diff --git a/hindsight-api/hindsight_api/api/http.py b/hindsight-api/hindsight_api/api/http.py index c27acdd0..76ee9861 100644 --- a/hindsight-api/hindsight_api/api/http.py +++ b/hindsight-api/hindsight_api/api/http.py @@ -281,6 +281,13 @@ class RecallResponse(BaseModel): chunks: dict[str, ChunkData] | None = Field(default=None, description="Chunks for facts, keyed by chunk_id") +class EntityInput(BaseModel): + """Entity to associate with retained content.""" + + text: str = Field(description="The entity name/text") + type: str | None = Field(default=None, description="Optional entity type (e.g., 'PERSON', 'ORG', 'CONCEPT')") + + class MemoryItem(BaseModel): """Single memory item for retain.""" @@ -292,6 +299,7 @@ class MemoryItem(BaseModel): "context": "team meeting", "metadata": {"source": "slack", "channel": "engineering"}, "document_id": "meeting_notes_2024_01_15", + "entities": [{"text": "Alice"}, {"text": "ML model", "type": "CONCEPT"}], } }, ) @@ -301,6 +309,10 @@ class MemoryItem(BaseModel): context: str | None = None metadata: dict[str, str] | None = None document_id: str | None = Field(default=None, description="Optional document ID for this memory item.") + entities: list[EntityInput] | None = Field( + default=None, + description="Optional entities to combine with auto-extracted entities.", + ) @field_validator("timestamp", mode="before") @classmethod @@ -1986,6 +1998,10 @@ async def api_retain( content_dict["metadata"] = item.metadata if item.document_id: content_dict["document_id"] = item.document_id + if item.entities: + content_dict["entities"] = [ + {"text": e.text, "type": e.type or "CONCEPT"} for e in item.entities + ] contents.append(content_dict) if request.async_: diff --git a/hindsight-api/hindsight_api/engine/retain/entity_processing.py b/hindsight-api/hindsight_api/engine/retain/entity_processing.py index 12e6409a..5200f9ea 100644 --- a/hindsight-api/hindsight_api/engine/retain/entity_processing.py +++ b/hindsight-api/hindsight_api/engine/retain/entity_processing.py @@ -13,16 +13,23 @@ async def process_entities_batch( - entity_resolver, conn, bank_id: str, unit_ids: list[str], facts: list[ProcessedFact], log_buffer: list[str] = None + entity_resolver, + conn, + bank_id: str, + unit_ids: list[str], + facts: list[ProcessedFact], + log_buffer: list[str] = None, + user_entities_per_content: dict[int, list[dict]] = None, ) -> list[EntityLink]: """ Process entities for all facts and create entity links. This function: 1. Extracts entity mentions from fact texts - 2. Resolves entity names to canonical entities - 3. Creates entity records in the database - 4. Returns entity links ready for insertion + 2. Merges user-provided entities with LLM-extracted entities + 3. Resolves entity names to canonical entities + 4. Creates entity records in the database + 5. Returns entity links ready for insertion Args: entity_resolver: EntityResolver instance for entity resolution @@ -31,6 +38,7 @@ async def process_entities_batch( unit_ids: List of unit IDs (same length as facts) facts: List of ProcessedFact objects log_buffer: Optional buffer for detailed logging + user_entities_per_content: Dict mapping content_index to list of user-provided entities Returns: List of EntityLink objects for batch insertion @@ -41,14 +49,33 @@ async def process_entities_batch( if len(unit_ids) != len(facts): raise ValueError(f"Mismatch between unit_ids ({len(unit_ids)}) and facts ({len(facts)})") + user_entities_per_content = user_entities_per_content or {} + # Extract data for link_utils function fact_texts = [fact.fact_text for fact in facts] # Use occurred_start if available, otherwise use mentioned_at for entity timestamps fact_dates = [fact.occurred_start if fact.occurred_start is not None else fact.mentioned_at for fact in facts] - # Convert EntityRef objects to dict format expected by link_utils - entities_per_fact = [ - [{"text": entity.name, "type": "CONCEPT"} for entity in (fact.entities or [])] for fact in facts - ] + + # Convert EntityRef objects to dict format and merge with user-provided entities + entities_per_fact = [] + for fact in facts: + # Start with LLM-extracted entities + llm_entities = [{"text": entity.name, "type": "CONCEPT"} for entity in (fact.entities or [])] + + # Get user entities for this content (use content_index from fact) + user_entities = user_entities_per_content.get(fact.content_index, []) + + # Merge with case-insensitive deduplication + seen_texts = {e["text"].lower() for e in llm_entities} + for user_entity in user_entities: + if user_entity["text"].lower() not in seen_texts: + llm_entities.append({ + "text": user_entity["text"], + "type": user_entity.get("type", "CONCEPT"), + }) + seen_texts.add(user_entity["text"].lower()) + + entities_per_fact.append(llm_entities) # Use existing link_utils function for entity processing entity_links = await link_utils.extract_entities_batch_optimized( diff --git a/hindsight-api/hindsight_api/engine/retain/orchestrator.py b/hindsight-api/hindsight_api/engine/retain/orchestrator.py index 133d1090..7cebf666 100644 --- a/hindsight-api/hindsight_api/engine/retain/orchestrator.py +++ b/hindsight-api/hindsight_api/engine/retain/orchestrator.py @@ -91,6 +91,7 @@ async def retain_batch( context=item.get("context", ""), event_date=item.get("event_date") or utcnow(), metadata=item.get("metadata", {}), + entities=item.get("entities", []), ) contents.append(content) @@ -352,8 +353,18 @@ async def retain_batch( # Process entities step_start = time.time() + # Build map of content_index -> user entities for merging + user_entities_per_content = { + idx: content.entities for idx, content in enumerate(contents) if content.entities + } entity_links = await entity_processing.process_entities_batch( - entity_resolver, conn, bank_id, unit_ids, non_duplicate_facts, log_buffer + entity_resolver, + conn, + bank_id, + unit_ids, + non_duplicate_facts, + log_buffer, + user_entities_per_content=user_entities_per_content, ) log_buffer.append(f"[6] Process entities: {len(entity_links)} links in {time.time() - step_start:.3f}s") diff --git a/hindsight-api/hindsight_api/engine/retain/types.py b/hindsight-api/hindsight_api/engine/retain/types.py index d6b5df1c..b0bfef17 100644 --- a/hindsight-api/hindsight_api/engine/retain/types.py +++ b/hindsight-api/hindsight_api/engine/retain/types.py @@ -20,6 +20,7 @@ class RetainContentDict(TypedDict, total=False): event_date: When the content occurred (optional, defaults to now) metadata: Custom key-value metadata (optional) document_id: Document ID for this content item (optional) + entities: User-provided entities to merge with extracted entities (optional) """ content: str # Required @@ -27,6 +28,7 @@ class RetainContentDict(TypedDict, total=False): event_date: datetime metadata: dict[str, str] document_id: str + entities: list[dict[str, str]] # [{"text": "...", "type": "..."}] def _now_utc() -> datetime: @@ -46,6 +48,7 @@ class RetainContent: context: str = "" event_date: datetime = field(default_factory=_now_utc) metadata: dict[str, str] = field(default_factory=dict) + entities: list[dict[str, str]] = field(default_factory=list) # User-provided entities @dataclass @@ -152,6 +155,9 @@ class ProcessedFact: # DB fields (set after insertion) unit_id: UUID | None = None + # Track which content this fact came from (for user entity merging) + content_index: int = 0 + @property def is_duplicate(self) -> bool: """Check if this fact was marked as a duplicate.""" @@ -194,6 +200,7 @@ def from_extracted_fact( entities=entities, causal_relations=extracted_fact.causal_relations, chunk_id=chunk_id, + content_index=extracted_fact.content_index, ) diff --git a/hindsight-clients/python/hindsight_client/hindsight_client.py b/hindsight-clients/python/hindsight_client/hindsight_client.py index 6af56599..ae54f0e5 100644 --- a/hindsight-clients/python/hindsight_client/hindsight_client.py +++ b/hindsight-clients/python/hindsight_client/hindsight_client.py @@ -112,6 +112,7 @@ def retain( context: Optional[str] = None, document_id: Optional[str] = None, metadata: Optional[Dict[str, str]] = None, + entities: Optional[List[Dict[str, str]]] = None, ) -> RetainResponse: """ Store a single memory (simplified interface). @@ -123,13 +124,14 @@ def retain( context: Optional context description document_id: Optional document ID for grouping metadata: Optional user-defined metadata + entities: Optional list of entities [{"text": "...", "type": "..."}] Returns: RetainResponse with success status """ return self.retain_batch( bank_id=bank_id, - items=[{"content": content, "timestamp": timestamp, "context": context, "metadata": metadata}], + items=[{"content": content, "timestamp": timestamp, "context": context, "metadata": metadata, "entities": entities}], document_id=document_id, ) @@ -145,24 +147,34 @@ def retain_batch( Args: bank_id: The memory bank ID - items: List of memory items with 'content' and optional 'timestamp', 'context', 'metadata', 'document_id' + items: List of memory items with 'content' and optional 'timestamp', 'context', 'metadata', 'document_id', 'entities' document_id: Optional document ID for grouping memories (applied to items that don't have their own) retain_async: If True, process asynchronously in background (default: False) Returns: RetainResponse with success status and item count """ - memory_items = [ - memory_item.MemoryItem( - content=item["content"], - timestamp=item.get("timestamp"), - context=item.get("context"), - metadata=item.get("metadata"), - # Use item's document_id if provided, otherwise fall back to batch-level document_id - document_id=item.get("document_id") or document_id, + from hindsight_client_api.models.entity_input import EntityInput + + memory_items = [] + for item in items: + entities = None + if item.get("entities"): + entities = [ + EntityInput(text=e["text"], type=e.get("type")) + for e in item["entities"] + ] + memory_items.append( + memory_item.MemoryItem( + content=item["content"], + timestamp=item.get("timestamp"), + context=item.get("context"), + metadata=item.get("metadata"), + # Use item's document_id if provided, otherwise fall back to batch-level document_id + document_id=item.get("document_id") or document_id, + entities=entities, + ) ) - for item in items - ] request_obj = retain_request.RetainRequest( items=memory_items, @@ -312,24 +324,34 @@ async def aretain_batch( Args: bank_id: The memory bank ID - items: List of memory items with 'content' and optional 'timestamp', 'context', 'metadata', 'document_id' + items: List of memory items with 'content' and optional 'timestamp', 'context', 'metadata', 'document_id', 'entities' document_id: Optional document ID for grouping memories (applied to items that don't have their own) retain_async: If True, process asynchronously in background (default: False) Returns: RetainResponse with success status and item count """ - memory_items = [ - memory_item.MemoryItem( - content=item["content"], - timestamp=item.get("timestamp"), - context=item.get("context"), - metadata=item.get("metadata"), - # Use item's document_id if provided, otherwise fall back to batch-level document_id - document_id=item.get("document_id") or document_id, + from hindsight_client_api.models.entity_input import EntityInput + + memory_items = [] + for item in items: + entities = None + if item.get("entities"): + entities = [ + EntityInput(text=e["text"], type=e.get("type")) + for e in item["entities"] + ] + memory_items.append( + memory_item.MemoryItem( + content=item["content"], + timestamp=item.get("timestamp"), + context=item.get("context"), + metadata=item.get("metadata"), + # Use item's document_id if provided, otherwise fall back to batch-level document_id + document_id=item.get("document_id") or document_id, + entities=entities, + ) ) - for item in items - ] request_obj = retain_request.RetainRequest( items=memory_items, @@ -346,6 +368,7 @@ async def aretain( context: Optional[str] = None, document_id: Optional[str] = None, metadata: Optional[Dict[str, str]] = None, + entities: Optional[List[Dict[str, str]]] = None, ) -> RetainResponse: """ Store a single memory (async). @@ -357,13 +380,14 @@ async def aretain( context: Optional context description document_id: Optional document ID for grouping metadata: Optional user-defined metadata + entities: Optional list of entities [{"text": "...", "type": "..."}] Returns: RetainResponse with success status """ return await self.aretain_batch( bank_id=bank_id, - items=[{"content": content, "timestamp": timestamp, "context": context, "metadata": metadata}], + items=[{"content": content, "timestamp": timestamp, "context": context, "metadata": metadata, "entities": entities}], document_id=document_id, ) diff --git a/hindsight-clients/python/hindsight_client_api/models/__init__.py b/hindsight-clients/python/hindsight_client_api/models/__init__.py index 0dde8e0c..794bf21e 100644 --- a/hindsight-clients/python/hindsight_client_api/models/__init__.py +++ b/hindsight-clients/python/hindsight_client_api/models/__init__.py @@ -31,6 +31,7 @@ from hindsight_client_api.models.document_response import DocumentResponse from hindsight_client_api.models.entity_detail_response import EntityDetailResponse from hindsight_client_api.models.entity_include_options import EntityIncludeOptions +from hindsight_client_api.models.entity_input import EntityInput from hindsight_client_api.models.entity_list_item import EntityListItem from hindsight_client_api.models.entity_list_response import EntityListResponse from hindsight_client_api.models.entity_observation_response import EntityObservationResponse diff --git a/hindsight-clients/python/hindsight_client_api/models/entity_input.py b/hindsight-clients/python/hindsight_client_api/models/entity_input.py new file mode 100644 index 00000000..d9af3b92 --- /dev/null +++ b/hindsight-clients/python/hindsight_client_api/models/entity_input.py @@ -0,0 +1,82 @@ +# coding: utf-8 + +""" + Hindsight HTTP API + + HTTP API for Hindsight + + The version of the OpenAPI document: 0.1.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) + + Do not edit the class manually. +""" # noqa: E501 + + +from __future__ import annotations +import pprint +import re # noqa: F401 +import json + +from pydantic import BaseModel, ConfigDict, Field, StrictStr +from typing import Any, ClassVar, Dict, List, Optional +from typing import Optional, Set +from typing_extensions import Self + + +class EntityInput(BaseModel): + """ + Entity to associate with retained content. + """ # noqa: E501 + + text: StrictStr = Field(description="The entity name/text") + type: Optional[StrictStr] = Field( + default=None, description="Optional entity type (e.g., 'PERSON', 'ORG', 'CONCEPT')" + ) + __properties: ClassVar[List[str]] = ["text", "type"] + + model_config = ConfigDict( + populate_by_name=True, + validate_assignment=True, + protected_namespaces=(), + ) + + def to_str(self) -> str: + """Returns the string representation of the model using alias""" + return pprint.pformat(self.model_dump(by_alias=True)) + + def to_json(self) -> str: + """Returns the JSON representation of the model using alias""" + return json.dumps(self.to_dict()) + + @classmethod + def from_json(cls, json_str: str) -> Optional[Self]: + """Create an instance of EntityInput from a JSON string""" + return cls.from_dict(json.loads(json_str)) + + def to_dict(self) -> Dict[str, Any]: + """Return the dictionary representation of the model using alias.""" + excluded_fields: Set[str] = set([]) + + _dict = self.model_dump( + by_alias=True, + exclude=excluded_fields, + exclude_none=True, + ) + # set to None if type (nullable) is None + # and model_fields_set contains the field + if self.type is None and "type" in self.model_fields_set: + _dict["type"] = None + + return _dict + + @classmethod + def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: + """Create an instance of EntityInput from a dict""" + if obj is None: + return None + + if not isinstance(obj, dict): + return cls.model_validate(obj) + + _obj = cls.model_validate({"text": obj.get("text"), "type": obj.get("type")}) + return _obj diff --git a/hindsight-clients/python/hindsight_client_api/models/memory_item.py b/hindsight-clients/python/hindsight_client_api/models/memory_item.py index 9fe7507c..51eb824d 100644 --- a/hindsight-clients/python/hindsight_client_api/models/memory_item.py +++ b/hindsight-clients/python/hindsight_client_api/models/memory_item.py @@ -18,10 +18,11 @@ import json from datetime import datetime -from pydantic import BaseModel, ConfigDict, StrictStr +from pydantic import BaseModel, ConfigDict, Field, StrictStr from typing import Any, ClassVar, Dict, List, Optional from typing import Optional, Set from typing_extensions import Self +from hindsight_client_api.models.entity_input import EntityInput class MemoryItem(BaseModel): """ @@ -32,7 +33,11 @@ class MemoryItem(BaseModel): context: Optional[StrictStr] = None metadata: Optional[Dict[str, StrictStr]] = None document_id: Optional[StrictStr] = None - __properties: ClassVar[List[str]] = ["content", "timestamp", "context", "metadata", "document_id"] + entities: Optional[List[EntityInput]] = Field( + default=None, + description="Optional entities to combine with auto-extracted entities." + ) + __properties: ClassVar[List[str]] = ["content", "timestamp", "context", "metadata", "document_id", "entities"] model_config = ConfigDict( populate_by_name=True, @@ -93,6 +98,11 @@ def to_dict(self) -> Dict[str, Any]: if self.document_id is None and "document_id" in self.model_fields_set: _dict['document_id'] = None + # set to None if entities (nullable) is None + # and model_fields_set contains the field + if self.entities is None and "entities" in self.model_fields_set: + _dict['entities'] = None + return _dict @classmethod @@ -109,7 +119,8 @@ def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: "timestamp": obj.get("timestamp"), "context": obj.get("context"), "metadata": obj.get("metadata"), - "document_id": obj.get("document_id") + "document_id": obj.get("document_id"), + "entities": [EntityInput.from_dict(_item) for _item in obj["entities"]] if obj.get("entities") is not None else None }) return _obj diff --git a/hindsight-clients/typescript/src/index.ts b/hindsight-clients/typescript/src/index.ts index 4078a473..4a509380 100644 --- a/hindsight-clients/typescript/src/index.ts +++ b/hindsight-clients/typescript/src/index.ts @@ -50,12 +50,18 @@ export interface HindsightClientOptions { apiKey?: string; } +export interface EntityInput { + text: string; + type?: string; +} + export interface MemoryItemInput { content: string; timestamp?: string | Date; context?: string; metadata?: Record; document_id?: string; + entities?: EntityInput[]; } export class HindsightClient { @@ -78,9 +84,23 @@ export class HindsightClient { async retain( bankId: string, content: string, - options?: { timestamp?: Date | string; context?: string; metadata?: Record; documentId?: string; async?: boolean } + options?: { + timestamp?: Date | string; + context?: string; + metadata?: Record; + documentId?: string; + async?: boolean; + entities?: EntityInput[]; + } ): Promise { - const item: { content: string; timestamp?: string; context?: string; metadata?: Record; document_id?: string } = { content }; + const item: { + content: string; + timestamp?: string; + context?: string; + metadata?: Record; + document_id?: string; + entities?: EntityInput[]; + } = { content }; if (options?.timestamp) { item.timestamp = options.timestamp instanceof Date @@ -96,6 +116,9 @@ export class HindsightClient { if (options?.documentId) { item.document_id = options.documentId; } + if (options?.entities) { + item.entities = options.entities; + } const response = await sdk.retainMemories({ client: this.client, @@ -115,6 +138,7 @@ export class HindsightClient { context: item.context, metadata: item.metadata, document_id: item.document_id, + entities: item.entities, timestamp: item.timestamp instanceof Date ? item.timestamp.toISOString() diff --git a/hindsight-control-plane/src/lib/api.ts b/hindsight-control-plane/src/lib/api.ts index 27d6355a..d91b34b6 100644 --- a/hindsight-control-plane/src/lib/api.ts +++ b/hindsight-control-plane/src/lib/api.ts @@ -85,6 +85,9 @@ export class ControlPlaneClient { content: string; timestamp?: string; context?: string; + metadata?: Record; + document_id?: string; + entities?: Array<{ text: string; type?: string }>; }>; document_id?: string; async?: boolean;