Skip to content

Commit e22d4e5

Browse files
fix: fix missing content in extraction prompt
1 parent daf7f86 commit e22d4e5

File tree

2 files changed

+13
-4
lines changed

2 files changed

+13
-4
lines changed

graphgen/models/extractor/schema_guided_extractor.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,9 @@ def build_prompt(self, text: str) -> str:
6060
return prompt
6161

6262
async def extract(self, chunk: dict) -> dict:
63-
text = chunk.get("text", "")
63+
_chunk_id = list(chunk.keys())[0]
64+
text = chunk[_chunk_id].get("content", "")
65+
6466
prompt = self.build_prompt(text)
6567
response = await self.llm_client.generate_answer(prompt)
6668
try:
@@ -74,13 +76,20 @@ async def extract(self, chunk: dict) -> dict:
7476
return {}
7577
main_keys_info = {key: extracted_info[key] for key in self.required_keys}
7678
logger.debug("Extracted info: %s", extracted_info)
77-
return {compute_dict_hash(main_keys_info, prefix="extract"): extracted_info}
79+
80+
# add chunk metadata
81+
extracted_info["_chunk_id"] = _chunk_id
82+
83+
return {
84+
compute_dict_hash(main_keys_info, prefix="extract-"): extracted_info
85+
}
7886
except json.JSONDecodeError:
7987
logger.error("Failed to parse extraction response: %s", response)
8088
return {}
8189

90+
@staticmethod
8291
async def merge_extractions(
83-
self, extraction_list: List[Dict[str, dict]]
92+
extraction_list: List[Dict[str, dict]]
8493
) -> Dict[str, dict]:
8594
"""
8695
Merge multiple extraction results based on their hashes.

graphgen/operators/split/split_chunks.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ async def chunk_documents(
6464
compute_content_hash(txt, prefix="chunk-"): {
6565
"content": txt,
6666
"type": "text",
67-
"full_doc_id": doc_key,
67+
"_full_docs_id": doc_key,
6868
"length": len(tokenizer_instance.encode(txt))
6969
if tokenizer_instance
7070
else len(txt),

0 commit comments

Comments
 (0)