fix: fix missing content in extraction prompt

ChenZiHong-Gavin · ChenZiHong-Gavin · commit e22d4e574f65 · 2025-11-07T19:20:49.000+08:00
diff --git a/graphgen/models/extractor/schema_guided_extractor.py b/graphgen/models/extractor/schema_guided_extractor.py
@@ -60,7 +60,9 @@ def build_prompt(self, text: str) -> str:
         return prompt
 
     async def extract(self, chunk: dict) -> dict:
-        text = chunk.get("text", "")
+        _chunk_id = list(chunk.keys())[0]
+        text = chunk[_chunk_id].get("content", "")
+
         prompt = self.build_prompt(text)
         response = await self.llm_client.generate_answer(prompt)
         try:
@@ -74,13 +76,20 @@ async def extract(self, chunk: dict) -> dict:
                 return {}
             main_keys_info = {key: extracted_info[key] for key in self.required_keys}
             logger.debug("Extracted info: %s", extracted_info)
-            return {compute_dict_hash(main_keys_info, prefix="extract"): extracted_info}
+
+            # add chunk metadata
+            extracted_info["_chunk_id"] = _chunk_id
+
+            return {
+                compute_dict_hash(main_keys_info, prefix="extract-"): extracted_info
+            }
         except json.JSONDecodeError:
             logger.error("Failed to parse extraction response: %s", response)
             return {}
 
+    @staticmethod
     async def merge_extractions(
-        self, extraction_list: List[Dict[str, dict]]
+        extraction_list: List[Dict[str, dict]]
     ) -> Dict[str, dict]:
         """
         Merge multiple extraction results based on their hashes.
diff --git a/graphgen/operators/split/split_chunks.py b/graphgen/operators/split/split_chunks.py
@@ -64,7 +64,7 @@ async def chunk_documents(
                 compute_content_hash(txt, prefix="chunk-"): {
                     "content": txt,
                     "type": "text",
-                    "full_doc_id": doc_key,
+                    "_full_docs_id": doc_key,
                     "length": len(tokenizer_instance.encode(txt))
                     if tokenizer_instance
                     else len(txt),