InternScience · ChenZiHong-Gavin · Dec 23, 2025 · Dec 23, 2025 · Dec 23, 2025 · Dec 23, 2025
diff --git a/graphgen/models/generator/aggregated_generator.py b/graphgen/models/generator/aggregated_generator.py
@@ -1,4 +1,5 @@
-from typing import Any
+import re
+from typing import Any, Optional
 
 from graphgen.bases import BaseGenerator
 from graphgen.templates import AGGREGATED_GENERATION_PROMPT
@@ -56,19 +57,21 @@ def build_prompt(
         return prompt
 
     @staticmethod
-    def parse_rephrased_text(response: str) -> str:
+    def parse_rephrased_text(response: str) -> Optional[str]:
         """
         Parse the rephrased text from the response.
         :param response:
         :return: rephrased text
         """
-        if "Rephrased Text:" in response:
-            rephrased_text = response.split("Rephrased Text:")[1].strip()
-        elif "重述文本:" in response:
-            rephrased_text = response.split("重述文本:")[1].strip()
+        rephrased_match = re.search(
+            r"<rephrased_text>(.*?)</rephrased_text>", response, re.DOTALL
+        )
+        if rephrased_match:
+            rephrased_text = rephrased_match.group(1).strip()
         else:
-            rephrased_text = response.strip()
-        return rephrased_text.strip('"')
+            logger.warning("Failed to parse rephrased text from response: %s", response)
+            return None
+        return rephrased_text.strip('"').strip("'")
 
     @staticmethod
     def _build_prompt_for_question_generation(answer: str) -> str:
@@ -85,15 +88,13 @@ def _build_prompt_for_question_generation(answer: str) -> str:
 
     @staticmethod
     def parse_response(response: str) -> dict:
-        if response.startswith("Question:"):
-            question = response[len("Question:") :].strip()
-        elif response.startswith("问题："):
-            question = response[len("问题：") :].strip()
+        question_match = re.search(r"<question>(.*?)</question>", response, re.DOTALL)
+        if question_match:
+            question = question_match.group(1).strip()
         else:
-            question = response.strip()
-        return {
-            "question": question,
-        }
+            logger.warning("Failed to parse question from response: %s", response)
+            return {"question": ""}
+        return {"question": question.strip('"').strip("'")}
 
     async def generate(
         self,
@@ -110,9 +111,13 @@ async def generate(
         rephrasing_prompt = self.build_prompt(batch)
         response = await self.llm_client.generate_answer(rephrasing_prompt)
         context = self.parse_rephrased_text(response)
+        if not context:
+            return result
         question_generation_prompt = self._build_prompt_for_question_generation(context)
         response = await self.llm_client.generate_answer(question_generation_prompt)
         question = self.parse_response(response)["question"]
+        if not question:
+            return result
         logger.debug("Question: %s", question)
         logger.debug("Answer: %s", context)
         qa_pairs = {

diff --git a/graphgen/models/generator/atomic_generator.py b/graphgen/models/generator/atomic_generator.py
@@ -1,3 +1,4 @@
+import re
 from typing import Any
 
 from graphgen.bases import BaseGenerator
@@ -29,17 +30,18 @@ def parse_response(response: str) -> dict:
         :param response:
         :return:
         """
-        if "Question:" in response and "Answer:" in response:
-            question = response.split("Question:")[1].split("Answer:")[0].strip()
-            answer = response.split("Answer:")[1].strip()
-        elif "问题：" in response and "答案：" in response:
-            question = response.split("问题：")[1].split("答案：")[0].strip()
-            answer = response.split("答案：")[1].strip()
+        question_match = re.search(r"<question>(.*?)</question>", response, re.DOTALL)
+        answer_match = re.search(r"<answer>(.*?)</answer>", response, re.DOTALL)
+
+        if question_match and answer_match:
+            question = question_match.group(1).strip()
+            answer = answer_match.group(1).strip()
         else:
             logger.warning("Failed to parse response: %s", response)
             return {}
-        question = question.strip('"')
-        answer = answer.strip('"')
+
+        question = question.strip('"').strip("'")
+        answer = answer.strip('"').strip("'")
         logger.debug("Question: %s", question)
         logger.debug("Answer: %s", answer)
         return {

diff --git a/graphgen/models/generator/cot_generator.py b/graphgen/models/generator/cot_generator.py
@@ -1,3 +1,4 @@
+import re
 from typing import Any
 
 from graphgen.bases import BaseGenerator
@@ -67,22 +68,26 @@ def build_prompt_for_cot_generation(
 
     @staticmethod
     def parse_response(response: str) -> dict:
-        if "Question:" in response and "Reasoning-Path Design:" in response:
-            question = (
-                response.split("Question:")[1]
-                .split("Reasoning-Path Design:")[0]
-                .strip()
-            )
-            reasoning_path = response.split("Reasoning-Path Design:")[1].strip()
-        elif "问题：" in response and "推理路径设计：" in response:
-            question = response.split("问题：")[1].split("推理路径设计：")[0].strip()
-            reasoning_path = response.split("推理路径设计：")[1].strip()
+        """
+        Parse CoT template from response.
+        :param response:
+        :return: dict with question and reasoning_path
+        """
+        question_match = re.search(r"<question>(.*?)</question>", response, re.DOTALL)
+        reasoning_path_match = re.search(
+            r"<reasoning_path>(.*?)</reasoning_path>", response, re.DOTALL
+        )
+
+        if question_match and reasoning_path_match:
+            question = question_match.group(1).strip()
+            reasoning_path = reasoning_path_match.group(1).strip()
         else:
-            logger.warning("Failed to parse CoT template: %s", response)
+            logger.warning("Failed to parse response: %s", response)
             return {}
 
-        question = question.strip('"')
-        reasoning_path = reasoning_path.strip('"')
+        question = question.strip('"').strip("'")
+        reasoning_path = reasoning_path.strip('"').strip("'")
+
         logger.debug("CoT Question: %s", question)
         logger.debug("CoT Reasoning Path: %s", reasoning_path)
         return {
@@ -105,6 +110,8 @@ async def generate(
         prompt = self.build_prompt(batch)
         response = await self.llm_client.generate_answer(prompt)
         response = self.parse_response(response)
+        if not response:
+            return result
         question, reasoning_path = response["question"], response["reasoning_path"]
         prompt = self.build_prompt_for_cot_generation(batch, question, reasoning_path)
         cot_answer = await self.llm_client.generate_answer(prompt)

diff --git a/graphgen/models/generator/multi_hop_generator.py b/graphgen/models/generator/multi_hop_generator.py
@@ -1,3 +1,4 @@
+import re
 from typing import Any
 
 from graphgen.bases import BaseGenerator
@@ -32,17 +33,18 @@ def build_prompt(
 
     @staticmethod
     def parse_response(response: str) -> dict:
-        if "Question:" in response and "Answer:" in response:
-            question = response.split("Question:")[1].split("Answer:")[0].strip()
-            answer = response.split("Answer:")[1].strip()
-        elif "问题：" in response and "答案：" in response:
-            question = response.split("问题：")[1].split("答案：")[0].strip()
-            answer = response.split("答案：")[1].strip()
+        question_match = re.search(r"<question>(.*?)</question>", response, re.DOTALL)
+        answer_match = re.search(r"<answer>(.*?)</answer>", response, re.DOTALL)
+
+        if question_match and answer_match:
+            question = question_match.group(1).strip()
+            answer = answer_match.group(1).strip()
         else:
             logger.warning("Failed to parse response: %s", response)
             return {}
-        question = question.strip('"')
-        answer = answer.strip('"')
+
+        question = question.strip('"').strip("'")
+        answer = answer.strip('"').strip("'")
         logger.debug("Question: %s", question)
         logger.debug("Answer: %s", answer)
         return {

diff --git a/graphgen/models/generator/vqa_generator.py b/graphgen/models/generator/vqa_generator.py
@@ -1,3 +1,4 @@
+import re
 from typing import Any
 
 from graphgen.bases import BaseGenerator
@@ -38,25 +39,21 @@ def parse_response(response: str) -> Any:
         :return: QA pairs
         """
         qa_pairs = {}
-        qa_list = response.strip().split("\n\n")
-        for qa in qa_list:
-            if "Question:" in qa and "Answer:" in qa:
-                question = qa.split("Question:")[1].split("Answer:")[0].strip()
-                answer = qa.split("Answer:")[1].strip()
-            elif "问题：" in qa and "答案：" in qa:
-                question = qa.split("问题：")[1].split("答案：")[0].strip()
-                answer = qa.split("答案：")[1].strip()
-            else:
-                logger.error("Failed to parse QA pair: %s", qa)
-                continue
-            question = question.strip('"')
-            answer = answer.strip('"')
-            logger.debug("Question: %s", question)
-            logger.debug("Answer: %s", answer)
-            qa_pairs[compute_content_hash(question)] = {
-                "question": question,
-                "answer": answer,
-            }
+        pattern = r"<question>(.*?)</question>\s*<answer>(.*?)</answer>"
+        matches = re.findall(pattern, response, re.DOTALL)
+
+        if matches:
+            for question, answer in matches:
+                question = question.strip().strip('"').strip("'")
+                answer = answer.strip().strip('"').strip("'")
+                logger.debug("Question: %s", question)
+                logger.debug("Answer: %s", answer)
+                qa_pairs[compute_content_hash(question)] = {
+                    "question": question,
+                    "answer": answer,
+                }
+        else:
+            logger.warning("Error parsing the response %s", response)
         return qa_pairs
 
     async def generate(

diff --git a/graphgen/models/llm/local/vllm_wrapper.py b/graphgen/models/llm/local/vllm_wrapper.py
@@ -16,7 +16,7 @@ def __init__(
         model: str,
         tensor_parallel_size: int = 1,
         gpu_memory_utilization: float = 0.9,
-        temperature: float = 0.0,
+        temperature: float = 0.6,
         top_p: float = 1.0,
         topk: int = 5,
         **kwargs: Any,
@@ -66,7 +66,7 @@ async def generate_answer(
         sp = self.SamplingParams(
             temperature=self.temperature if self.temperature > 0 else 1.0,
             top_p=self.top_p if self.temperature > 0 else 1.0,
-            max_tokens=extra.get("max_new_tokens", 512),
+            max_tokens=extra.get("max_new_tokens", 2048),
         )
 
         result_generator = self.engine.generate(full_prompt, sp, request_id=request_id)
@@ -82,7 +82,7 @@ async def generate_answer(
 
     async def generate_topk_per_token(
         self, text: str, history: Optional[List[str]] = None, **extra: Any
-        ) -> List[Token]:
+    ) -> List[Token]:
         full_prompt = self._build_inputs(text, history)
         request_id = f"graphgen_topk_{uuid.uuid4()}"
 
@@ -110,7 +110,9 @@ async def generate_topk_per_token(
 
         candidate_tokens = []
         for _, logprob_obj in top_logprobs.items():
-            tok_str = logprob_obj.decoded_token.strip() if logprob_obj.decoded_token else ""
+            tok_str = (
+                logprob_obj.decoded_token.strip() if logprob_obj.decoded_token else ""
+            )
             prob = float(math.exp(logprob_obj.logprob))
             candidate_tokens.append(Token(tok_str, prob))
 
@@ -120,7 +122,7 @@ async def generate_topk_per_token(
             main_token = Token(
                 text=candidate_tokens[0].text,
                 prob=candidate_tokens[0].prob,
-                top_candidates=candidate_tokens
+                top_candidates=candidate_tokens,
             )
             return [main_token]
         return []

diff --git a/graphgen/operators/generate/generate_service.py b/graphgen/operators/generate/generate_service.py
@@ -61,6 +61,9 @@ def generate(self, items: list[dict]) -> list[dict]:
             unit="batch",
         )
 
+        # Filter out empty results
+        results = [res for res in results if res]
+
         results = self.generator.format_generation_results(
             results, output_data_format=self.data_format
         )

diff --git a/graphgen/operators/read/read.py b/graphgen/operators/read/read.py
@@ -50,7 +50,7 @@ def _build_reader(suffix: str, cache_dir: str | None, **reader_kwargs):
 def read(
     input_path: Union[str, List[str]],
     allowed_suffix: Optional[List[str]] = None,
-    cache_dir: Optional[str] = "cache",
+    working_dir: Optional[str] = "cache",
     parallelism: int = 4,
     recursive: bool = True,
     **reader_kwargs: Any,
@@ -60,7 +60,7 @@ def read(
 
     :param input_path: File or directory path(s) to read from
     :param allowed_suffix: List of allowed file suffixes (e.g., ['pdf', 'txt'])
-    :param cache_dir: Directory to cache intermediate files (PDF processing)
+    :param working_dir: Directory to cache intermediate files (PDF processing)
     :param parallelism: Number of parallel workers
     :param recursive: Whether to scan directories recursively
     :param reader_kwargs: Additional kwargs passed to readers
@@ -70,7 +70,7 @@ def read(
         # 1. Scan all paths to discover files
         logger.info("[READ] Scanning paths: %s", input_path)
         scanner = ParallelFileScanner(
-            cache_dir=cache_dir,
+            cache_dir=working_dir,
             allowed_suffix=allowed_suffix,
             rescan=False,
             max_workers=parallelism if parallelism > 0 else 1,
@@ -100,7 +100,7 @@ def read(
         # 3. Create read tasks
         read_tasks = []
         for suffix, file_paths in files_by_suffix.items():
-            reader = _build_reader(suffix, cache_dir, **reader_kwargs)
+            reader = _build_reader(suffix, working_dir, **reader_kwargs)
             ds = reader.read(file_paths)
             read_tasks.append(ds)