refactor: use xml format prompt in Generators

ChenZiHong-Gavin · ChenZiHong-Gavin · commit de4da5f44caa · 2025-12-23T20:39:15.000+08:00
diff --git a/graphgen/models/generator/aggregated_generator.py b/graphgen/models/generator/aggregated_generator.py
@@ -1,4 +1,5 @@
-from typing import Any
+import re
+from typing import Any, Optional
 
 from graphgen.bases import BaseGenerator
 from graphgen.templates import AGGREGATED_GENERATION_PROMPT
@@ -56,19 +57,21 @@ def build_prompt(
         return prompt
 
     @staticmethod
-    def parse_rephrased_text(response: str) -> str:
+    def parse_rephrased_text(response: str) -> Optional[str]:
         """
         Parse the rephrased text from the response.
         :param response:
         :return: rephrased text
         """
-        if "Rephrased Text:" in response:
-            rephrased_text = response.split("Rephrased Text:")[1].strip()
-        elif "重述文本:" in response:
-            rephrased_text = response.split("重述文本:")[1].strip()
+        rephrased_match = re.search(
+            r"<rephrased_text>(.*?)</rephrased_text>", response, re.DOTALL
+        )
+        if rephrased_match:
+            rephrased_text = rephrased_match.group(1).strip()
         else:
-            rephrased_text = response.strip()
-        return rephrased_text.strip('"')
+            logger.warning("Failed to parse rephrased text from response: %s", response)
+            return None
+        return rephrased_text.strip('"').strip("'")
 
     @staticmethod
     def _build_prompt_for_question_generation(answer: str) -> str:
@@ -85,15 +88,13 @@ def _build_prompt_for_question_generation(answer: str) -> str:
 
     @staticmethod
     def parse_response(response: str) -> dict:
-        if response.startswith("Question:"):
-            question = response[len("Question:") :].strip()
-        elif response.startswith("问题："):
-            question = response[len("问题：") :].strip()
+        question_match = re.search(r"<question>(.*?)</question>", response, re.DOTALL)
+        if question_match:
+            question = question_match.group(1).strip()
         else:
-            question = response.strip()
-        return {
-            "question": question,
-        }
+            logger.warning("Failed to parse question from response: %s", response)
+            return {"question": ""}
+        return {"question": question.strip('"').strip("'")}
 
     async def generate(
         self,
@@ -110,9 +111,13 @@ async def generate(
         rephrasing_prompt = self.build_prompt(batch)
         response = await self.llm_client.generate_answer(rephrasing_prompt)
         context = self.parse_rephrased_text(response)
+        if not context:
+            return result
         question_generation_prompt = self._build_prompt_for_question_generation(context)
         response = await self.llm_client.generate_answer(question_generation_prompt)
         question = self.parse_response(response)["question"]
+        if not question:
+            return result
         logger.debug("Question: %s", question)
         logger.debug("Answer: %s", context)
         qa_pairs = {
diff --git a/graphgen/models/generator/atomic_generator.py b/graphgen/models/generator/atomic_generator.py
@@ -1,3 +1,4 @@
+import re
 from typing import Any
 
 from graphgen.bases import BaseGenerator
@@ -29,17 +30,18 @@ def parse_response(response: str) -> dict:
         :param response:
         :return:
         """
-        if "Question:" in response and "Answer:" in response:
-            question = response.split("Question:")[1].split("Answer:")[0].strip()
-            answer = response.split("Answer:")[1].strip()
-        elif "问题：" in response and "答案：" in response:
-            question = response.split("问题：")[1].split("答案：")[0].strip()
-            answer = response.split("答案：")[1].strip()
+        question_match = re.search(r"<question>(.*?)</question>", response, re.DOTALL)
+        answer_match = re.search(r"<answer>(.*?)</answer>", response, re.DOTALL)
+
+        if question_match and answer_match:
+            question = question_match.group(1).strip()
+            answer = answer_match.group(1).strip()
         else:
             logger.warning("Failed to parse response: %s", response)
             return {}
-        question = question.strip('"')
-        answer = answer.strip('"')
+
+        question = question.strip('"').strip("'")
+        answer = answer.strip('"').strip("'")
         logger.debug("Question: %s", question)
         logger.debug("Answer: %s", answer)
         return {
diff --git a/graphgen/models/generator/cot_generator.py b/graphgen/models/generator/cot_generator.py
@@ -1,3 +1,4 @@
+import re
 from typing import Any
 
 from graphgen.bases import BaseGenerator
@@ -67,22 +68,26 @@ def build_prompt_for_cot_generation(
 
     @staticmethod
     def parse_response(response: str) -> dict:
-        if "Question:" in response and "Reasoning-Path Design:" in response:
-            question = (
-                response.split("Question:")[1]
-                .split("Reasoning-Path Design:")[0]
-                .strip()
-            )
-            reasoning_path = response.split("Reasoning-Path Design:")[1].strip()
-        elif "问题：" in response and "推理路径设计：" in response:
-            question = response.split("问题：")[1].split("推理路径设计：")[0].strip()
-            reasoning_path = response.split("推理路径设计：")[1].strip()
+        """
+        Parse CoT template from response.
+        :param response:
+        :return: dict with question and reasoning_path
+        """
+        question_match = re.search(r"<question>(.*?)</question>", response, re.DOTALL)
+        reasoning_path_match = re.search(
+            r"<reasoning_path>(.*?)</reasoning_path>", response, re.DOTALL
+        )
+
+        if question_match and reasoning_path_match:
+            question = question_match.group(1).strip()
+            reasoning_path = reasoning_path_match.group(1).strip()
         else:
-            logger.warning("Failed to parse CoT template: %s", response)
+            logger.warning("Failed to parse response: %s", response)
             return {}
 
-        question = question.strip('"')
-        reasoning_path = reasoning_path.strip('"')
+        question = question.strip('"').strip("'")
+        reasoning_path = reasoning_path.strip('"').strip("'")
+
         logger.debug("CoT Question: %s", question)
         logger.debug("CoT Reasoning Path: %s", reasoning_path)
         return {
@@ -105,6 +110,8 @@ async def generate(
         prompt = self.build_prompt(batch)
         response = await self.llm_client.generate_answer(prompt)
         response = self.parse_response(response)
+        if not response:
+            return result
         question, reasoning_path = response["question"], response["reasoning_path"]
         prompt = self.build_prompt_for_cot_generation(batch, question, reasoning_path)
         cot_answer = await self.llm_client.generate_answer(prompt)
diff --git a/graphgen/models/generator/multi_hop_generator.py b/graphgen/models/generator/multi_hop_generator.py
@@ -1,3 +1,4 @@
+import re
 from typing import Any
 
 from graphgen.bases import BaseGenerator
@@ -32,17 +33,18 @@ def build_prompt(
 
     @staticmethod
     def parse_response(response: str) -> dict:
-        if "Question:" in response and "Answer:" in response:
-            question = response.split("Question:")[1].split("Answer:")[0].strip()
-            answer = response.split("Answer:")[1].strip()
-        elif "问题：" in response and "答案：" in response:
-            question = response.split("问题：")[1].split("答案：")[0].strip()
-            answer = response.split("答案：")[1].strip()
+        question_match = re.search(r"<question>(.*?)</question>", response, re.DOTALL)
+        answer_match = re.search(r"<answer>(.*?)</answer>", response, re.DOTALL)
+
+        if question_match and answer_match:
+            question = question_match.group(1).strip()
+            answer = answer_match.group(1).strip()
         else:
             logger.warning("Failed to parse response: %s", response)
             return {}
-        question = question.strip('"')
-        answer = answer.strip('"')
+
+        question = question.strip('"').strip("'")
+        answer = answer.strip('"').strip("'")
         logger.debug("Question: %s", question)
         logger.debug("Answer: %s", answer)
         return {
diff --git a/graphgen/models/generator/vqa_generator.py b/graphgen/models/generator/vqa_generator.py
@@ -1,3 +1,4 @@
+import re
 from typing import Any
 
 from graphgen.bases import BaseGenerator
@@ -38,25 +39,21 @@ def parse_response(response: str) -> Any:
         :return: QA pairs
         """
         qa_pairs = {}
-        qa_list = response.strip().split("\n\n")
-        for qa in qa_list:
-            if "Question:" in qa and "Answer:" in qa:
-                question = qa.split("Question:")[1].split("Answer:")[0].strip()
-                answer = qa.split("Answer:")[1].strip()
-            elif "问题：" in qa and "答案：" in qa:
-                question = qa.split("问题：")[1].split("答案：")[0].strip()
-                answer = qa.split("答案：")[1].strip()
-            else:
-                logger.error("Failed to parse QA pair: %s", qa)
-                continue
-            question = question.strip('"')
-            answer = answer.strip('"')
-            logger.debug("Question: %s", question)
-            logger.debug("Answer: %s", answer)
-            qa_pairs[compute_content_hash(question)] = {
-                "question": question,
-                "answer": answer,
-            }
+        pattern = r"<question>(.*?)</question>\s*<answer>(.*?)</answer>"
+        matches = re.findall(pattern, response, re.DOTALL)
+
+        if matches:
+            for question, answer in matches:
+                question = question.strip().strip('"').strip("'")
+                answer = answer.strip().strip('"').strip("'")
+                logger.debug("Question: %s", question)
+                logger.debug("Answer: %s", answer)
+                qa_pairs[compute_content_hash(question)] = {
+                    "question": question,
+                    "answer": answer,
+                }
+            return qa_pairs
+        logger.warning("Error parsing the response %s", response)
         return qa_pairs
 
     async def generate(
diff --git a/graphgen/operators/generate/generate_service.py b/graphgen/operators/generate/generate_service.py
@@ -61,6 +61,9 @@ def generate(self, items: list[dict]) -> list[dict]:
             unit="batch",
         )
 
+        # Filter out empty results
+        results = [res for res in results if res]
+
         results = self.generator.format_generation_results(
             results, output_data_format=self.data_format
         )
diff --git a/graphgen/templates/generation/aggregated_generation.py b/graphgen/templates/generation/aggregated_generation.py
@@ -132,6 +132,8 @@
    - Logical consistency throughout
    - Clear cause-and-effect relationships
 
+**Attention: Please directly provide the rephrased text without any additional content or analysis.**
+
 ################
 -ENTITIES-
 ################
@@ -175,6 +177,8 @@
     - 整体逻辑一致性
     - 清晰的因果关系
 
+**注意： 请你直接给出重述文本，不要输出任何额外的内容，也不要进行任何分析。**
+
 ################
 -实体-
 ################
@@ -191,32 +195,52 @@
 ################
 请在下方直接输出连贯的重述文本，不要输出任何额外的内容。
 
+输出格式：
+<rephrased_text>rephrased_text_here</rephrased_text>
+
 重述文本:
 """
 
 REQUIREMENT_EN = """
 ################
 Please directly output the coherent rephrased text below, without any additional content.
 
+Output format:
+<rephrased_text>rephrased_text_here</rephrased_text>
+
 Rephrased Text:
 """
 
 QUESTION_GENERATION_EN: str = """The answer to a question is provided. Please generate a question that corresponds to the answer.
 
-################
-Answer:
-{answer}
-################
+The answer for which a question needs to be generated is as follows:
+<answer>{answer}</answer>
+
+Please note the following requirements:
+1. Only output one question text without any additional explanations or analysis.
+2. Do not repeat the content of the answer or any fragments of it.
+3. The question must be independently understandable and fully match the answer.
+
+Output format:
+<question>question_text</question>
+
 Question:
 """
 
 QUESTION_GENERATION_ZH: str = """下面提供了一个问题的答案，请生成一个与答案对应的问题。
 
-################
-答案：
-{answer}
-################
-问题：
+需要生成问题的答案如下：
+<answer>{answer}</answer>
+
+请注意下列要求：
+1. 仅输出一个问题文本，不得包含任何额外说明或分析
+2. 不得重复答案内容或其中任何片段
+3. 问题必须可独立理解且与答案完全匹配
+
+输出格式：
+<question>question_text</question>
+
+问题:
 """
 
 AGGREGATED_GENERATION_PROMPT = {
diff --git a/graphgen/templates/generation/atomic_generation.py b/graphgen/templates/generation/atomic_generation.py
@@ -1,28 +1,44 @@
 # pylint: disable=C0301
 TEMPLATE_EN: str = """You are given a text passage. Your task is to generate a question and answer (QA) pair based on the content of that text.
-The answer should be accurate and directly derived from the text. Make sure the QA pair is relevant to the main theme or important details of the given text. 
-For example:
-Question: What is the effect of overexpressing the BG1 gene on grain size and development?
-Answer: Overexpression of the BG1 gene leads to significantly increased grain size, demonstrating its role in grain development.
 
-Question: What role does TAC4 play in the gravitropism of rice shoots?
-Answer: TAC4 is a key regulator of gravitropism in rice shoots, promoting the bending of shoots towards the gravity vector.
+Please note the following requirements:
+1. Output only one QA pair without any additional explanations or analysis.
+2. Do not repeat the content of the answer or any part of it.
+3. The answer should be accurate and directly derived from the text. Make sure the QA pair is relevant to the main theme or important details of the given text.
+
+Output format:
+<question>question_text</question>
+<answer>answer_text</answer>
+
+For example:
+<question>What is the effect of overexpressing the BG1 gene on grain size and development?</question>
+<answer>Overexpression of the BG1 gene leads to significantly increased grain size, demonstrating its role in grain development.</answer>
 
 Here is the text passage you need to generate a QA pair for:
 {context}
+
+Output:
 """
 
 TEMPLATE_ZH: str = """给定一个文本段落。你的任务是根据该文本的内容生成一个问答（QA）对。
-答案应准确且直接从文本中得出。确保QA对与给定文本的主题或重要细节相关。
-例如：
-问题：过表达BG1基因对谷粒大小和发育有什么影响？
-答案：BG1基因的过表达显著增加了谷粒大小，表明其在谷物发育中的作用。
 
-问题：TAC4在水稻茎的重力性状中扮演什么角色？
-答案：TAC4是水稻茎重力性状的关键调节因子，促进茎向重力矢量弯曲。
+请注意下列要求：
+1. 仅输出一个问答（QA）对，不得包含任何额外说明或分析
+2. 不得重复答案内容或其中任何片段
+3. 答案应准确且直接从文本中得出。确保QA对与给定文本的主题或重要细节相关。
+
+输出格式如下：
+<question>question_text</question>
+<answer>answer_text</answer>
+
+例如：
+<question>过表达BG1基因对谷粒大小和发育有什么影响？</question>
+<answer>BG1基因的过表达显著增加了谷粒大小，表明其在谷物发育中的作用。</answer>
 
 以下是你需要为其生成QA对的文本段落：
 {context}
+
+输出：
 """
 
 
diff --git a/graphgen/templates/generation/cot_generation.py b/graphgen/templates/generation/cot_generation.py
diff --git a/graphgen/templates/generation/multi_hop_generation.py b/graphgen/templates/generation/multi_hop_generation.py
diff --git a/graphgen/templates/generation/vqa_generation.py b/graphgen/templates/generation/vqa_generation.py

Original file line number	Diff line number	Diff line change
`@@ -61,6 +61,9 @@ def generate(self, items: list[dict]) -> list[dict]:`
`61`	`61`	`unit="batch",`
`62`	`62`	`)`
`63`	`63`
	`64`	`+ # Filter out empty results`
	`65`	`+ results = [res for res in results if res]`
	`66`	`+`
`64`	`67`	`results = self.generator.format_generation_results(`
`65`	`68`	`results, output_data_format=self.data_format`
`66`	`69`	`)`