Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 21 additions & 16 deletions graphgen/models/generator/aggregated_generator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import Any
import re
from typing import Any, Optional

from graphgen.bases import BaseGenerator
from graphgen.templates import AGGREGATED_GENERATION_PROMPT
Expand Down Expand Up @@ -56,19 +57,21 @@ def build_prompt(
return prompt

@staticmethod
def parse_rephrased_text(response: str) -> str:
def parse_rephrased_text(response: str) -> Optional[str]:
"""
Parse the rephrased text from the response.
:param response:
:return: rephrased text
"""
if "Rephrased Text:" in response:
rephrased_text = response.split("Rephrased Text:")[1].strip()
elif "重述文本:" in response:
rephrased_text = response.split("重述文本:")[1].strip()
rephrased_match = re.search(
r"<rephrased_text>(.*?)</rephrased_text>", response, re.DOTALL
)
if rephrased_match:
rephrased_text = rephrased_match.group(1).strip()
else:
rephrased_text = response.strip()
return rephrased_text.strip('"')
logger.warning("Failed to parse rephrased text from response: %s", response)
return None
return rephrased_text.strip('"').strip("'")

@staticmethod
def _build_prompt_for_question_generation(answer: str) -> str:
Expand All @@ -85,15 +88,13 @@ def _build_prompt_for_question_generation(answer: str) -> str:

@staticmethod
def parse_response(response: str) -> dict:
if response.startswith("Question:"):
question = response[len("Question:") :].strip()
elif response.startswith("问题:"):
question = response[len("问题:") :].strip()
question_match = re.search(r"<question>(.*?)</question>", response, re.DOTALL)
if question_match:
question = question_match.group(1).strip()
else:
question = response.strip()
return {
"question": question,
}
logger.warning("Failed to parse question from response: %s", response)
return {"question": ""}
return {"question": question.strip('"').strip("'")}

async def generate(
self,
Expand All @@ -110,9 +111,13 @@ async def generate(
rephrasing_prompt = self.build_prompt(batch)
response = await self.llm_client.generate_answer(rephrasing_prompt)
context = self.parse_rephrased_text(response)
if not context:
return result
question_generation_prompt = self._build_prompt_for_question_generation(context)
response = await self.llm_client.generate_answer(question_generation_prompt)
question = self.parse_response(response)["question"]
if not question:
return result
logger.debug("Question: %s", question)
logger.debug("Answer: %s", context)
qa_pairs = {
Expand Down
18 changes: 10 additions & 8 deletions graphgen/models/generator/atomic_generator.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
from typing import Any

from graphgen.bases import BaseGenerator
Expand Down Expand Up @@ -29,17 +30,18 @@ def parse_response(response: str) -> dict:
:param response:
:return:
"""
if "Question:" in response and "Answer:" in response:
question = response.split("Question:")[1].split("Answer:")[0].strip()
answer = response.split("Answer:")[1].strip()
elif "问题:" in response and "答案:" in response:
question = response.split("问题:")[1].split("答案:")[0].strip()
answer = response.split("答案:")[1].strip()
question_match = re.search(r"<question>(.*?)</question>", response, re.DOTALL)
answer_match = re.search(r"<answer>(.*?)</answer>", response, re.DOTALL)

if question_match and answer_match:
question = question_match.group(1).strip()
answer = answer_match.group(1).strip()
else:
logger.warning("Failed to parse response: %s", response)
return {}
question = question.strip('"')
answer = answer.strip('"')

question = question.strip('"').strip("'")
answer = answer.strip('"').strip("'")
logger.debug("Question: %s", question)
logger.debug("Answer: %s", answer)
return {
Expand Down
33 changes: 20 additions & 13 deletions graphgen/models/generator/cot_generator.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
from typing import Any

from graphgen.bases import BaseGenerator
Expand Down Expand Up @@ -67,22 +68,26 @@ def build_prompt_for_cot_generation(

@staticmethod
def parse_response(response: str) -> dict:
if "Question:" in response and "Reasoning-Path Design:" in response:
question = (
response.split("Question:")[1]
.split("Reasoning-Path Design:")[0]
.strip()
)
reasoning_path = response.split("Reasoning-Path Design:")[1].strip()
elif "问题:" in response and "推理路径设计:" in response:
question = response.split("问题:")[1].split("推理路径设计:")[0].strip()
reasoning_path = response.split("推理路径设计:")[1].strip()
"""
Parse CoT template from response.
:param response:
:return: dict with question and reasoning_path
"""
question_match = re.search(r"<question>(.*?)</question>", response, re.DOTALL)
reasoning_path_match = re.search(
r"<reasoning_path>(.*?)</reasoning_path>", response, re.DOTALL
)

if question_match and reasoning_path_match:
question = question_match.group(1).strip()
reasoning_path = reasoning_path_match.group(1).strip()
else:
logger.warning("Failed to parse CoT template: %s", response)
logger.warning("Failed to parse response: %s", response)
return {}

question = question.strip('"')
reasoning_path = reasoning_path.strip('"')
question = question.strip('"').strip("'")
reasoning_path = reasoning_path.strip('"').strip("'")

logger.debug("CoT Question: %s", question)
logger.debug("CoT Reasoning Path: %s", reasoning_path)
return {
Expand All @@ -105,6 +110,8 @@ async def generate(
prompt = self.build_prompt(batch)
response = await self.llm_client.generate_answer(prompt)
response = self.parse_response(response)
if not response:
return result
question, reasoning_path = response["question"], response["reasoning_path"]
prompt = self.build_prompt_for_cot_generation(batch, question, reasoning_path)
cot_answer = await self.llm_client.generate_answer(prompt)
Expand Down
18 changes: 10 additions & 8 deletions graphgen/models/generator/multi_hop_generator.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
from typing import Any

from graphgen.bases import BaseGenerator
Expand Down Expand Up @@ -32,17 +33,18 @@ def build_prompt(

@staticmethod
def parse_response(response: str) -> dict:
if "Question:" in response and "Answer:" in response:
question = response.split("Question:")[1].split("Answer:")[0].strip()
answer = response.split("Answer:")[1].strip()
elif "问题:" in response and "答案:" in response:
question = response.split("问题:")[1].split("答案:")[0].strip()
answer = response.split("答案:")[1].strip()
question_match = re.search(r"<question>(.*?)</question>", response, re.DOTALL)
answer_match = re.search(r"<answer>(.*?)</answer>", response, re.DOTALL)

if question_match and answer_match:
question = question_match.group(1).strip()
answer = answer_match.group(1).strip()
else:
logger.warning("Failed to parse response: %s", response)
return {}
question = question.strip('"')
answer = answer.strip('"')

question = question.strip('"').strip("'")
answer = answer.strip('"').strip("'")
logger.debug("Question: %s", question)
logger.debug("Answer: %s", answer)
return {
Expand Down
35 changes: 16 additions & 19 deletions graphgen/models/generator/vqa_generator.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
from typing import Any

from graphgen.bases import BaseGenerator
Expand Down Expand Up @@ -38,25 +39,21 @@ def parse_response(response: str) -> Any:
:return: QA pairs
"""
qa_pairs = {}
qa_list = response.strip().split("\n\n")
for qa in qa_list:
if "Question:" in qa and "Answer:" in qa:
question = qa.split("Question:")[1].split("Answer:")[0].strip()
answer = qa.split("Answer:")[1].strip()
elif "问题:" in qa and "答案:" in qa:
question = qa.split("问题:")[1].split("答案:")[0].strip()
answer = qa.split("答案:")[1].strip()
else:
logger.error("Failed to parse QA pair: %s", qa)
continue
question = question.strip('"')
answer = answer.strip('"')
logger.debug("Question: %s", question)
logger.debug("Answer: %s", answer)
qa_pairs[compute_content_hash(question)] = {
"question": question,
"answer": answer,
}
pattern = r"<question>(.*?)</question>\s*<answer>(.*?)</answer>"
matches = re.findall(pattern, response, re.DOTALL)

if matches:
for question, answer in matches:
question = question.strip().strip('"').strip("'")
answer = answer.strip().strip('"').strip("'")
logger.debug("Question: %s", question)
logger.debug("Answer: %s", answer)
qa_pairs[compute_content_hash(question)] = {
"question": question,
"answer": answer,
}
else:
logger.warning("Error parsing the response %s", response)
return qa_pairs

async def generate(
Expand Down
12 changes: 7 additions & 5 deletions graphgen/models/llm/local/vllm_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def __init__(
model: str,
tensor_parallel_size: int = 1,
gpu_memory_utilization: float = 0.9,
temperature: float = 0.0,
temperature: float = 0.6,
top_p: float = 1.0,
topk: int = 5,
**kwargs: Any,
Expand Down Expand Up @@ -66,7 +66,7 @@ async def generate_answer(
sp = self.SamplingParams(
temperature=self.temperature if self.temperature > 0 else 1.0,
top_p=self.top_p if self.temperature > 0 else 1.0,
max_tokens=extra.get("max_new_tokens", 512),
max_tokens=extra.get("max_new_tokens", 2048),
)

result_generator = self.engine.generate(full_prompt, sp, request_id=request_id)
Expand All @@ -82,7 +82,7 @@ async def generate_answer(

async def generate_topk_per_token(
self, text: str, history: Optional[List[str]] = None, **extra: Any
) -> List[Token]:
) -> List[Token]:
full_prompt = self._build_inputs(text, history)
request_id = f"graphgen_topk_{uuid.uuid4()}"

Expand Down Expand Up @@ -110,7 +110,9 @@ async def generate_topk_per_token(

candidate_tokens = []
for _, logprob_obj in top_logprobs.items():
tok_str = logprob_obj.decoded_token.strip() if logprob_obj.decoded_token else ""
tok_str = (
logprob_obj.decoded_token.strip() if logprob_obj.decoded_token else ""
)
prob = float(math.exp(logprob_obj.logprob))
candidate_tokens.append(Token(tok_str, prob))

Expand All @@ -120,7 +122,7 @@ async def generate_topk_per_token(
main_token = Token(
text=candidate_tokens[0].text,
prob=candidate_tokens[0].prob,
top_candidates=candidate_tokens
top_candidates=candidate_tokens,
)
return [main_token]
return []
Expand Down
3 changes: 3 additions & 0 deletions graphgen/operators/generate/generate_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ def generate(self, items: list[dict]) -> list[dict]:
unit="batch",
)

# Filter out empty results
results = [res for res in results if res]

results = self.generator.format_generation_results(
results, output_data_format=self.data_format
)
Expand Down
8 changes: 4 additions & 4 deletions graphgen/operators/read/read.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def _build_reader(suffix: str, cache_dir: str | None, **reader_kwargs):
def read(
input_path: Union[str, List[str]],
allowed_suffix: Optional[List[str]] = None,
cache_dir: Optional[str] = "cache",
working_dir: Optional[str] = "cache",
parallelism: int = 4,
recursive: bool = True,
**reader_kwargs: Any,
Expand All @@ -60,7 +60,7 @@ def read(

:param input_path: File or directory path(s) to read from
:param allowed_suffix: List of allowed file suffixes (e.g., ['pdf', 'txt'])
:param cache_dir: Directory to cache intermediate files (PDF processing)
:param working_dir: Directory to cache intermediate files (PDF processing)
:param parallelism: Number of parallel workers
:param recursive: Whether to scan directories recursively
:param reader_kwargs: Additional kwargs passed to readers
Expand All @@ -70,7 +70,7 @@ def read(
# 1. Scan all paths to discover files
logger.info("[READ] Scanning paths: %s", input_path)
scanner = ParallelFileScanner(
cache_dir=cache_dir,
cache_dir=working_dir,
allowed_suffix=allowed_suffix,
rescan=False,
max_workers=parallelism if parallelism > 0 else 1,
Expand Down Expand Up @@ -100,7 +100,7 @@ def read(
# 3. Create read tasks
read_tasks = []
for suffix, file_paths in files_by_suffix.items():
reader = _build_reader(suffix, cache_dir, **reader_kwargs)
reader = _build_reader(suffix, working_dir, **reader_kwargs)
ds = reader.read(file_paths)
read_tasks.append(ds)

Expand Down
Loading