feat(graphgen): add vqa configs

ChenZiHong-Gavin · ChenZiHong-Gavin · commit 7a1457fde5bd · 2025-10-20T14:34:36.000+08:00
diff --git a/graphgen/configs/aggregated_config.yaml b/graphgen/configs/aggregated_config.yaml
@@ -1,5 +1,5 @@
 read:
-  input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
+  input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
 split:
   chunk_size: 1024 # chunk size for text splitting
   chunk_overlap: 100 # chunk overlap for text splitting
@@ -18,5 +18,5 @@ partition: # graph partition configuration
     max_tokens_per_community: 10240 # max tokens per community
     unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
 generate:
-  mode: aggregated # atomic, aggregated, multi_hop, cot
+  mode: aggregated # atomic, aggregated, multi_hop, cot, vqa
   data_format: ChatML # Alpaca, Sharegpt, ChatML
diff --git a/graphgen/configs/atomic_config.yaml b/graphgen/configs/atomic_config.yaml
@@ -1,5 +1,5 @@
 read:
-  input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv. See resources/input_examples for examples
+  input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
 split:
   chunk_size: 1024 # chunk size for text splitting
   chunk_overlap: 100 # chunk overlap for text splitting
@@ -15,5 +15,5 @@ partition: # graph partition configuration
   method_params:
     max_units_per_community: 1 # atomic partition, one node or edge per community
 generate:
-  mode: atomic # atomic, aggregated, multi_hop, cot
+  mode: atomic # atomic, aggregated, multi_hop, cot, vqa
   data_format: Alpaca # Alpaca, Sharegpt, ChatML
diff --git a/graphgen/configs/cot_config.yaml b/graphgen/configs/cot_config.yaml
@@ -1,5 +1,5 @@
 read:
-  input_file: resources/input_examples/txt_demo.txt  # input file path, support json, jsonl, txt. See resources/input_examples for examples
+  input_file: resources/input_examples/txt_demo.txt  # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
 split:
   chunk_size: 1024 # chunk size for text splitting
   chunk_overlap: 100 # chunk overlap for text splitting
@@ -15,5 +15,5 @@ partition: # graph partition configuration
     use_lcc: false # whether to use the largest connected component
     random_seed: 42 # random seed for partitioning
 generate:
-  mode: cot # atomic, aggregated, multi_hop, cot
+  mode: cot # atomic, aggregated, multi_hop, cot, vqa
   data_format: Sharegpt # Alpaca, Sharegpt, ChatML
diff --git a/graphgen/configs/multi_hop_config.yaml b/graphgen/configs/multi_hop_config.yaml
@@ -1,5 +1,5 @@
 read:
-  input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt. See resources/input_examples for examples
+  input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
 split:
   chunk_size: 1024 # chunk size for text splitting
   chunk_overlap: 100 # chunk overlap for text splitting
@@ -18,5 +18,5 @@ partition: # graph partition configuration
     max_tokens_per_community: 10240 # max tokens per community
     unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss
 generate:
-  mode: multi_hop # strategy for generating multi-hop QA pairs
+  mode: multi_hop # atomic, aggregated, multi_hop, cot, vqa
   data_format: ChatML # Alpaca, Sharegpt, ChatML
diff --git a/graphgen/configs/vqa_config.yaml b/graphgen/configs/vqa_config.yaml
@@ -0,0 +1,22 @@
+read:
+  input_file: resources/input_examples/pdf_demo.pdf # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+split:
+  chunk_size: 1024 # chunk size for text splitting
+  chunk_overlap: 100 # chunk overlap for text splitting
+search: # web search configuration
+  enabled: false # whether to enable web search
+  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
+quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
+  enabled: true
+  quiz_samples: 2 # number of quiz samples to generate
+  re_judge: false # whether to re-judge the existing quiz samples
+partition: # graph partition configuration
+  method: ece # ece is a custom partition method based on comprehension loss
+  method_params:
+    max_units_per_community: 20 # max nodes and edges per community
+    min_units_per_community: 5 # min nodes and edges per community
+    max_tokens_per_community: 10240 # max tokens per community
+    unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
+generate:
+  mode: vqa # atomic, aggregated, multi_hop, cot, vqa
+  data_format: ChatML # Alpaca, Sharegpt, ChatML
diff --git a/graphgen/generate.py b/graphgen/generate.py
@@ -72,24 +72,10 @@ def main():
 
     graph_gen.search(search_config=config["search"])
 
-    # Use pipeline according to the output data type
-    if mode in ["atomic", "aggregated", "multi_hop"]:
-        logger.info("Generation mode set to '%s'. Start generation.", mode)
-        if "quiz_and_judge" in config and config["quiz_and_judge"]["enabled"]:
-            graph_gen.quiz_and_judge(quiz_and_judge_config=config["quiz_and_judge"])
-        else:
-            logger.warning(
-                "Quiz and Judge strategy is disabled. Edge sampling falls back to random."
-            )
-            assert (
-                config["partition"]["method"] == "ece"
-                and "method_params" in config["partition"]
-            ), "Only ECE partition with edge sampling is supported."
-            config["partition"]["method_params"]["edge_sampling"] = "random"
-    elif mode == "cot":
-        logger.info("Generation mode set to 'cot'. Start generation.")
-    else:
-        raise ValueError(f"Unsupported output data type: {mode}")
+    graph_gen.quiz_and_judge(quiz_and_judge_config=config["quiz_and_judge"])
+
+    # TODO: add data filtering step here in the future
+    # graph_gen.filter(filter_config=config["filter"])
 
     graph_gen.generate(
         partition_config=config["partition"],
diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py
@@ -91,7 +91,7 @@ async def insert(self, read_config: Dict, split_config: Dict):
         insert chunks into the graph
         """
         # Step 1: Read files
-        data = read_files(read_config["input_file"])
+        data = read_files(read_config["input_file"], self.working_dir)
         if len(data) == 0:
             logger.warning("No data to process")
             return
diff --git a/graphgen/models/__init__.py b/graphgen/models/__init__.py
@@ -14,7 +14,7 @@
     ECEPartitioner,
     LeidenPartitioner,
 )
-from .reader import CsvReader, JsonlReader, JsonReader, TxtReader
+from .reader import CSVReader, JSONLReader, JSONReader, PDFReader, TXTReader
 from .search.db.uniprot_search import UniProtSearch
 from .search.kg.wiki_search import WikiSearch
 from .search.web.bing_search import BingSearch
diff --git a/graphgen/models/reader/__init__.py b/graphgen/models/reader/__init__.py
@@ -1,4 +1,5 @@
-from .csv_reader import CsvReader
-from .json_reader import JsonReader
-from .jsonl_reader import JsonlReader
-from .txt_reader import TxtReader
+from .csv_reader import CSVReader
+from .json_reader import JSONReader
+from .jsonl_reader import JSONLReader
+from .pdf_reader import PDFReader
+from .txt_reader import TXTReader
diff --git a/graphgen/models/reader/csv_reader.py b/graphgen/models/reader/csv_reader.py
@@ -5,7 +5,7 @@
 from graphgen.bases.base_reader import BaseReader
 
 
-class CsvReader(BaseReader):
+class CSVReader(BaseReader):
     def read(self, file_path: str) -> List[Dict[str, Any]]:
 
         df = pd.read_csv(file_path)
diff --git a/graphgen/models/reader/json_reader.py b/graphgen/models/reader/json_reader.py
@@ -4,7 +4,7 @@
 from graphgen.bases.base_reader import BaseReader
 
 
-class JsonReader(BaseReader):
+class JSONReader(BaseReader):
     def read(self, file_path: str) -> List[Dict[str, Any]]:
         with open(file_path, "r", encoding="utf-8") as f:
             data = json.load(f)
diff --git a/graphgen/models/reader/jsonl_reader.py b/graphgen/models/reader/jsonl_reader.py
@@ -5,7 +5,7 @@
 from graphgen.utils import logger
 
 
-class JsonlReader(BaseReader):
+class JSONLReader(BaseReader):
     def read(self, file_path: str) -> List[Dict[str, Any]]:
         docs = []
         with open(file_path, "r", encoding="utf-8") as f:
diff --git a/graphgen/models/reader/pdf_reader.py b/graphgen/models/reader/pdf_reader.py
@@ -1,15 +1,16 @@
 import json
+import os
 import subprocess
 import tempfile
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
 from graphgen.bases.base_reader import BaseReader
-from graphgen.models import TxtReader
+from graphgen.models.reader.txt_reader import TXTReader
 from graphgen.utils import logger
 
 
-class PdfReader(BaseReader):
+class PDFReader(BaseReader):
     """
     PDF files are converted using MinerU, see [MinerU](https://github.com/opendatalab/MinerU).
     After conversion, the generated markdown file is read using TxtReader and pictures can be used for VQA tasks.
@@ -34,7 +35,7 @@ def __init__(
         **other_mineru_kwargs: Any,
     ):
         super().__init__()
-        self.output_dir = Path(output_dir) if output_dir else None
+        self.output_dir = os.path.join(output_dir, "mineru") if output_dir else None
 
         self._default_kwargs: Dict[str, Any] = {
             "method": method,
@@ -54,7 +55,7 @@ def __init__(
         }
 
         self.parser = MinerUParser()
-        self.txt_reader = TxtReader()
+        self.txt_reader = TXTReader()
 
     def read(self, file_path: str, **override) -> List[Dict[str, Any]]:
         """
@@ -121,9 +122,60 @@ def parse_pdf(
         if not pdf.is_file():
             raise FileNotFoundError(pdf)
 
-        out = Path(output_dir) if output_dir else Path(tempfile.mkdtemp(prefix="mu_"))
+        out = (
+            Path(output_dir) if output_dir else Path(tempfile.mkdtemp(prefix="mineru_"))
+        )
         out.mkdir(parents=True, exist_ok=True)
 
+        cached = MinerUParser._try_load_cached_result(str(out), pdf.stem, method)
+        if cached is not None:
+            return cached
+
+        MinerUParser._run_mineru(pdf, out, method, device, **kw)
+
+        cached = MinerUParser._try_load_cached_result(str(out), pdf.stem, method)
+        return cached if cached is not None else []
+
+    @staticmethod
+    def _try_load_cached_result(
+        out_dir: str, pdf_stem: str, method: str
+    ) -> Optional[List[Dict[str, Any]]]:
+        """
+        try to load cached json result from MinerU output.
+        :param out_dir:
+        :param pdf_stem:
+        :param method:
+        :return:
+        """
+        json_file = os.path.join(
+            out_dir, pdf_stem, method, f"{pdf_stem}_content_list.json"
+        )
+        if not os.path.exists(json_file):
+            return None
+
+        try:
+            with open(json_file, encoding="utf-8") as f:
+                data = json.load(f)
+        except Exception as exc:  # pylint: disable=broad-except
+            logger.warning("Failed to load cached MinerU result: %s", exc)
+            return None
+
+        base = os.path.dirname(json_file)
+        for item in data:
+            for key in ("img_path", "table_img_path", "equation_img_path"):
+                rel_path = item.get(key)
+                if rel_path:
+                    item[key] = str(Path(base).joinpath(rel_path).resolve())
+        return data
+
+    @staticmethod
+    def _run_mineru(
+        pdf: Path,
+        out: Path,
+        method: str,
+        device: str,
+        **kw: Any,
+    ) -> None:
         cmd = [
             "mineru",
             "-p",
@@ -143,33 +195,21 @@ def parse_pdf(
             else:
                 cmd += [f"--{k}", str(v)]
 
+        logger.info("Parsing PDF with MinerU: %s", pdf)
+        logger.debug("Running MinerU command: %s", " ".join(cmd))
+
         proc = subprocess.run(
             cmd,
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
             text=True,
             encoding="utf-8",
             errors="ignore",
-            check=False,  # catch later
+            check=False,
         )
         if proc.returncode != 0:
             raise RuntimeError(f"MinerU failed: {proc.stderr or proc.stdout}")
 
-        json_file = out / f"{pdf.stem}_content_list.json"
-        if not json_file.exists():
-            json_file = out / pdf.stem / method / f"{pdf.stem}_content_list.json"
-
-        if json_file.exists():
-            with json_file.open(encoding="utf-8") as f:
-                data = json.load(f)
-                base = json_file.parent
-                for item in data:
-                    for key in ("img_path", "table_img_path", "equation_img_path"):
-                        if item.get(key):
-                            item[key] = str((base / item[key]).resolve())
-                return data
-        return []
-
     @staticmethod
     def _check_bin() -> None:
         try:
diff --git a/graphgen/models/reader/txt_reader.py b/graphgen/models/reader/txt_reader.py
@@ -3,7 +3,7 @@
 from graphgen.bases.base_reader import BaseReader
 
 
-class TxtReader(BaseReader):
+class TXTReader(BaseReader):
     def read(self, file_path: str) -> List[Dict[str, Any]]:
         docs = []
         with open(file_path, "r", encoding="utf-8") as f:
diff --git a/graphgen/operators/read/read_files.py b/graphgen/operators/read/read_files.py
@@ -1,16 +1,19 @@
-from graphgen.models import CsvReader, JsonlReader, JsonReader, TxtReader
+from graphgen.models import CSVReader, JSONLReader, JSONReader, PDFReader, TXTReader
 
 _MAPPING = {
-    "jsonl": JsonlReader,
-    "json": JsonReader,
-    "txt": TxtReader,
-    "csv": CsvReader,
+    "jsonl": JSONLReader,
+    "json": JSONReader,
+    "txt": TXTReader,
+    "csv": CSVReader,
+    "pdf": PDFReader,
 }
 
 
-def read_files(file_path: str):
-    suffix = file_path.split(".")[-1]
-    if suffix in _MAPPING:
+def read_files(file_path: str, cache_dir: str):
+    suffix = file_path.split(".")[-1].lower()
+    if suffix == "pdf":
+        reader = _MAPPING[suffix](output_dir=cache_dir)
+    elif suffix in _MAPPING:
         reader = _MAPPING[suffix]()
     else:
         raise ValueError(
diff --git a/scripts/generate/generate_vqa.sh b/scripts/generate/generate_vqa.sh
@@ -0,0 +1,3 @@
+python3 -m graphgen.generate \
+--config_file graphgen/configs/vqa_config.yaml \
+--output_dir cache/

Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@`
`14`	`14`	`ECEPartitioner,`
`15`	`15`	`LeidenPartitioner,`
`16`	`16`	`)`
`17`		`-from .reader import CsvReader, JsonlReader, JsonReader, TxtReader`
	`17`	`+from .reader import CSVReader, JSONLReader, JSONReader, PDFReader, TXTReader`
`18`	`18`	`from .search.db.uniprot_search import UniProtSearch`
`19`	`19`	`from .search.kg.wiki_search import WikiSearch`
`20`	`20`	`from .search.web.bing_search import BingSearch`