InternScience · ChenZiHong-Gavin · Nov 7, 2025 · Nov 6, 2025 · Nov 6, 2025 · Nov 6, 2025
diff --git a/graphgen/configs/aggregated_config.yaml b/graphgen/configs/aggregated_config.yaml
@@ -1,22 +1,28 @@
-read:
-  input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-split:
-  chunk_size: 1024 # chunk size for text splitting
-  chunk_overlap: 100 # chunk overlap for text splitting
-search: # web search configuration
-  enabled: false # whether to enable web search
-  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
-  enabled: true
-  quiz_samples: 2 # number of quiz samples to generate
-  re_judge: false # whether to re-judge the existing quiz samples
-partition: # graph partition configuration
-  method: ece # ece is a custom partition method based on comprehension loss
-  method_params:
-    max_units_per_community: 20 # max nodes and edges per community
-    min_units_per_community: 5 # min nodes and edges per community
-    max_tokens_per_community: 10240 # max tokens per community
-    unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
-generate:
-  mode: aggregated # atomic, aggregated, multi_hop, cot, vqa
-  data_format: ChatML # Alpaca, Sharegpt, ChatML
+pipeline:
+  - name: read
+    params:
+      input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+      chunk_size: 1024 # chunk size for text splitting
+      chunk_overlap: 100 # chunk overlap for text splitting
+
+  - name: build_kg
+
+  - name: quiz_and_judge
+    params:
+      quiz_samples: 2 # number of quiz samples to generate
+      re_judge: false # whether to re-judge the existing quiz samples
+
+  - name: partition
+    deps: [quiz_and_judge] # ece depends on quiz_and_judge steps
+    params:
+      method: ece # ece is a custom partition method based on comprehension loss
+      method_params:
+        max_units_per_community: 20 # max nodes and edges per community
+        min_units_per_community: 5 # min nodes and edges per community
+        max_tokens_per_community: 10240 # max tokens per community
+        unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
+
+  - name: generate
+    params:
+      method: aggregated # atomic, aggregated, multi_hop, cot, vqa
+      data_format: ChatML # Alpaca, Sharegpt, ChatML
diff --git a/graphgen/configs/atomic_config.yaml b/graphgen/configs/atomic_config.yaml
@@ -1,19 +1,18 @@
-read:
-  input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
-split:
-  chunk_size: 1024 # chunk size for text splitting
-  chunk_overlap: 100 # chunk overlap for text splitting
-search: # web search configuration
-  enabled: false # whether to enable web search
-  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
-  enabled: true
-  quiz_samples: 2 # number of quiz samples to generate
-  re_judge: false # whether to re-judge the existing quiz samples
-partition: # graph partition configuration
-  method: dfs # partition method, support: dfs, bfs, ece, leiden
-  method_params:
-    max_units_per_community: 1 # atomic partition, one node or edge per community
-generate:
-  mode: atomic # atomic, aggregated, multi_hop, cot, vqa
-  data_format: Alpaca # Alpaca, Sharegpt, ChatML
+pipeline:
+  - name: read
+    params:
+      input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
+      chunk_size: 1024 # chunk size for text splitting
+      chunk_overlap: 100 # chunk overlap for text splitting
+
+  - name: build_kg
+
+  - name: partition
+    params:
+      method: dfs # partition method, support: dfs, bfs, ece, leiden
+      method_params:
+        max_units_per_community: 1 # atomic partition, one node or edge per community
+  - name: generate
+    params:
+      method: atomic # atomic, aggregated, multi_hop, cot, vqa
+      data_format: Alpaca # Alpaca, Sharegpt, ChatML
diff --git a/graphgen/configs/cot_config.yaml b/graphgen/configs/cot_config.yaml
@@ -1,19 +1,21 @@
-read:
-  input_file: resources/input_examples/txt_demo.txt  # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-split:
-  chunk_size: 1024 # chunk size for text splitting
-  chunk_overlap: 100 # chunk overlap for text splitting
-search: # web search configuration
-  enabled: false # whether to enable web search
-  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
-  enabled: false
-partition: # graph partition configuration
-  method: leiden # leiden is a partitioner detection algorithm
-  method_params:
-    max_size: 20 # Maximum size of communities
-    use_lcc: false # whether to use the largest connected component
-    random_seed: 42 # random seed for partitioning
-generate:
-  mode: cot # atomic, aggregated, multi_hop, cot, vqa
-  data_format: Sharegpt # Alpaca, Sharegpt, ChatML
+pipeline:
+  - name: read
+    params:
+      input_file: resources/input_examples/txt_demo.txt  # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+      chunk_size: 1024 # chunk size for text splitting
+      chunk_overlap: 100 # chunk overlap for text splitting
+
+  - name: build_kg
+
+  - name: partition
+    params:
+      method: leiden # leiden is a partitioner detection algorithm
+      method_params:
+        max_size: 20 # Maximum size of communities
+        use_lcc: false # whether to use the largest connected component
+        random_seed: 42 # random seed for partitioning
+
+  - name: generate
+    params:
+      method: cot # atomic, aggregated, multi_hop, cot, vqa
+      data_format: Sharegpt # Alpaca, Sharegpt, ChatML
diff --git a/graphgen/configs/multi_hop_config.yaml b/graphgen/configs/multi_hop_config.yaml
@@ -1,22 +1,22 @@
-read:
-  input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-split:
-  chunk_size: 1024 # chunk size for text splitting
-  chunk_overlap: 100 # chunk overlap for text splitting
-search: # web search configuration
-  enabled: false # whether to enable web search
-  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
-  enabled: false
-  quiz_samples: 2 # number of quiz samples to generate
-  re_judge: false # whether to re-judge the existing quiz samples
-partition: # graph partition configuration
-  method: ece # ece is a custom partition method based on comprehension loss
-  method_params:
-    max_units_per_community: 3 # max nodes and edges per community, for multi-hop, we recommend setting it to 3
-    min_units_per_community: 3 # min nodes and edges per community, for multi-hop, we recommend setting it to 3
-    max_tokens_per_community: 10240 # max tokens per community
-    unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss
-generate:
-  mode: multi_hop # atomic, aggregated, multi_hop, cot, vqa
-  data_format: ChatML # Alpaca, Sharegpt, ChatML
+pipeline:
+  - name: read
+    params:
+      input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+      chunk_size: 1024 # chunk size for text splitting
+      chunk_overlap: 100 # chunk overlap for text splitting
+
+  - name: build_kg
+
+  - name: partition
+    params:
+      method: ece # ece is a custom partition method based on comprehension loss
+      method_params:
+        max_units_per_community: 3 # max nodes and edges per community, for multi-hop, we recommend setting it to 3
+        min_units_per_community: 3 # min nodes and edges per community, for multi-hop, we recommend setting it to 3
+        max_tokens_per_community: 10240 # max tokens per community
+        unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss
+
+  - name: generate
+    params:
+      method: multi_hop # atomic, aggregated, multi_hop, cot, vqa
+      data_format: ChatML # Alpaca, Sharegpt, ChatML
diff --git a/graphgen/configs/vqa_config.yaml b/graphgen/configs/vqa_config.yaml
@@ -1,18 +1,20 @@
-read:
-  input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-split:
-  chunk_size: 1024 # chunk size for text splitting
-  chunk_overlap: 100 # chunk overlap for text splitting
-search: # web search configuration
-  enabled: false # whether to enable web search
-  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
-  enabled: false
-partition: # graph partition configuration
-  method: anchor_bfs # partition method
-  method_params:
-    anchor_type: image # node type to select anchor nodes
-    max_units_per_community: 10 # atomic partition, one node or edge per community
-generate:
-  mode: vqa # atomic, aggregated, multi_hop, cot, vqa
-  data_format: ChatML # Alpaca, Sharegpt, ChatML
+pipeline:
+  - name: read
+    params:
+      input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+      chunk_size: 1024 # chunk size for text splitting
+      chunk_overlap: 100 # chunk overlap for text splitting
+
+  - name: build_kg
+
+  - name: partition
+    params:
+      method: anchor_bfs # partition method
+      method_params:
+        anchor_type: image # node type to select anchor nodes
+        max_units_per_community: 10 # atomic partition, one node or edge per community
+
+  - name: generate
+    params:
+      method: vqa # atomic, aggregated, multi_hop, cot, vqa
+      data_format: ChatML # Alpaca, Sharegpt, ChatML
diff --git a/graphgen/engine.py b/graphgen/engine.py
@@ -0,0 +1,121 @@
+"""
+orchestration engine for GraphGen
+"""
+
+import threading
+import traceback
+from functools import wraps
+from typing import Any, Callable, List
+
+
+class Context(dict):
+    _lock = threading.Lock()
+
+    def set(self, k, v):
+        with self._lock:
+            self[k] = v
+
+    def get(self, k, default=None):
+        with self._lock:
+            return super().get(k, default)
+
+
+class OpNode:
+    def __init__(
+        self, name: str, deps: List[str], func: Callable[["OpNode", Context], Any]
+    ):
+        self.name, self.deps, self.func = name, deps, func
+
+
+def op(name: str, deps=None):
+    deps = deps or []
+
+    def decorator(func):
+        @wraps(func)
+        def _wrapper(*args, **kwargs):
+            return func(*args, **kwargs)
+
+        _wrapper.op_node = OpNode(name, deps, lambda self, ctx: func(self, **ctx))
+        return _wrapper
+
+    return decorator
+
+
+class Engine:
+    def __init__(self, max_workers: int = 4):
+        self.max_workers = max_workers
+
+    def run(self, ops: List[OpNode], ctx: Context):
+        name2op = {operation.name: operation for operation in ops}
+
+        # topological sort
+        graph = {n: set(name2op[n].deps) for n in name2op}
+        topo = []
+        q = [n for n, d in graph.items() if not d]
+        while q:
+            cur = q.pop(0)
+            topo.append(cur)
+            for child in [c for c, d in graph.items() if cur in d]:
+                graph[child].remove(cur)
+                if not graph[child]:
+                    q.append(child)
+
+        if len(topo) != len(ops):
+            raise ValueError(
+                "Cyclic dependencies detected among operations."
+                "Please check your configuration."
+            )
+
+        # semaphore for max_workers
+        sem = threading.Semaphore(self.max_workers)
+        done = {n: threading.Event() for n in name2op}
+        exc = {}
+
+        def _exec(n: str):
+            with sem:
+                for d in name2op[n].deps:
+                    done[d].wait()
+                if any(d in exc for d in name2op[n].deps):
+                    exc[n] = Exception("Skipped due to failed dependencies")
+                    done[n].set()
+                    return
+                try:
+                    name2op[n].func(name2op[n], ctx)
+                except Exception:  # pylint: disable=broad-except
+                    exc[n] = traceback.format_exc()
+                done[n].set()
+
+        ts = [threading.Thread(target=_exec, args=(n,), daemon=True) for n in topo]
+        for t in ts:
+            t.start()
+        for t in ts:
+            t.join()
+        if exc:
+            raise RuntimeError(
+                "Some operations failed:\n"
+                + "\n".join(f"---- {op} ----\n{tb}" for op, tb in exc.items())
+            )
+
+
+def collect_ops(config: dict, graph_gen) -> List[OpNode]:
+    """
+    build operation nodes from yaml config
+    :param config
+    :param graph_gen
+    """
+    ops: List[OpNode] = []
+    for stage in config["pipeline"]:
+        name = stage["name"]
+        method = getattr(graph_gen, name)
+        op_node = method.op_node
+
+        # if there are runtime dependencies, override them
+        runtime_deps = stage.get("deps", op_node.deps)
+        op_node.deps = runtime_deps
+
+        if "params" in stage:
+            op_node.func = lambda self, ctx, m=method, sc=stage: m(sc.get("params", {}))
+        else:
+            op_node.func = lambda self, ctx, m=method: m()
+        ops.append(op_node)
+    return ops
diff --git a/graphgen/evaluate.py b/graphgen/evaluate.py
@@ -1,3 +1,4 @@
+# TODO: this module needs refactoring to merge into GraphGen framework
 """Evaluate the quality of the generated text using various metrics"""
 
 import argparse