InternScience
diff --git a/‎graphgen/configs/aggregated_config.yaml‎
Lines changed: 26 additions & 22 deletions b/‎graphgen/configs/aggregated_config.yaml‎
Lines changed: 26 additions & 22 deletions
diff --git a/‎graphgen/configs/atomic_config.yaml‎
Lines changed: 15 additions & 19 deletions b/‎graphgen/configs/atomic_config.yaml‎
Lines changed: 15 additions & 19 deletions
diff --git a/‎graphgen/configs/cot_config.yaml‎
Lines changed: 19 additions & 19 deletions b/‎graphgen/configs/cot_config.yaml‎
Lines changed: 19 additions & 19 deletions
diff --git a/‎graphgen/configs/multi_hop_config.yaml‎
Lines changed: 20 additions & 22 deletions b/‎graphgen/configs/multi_hop_config.yaml‎
Lines changed: 20 additions & 22 deletions
diff --git a/‎graphgen/configs/vqa_config.yaml‎
Lines changed: 18 additions & 18 deletions b/‎graphgen/configs/vqa_config.yaml‎
Lines changed: 18 additions & 18 deletions
diff --git a/‎graphgen/engine.py‎
Lines changed: 118 additions & 0 deletions b/‎graphgen/engine.py‎
Lines changed: 118 additions & 0 deletions
diff --git a/‎graphgen/evaluate.py‎
Lines changed: 1 addition & 0 deletions b/‎graphgen/evaluate.py‎
Lines changed: 1 addition & 0 deletions
@@ -1,22 +1,26 @@
-read:
-  input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-split:
-  chunk_size: 1024 # chunk size for text splitting
-  chunk_overlap: 100 # chunk overlap for text splitting
-search: # web search configuration
-  enabled: false # whether to enable web search
-  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
-  enabled: true
-  quiz_samples: 2 # number of quiz samples to generate
-  re_judge: false # whether to re-judge the existing quiz samples
-partition: # graph partition configuration
-  method: ece # ece is a custom partition method based on comprehension loss
-  method_params:
-    max_units_per_community: 20 # max nodes and edges per community
-    min_units_per_community: 5 # min nodes and edges per community
-    max_tokens_per_community: 10240 # max tokens per community
-    unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
-generate:
-  mode: aggregated # atomic, aggregated, multi_hop, cot, vqa
-  data_format: ChatML # Alpaca, Sharegpt, ChatML
+pipeline:
+  - name: insert
+    params:
+      input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+      chunk_size: 1024 # chunk size for text splitting
+      chunk_overlap: 100 # chunk overlap for text splitting
+
+  - name: quiz_and_judge
+    params:
+      quiz_samples: 2 # number of quiz samples to generate
+      re_judge: false # whether to re-judge the existing quiz samples
+
+  - name: partition
+    deps: [insert, quiz_and_judge] # ece depends on both insert and quiz_and_judge steps
+    params:
+      method: ece # ece is a custom partition method based on comprehension loss
+      method_params:
+        max_units_per_community: 20 # max nodes and edges per community
+        min_units_per_community: 5 # min nodes and edges per community
+        max_tokens_per_community: 10240 # max tokens per community
+        unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
+
+  - name: generate
+    params:
+      method: aggregated # atomic, aggregated, multi_hop, cot, vqa
+      data_format: ChatML # Alpaca, Sharegpt, ChatML
@@ -1,19 +1,15 @@
-read:
-  input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
-split:
-  chunk_size: 1024 # chunk size for text splitting
-  chunk_overlap: 100 # chunk overlap for text splitting
-search: # web search configuration
-  enabled: false # whether to enable web search
-  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
-  enabled: true
-  quiz_samples: 2 # number of quiz samples to generate
-  re_judge: false # whether to re-judge the existing quiz samples
-partition: # graph partition configuration
-  method: dfs # partition method, support: dfs, bfs, ece, leiden
-  method_params:
-    max_units_per_community: 1 # atomic partition, one node or edge per community
-generate:
-  mode: atomic # atomic, aggregated, multi_hop, cot, vqa
-  data_format: Alpaca # Alpaca, Sharegpt, ChatML
+pipeline:
+  - name: insert
+    params:
+      input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
+      chunk_size: 1024 # chunk size for text splitting
+      chunk_overlap: 100 # chunk overlap for text splitting
+  - name: partition
+    params:
+      method: dfs # partition method, support: dfs, bfs, ece, leiden
+      method_params:
+        max_units_per_community: 1 # atomic partition, one node or edge per community
+  - name: generate
+    params:
+      method: atomic # atomic, aggregated, multi_hop, cot, vqa
+      data_format: Alpaca # Alpaca, Sharegpt, ChatML
@@ -1,19 +1,19 @@
-read:
-  input_file: resources/input_examples/txt_demo.txt  # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-split:
-  chunk_size: 1024 # chunk size for text splitting
-  chunk_overlap: 100 # chunk overlap for text splitting
-search: # web search configuration
-  enabled: false # whether to enable web search
-  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
-  enabled: false
-partition: # graph partition configuration
-  method: leiden # leiden is a partitioner detection algorithm
-  method_params:
-    max_size: 20 # Maximum size of communities
-    use_lcc: false # whether to use the largest connected component
-    random_seed: 42 # random seed for partitioning
-generate:
-  mode: cot # atomic, aggregated, multi_hop, cot, vqa
-  data_format: Sharegpt # Alpaca, Sharegpt, ChatML
+pipeline:
+  - name: insert
+    params:
+      input_file: resources/input_examples/txt_demo.txt  # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+      chunk_size: 1024 # chunk size for text splitting
+      chunk_overlap: 100 # chunk overlap for text splitting
+
+  - name: partition
+    params:
+      method: leiden # leiden is a partitioner detection algorithm
+      method_params:
+        max_size: 20 # Maximum size of communities
+        use_lcc: false # whether to use the largest connected component
+        random_seed: 42 # random seed for partitioning
+
+  - name: generate
+    params:
+      method: cot # atomic, aggregated, multi_hop, cot, vqa
+      data_format: Sharegpt # Alpaca, Sharegpt, ChatML
@@ -1,22 +1,20 @@
-read:
-  input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-split:
-  chunk_size: 1024 # chunk size for text splitting
-  chunk_overlap: 100 # chunk overlap for text splitting
-search: # web search configuration
-  enabled: false # whether to enable web search
-  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
-  enabled: false
-  quiz_samples: 2 # number of quiz samples to generate
-  re_judge: false # whether to re-judge the existing quiz samples
-partition: # graph partition configuration
-  method: ece # ece is a custom partition method based on comprehension loss
-  method_params:
-    max_units_per_community: 3 # max nodes and edges per community, for multi-hop, we recommend setting it to 3
-    min_units_per_community: 3 # min nodes and edges per community, for multi-hop, we recommend setting it to 3
-    max_tokens_per_community: 10240 # max tokens per community
-    unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss
-generate:
-  mode: multi_hop # atomic, aggregated, multi_hop, cot, vqa
-  data_format: ChatML # Alpaca, Sharegpt, ChatML
+pipeline:
+  - name: insert
+    params:
+      input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+      chunk_size: 1024 # chunk size for text splitting
+      chunk_overlap: 100 # chunk overlap for text splitting
+
+  - name: partition
+    params:
+      method: ece # ece is a custom partition method based on comprehension loss
+      method_params:
+        max_units_per_community: 3 # max nodes and edges per community, for multi-hop, we recommend setting it to 3
+        min_units_per_community: 3 # min nodes and edges per community, for multi-hop, we recommend setting it to 3
+        max_tokens_per_community: 10240 # max tokens per community
+        unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss
+
+  - name: generate
+    params:
+      method: multi_hop # atomic, aggregated, multi_hop, cot, vqa
+      data_format: ChatML # Alpaca, Sharegpt, ChatML
@@ -1,18 +1,18 @@
-read:
-  input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-split:
-  chunk_size: 1024 # chunk size for text splitting
-  chunk_overlap: 100 # chunk overlap for text splitting
-search: # web search configuration
-  enabled: false # whether to enable web search
-  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
-  enabled: false
-partition: # graph partition configuration
-  method: anchor_bfs # partition method
-  method_params:
-    anchor_type: image # node type to select anchor nodes
-    max_units_per_community: 10 # atomic partition, one node or edge per community
-generate:
-  mode: vqa # atomic, aggregated, multi_hop, cot, vqa
-  data_format: ChatML # Alpaca, Sharegpt, ChatML
+pipeline:
+  - name: insert
+    params:
+      input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+      chunk_size: 1024 # chunk size for text splitting
+      chunk_overlap: 100 # chunk overlap for text splitting
+
+  - name: partition
+    params:
+      method: anchor_bfs # partition method
+      method_params:
+        anchor_type: image # node type to select anchor nodes
+        max_units_per_community: 10 # atomic partition, one node or edge per community
+
+  - name: generate
+    params:
+      method: vqa # atomic, aggregated, multi_hop, cot, vqa
+      data_format: ChatML # Alpaca, Sharegpt, ChatML
@@ -0,0 +1,118 @@
+"""
+orchestration engine for GraphGen
+"""
+
+import threading
+import traceback
+from functools import wraps
+from typing import Any, Callable, List
+
+
+class Context(dict):
+    _lock = threading.Lock()
+
+    def set(self, k, v):
+        with self._lock:
+            self[k] = v
+
+    def get(self, k, default=None):
+        with self._lock:
+            return super().get(k, default)
+
+
+class OpNode:
+    def __init__(
+        self, name: str, deps: List[str], func: Callable[["OpNode", Context], Any]
+    ):
+        self.name, self.deps, self.func = name, deps, func
+
+
+def op(name: str, deps=None):
+    deps = deps or []
+
+    def decorator(func):
+        @wraps(func)
+        def _wrapper(*args, **kwargs):
+            return func(*args, **kwargs)
+
+        _wrapper.op_node = OpNode(name, deps, lambda self, ctx: func(self, **ctx))
+        return _wrapper
+
+    return decorator
+
+
+class Engine:
+    def __init__(self, max_workers: int = 4):
+        self.max_workers = max_workers
+
+    def run(self, ops: List[OpNode], ctx: Context):
+        name2op = {operation.name: operation for operation in ops}
+
+        # topological sort
+        graph = {n: set(name2op[n].deps) for n in name2op}
+        topo = []
+        q = [n for n, d in graph.items() if not d]
+        while q:
+            cur = q.pop(0)
+            topo.append(cur)
+            for child in [c for c, d in graph.items() if cur in d]:
+                graph[child].remove(cur)
+                if not graph[child]:
+                    q.append(child)
+
+        if len(topo) != len(ops):
+            raise ValueError(
+                "Cyclic dependencies detected among operations."
+                "Please check your configuration."
+            )
+
+        # semaphore for max_workers
+        sem = threading.Semaphore(self.max_workers)
+        done = {n: threading.Event() for n in name2op}
+        exc = {}
+
+        def _exec(n: str):
+            with sem:
+                for d in name2op[n].deps:
+                    done[d].wait()
+                if any(d in exc for d in name2op[n].deps):
+                    exc[n] = Exception("Skipped due to failed dependencies")
+                    done[n].set()
+                    return
+                try:
+                    name2op[n].func(name2op[n], ctx)
+                except Exception:  # pylint: disable=broad-except
+                    exc[n] = traceback.format_exc()
+                done[n].set()
+
+        ts = [threading.Thread(target=_exec, args=(n,), daemon=True) for n in topo]
+        for t in ts:
+            t.start()
+        for t in ts:
+            t.join()
+        if exc:
+            raise RuntimeError(
+                "Some operations failed:\n"
+                + "\n".join(f"---- {op} ----\n{tb}" for op, tb in exc.items())
+            )
+
+
+def collect_ops(config: dict, graph_gen) -> List[OpNode]:
+    """
+    build operation nodes from yaml config
+    :param config
+    :param graph_gen
+    """
+    ops: List[OpNode] = []
+    for stage in config["pipeline"]:
+        name = stage["name"]
+        method = getattr(graph_gen, name)
+        op_node = method.op_node
+
+        # if there are runtime dependencies, override them
+        runtime_deps = stage.get("deps", op_node.deps)
+        op_node.deps = runtime_deps
+
+        op_node.func = lambda self, ctx, m=method, sc=stage: m(sc.get("params"))
+        ops.append(op_node)
+    return ops
@@ -1,3 +1,4 @@
+# TODO: this module needs refactoring to merge into GraphGen framework
 """Evaluate the quality of the generated text using various metrics"""
 
 import argparse
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+# TODO: this module needs refactoring to merge into GraphGen framework`
`1`	`2`	`"""Evaluate the quality of the generated text using various metrics"""`
`2`	`3`
`3`	`4`	`import argparse`