Skip to content

Commit 011a1e5

Browse files
Merge branch 'feature/orchestration-operation' of https://github.com/open-sciencelab/GraphGen into feature/schema_guided_build
2 parents 9de5b2f + ae31db9 commit 011a1e5

22 files changed

+382
-141
lines changed
Lines changed: 26 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,26 @@
1-
read:
2-
input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
3-
split:
4-
chunk_size: 1024 # chunk size for text splitting
5-
chunk_overlap: 100 # chunk overlap for text splitting
6-
search: # web search configuration
7-
enabled: false # whether to enable web search
8-
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
9-
quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
10-
enabled: true
11-
quiz_samples: 2 # number of quiz samples to generate
12-
re_judge: false # whether to re-judge the existing quiz samples
13-
partition: # graph partition configuration
14-
method: ece # ece is a custom partition method based on comprehension loss
15-
method_params:
16-
max_units_per_community: 20 # max nodes and edges per community
17-
min_units_per_community: 5 # min nodes and edges per community
18-
max_tokens_per_community: 10240 # max tokens per community
19-
unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
20-
generate:
21-
mode: aggregated # atomic, aggregated, multi_hop, cot, vqa
22-
data_format: ChatML # Alpaca, Sharegpt, ChatML
1+
pipeline:
2+
- name: insert
3+
params:
4+
input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
5+
chunk_size: 1024 # chunk size for text splitting
6+
chunk_overlap: 100 # chunk overlap for text splitting
7+
8+
- name: quiz_and_judge
9+
params:
10+
quiz_samples: 2 # number of quiz samples to generate
11+
re_judge: false # whether to re-judge the existing quiz samples
12+
13+
- name: partition
14+
deps: [insert, quiz_and_judge] # ece depends on both insert and quiz_and_judge steps
15+
params:
16+
method: ece # ece is a custom partition method based on comprehension loss
17+
method_params:
18+
max_units_per_community: 20 # max nodes and edges per community
19+
min_units_per_community: 5 # min nodes and edges per community
20+
max_tokens_per_community: 10240 # max tokens per community
21+
unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
22+
23+
- name: generate
24+
params:
25+
method: aggregated # atomic, aggregated, multi_hop, cot, vqa
26+
data_format: ChatML # Alpaca, Sharegpt, ChatML
Lines changed: 15 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,15 @@
1-
read:
2-
input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
3-
split:
4-
chunk_size: 1024 # chunk size for text splitting
5-
chunk_overlap: 100 # chunk overlap for text splitting
6-
search: # web search configuration
7-
enabled: false # whether to enable web search
8-
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
9-
quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
10-
enabled: true
11-
quiz_samples: 2 # number of quiz samples to generate
12-
re_judge: false # whether to re-judge the existing quiz samples
13-
partition: # graph partition configuration
14-
method: dfs # partition method, support: dfs, bfs, ece, leiden
15-
method_params:
16-
max_units_per_community: 1 # atomic partition, one node or edge per community
17-
generate:
18-
mode: atomic # atomic, aggregated, multi_hop, cot, vqa
19-
data_format: Alpaca # Alpaca, Sharegpt, ChatML
1+
pipeline:
2+
- name: insert
3+
params:
4+
input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
5+
chunk_size: 1024 # chunk size for text splitting
6+
chunk_overlap: 100 # chunk overlap for text splitting
7+
- name: partition
8+
params:
9+
method: dfs # partition method, support: dfs, bfs, ece, leiden
10+
method_params:
11+
max_units_per_community: 1 # atomic partition, one node or edge per community
12+
- name: generate
13+
params:
14+
method: atomic # atomic, aggregated, multi_hop, cot, vqa
15+
data_format: Alpaca # Alpaca, Sharegpt, ChatML

graphgen/configs/cot_config.yaml

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
1-
read:
2-
input_file: resources/input_examples/txt_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
3-
split:
4-
chunk_size: 1024 # chunk size for text splitting
5-
chunk_overlap: 100 # chunk overlap for text splitting
6-
search: # web search configuration
7-
enabled: false # whether to enable web search
8-
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
9-
quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
10-
enabled: false
11-
partition: # graph partition configuration
12-
method: leiden # leiden is a partitioner detection algorithm
13-
method_params:
14-
max_size: 20 # Maximum size of communities
15-
use_lcc: false # whether to use the largest connected component
16-
random_seed: 42 # random seed for partitioning
17-
generate:
18-
mode: cot # atomic, aggregated, multi_hop, cot, vqa
19-
data_format: Sharegpt # Alpaca, Sharegpt, ChatML
1+
pipeline:
2+
- name: insert
3+
params:
4+
input_file: resources/input_examples/txt_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
5+
chunk_size: 1024 # chunk size for text splitting
6+
chunk_overlap: 100 # chunk overlap for text splitting
7+
8+
- name: partition
9+
params:
10+
method: leiden # leiden is a partitioner detection algorithm
11+
method_params:
12+
max_size: 20 # Maximum size of communities
13+
use_lcc: false # whether to use the largest connected component
14+
random_seed: 42 # random seed for partitioning
15+
16+
- name: generate
17+
params:
18+
method: cot # atomic, aggregated, multi_hop, cot, vqa
19+
data_format: Sharegpt # Alpaca, Sharegpt, ChatML
Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,20 @@
1-
read:
2-
input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
3-
split:
4-
chunk_size: 1024 # chunk size for text splitting
5-
chunk_overlap: 100 # chunk overlap for text splitting
6-
search: # web search configuration
7-
enabled: false # whether to enable web search
8-
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
9-
quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
10-
enabled: false
11-
quiz_samples: 2 # number of quiz samples to generate
12-
re_judge: false # whether to re-judge the existing quiz samples
13-
partition: # graph partition configuration
14-
method: ece # ece is a custom partition method based on comprehension loss
15-
method_params:
16-
max_units_per_community: 3 # max nodes and edges per community, for multi-hop, we recommend setting it to 3
17-
min_units_per_community: 3 # min nodes and edges per community, for multi-hop, we recommend setting it to 3
18-
max_tokens_per_community: 10240 # max tokens per community
19-
unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss
20-
generate:
21-
mode: multi_hop # atomic, aggregated, multi_hop, cot, vqa
22-
data_format: ChatML # Alpaca, Sharegpt, ChatML
1+
pipeline:
2+
- name: insert
3+
params:
4+
input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
5+
chunk_size: 1024 # chunk size for text splitting
6+
chunk_overlap: 100 # chunk overlap for text splitting
7+
8+
- name: partition
9+
params:
10+
method: ece # ece is a custom partition method based on comprehension loss
11+
method_params:
12+
max_units_per_community: 3 # max nodes and edges per community, for multi-hop, we recommend setting it to 3
13+
min_units_per_community: 3 # min nodes and edges per community, for multi-hop, we recommend setting it to 3
14+
max_tokens_per_community: 10240 # max tokens per community
15+
unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss
16+
17+
- name: generate
18+
params:
19+
method: multi_hop # atomic, aggregated, multi_hop, cot, vqa
20+
data_format: ChatML # Alpaca, Sharegpt, ChatML

graphgen/configs/vqa_config.yaml

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
1-
read:
2-
input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
3-
split:
4-
chunk_size: 1024 # chunk size for text splitting
5-
chunk_overlap: 100 # chunk overlap for text splitting
6-
search: # web search configuration
7-
enabled: false # whether to enable web search
8-
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
9-
quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
10-
enabled: false
11-
partition: # graph partition configuration
12-
method: anchor_bfs # partition method
13-
method_params:
14-
anchor_type: image # node type to select anchor nodes
15-
max_units_per_community: 10 # atomic partition, one node or edge per community
16-
generate:
17-
mode: vqa # atomic, aggregated, multi_hop, cot, vqa
18-
data_format: ChatML # Alpaca, Sharegpt, ChatML
1+
pipeline:
2+
- name: insert
3+
params:
4+
input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
5+
chunk_size: 1024 # chunk size for text splitting
6+
chunk_overlap: 100 # chunk overlap for text splitting
7+
8+
- name: partition
9+
params:
10+
method: anchor_bfs # partition method
11+
method_params:
12+
anchor_type: image # node type to select anchor nodes
13+
max_units_per_community: 10 # atomic partition, one node or edge per community
14+
15+
- name: generate
16+
params:
17+
method: vqa # atomic, aggregated, multi_hop, cot, vqa
18+
data_format: ChatML # Alpaca, Sharegpt, ChatML

graphgen/engine.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
"""
2+
orchestration engine for GraphGen
3+
"""
4+
5+
import threading
6+
import traceback
7+
from functools import wraps
8+
from typing import Any, Callable, List
9+
10+
11+
class Context(dict):
12+
_lock = threading.Lock()
13+
14+
def set(self, k, v):
15+
with self._lock:
16+
self[k] = v
17+
18+
def get(self, k, default=None):
19+
with self._lock:
20+
return super().get(k, default)
21+
22+
23+
class OpNode:
24+
def __init__(
25+
self, name: str, deps: List[str], func: Callable[["OpNode", Context], Any]
26+
):
27+
self.name, self.deps, self.func = name, deps, func
28+
29+
30+
def op(name: str, deps=None):
31+
deps = deps or []
32+
33+
def decorator(func):
34+
@wraps(func)
35+
def _wrapper(*args, **kwargs):
36+
return func(*args, **kwargs)
37+
38+
_wrapper.op_node = OpNode(name, deps, lambda self, ctx: func(self, **ctx))
39+
return _wrapper
40+
41+
return decorator
42+
43+
44+
class Engine:
45+
def __init__(self, max_workers: int = 4):
46+
self.max_workers = max_workers
47+
48+
def run(self, ops: List[OpNode], ctx: Context):
49+
name2op = {operation.name: operation for operation in ops}
50+
51+
# topological sort
52+
graph = {n: set(name2op[n].deps) for n in name2op}
53+
topo = []
54+
q = [n for n, d in graph.items() if not d]
55+
while q:
56+
cur = q.pop(0)
57+
topo.append(cur)
58+
for child in [c for c, d in graph.items() if cur in d]:
59+
graph[child].remove(cur)
60+
if not graph[child]:
61+
q.append(child)
62+
63+
if len(topo) != len(ops):
64+
raise ValueError(
65+
"Cyclic dependencies detected among operations."
66+
"Please check your configuration."
67+
)
68+
69+
# semaphore for max_workers
70+
sem = threading.Semaphore(self.max_workers)
71+
done = {n: threading.Event() for n in name2op}
72+
exc = {}
73+
74+
def _exec(n: str):
75+
with sem:
76+
for d in name2op[n].deps:
77+
done[d].wait()
78+
if any(d in exc for d in name2op[n].deps):
79+
exc[n] = Exception("Skipped due to failed dependencies")
80+
done[n].set()
81+
return
82+
try:
83+
name2op[n].func(name2op[n], ctx)
84+
except Exception: # pylint: disable=broad-except
85+
exc[n] = traceback.format_exc()
86+
done[n].set()
87+
88+
ts = [threading.Thread(target=_exec, args=(n,), daemon=True) for n in topo]
89+
for t in ts:
90+
t.start()
91+
for t in ts:
92+
t.join()
93+
if exc:
94+
raise RuntimeError(
95+
"Some operations failed:\n"
96+
+ "\n".join(f"---- {op} ----\n{tb}" for op, tb in exc.items())
97+
)
98+
99+
100+
def collect_ops(config: dict, graph_gen) -> List[OpNode]:
101+
"""
102+
build operation nodes from yaml config
103+
:param config
104+
:param graph_gen
105+
"""
106+
ops: List[OpNode] = []
107+
for stage in config["pipeline"]:
108+
name = stage["name"]
109+
method = getattr(graph_gen, name)
110+
op_node = method.op_node
111+
112+
# if there are runtime dependencies, override them
113+
runtime_deps = stage.get("deps", op_node.deps)
114+
op_node.deps = runtime_deps
115+
116+
op_node.func = lambda self, ctx, m=method, sc=stage: m(sc.get("params"))
117+
ops.append(op_node)
118+
return ops

graphgen/evaluate.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# TODO: this module needs refactoring to merge into GraphGen framework
12
"""Evaluate the quality of the generated text using various metrics"""
23

34
import argparse

0 commit comments

Comments
 (0)