Skip to content
50 changes: 28 additions & 22 deletions graphgen/configs/aggregated_config.yaml
Original file line number Diff line number Diff line change
@@ -1,22 +1,28 @@
read:
input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
split:
chunk_size: 1024 # chunk size for text splitting
chunk_overlap: 100 # chunk overlap for text splitting
search: # web search configuration
enabled: false # whether to enable web search
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
enabled: true
quiz_samples: 2 # number of quiz samples to generate
re_judge: false # whether to re-judge the existing quiz samples
partition: # graph partition configuration
method: ece # ece is a custom partition method based on comprehension loss
method_params:
max_units_per_community: 20 # max nodes and edges per community
min_units_per_community: 5 # min nodes and edges per community
max_tokens_per_community: 10240 # max tokens per community
unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
generate:
mode: aggregated # atomic, aggregated, multi_hop, cot, vqa
data_format: ChatML # Alpaca, Sharegpt, ChatML
pipeline:
- name: read
params:
input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
chunk_size: 1024 # chunk size for text splitting
chunk_overlap: 100 # chunk overlap for text splitting

- name: build_kg

- name: quiz_and_judge
params:
quiz_samples: 2 # number of quiz samples to generate
re_judge: false # whether to re-judge the existing quiz samples

- name: partition
deps: [quiz_and_judge] # ece depends on quiz_and_judge steps
params:
method: ece # ece is a custom partition method based on comprehension loss
method_params:
max_units_per_community: 20 # max nodes and edges per community
min_units_per_community: 5 # min nodes and edges per community
max_tokens_per_community: 10240 # max tokens per community
unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss

- name: generate
params:
method: aggregated # atomic, aggregated, multi_hop, cot, vqa
data_format: ChatML # Alpaca, Sharegpt, ChatML
37 changes: 18 additions & 19 deletions graphgen/configs/atomic_config.yaml
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
read:
input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
split:
chunk_size: 1024 # chunk size for text splitting
chunk_overlap: 100 # chunk overlap for text splitting
search: # web search configuration
enabled: false # whether to enable web search
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
enabled: true
quiz_samples: 2 # number of quiz samples to generate
re_judge: false # whether to re-judge the existing quiz samples
partition: # graph partition configuration
method: dfs # partition method, support: dfs, bfs, ece, leiden
method_params:
max_units_per_community: 1 # atomic partition, one node or edge per community
generate:
mode: atomic # atomic, aggregated, multi_hop, cot, vqa
data_format: Alpaca # Alpaca, Sharegpt, ChatML
pipeline:
- name: read
params:
input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
chunk_size: 1024 # chunk size for text splitting
chunk_overlap: 100 # chunk overlap for text splitting

- name: build_kg

- name: partition
params:
method: dfs # partition method, support: dfs, bfs, ece, leiden
method_params:
max_units_per_community: 1 # atomic partition, one node or edge per community
- name: generate
params:
method: atomic # atomic, aggregated, multi_hop, cot, vqa
data_format: Alpaca # Alpaca, Sharegpt, ChatML
40 changes: 21 additions & 19 deletions graphgen/configs/cot_config.yaml
Original file line number Diff line number Diff line change
@@ -1,19 +1,21 @@
read:
input_file: resources/input_examples/txt_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
split:
chunk_size: 1024 # chunk size for text splitting
chunk_overlap: 100 # chunk overlap for text splitting
search: # web search configuration
enabled: false # whether to enable web search
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
enabled: false
partition: # graph partition configuration
method: leiden # leiden is a partitioner detection algorithm
method_params:
max_size: 20 # Maximum size of communities
use_lcc: false # whether to use the largest connected component
random_seed: 42 # random seed for partitioning
generate:
mode: cot # atomic, aggregated, multi_hop, cot, vqa
data_format: Sharegpt # Alpaca, Sharegpt, ChatML
pipeline:
- name: read
params:
input_file: resources/input_examples/txt_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
chunk_size: 1024 # chunk size for text splitting
chunk_overlap: 100 # chunk overlap for text splitting

- name: build_kg

- name: partition
params:
method: leiden # leiden is a partitioner detection algorithm
method_params:
max_size: 20 # Maximum size of communities
use_lcc: false # whether to use the largest connected component
random_seed: 42 # random seed for partitioning

- name: generate
params:
method: cot # atomic, aggregated, multi_hop, cot, vqa
data_format: Sharegpt # Alpaca, Sharegpt, ChatML
44 changes: 22 additions & 22 deletions graphgen/configs/multi_hop_config.yaml
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
read:
input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
split:
chunk_size: 1024 # chunk size for text splitting
chunk_overlap: 100 # chunk overlap for text splitting
search: # web search configuration
enabled: false # whether to enable web search
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
enabled: false
quiz_samples: 2 # number of quiz samples to generate
re_judge: false # whether to re-judge the existing quiz samples
partition: # graph partition configuration
method: ece # ece is a custom partition method based on comprehension loss
method_params:
max_units_per_community: 3 # max nodes and edges per community, for multi-hop, we recommend setting it to 3
min_units_per_community: 3 # min nodes and edges per community, for multi-hop, we recommend setting it to 3
max_tokens_per_community: 10240 # max tokens per community
unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss
generate:
mode: multi_hop # atomic, aggregated, multi_hop, cot, vqa
data_format: ChatML # Alpaca, Sharegpt, ChatML
pipeline:
- name: read
params:
input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
chunk_size: 1024 # chunk size for text splitting
chunk_overlap: 100 # chunk overlap for text splitting

- name: build_kg

- name: partition
params:
method: ece # ece is a custom partition method based on comprehension loss
method_params:
max_units_per_community: 3 # max nodes and edges per community, for multi-hop, we recommend setting it to 3
min_units_per_community: 3 # min nodes and edges per community, for multi-hop, we recommend setting it to 3
max_tokens_per_community: 10240 # max tokens per community
unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss

- name: generate
params:
method: multi_hop # atomic, aggregated, multi_hop, cot, vqa
data_format: ChatML # Alpaca, Sharegpt, ChatML
38 changes: 20 additions & 18 deletions graphgen/configs/vqa_config.yaml
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
read:
input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
split:
chunk_size: 1024 # chunk size for text splitting
chunk_overlap: 100 # chunk overlap for text splitting
search: # web search configuration
enabled: false # whether to enable web search
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
enabled: false
partition: # graph partition configuration
method: anchor_bfs # partition method
method_params:
anchor_type: image # node type to select anchor nodes
max_units_per_community: 10 # atomic partition, one node or edge per community
generate:
mode: vqa # atomic, aggregated, multi_hop, cot, vqa
data_format: ChatML # Alpaca, Sharegpt, ChatML
pipeline:
- name: read
params:
input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
chunk_size: 1024 # chunk size for text splitting
chunk_overlap: 100 # chunk overlap for text splitting

- name: build_kg

- name: partition
params:
method: anchor_bfs # partition method
method_params:
anchor_type: image # node type to select anchor nodes
max_units_per_community: 10 # atomic partition, one node or edge per community

- name: generate
params:
method: vqa # atomic, aggregated, multi_hop, cot, vqa
data_format: ChatML # Alpaca, Sharegpt, ChatML
121 changes: 121 additions & 0 deletions graphgen/engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
"""
orchestration engine for GraphGen
"""

import threading
import traceback
from functools import wraps
from typing import Any, Callable, List


class Context(dict):
_lock = threading.Lock()

def set(self, k, v):
with self._lock:
self[k] = v

def get(self, k, default=None):
with self._lock:
return super().get(k, default)


class OpNode:
def __init__(
self, name: str, deps: List[str], func: Callable[["OpNode", Context], Any]
):
self.name, self.deps, self.func = name, deps, func


def op(name: str, deps=None):
deps = deps or []

def decorator(func):
@wraps(func)
def _wrapper(*args, **kwargs):
return func(*args, **kwargs)

_wrapper.op_node = OpNode(name, deps, lambda self, ctx: func(self, **ctx))
return _wrapper

return decorator


class Engine:
def __init__(self, max_workers: int = 4):
self.max_workers = max_workers

def run(self, ops: List[OpNode], ctx: Context):
name2op = {operation.name: operation for operation in ops}

# topological sort
graph = {n: set(name2op[n].deps) for n in name2op}
topo = []
q = [n for n, d in graph.items() if not d]
while q:
cur = q.pop(0)
topo.append(cur)
for child in [c for c, d in graph.items() if cur in d]:
graph[child].remove(cur)
if not graph[child]:
q.append(child)

if len(topo) != len(ops):
raise ValueError(
"Cyclic dependencies detected among operations."
"Please check your configuration."
)

# semaphore for max_workers
sem = threading.Semaphore(self.max_workers)
done = {n: threading.Event() for n in name2op}
exc = {}

def _exec(n: str):
with sem:
for d in name2op[n].deps:
done[d].wait()
if any(d in exc for d in name2op[n].deps):
exc[n] = Exception("Skipped due to failed dependencies")
done[n].set()
return
try:
name2op[n].func(name2op[n], ctx)
except Exception: # pylint: disable=broad-except
exc[n] = traceback.format_exc()
done[n].set()

ts = [threading.Thread(target=_exec, args=(n,), daemon=True) for n in topo]
for t in ts:
t.start()
for t in ts:
t.join()
if exc:
raise RuntimeError(
"Some operations failed:\n"
+ "\n".join(f"---- {op} ----\n{tb}" for op, tb in exc.items())
)


def collect_ops(config: dict, graph_gen) -> List[OpNode]:
"""
build operation nodes from yaml config
:param config
:param graph_gen
"""
ops: List[OpNode] = []
for stage in config["pipeline"]:
name = stage["name"]
method = getattr(graph_gen, name)
op_node = method.op_node

# if there are runtime dependencies, override them
runtime_deps = stage.get("deps", op_node.deps)
op_node.deps = runtime_deps

if "params" in stage:
op_node.func = lambda self, ctx, m=method, sc=stage: m(sc.get("params", {}))
else:
op_node.func = lambda self, ctx, m=method: m()
ops.append(op_node)
return ops
1 change: 1 addition & 0 deletions graphgen/evaluate.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# TODO: this module needs refactoring to merge into GraphGen framework
"""Evaluate the quality of the generated text using various metrics"""

import argparse
Expand Down
Loading