Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 17 additions & 7 deletions graphgen/configs/aggregated_config.yaml
Original file line number Diff line number Diff line change
@@ -1,22 +1,30 @@
pipeline:
- name: read
- name: read_step # step name is unique in the pipeline, and can be referenced by other steps
op_key: read
params:
input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples

- name: chunk
- name: chunk_step
op_key: chunk
deps: [read_step] # chunk_step depends on read_step
params:
chunk_size: 1024 # chunk size for text splitting
chunk_overlap: 100 # chunk overlap for text splitting

- name: build_kg
- name: build_kg_step
op_key: build_kg
deps: [chunk_step] # build_kg_step depends on chunk_step

- name: quiz_and_judge
- name: quiz_and_judge_step
op_key: quiz_and_judge
deps: [build_kg_step] # quiz_and_judge depends on build_kg_step
params:
quiz_samples: 2 # number of quiz samples to generate
re_judge: false # whether to re-judge the existing quiz samples

- name: partition
deps: [quiz_and_judge] # ece depends on quiz_and_judge steps
- name: partition_step
op_key: partition
deps: [quiz_and_judge_step] # partition_step depends on quiz_and_judge_step
params:
method: ece # ece is a custom partition method based on comprehension loss
method_params:
Expand All @@ -25,7 +33,9 @@ pipeline:
max_tokens_per_community: 10240 # max tokens per community
unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss

- name: generate
- name: generate_step
op_key: generate
deps: [partition_step] # generate_step depends on partition_step
params:
method: aggregated # atomic, aggregated, multi_hop, cot, vqa
data_format: ChatML # Alpaca, Sharegpt, ChatML
20 changes: 15 additions & 5 deletions graphgen/configs/atomic_config.yaml
Original file line number Diff line number Diff line change
@@ -1,21 +1,31 @@
pipeline:
- name: read
- name: read_step
op_key: read
params:
input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples

- name: chunk
- name: chunk_step
op_key: chunk
deps: [read_step] # chunk_step depends on read_step
params:
chunk_size: 1024 # chunk size for text splitting
chunk_overlap: 100 # chunk overlap for text splitting

- name: build_kg
- name: build_kg_step
op_key: build_kg
deps: [chunk_step] # build_kg depends on chunk_step

- name: partition
- name: partition_step
op_key: partition
deps: [build_kg] # partition_step depends on build_kg
params:
method: dfs # partition method, support: dfs, bfs, ece, leiden
method_params:
max_units_per_community: 1 # atomic partition, one node or edge per community
- name: generate

- name: generate_step
op_key: generate
deps: [partition_step] # generate_step depends on partition_step
params:
method: atomic # atomic, aggregated, multi_hop, cot, vqa
data_format: Alpaca # Alpaca, Sharegpt, ChatML
19 changes: 14 additions & 5 deletions graphgen/configs/cot_config.yaml
Original file line number Diff line number Diff line change
@@ -1,24 +1,33 @@
pipeline:
- name: read
- name: read_step
op_key: read
params:
input_file: resources/input_examples/txt_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples

- name: chunk
- name: chunk_step
op_key: chunk
deps: [read_step] # chunk_step depends on read_step
params:
chunk_size: 1024 # chunk size for text splitting
chunk_overlap: 100 # chunk overlap for text splitting

- name: build_kg
- name: build_kg_step
op_key: build_kg
deps: [chunk_step] # build_kg depends on chunk_step

- name: partition
- name: partition_step
op_key: partition
deps: [build_kg_step] # partition_step depends on build_kg
params:
method: leiden # leiden is a partitioner detection algorithm
method_params:
max_size: 20 # Maximum size of communities
use_lcc: false # whether to use the largest connected component
random_seed: 42 # random seed for partitioning

- name: generate
- name: generate_step
op_key: generate
deps: [partition_step] # generate_step depends on partition_step
params:
method: cot # atomic, aggregated, multi_hop, cot, vqa
data_format: Sharegpt # Alpaca, Sharegpt, ChatML
19 changes: 14 additions & 5 deletions graphgen/configs/multi_hop_config.yaml
Original file line number Diff line number Diff line change
@@ -1,16 +1,23 @@
pipeline:
- name: read
- name: read_step
op_key: read
params:
input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples

- name: chunk
- name: chunk_step
op_key: chunk
deps: [read_step] # chunk_step depends on read_step
params:
chunk_size: 1024 # chunk size for text splitting
chunk_overlap: 100 # chunk overlap for text splitting

- name: build_kg
- name: build_kg_step
op_key: build_kg
deps: [chunk_step] # build_kg_step depends on chunk_step

- name: partition
- name: partition_step
op_key: partition
deps: [build_kg_step] # partition_step depends on build_kg_step
params:
method: ece # ece is a custom partition method based on comprehension loss
method_params:
Expand All @@ -19,7 +26,9 @@ pipeline:
max_tokens_per_community: 10240 # max tokens per community
unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss

- name: generate
- name: generate_step
op_key: generate
deps: [partition_step] # generate_step depends on partition_step
params:
method: multi_hop # atomic, aggregated, multi_hop, cot, vqa
data_format: ChatML # Alpaca, Sharegpt, ChatML
11 changes: 8 additions & 3 deletions graphgen/configs/schema_guided_extraction_config.yaml
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
pipeline:
- name: read
- name: read_step
op_key: read
params:
input_file: resources/input_examples/extract_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples

- name: chunk
- name: chunk_step
op_key: chunk
deps: [read_step] # chunk_step depends on read_step
params:
chunk_size: 20480
chunk_overlap: 2000
separators: []

- name: extract
- name: extract_step
op_key: extract
deps: [chunk_step] # extract_step depends on chunk_step
params:
method: schema_guided # extraction method, support: schema_guided
schema_file: graphgen/templates/extraction/schemas/legal_contract.json # schema file path for schema_guided method
7 changes: 5 additions & 2 deletions graphgen/configs/search_config.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
pipeline:
- name: read
- name: read_step
op_key: read
params:
input_file: resources/input_examples/search_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples

- name: search
- name: search_step
op_key: search
deps: [read_step] # search_step depends on read_step
params:
data_sources: [uniprot] # data source for searcher, support: wikipedia, google, uniprot
uniprot_params:
Expand Down
19 changes: 14 additions & 5 deletions graphgen/configs/vqa_config.yaml
Original file line number Diff line number Diff line change
@@ -1,23 +1,32 @@
pipeline:
- name: read
- name: read_step
op_key: read
params:
input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples

- name: chunk
- name: chunk_step
op_key: chunk
deps: [read_step] # chunk_step depends on read_step
params:
chunk_size: 1024 # chunk size for text splitting
chunk_overlap: 100 # chunk overlap for text splitting

- name: build_kg
- name: build_kg_step
op_key: build_kg
deps: [chunk_step] # build_kg depends on chunk_step

- name: partition
- name: partition_step
op_key: partition
deps: [build_kg_step] # partition_step depends on build_kg_step
params:
method: anchor_bfs # partition method
method_params:
anchor_type: image # node type to select anchor nodes
max_units_per_community: 10 # atomic partition, one node or edge per community

- name: generate
- name: generate_step
op_key: generate
deps: [partition_step] # generate_step depends on partition_step
params:
method: vqa # atomic, aggregated, multi_hop, cot, vqa
data_format: ChatML # Alpaca, Sharegpt, ChatML
52 changes: 28 additions & 24 deletions graphgen/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import threading
import traceback
from functools import wraps
from typing import Any, Callable, List


Expand All @@ -27,25 +26,12 @@ def __init__(
self.name, self.deps, self.func = name, deps, func


def op(name: str, deps=None):
deps = deps or []

def decorator(func):
@wraps(func)
def _wrapper(*args, **kwargs):
return func(*args, **kwargs)

_wrapper.op_node = OpNode(name, deps, lambda self, ctx: func(self, **ctx))
return _wrapper

return decorator


class Engine:
def __init__(self, max_workers: int = 4):
self.max_workers = max_workers

def run(self, ops: List[OpNode], ctx: Context):
self._validate(ops)
name2op = {operation.name: operation for operation in ops}

# topological sort
Expand Down Expand Up @@ -81,7 +67,7 @@ def _exec(n: str):
return
try:
name2op[n].func(name2op[n], ctx)
except Exception: # pylint: disable=broad-except
except Exception:
exc[n] = traceback.format_exc()
done[n].set()

Expand All @@ -96,6 +82,20 @@ def _exec(n: str):
+ "\n".join(f"---- {op} ----\n{tb}" for op, tb in exc.items())
)

@staticmethod
def _validate(ops: List[OpNode]):
name_set = set()
for op in ops:
if op.name in name_set:
raise ValueError(f"Duplicate operation name: {op.name}")
name_set.add(op.name)
for op in ops:
for dep in op.deps:
if dep not in name_set:
raise ValueError(
f"Operation {op.name} has unknown dependency: {dep}"
)


def collect_ops(config: dict, graph_gen) -> List[OpNode]:
"""
Expand All @@ -106,16 +106,20 @@ def collect_ops(config: dict, graph_gen) -> List[OpNode]:
ops: List[OpNode] = []
for stage in config["pipeline"]:
name = stage["name"]
method = getattr(graph_gen, name)
op_node = method.op_node

# if there are runtime dependencies, override them
runtime_deps = stage.get("deps", op_node.deps)
op_node.deps = runtime_deps
method_name = stage.get("op_key")
method = getattr(graph_gen, method_name)
deps = stage.get("deps", [])

if "params" in stage:
op_node.func = lambda self, ctx, m=method, sc=stage: m(sc.get("params", {}))

def func(self, ctx, _method=method, _params=stage.get("params", {})):
return _method(_params)

else:
op_node.func = lambda self, ctx, m=method: m()

def func(self, ctx, _method=method):
return _method()

op_node = OpNode(name=name, deps=deps, func=func)
ops.append(op_node)
return ops
Loading