Skip to content

Commit 7a1457f

Browse files
feat(graphgen): add vqa configs
1 parent 010b9ae commit 7a1457f

File tree

16 files changed

+120
-65
lines changed

16 files changed

+120
-65
lines changed

graphgen/configs/aggregated_config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
read:
2-
input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
2+
input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
33
split:
44
chunk_size: 1024 # chunk size for text splitting
55
chunk_overlap: 100 # chunk overlap for text splitting
@@ -18,5 +18,5 @@ partition: # graph partition configuration
1818
max_tokens_per_community: 10240 # max tokens per community
1919
unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
2020
generate:
21-
mode: aggregated # atomic, aggregated, multi_hop, cot
21+
mode: aggregated # atomic, aggregated, multi_hop, cot, vqa
2222
data_format: ChatML # Alpaca, Sharegpt, ChatML

graphgen/configs/atomic_config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
read:
2-
input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv. See resources/input_examples for examples
2+
input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
33
split:
44
chunk_size: 1024 # chunk size for text splitting
55
chunk_overlap: 100 # chunk overlap for text splitting
@@ -15,5 +15,5 @@ partition: # graph partition configuration
1515
method_params:
1616
max_units_per_community: 1 # atomic partition, one node or edge per community
1717
generate:
18-
mode: atomic # atomic, aggregated, multi_hop, cot
18+
mode: atomic # atomic, aggregated, multi_hop, cot, vqa
1919
data_format: Alpaca # Alpaca, Sharegpt, ChatML

graphgen/configs/cot_config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
read:
2-
input_file: resources/input_examples/txt_demo.txt # input file path, support json, jsonl, txt. See resources/input_examples for examples
2+
input_file: resources/input_examples/txt_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
33
split:
44
chunk_size: 1024 # chunk size for text splitting
55
chunk_overlap: 100 # chunk overlap for text splitting
@@ -15,5 +15,5 @@ partition: # graph partition configuration
1515
use_lcc: false # whether to use the largest connected component
1616
random_seed: 42 # random seed for partitioning
1717
generate:
18-
mode: cot # atomic, aggregated, multi_hop, cot
18+
mode: cot # atomic, aggregated, multi_hop, cot, vqa
1919
data_format: Sharegpt # Alpaca, Sharegpt, ChatML

graphgen/configs/multi_hop_config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
read:
2-
input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt. See resources/input_examples for examples
2+
input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
33
split:
44
chunk_size: 1024 # chunk size for text splitting
55
chunk_overlap: 100 # chunk overlap for text splitting
@@ -18,5 +18,5 @@ partition: # graph partition configuration
1818
max_tokens_per_community: 10240 # max tokens per community
1919
unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss
2020
generate:
21-
mode: multi_hop # strategy for generating multi-hop QA pairs
21+
mode: multi_hop # atomic, aggregated, multi_hop, cot, vqa
2222
data_format: ChatML # Alpaca, Sharegpt, ChatML

graphgen/configs/vqa_config.yaml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
read:
2+
input_file: resources/input_examples/pdf_demo.pdf # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
3+
split:
4+
chunk_size: 1024 # chunk size for text splitting
5+
chunk_overlap: 100 # chunk overlap for text splitting
6+
search: # web search configuration
7+
enabled: false # whether to enable web search
8+
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
9+
quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
10+
enabled: true
11+
quiz_samples: 2 # number of quiz samples to generate
12+
re_judge: false # whether to re-judge the existing quiz samples
13+
partition: # graph partition configuration
14+
method: ece # ece is a custom partition method based on comprehension loss
15+
method_params:
16+
max_units_per_community: 20 # max nodes and edges per community
17+
min_units_per_community: 5 # min nodes and edges per community
18+
max_tokens_per_community: 10240 # max tokens per community
19+
unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
20+
generate:
21+
mode: vqa # atomic, aggregated, multi_hop, cot, vqa
22+
data_format: ChatML # Alpaca, Sharegpt, ChatML

graphgen/generate.py

Lines changed: 4 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -72,24 +72,10 @@ def main():
7272

7373
graph_gen.search(search_config=config["search"])
7474

75-
# Use pipeline according to the output data type
76-
if mode in ["atomic", "aggregated", "multi_hop"]:
77-
logger.info("Generation mode set to '%s'. Start generation.", mode)
78-
if "quiz_and_judge" in config and config["quiz_and_judge"]["enabled"]:
79-
graph_gen.quiz_and_judge(quiz_and_judge_config=config["quiz_and_judge"])
80-
else:
81-
logger.warning(
82-
"Quiz and Judge strategy is disabled. Edge sampling falls back to random."
83-
)
84-
assert (
85-
config["partition"]["method"] == "ece"
86-
and "method_params" in config["partition"]
87-
), "Only ECE partition with edge sampling is supported."
88-
config["partition"]["method_params"]["edge_sampling"] = "random"
89-
elif mode == "cot":
90-
logger.info("Generation mode set to 'cot'. Start generation.")
91-
else:
92-
raise ValueError(f"Unsupported output data type: {mode}")
75+
graph_gen.quiz_and_judge(quiz_and_judge_config=config["quiz_and_judge"])
76+
77+
# TODO: add data filtering step here in the future
78+
# graph_gen.filter(filter_config=config["filter"])
9379

9480
graph_gen.generate(
9581
partition_config=config["partition"],

graphgen/graphgen.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ async def insert(self, read_config: Dict, split_config: Dict):
9191
insert chunks into the graph
9292
"""
9393
# Step 1: Read files
94-
data = read_files(read_config["input_file"])
94+
data = read_files(read_config["input_file"], self.working_dir)
9595
if len(data) == 0:
9696
logger.warning("No data to process")
9797
return

graphgen/models/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
ECEPartitioner,
1515
LeidenPartitioner,
1616
)
17-
from .reader import CsvReader, JsonlReader, JsonReader, TxtReader
17+
from .reader import CSVReader, JSONLReader, JSONReader, PDFReader, TXTReader
1818
from .search.db.uniprot_search import UniProtSearch
1919
from .search.kg.wiki_search import WikiSearch
2020
from .search.web.bing_search import BingSearch

graphgen/models/reader/__init__.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
from .csv_reader import CsvReader
2-
from .json_reader import JsonReader
3-
from .jsonl_reader import JsonlReader
4-
from .txt_reader import TxtReader
1+
from .csv_reader import CSVReader
2+
from .json_reader import JSONReader
3+
from .jsonl_reader import JSONLReader
4+
from .pdf_reader import PDFReader
5+
from .txt_reader import TXTReader

graphgen/models/reader/csv_reader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from graphgen.bases.base_reader import BaseReader
66

77

8-
class CsvReader(BaseReader):
8+
class CSVReader(BaseReader):
99
def read(self, file_path: str) -> List[Dict[str, Any]]:
1010

1111
df = pd.read_csv(file_path)

0 commit comments

Comments
 (0)