feat: add schema_guided_extraction config

ChenZiHong-Gavin · ChenZiHong-Gavin · commit c5407b5a72fd · 2025-11-06T11:59:26.000+08:00
diff --git a/graphgen/configs/schema_guided_config.yaml b/graphgen/configs/schema_guided_config.yaml
@@ -1,22 +1,8 @@
 read:
   input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
 split:
-  chunk_size: 1024 # chunk size for text splitting
+  chunk_size: 10240 # chunk size for text splitting
   chunk_overlap: 100 # chunk overlap for text splitting
-search: # web search configuration
-  enabled: false # whether to enable web search
-  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
-  enabled: true
-  quiz_samples: 2 # number of quiz samples to generate
-  re_judge: false # whether to re-judge the existing quiz samples
-partition: # graph partition configuration
-  method: ece # ece is a custom partition method based on comprehension loss
-  method_params:
-    max_units_per_community: 20 # max nodes and edges per community
-    min_units_per_community: 5 # min nodes and edges per community
-    max_tokens_per_community: 10240 # max tokens per community
-    unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
-generate:
-  mode: aggregated # atomic, aggregated, multi_hop, cot, vqa
-  data_format: ChatML # Alpaca, Sharegpt, ChatML
+extract:
+  method: schema_guided # extraction method, support: schema_guided
+  schema_file: resources/schemas/legal_contract.json # schema file path for schema_guided method