|
1 | 1 | read: |
2 | 2 | input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples |
3 | 3 | split: |
4 | | - chunk_size: 1024 # chunk size for text splitting |
| 4 | + chunk_size: 10240 # chunk size for text splitting |
5 | 5 | chunk_overlap: 100 # chunk overlap for text splitting |
6 | | -search: # web search configuration |
7 | | - enabled: false # whether to enable web search |
8 | | - search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia |
9 | | -quiz_and_judge: # quiz and test whether the LLM masters the knowledge points |
10 | | - enabled: true |
11 | | - quiz_samples: 2 # number of quiz samples to generate |
12 | | - re_judge: false # whether to re-judge the existing quiz samples |
13 | | -partition: # graph partition configuration |
14 | | - method: ece # ece is a custom partition method based on comprehension loss |
15 | | - method_params: |
16 | | - max_units_per_community: 20 # max nodes and edges per community |
17 | | - min_units_per_community: 5 # min nodes and edges per community |
18 | | - max_tokens_per_community: 10240 # max tokens per community |
19 | | - unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss |
20 | | -generate: |
21 | | - mode: aggregated # atomic, aggregated, multi_hop, cot, vqa |
22 | | - data_format: ChatML # Alpaca, Sharegpt, ChatML |
| 6 | +extract: |
| 7 | + method: schema_guided # extraction method, support: schema_guided |
| 8 | + schema_file: resources/schemas/legal_contract.json # schema file path for schema_guided method |
0 commit comments