|
| 1 | +processors_to_run: all |
| 2 | + |
| 3 | +output_path: ??? |
| 4 | +# prompt_config: null |
| 5 | +preprocessed_dataset_files: null |
| 6 | +input_files: null |
| 7 | + |
| 8 | + |
| 9 | +# --- Data Keys --- |
| 10 | +# These keys MUST match the output of your preprocessing script. |
| 11 | +input_key: "background" |
| 12 | +output_key: "response" |
| 13 | + |
| 14 | +# --- SFT Formatting --- |
| 15 | +# Define the tokenizer and the final chat format for the model. |
| 16 | +tokenizer: "Qwen/Qwen3-32B" # EDIT THIS or override via CLI |
| 17 | + |
| 18 | +# This uses a separate YAML file to define the chat template. |
| 19 | +# This makes the configuration cleaner and more reusable. |
| 20 | +prompt_config: "/workspace/data/prompt_incident.yaml" # EDIT THIS or override via CLI |
| 21 | + |
| 22 | +# ----------------- |
| 23 | +# --- General Settings --- |
| 24 | +# ----------------- |
| 25 | +do_shuffle: false |
| 26 | +deduplicate: true |
| 27 | +exclude_optional_keys: true |
| 28 | +random_seed: 42 |
| 29 | +num_output_samples: null |
| 30 | +add_correct: true |
| 31 | +add_incorrect: false |
| 32 | +add_unlabeled: true # Set to true as requested |
| 33 | + |
| 34 | +# ----------------- |
| 35 | +# --- Quality Control Filters --- |
| 36 | +# ----------------- |
| 37 | +# Most filters are disabled because the data is text-based reasoning, not math or code. |
| 38 | +# This prevents the pipeline from incorrectly discarding valid data. |
| 39 | + |
| 40 | + |
| 41 | +contamination_file: null |
| 42 | + |
| 43 | +filters: |
| 44 | + drop_multi_boxed: false |
| 45 | + remove_contaminated: false # can be enabled if you have a contamination file |
| 46 | + majority_filter: false |
| 47 | + trim_solutions: false # Your data doesn't use the \boxed{} syntax |
| 48 | + trim_prefix: false |
| 49 | + drop_incorrect_arithmetic: false |
| 50 | + split_arithmetic: false |
| 51 | + remove_len_outlier_problems: false # Keep all data regardless of length |
| 52 | + remove_len_outlier_solutions: false |
| 53 | + code_text_filter: null |
| 54 | + remove_code_errors: false |
| 55 | + remove_verification_code: false |
| 56 | + remove_matplotlib: false |
| 57 | + remove_no_code: false |
| 58 | + remove_no_think_tags: false # Enabled, as requested |
| 59 | + |
| 60 | +# ================================================================================= # |
| 61 | +# Processor Pipeline (Usually does not need to be changed) # |
| 62 | +# ================================================================================= # |
| 63 | +processors: |
| 64 | + - _target_: nemo_skills.training.data_preparation_utils.preprocessing.ReadData |
| 65 | + input_files: ${input_files} # This line ensures the processor gets the input file path |
| 66 | + input_key: ${input_key} |
| 67 | + output_key: ${output_key} |
| 68 | + add_unlabeled: ${add_unlabeled} |
| 69 | + deduplicate: ${deduplicate} |
| 70 | + keys_to_keep: |
| 71 | + - "expected_answer" |
| 72 | + - "incident_identifier" |
| 73 | + - "incident_classification" |
| 74 | + - "urgency_level" |
| 75 | + - "geographical_territory" |
| 76 | + - "incident_subtype" |
| 77 | + - "service_domain" |
| 78 | + - "equipment_provider" |
| 79 | + - "operational_zone" |
| 80 | + - "affected_site" |
| 81 | + - "incident_summary" |
| 82 | + - "detection_timestamp" |
| 83 | + - "escalation_date" |
| 84 | + - "responsible_team" |
| 85 | + - "fault_category" |
| 86 | + - "action_chronicle" |
| 87 | + - "resolution_summary" |
| 88 | + - "resolution_method" |
| 89 | + - "problem_code_reasoning_process" |
| 90 | + |
| 91 | + - _target_: nemo_skills.training.data_preparation_utils.merge_processor.MergeProcessor |
| 92 | + _recursive_: false |
| 93 | + processor_configs: |
| 94 | + - _target_: nemo_skills.training.data_preparation_utils.filters.RemoveContaminated |
| 95 | + should_run: ${filters.remove_contaminated} |
| 96 | + contamination_file: ${contamination_file} |
| 97 | + |
| 98 | + - _target_: nemo_skills.training.data_preparation_utils.filters.DropIfRegexMatch #removing errors |
| 99 | + should_run: ${filters.remove_code_errors} |
| 100 | + text_key: ${output_key} |
| 101 | + regex_patterns: |
| 102 | + - 'Traceback (most recent call last)' |
| 103 | + - '<output cut>' |
| 104 | + - 'Timed out' |
| 105 | + - 'SyntaxError' |
| 106 | + test_cases: |
| 107 | + - { input: { generation: "My solution:\n---Traceback (most recent call last)---\nSomething else" }, output: null } |
| 108 | + - { input: { generation: "My solution:\nSome long output<output cut>\nSomething else" }, output: null } |
| 109 | + - { input: { generation: "My solution:\nTimed out\nSomething else" }, output: null } |
| 110 | + - { input: { generation: "My solution:\n[0;31mSyntaxError\u001b\nSomething else" }, output: null } |
| 111 | + - { input: { generation: "My solution, no errors" }, output: { generation: "My solution, no errors" } } |
| 112 | + |
| 113 | + - _target_: nemo_skills.training.data_preparation_utils.filters.DropIfRegexNotMatch # filtering out tool calling |
| 114 | + should_run: ${filters.remove_no_code} |
| 115 | + text_key: ${output_key} |
| 116 | + regex_patterns: |
| 117 | + - '<tool_call>' |
| 118 | + - '</tool_call>' |
| 119 | + test_cases: |
| 120 | + - { input: { generation: "My solution:\n---<tool_call>---\nSomething else" }, output: null } |
| 121 | + - { input: { generation: "My solution:\ncode</tool_call>\nSomething else" }, output: null } |
| 122 | + - { input: { generation: "<tool_call>code</tool_call>" }, output: { generation: "<tool_call>code</tool_call>" } } |
| 123 | + |
| 124 | + - _target_: nemo_skills.training.data_preparation_utils.filters.DropIfRegexNotMatch # filtering out if no think tags |
| 125 | + should_run: ${filters.remove_no_think_tags} |
| 126 | + text_key: ${output_key} |
| 127 | + regex_patterns: |
| 128 | + - '</think>' |
| 129 | + test_cases: |
| 130 | + - { input: { generation: "My solution:\n---</think>---\nSomething else" }, output: { generation: "My solution:\n---</think>---\nSomething else" } } |
| 131 | + - { input: { generation: "<think>My solution:\n\nSomething else" }, output: null } |
| 132 | + - { input: { generation: "<think>thinking</think>summary" }, output: { generation: "<think>thinking</think>summary" } } |
| 133 | + |
| 134 | + |
| 135 | + |
| 136 | + - _target_: nemo_skills.training.data_preparation_utils.preprocessing.GroupSamples |
| 137 | + group_key: ${input_key} |
| 138 | + |
| 139 | + - _target_: nemo_skills.training.data_preparation_utils.preprocessing.ShuffleAndDownsampleData |
| 140 | + num_samples: ${num_output_samples} |
| 141 | + random_seed: ${random_seed} |
| 142 | + do_shuffle: ${do_shuffle} |
| 143 | + |
| 144 | + - _target_: nemo_skills.training.data_preparation_utils.preprocessing.WriteFinalSftManifest |
| 145 | + output_manifest_file: ${output_path} |
| 146 | + prompt_config: ${prompt_config} |
| 147 | + tokenizer: ${tokenizer} |
| 148 | + input_key: ${input_key} |
| 149 | + output_key: ${output_key} |
| 150 | + exclude_optional_keys: ${exclude_optional_keys} |
0 commit comments