Skip to content

Commit f6e3505

Browse files
gwarmstrongacanaverascursoragentrajeshwarid179
authored
Add noc reasoning tutorial (#1278)
Signed-off-by: Amparo Canaveras <acanaveras@nvidia.com> Signed-off-by: rajeshwarid179 <rdevaramani@nvidia.com> Signed-off-by: acanaveras <142839082+acanaveras@users.noreply.github.com> Signed-off-by: George Armstrong <georgea@nvidia.com> Co-authored-by: Amparo Canaveras <acanaveras@nvidia.com> Co-authored-by: Cursor <cursoragent@cursor.com> Co-authored-by: acanaveras <142839082+acanaveras@users.noreply.github.com> Co-authored-by: rajeshwarid179 <rdevaramani@nvidia.com>
1 parent fc2072a commit f6e3505

30 files changed

Lines changed: 10470 additions & 0 deletions

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ AGENTS.md
5050
.claude
5151
.cursor
5252
.idea
53+
site/
5354

5455
#scripts at root level
5556
/*.sh

docs/tutorials/posts/noc-reasoning-agent.md

Lines changed: 605 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
[download]
2+
qwen2.5-14=/workspace/models/Qwen2.5-14B-Instruct
3+
qwen2.5-32=/workspace/models/Qwen2.5-32B-Instruct
4+
gpt-oss-120b=/workspace/models/gpt-oss-120b
5+
nemotron-49b-1.5=/workspace/models/Llama-3_3-Nemotron-Super-49B-v1_5
6+
7+
8+
[data_path]
9+
original_data_path=data/anonymized-Incidents_Last_6_Months.csv
10+
incident_json_data=outputs/input_incident.jsonl
Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
processors_to_run: all
2+
3+
output_path: ???
4+
# prompt_config: null
5+
preprocessed_dataset_files: null
6+
input_files: null
7+
8+
9+
# --- Data Keys ---
10+
# These keys MUST match the output of your preprocessing script.
11+
input_key: "background"
12+
output_key: "response"
13+
14+
# --- SFT Formatting ---
15+
# Define the tokenizer and the final chat format for the model.
16+
tokenizer: "Qwen/Qwen3-32B" # EDIT THIS or override via CLI
17+
18+
# This uses a separate YAML file to define the chat template.
19+
# This makes the configuration cleaner and more reusable.
20+
prompt_config: "/workspace/data/prompt_incident.yaml" # EDIT THIS or override via CLI
21+
22+
# -----------------
23+
# --- General Settings ---
24+
# -----------------
25+
do_shuffle: false
26+
deduplicate: true
27+
exclude_optional_keys: true
28+
random_seed: 42
29+
num_output_samples: null
30+
add_correct: true
31+
add_incorrect: false
32+
add_unlabeled: true # Set to true as requested
33+
34+
# -----------------
35+
# --- Quality Control Filters ---
36+
# -----------------
37+
# Most filters are disabled because the data is text-based reasoning, not math or code.
38+
# This prevents the pipeline from incorrectly discarding valid data.
39+
40+
41+
contamination_file: null
42+
43+
filters:
44+
drop_multi_boxed: false
45+
remove_contaminated: false # can be enabled if you have a contamination file
46+
majority_filter: false
47+
trim_solutions: false # Your data doesn't use the \boxed{} syntax
48+
trim_prefix: false
49+
drop_incorrect_arithmetic: false
50+
split_arithmetic: false
51+
remove_len_outlier_problems: false # Keep all data regardless of length
52+
remove_len_outlier_solutions: false
53+
code_text_filter: null
54+
remove_code_errors: false
55+
remove_verification_code: false
56+
remove_matplotlib: false
57+
remove_no_code: false
58+
remove_no_think_tags: false # Enabled, as requested
59+
60+
# ================================================================================= #
61+
# Processor Pipeline (Usually does not need to be changed) #
62+
# ================================================================================= #
63+
processors:
64+
- _target_: nemo_skills.training.data_preparation_utils.preprocessing.ReadData
65+
input_files: ${input_files} # This line ensures the processor gets the input file path
66+
input_key: ${input_key}
67+
output_key: ${output_key}
68+
add_unlabeled: ${add_unlabeled}
69+
deduplicate: ${deduplicate}
70+
keys_to_keep:
71+
- "expected_answer"
72+
- "incident_identifier"
73+
- "incident_classification"
74+
- "urgency_level"
75+
- "geographical_territory"
76+
- "incident_subtype"
77+
- "service_domain"
78+
- "equipment_provider"
79+
- "operational_zone"
80+
- "affected_site"
81+
- "incident_summary"
82+
- "detection_timestamp"
83+
- "escalation_date"
84+
- "responsible_team"
85+
- "fault_category"
86+
- "action_chronicle"
87+
- "resolution_summary"
88+
- "resolution_method"
89+
- "problem_code_reasoning_process"
90+
91+
- _target_: nemo_skills.training.data_preparation_utils.merge_processor.MergeProcessor
92+
_recursive_: false
93+
processor_configs:
94+
- _target_: nemo_skills.training.data_preparation_utils.filters.RemoveContaminated
95+
should_run: ${filters.remove_contaminated}
96+
contamination_file: ${contamination_file}
97+
98+
- _target_: nemo_skills.training.data_preparation_utils.filters.DropIfRegexMatch #removing errors
99+
should_run: ${filters.remove_code_errors}
100+
text_key: ${output_key}
101+
regex_patterns:
102+
- 'Traceback (most recent call last)'
103+
- '<output cut>'
104+
- 'Timed out'
105+
- 'SyntaxError'
106+
test_cases:
107+
- { input: { generation: "My solution:\n---Traceback (most recent call last)---\nSomething else" }, output: null }
108+
- { input: { generation: "My solution:\nSome long output<output cut>\nSomething else" }, output: null }
109+
- { input: { generation: "My solution:\nTimed out\nSomething else" }, output: null }
110+
- { input: { generation: "My solution:\n[0;31mSyntaxError\u001b\nSomething else" }, output: null }
111+
- { input: { generation: "My solution, no errors" }, output: { generation: "My solution, no errors" } }
112+
113+
- _target_: nemo_skills.training.data_preparation_utils.filters.DropIfRegexNotMatch # filtering out tool calling
114+
should_run: ${filters.remove_no_code}
115+
text_key: ${output_key}
116+
regex_patterns:
117+
- '<tool_call>'
118+
- '</tool_call>'
119+
test_cases:
120+
- { input: { generation: "My solution:\n---<tool_call>---\nSomething else" }, output: null }
121+
- { input: { generation: "My solution:\ncode</tool_call>\nSomething else" }, output: null }
122+
- { input: { generation: "<tool_call>code</tool_call>" }, output: { generation: "<tool_call>code</tool_call>" } }
123+
124+
- _target_: nemo_skills.training.data_preparation_utils.filters.DropIfRegexNotMatch # filtering out if no think tags
125+
should_run: ${filters.remove_no_think_tags}
126+
text_key: ${output_key}
127+
regex_patterns:
128+
- '</think>'
129+
test_cases:
130+
- { input: { generation: "My solution:\n---</think>---\nSomething else" }, output: { generation: "My solution:\n---</think>---\nSomething else" } }
131+
- { input: { generation: "<think>My solution:\n\nSomething else" }, output: null }
132+
- { input: { generation: "<think>thinking</think>summary" }, output: { generation: "<think>thinking</think>summary" } }
133+
134+
135+
136+
- _target_: nemo_skills.training.data_preparation_utils.preprocessing.GroupSamples
137+
group_key: ${input_key}
138+
139+
- _target_: nemo_skills.training.data_preparation_utils.preprocessing.ShuffleAndDownsampleData
140+
num_samples: ${num_output_samples}
141+
random_seed: ${random_seed}
142+
do_shuffle: ${do_shuffle}
143+
144+
- _target_: nemo_skills.training.data_preparation_utils.preprocessing.WriteFinalSftManifest
145+
output_manifest_file: ${output_path}
146+
prompt_config: ${prompt_config}
147+
tokenizer: ${tokenizer}
148+
input_key: ${input_key}
149+
output_key: ${output_key}
150+
exclude_optional_keys: ${exclude_optional_keys}

0 commit comments

Comments
 (0)