-
Notifications
You must be signed in to change notification settings - Fork 258
Expand file tree
/
Copy pathpipeline.yaml
More file actions
117 lines (91 loc) · 3.43 KB
/
pipeline.yaml
File metadata and controls
117 lines (91 loc) · 3.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# ALM (Audio Language Model) Pipeline Configuration
#
# This config processes audio manifests to create training windows
# for Audio Language Models.
#
# Usage (from Curator repo root):
# python tutorials/audio/alm/main.py \
# --config-path . \
# --config-name pipeline \
# manifest_path=tests/fixtures/audio/alm/sample_input.jsonl
#
# # Override values from command line
# python tutorials/audio/alm/main.py \
# --config-path . \
# --config-name pipeline \
# manifest_path=/data/input.jsonl \
# stages.1.min_speakers=3 \
# stages.2.overlap_percentage=30
defaults:
- _self_
- override hydra/job_logging: none
- override hydra/hydra_logging: none
hydra:
run:
dir: .
output_subdir: null
documentation: |
ALM Data Pipeline
#################
This config processes audio manifests to create training windows
for Audio Language Model training.
It performs the following data processing:
0. ALM Manifest Reader: Reads JSONL manifest on the worker (not the driver)
1. ALM Data Builder: Creates training windows from audio segments
- Filters by sample rate, bandwidth, speaker count
- Creates windows of target duration (120s ± 10%)
2. ALM Data Overlap: Filters overlapping windows
- Removes windows with high overlap
- Keeps windows closest to target duration
**Required arguments**:
* **manifest_path**: Path to input JSONL manifest with audio segments
**Output format**:
This config generates output manifest at ``${output_dir}/alm_output.jsonl``
Output manifest contains the following keys:
* **audio_filepath (str)**: Path to the audio file
* **windows (list)**: Training windows with segments and speaker durations
* **filtered_windows (list)**: Windows after overlap filtering
* **filtered_dur (float)**: Total duration of filtered windows
* **stats (dict)**: Processing statistics
# Path to input JSONL manifest (required)
manifest_path: ???
# Output directory for results
output_dir: ./alm_output
# Execution backend: "xenna" (default) or "ray_data"
backend: xenna
# Stage chain definition
stages:
# Stage 0: ALM Manifest Reader (CompositeStage)
# Decomposes into FilePartitioningStage + ALMManifestReaderStage
# Reads JSONL line-by-line (no Pandas), produces one AudioTask per entry
- _target_: nemo_curator.stages.audio.alm.ALMManifestReader
manifest_path: ${manifest_path}
files_per_partition: 1
# Stage 1: ALM Data Builder
# Creates training windows from audio segments
- _target_: nemo_curator.stages.audio.alm.ALMDataBuilderStage
# Window duration: target ± (target * tolerance)
# e.g., 120 ± 12 = 108-132 seconds
target_window_duration: 120.0
tolerance: 0.1
# Audio quality requirements
min_sample_rate: 16000
min_bandwidth: 8000
# Speaker constraints
min_speakers: 2
max_speakers: 5
# Truncation behavior
truncation: true
# Fields to drop (comma-separated strings)
drop_fields: "words"
drop_fields_top_level: "words,segments"
# Stage 2: ALM Data Overlap Filter
# Filters windows based on overlap threshold
- _target_: nemo_curator.stages.audio.alm.ALMDataOverlapStage
# Overlap filtering (0 = aggressive, 100 = permissive)
overlap_percentage: 50
target_duration: 120.0
# Stage 3: ALM Manifest Writer
# Writes output entries to a JSONL manifest
- _target_: nemo_curator.stages.audio.alm.ALMManifestWriterStage
output_path: ${output_dir}/alm_output.jsonl