Curator/tutorials/audio/alm/pipeline.yaml at main · NVIDIA-NeMo/Curator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# ALM (Audio Language Model) Pipeline Configuration
#
# This config processes audio manifests to create training windows
# for Audio Language Models.
#
# Usage (from Curator repo root):
#   python tutorials/audio/alm/main.py \
#     --config-path . \
#     --config-name pipeline \
#     manifest_path=tests/fixtures/audio/alm/sample_input.jsonl
#
#   # Override values from command line
#   python tutorials/audio/alm/main.py \
#     --config-path . \
#     --config-name pipeline \
#     manifest_path=/data/input.jsonl \
#     stages.1.min_speakers=3 \
#     stages.2.overlap_percentage=30

defaults:
  - _self_
  - override hydra/job_logging: none
  - override hydra/hydra_logging: none

hydra:
  run:
    dir: .
  output_subdir: null

documentation: |
  ALM Data Pipeline
  #################
  This config processes audio manifests to create training windows
  for Audio Language Model training.

  It performs the following data processing:

  0. ALM Manifest Reader: Reads JSONL manifest on the worker (not the driver)

  1. ALM Data Builder: Creates training windows from audio segments
     - Filters by sample rate, bandwidth, speaker count
     - Creates windows of target duration (120s ± 10%)

  2. ALM Data Overlap: Filters overlapping windows
     - Removes windows with high overlap
     - Keeps windows closest to target duration

  **Required arguments**:

  * **manifest_path**: Path to input JSONL manifest with audio segments

  **Output format**:

  This config generates output manifest at ``${output_dir}/alm_output.jsonl``

  Output manifest contains the following keys:

  * **audio_filepath (str)**: Path to the audio file
  * **windows (list)**: Training windows with segments and speaker durations
  * **filtered_windows (list)**: Windows after overlap filtering
  * **filtered_dur (float)**: Total duration of filtered windows
  * **stats (dict)**: Processing statistics

# Path to input JSONL manifest (required)
manifest_path: ???

# Output directory for results
output_dir: ./alm_output

# Execution backend: "xenna" (default) or "ray_data"
backend: xenna

# Stage chain definition
stages:
  # Stage 0: ALM Manifest Reader (CompositeStage)
  # Decomposes into FilePartitioningStage + ALMManifestReaderStage
  # Reads JSONL line-by-line (no Pandas), produces one AudioTask per entry
  - _target_: nemo_curator.stages.audio.alm.ALMManifestReader
    manifest_path: ${manifest_path}
    files_per_partition: 1

  # Stage 1: ALM Data Builder
  # Creates training windows from audio segments
  - _target_: nemo_curator.stages.audio.alm.ALMDataBuilderStage

    # Window duration: target ± (target * tolerance)
    # e.g., 120 ± 12 = 108-132 seconds
    target_window_duration: 120.0
    tolerance: 0.1

    # Audio quality requirements
    min_sample_rate: 16000
    min_bandwidth: 8000

    # Speaker constraints
    min_speakers: 2
    max_speakers: 5

    # Truncation behavior
    truncation: true

    # Fields to drop (comma-separated strings)
    drop_fields: "words"
    drop_fields_top_level: "words,segments"

  # Stage 2: ALM Data Overlap Filter
  # Filters windows based on overlap threshold
  - _target_: nemo_curator.stages.audio.alm.ALMDataOverlapStage

    # Overlap filtering (0 = aggressive, 100 = permissive)
    overlap_percentage: 50
    target_duration: 120.0

  # Stage 3: ALM Manifest Writer
  # Writes output entries to a JSONL manifest
  - _target_: nemo_curator.stages.audio.alm.ALMManifestWriterStage
    output_path: ${output_dir}/alm_output.jsonl