NVIDIA-NeMo · wprazuch · Nov 13, 2025 · Dec 18, 2025 · Dec 18, 2025 · Dec 20, 2025
diff --git a/cluster_configs/s2s_eval_oci_iad.yaml b/cluster_configs/s2s_eval_oci_iad.yaml
@@ -0,0 +1,52 @@
+executor: slurm
+
+ssh_tunnel:
+  host: draco-oci-login-01.draco-oci-iad.nvidia.com
+  # ------------------------------- Fill this up! -------------------------------
+  user: mmkrtchyan
+  job_dir: /lustre/fsw/portfolios/llmservice/users/mmkrtchyan/workspace/code/nemo-run
+  identity: ""
+  # -----------------------------------------------------------------------------
+
+# if you're running directly from cluster, you only need to define job_dir and shouldn't use ssh_tunnel
+# job_dir: <some location on slurm cluster to keep job metadata, uploaded code and generated sbatch files>
+
+account: llmservice_nemo_speechlm
+# account: llmservice_nemo_reasoning
+partition: batch_block1,batch_block3,batch_block4
+cpu_partition: cpu
+job_name_prefix: ""
+
+containers:
+  trtllm: /lustre/fsw/portfolios/llmservice/users/igitman/llm/images/nemo-skills-trtllm-0.7.0.sqsh
+  vllm: /lustre/fsw/portfolios/llmservice/users/igitman/llm/images/nemo-skills-vllm-0.7.0.sqsh
+  sglang: /lustre/fsw/portfolios/llmservice/users/igitman/llm/images/nemo-skills-sglang-0.7.0.sqsh
+  nemo: /lustre/fsw/portfolios/llmservice/users/igitman/llm/images/nemo-skills-nemo-0.7.0.sqsh
+  nemo-rl: /lustre/fsw/portfolios/llmservice/users/igitman/llm/images/nemo-skills-nemo-rl-0.7.0.sqsh
+  megatron: /lustre/fsw/portfolios/llmservice/users/igitman/llm/images/nemo-skills-megatron-0.7.0.sqsh
+  sandbox: /lustre/fsw/portfolios/llmservice/users/igitman/llm/images/nemo-skills-sandbox-0.7.1.sqsh
+  nemo-skills: /lustre/fsw/portfolios/llmservice/users/igitman/llm/images/nemo-skills-0.7.0.sqsh
+  verl: /lustre/fsw/portfolios/llmservice/users/igitman/llm/images/nemo-skills-verl-0.7.0.sqsh
+
+mounts:
+  - /lustre:/lustre
+  - /lustre/fs12/portfolios/llmservice/projects/llmservice_nemo_speechlm/data/nemo_skills/dataset:/dataset
+
+required_env_vars:
+  - NV_INFERENCE_KEY
+
+env_vars:
+  # ------------------------------- Fill this up! -------------------------------
+  - HF_HOME=/lustre/fsw/portfolios/llmservice/users/mmkrtchyan/.cache/huggingface
+  - AUDIO_SAVE_DIR=/lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/fullduplexbench_audio_outputs
+  - PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+  # -----------------------------------------------------------------------------
+
+timeouts:
+  batch_block1,batch_block3,batch_block4: 04:00:00
+  interactive: 04:00:00
+  interactive_singlenode: 04:00:00
+  cpu: 04:00:00
+
+mail_type: FAIL
+mail_user: # <your email goes here>
diff --git a/nemo_skills/dataset/asr-leaderboard/prepare.py b/nemo_skills/dataset/asr-leaderboard/prepare.py
@@ -14,12 +14,17 @@
 
 """Prepare ASR Leaderboard datasets for evaluation.
 
-Downloads and formats datasets from the HuggingFace Open ASR Leaderboard.
+Downloads and formats datasets from the official HF Open ASR Leaderboard ESB
+test-only sorted dataset (hf-audio/esb-datasets-test-only-sorted). This is the
+same data source used by the official leaderboard and the offline NeMo eval
+pipeline, ensuring apples-to-apples WER comparison.
+
 Audio paths in JSONL: /dataset/asr-leaderboard/data/{dataset}/{sample_id}.flac
 
 Usage:
     ns prepare_data asr-leaderboard
     ns prepare_data asr-leaderboard --datasets librispeech_clean ami
+    ns prepare_data asr-leaderboard --datasets earnings22
     ns prepare_data asr-leaderboard --no-audio  # skip saving audio files
 """
 
@@ -34,29 +39,22 @@
 SYSTEM_MESSAGE = "You are a helpful assistant. /no_think"
 MIN_AUDIO_DURATION = 0.1  # Skip audio shorter than this (causes mel spectrogram errors)
 
-# (hf_dataset, hf_config, hf_split, streaming)
+# (hf_repo, config, split, text_field, id_field)
 DATASET_CONFIGS = {
-    "librispeech_clean": ("librispeech_asr", "clean", "test", False),
-    "librispeech_other": ("librispeech_asr", "other", "test", False),
-    "voxpopuli": ("facebook/voxpopuli", "en", "test", False),
-    "tedlium": ("LIUM/tedlium", "release3", "test", False),
-    "gigaspeech": ("speechcolab/gigaspeech", "xs", "test", False),
-    "spgispeech": ("kensho/spgispeech", "test", "test", True),  # streaming to avoid timeout due to large metadata
-    "earnings22": ("distil-whisper/earnings22", "chunked", "test", False),
-    "ami": ("edinburghcstr/ami", "ihm", "test", False),
+    "librispeech_clean": ("hf-audio/esb-datasets-test-only-sorted", "librispeech", "test.clean", "text", "id"),
+    "librispeech_other": ("hf-audio/esb-datasets-test-only-sorted", "librispeech", "test.other", "text", "id"),
+    "voxpopuli": ("hf-audio/esb-datasets-test-only-sorted", "voxpopuli", "test", "text", "id"),
+    "tedlium": ("hf-audio/esb-datasets-test-only-sorted", "tedlium", "test", "text", "id"),
+    "gigaspeech": ("hf-audio/esb-datasets-test-only-sorted", "gigaspeech", "test", "text", "id"),
+    "spgispeech": ("hf-audio/esb-datasets-test-only-sorted", "spgispeech", "test", "text", "id"),
+    "earnings22": ("hf-audio/esb-datasets-test-only-sorted", "earnings22", "test", "text", "id"),
+    "ami": ("hf-audio/esb-datasets-test-only-sorted", "ami", "test", "text", "id"),
 }
 
 
-def save_audio_and_format_entry(entry, dataset_name, audio_dir, sample_idx, with_audio=True):
+def save_audio_and_format_entry(entry, dataset_name, audio_dir, sample_idx, text_field="text", id_field="id", with_audio=True):
     """Format a dataset entry and optionally save audio file."""
-    # Different datasets use different field names for transcription
-    text = (
-        entry.get("text", "")  # ami, LS, gigaspeech, tedlium
-        or entry.get("normalized_text", "")  # voxpopuli
-        or entry.get("transcript", "")  # spgispeech
-        or entry.get("transcription", "")  # earnings22
-    )
-    text = text.strip() if text else ""
+    text = entry.get(text_field, "").strip()
 
     system_message = {"role": "system", "content": SYSTEM_MESSAGE}
     user_message = {"role": "user", "content": "Transcribe the following audio."}
@@ -70,8 +68,8 @@ def save_audio_and_format_entry(entry, dataset_name, audio_dir, sample_idx, with
         if duration < MIN_AUDIO_DURATION:
             return None
 
-        sample_id = entry.get("id", str(sample_idx))
-        audio_filename = f"{sample_id}.flac"
+        sample_id = str(entry.get(id_field, sample_idx)).replace("/", "_")
+        audio_filename = f"{Path(sample_id).stem}.flac"
 
         if with_audio:
             sf.write(str(audio_dir / audio_filename), audio_array, sampling_rate)
@@ -82,14 +80,14 @@ def save_audio_and_format_entry(entry, dataset_name, audio_dir, sample_idx, with
         }
 
     formatted_entry = {
-        "task_type": "ASR_LEADERBOARD",
+        "task_type": "ASR",
         "expected_answer": text,
         "messages": [system_message, user_message],
         "subset_for_metrics": dataset_name,
     }
 
-    if "id" in entry:
-        formatted_entry["id"] = entry["id"]
+    if id_field in entry:
+        formatted_entry["id"] = entry[id_field]
     if "speaker_id" in entry:
         formatted_entry["speaker_id"] = entry["speaker_id"]
 
@@ -101,14 +99,11 @@ def prepare_dataset(dataset_name, output_dir, with_audio=True):
     if dataset_name not in DATASET_CONFIGS:
         raise ValueError(f"Unknown dataset: {dataset_name}. Available: {list(DATASET_CONFIGS.keys())}")
 
-    hf_dataset, hf_config, hf_split, streaming = DATASET_CONFIGS[dataset_name]
+    hf_repo, hf_config, hf_split, text_field, id_field = DATASET_CONFIGS[dataset_name]
 
-    print(f"Loading {dataset_name} from {hf_dataset} (streaming={streaming})...")
+    print(f"Loading {dataset_name} from {hf_repo} (config={hf_config}, split={hf_split})...")
     try:
-        if hf_config:
-            dataset = load_dataset(hf_dataset, hf_config, split=hf_split, trust_remote_code=True, streaming=streaming)
-        else:
-            dataset = load_dataset(hf_dataset, split=hf_split, trust_remote_code=True, streaming=streaming)
+        dataset = load_dataset(hf_repo, hf_config, split=hf_split, trust_remote_code=True)
     except Exception as e:
         print(f"Warning: Failed to load {dataset_name}: {e}")
         return 0
@@ -120,16 +115,13 @@ def prepare_dataset(dataset_name, output_dir, with_audio=True):
         audio_dir.mkdir(parents=True, exist_ok=True)
         print(f"Saving audio files to {audio_dir}")
 
-    if streaming:
-        print(f"Processing {dataset_name} (streaming)...")
-    else:
-        print(f"Processing {len(dataset)} samples from {dataset_name}...")
+    print(f"Processing {len(dataset)} samples from {dataset_name}...")
 
     count = 0
     skipped = 0
     with open(output_file, "w", encoding="utf-8") as fout:
         for idx, entry in enumerate(tqdm(dataset, desc=dataset_name)):
-            formatted = save_audio_and_format_entry(entry, dataset_name, audio_dir, idx, with_audio=with_audio)
+            formatted = save_audio_and_format_entry(entry, dataset_name, audio_dir, idx, text_field=text_field, id_field=id_field, with_audio=with_audio)
             if formatted is None:
                 skipped += 1
                 continue
@@ -160,7 +152,8 @@ def main():
     )
     args = parser.parse_args()
 
-    output_dir = Path(__file__).parent
+    data_dir = Path("/dataset/asr-leaderboard")
+    output_dir = data_dir if data_dir.exists() else Path(__file__).parent
     output_dir.mkdir(parents=True, exist_ok=True)
 
     with_audio = not args.no_audio

diff --git a/...ills/dataset/asr-leaderboard/scripts/asr_leaderboard_s2s_incremental_v2_02mar_config.yaml b/...ills/dataset/asr-leaderboard/scripts/asr_leaderboard_s2s_incremental_v2_02mar_config.yaml
@@ -0,0 +1,59 @@
+# HF Open ASR Leaderboard evaluation - S2S incremental V2 backend, TEXT output only.
+# Evaluates WER on 8 ASR datasets: librispeech_clean, librispeech_other, voxpopuli,
+# tedlium, gigaspeech, spgispeech, earnings22, ami.
+# Checkpoint: Mar 2 2026 (Feb 26 STT + Mar 3 TTS, Megan speaker, force_turn_taking OFF)
+#
+# Run:
+#   python nemo_skills/dataset/asr-leaderboard/scripts/run_eval.py \
+#     --config nemo_skills/dataset/asr-leaderboard/scripts/asr_leaderboard_s2s_incremental_v2_02mar_config.yaml
+
+# Cluster settings
+cluster: s2s_eval_oci_iad
+partition: batch_block1,batch_block3,batch_block4
+cpu_partition: cpu
+
+model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-10min_sw_et_eos_dp_eos_dup_fp32_1delay_02_March_exp_17_afg_long_FT_Megan_msr_34k_steps-stt-AS9.1_11002_new_branch_load_fixed
+
+server_type: vllm
+server_gpus: 1
+num_chunks: 104
+
+server_entrypoint: "-m nemo_skills.inference.server.serve_unified"
+server_args: >-
+  --backend s2s_incremental_v2
+  --no_decode_audio
+  --use_asr_as_response
+  --ignore_system_prompt
+  --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Mg_a_00759.wav
+  --num_frames_per_inference 3
+  --engine_type vllm_llm_vllm_eartts
+  --use_perception_cache
+  --use_perception_cudagraph
+  --buffer_size_frames 21
+  --codec_token_history_size 60
+  --repetition_penalty 1.0
+  --matmul_precision medium
+  --vllm_gpu_memory_utilization 0.35
+  --vllm_max_model_len 8192
+  --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02/asr_leaderboard_incremental_v2_02mar_artifacts_full_no_system_prompt_3frames_per_inference_21buffer
+  --batch_size 1
+  --code_path /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/NeMo
+  --pip_install "hf-xet==1.1.9 huggingface-hub==0.34.4 nvidia-modelopt==0.33.1 nvidia-modelopt-core==0.33.1 tokenizers==0.22.0 transformers==4.56.0 lhotse==1.32.2 nv-one-logger-core==2.1.0 nv-one-logger-pytorch-lightning-integration==2.1.0 nv-one-logger-training-telemetry==2.1.0 kaldialign==0.9.1 jiwer whisper-normalizer sacrebleu"
+
+server_container: /lustre/fsw/portfolios/llmservice/users/erastorgueva/code/containers/triton25.05_s2svllm26.02.12.sqsh
+server_server_type: vllm_multimodal
+
+# Benchmark name used by nemo-skills eval pipeline
+benchmark: asr-leaderboard
+
+# Paths -- data_dir must match the container mount in cluster config
+# (cluster mounts /lustre/fs12/.../nemo_skills/dataset -> /dataset)
+data_dir: /dataset
+output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02/asr_leaderboard_incremental_v5_02mar_artifacts_full_no_system_prompt_3frames_per_inference_21buffer
+
+installation_command: "pip install jiwer whisper-normalizer sacrebleu"
+
+expname: asr_leaderboard_s2s_incremental_v5_02mar
+generation_only: false
+scoring_only: false
+dry_run: false
diff --git a/nemo_skills/dataset/asr-leaderboard/scripts/asr_leaderboard_s2s_incremental_v2_config.yaml b/nemo_skills/dataset/asr-leaderboard/scripts/asr_leaderboard_s2s_incremental_v2_config.yaml
@@ -0,0 +1,65 @@
+# HF Open ASR Leaderboard evaluation - S2S incremental V2 backend, TEXT output only.
+# Evaluates WER on 8 ASR datasets: librispeech_clean, librispeech_other, voxpopuli,
+# tedlium, gigaspeech, spgispeech, earnings22, ami.
+# Checkpoint: Feb 26 2026 (legally friendly personaplex dataset)
+#
+# Prepare data first:
+#   ns prepare_data asr-leaderboard
+#
+# Run:
+#   python nemo_skills/dataset/asr-leaderboard/scripts/run_eval.py \
+#     --config nemo_skills/dataset/asr-leaderboard/scripts/asr_leaderboard_s2s_incremental_v2_config.yaml
+
+# Cluster settings
+cluster: s2s_eval_oci_iad
+partition: batch_block1,batch_block3,batch_block4
+cpu_partition: cpu
+
+model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-10min_sw_et_eos_dp_eos_dup_fp32_1delay_26_Feb_exp_13_afg_14k_steps-stt-AS9.1_11002_new_branch_load_fixed
+
+server_type: vllm
+server_gpus: 1
+num_chunks: 1
+
+server_entrypoint: "-m nemo_skills.inference.server.serve_unified"
+server_args: >-
+  --backend s2s_incremental_v2
+  --no_decode_audio
+  --use_asr_as_response
+  --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Mg_a_00759.wav
+  --num_frames_per_inference 3
+  --engine_type vllm_llm_vllm_eartts
+  --use_perception_cache
+  --use_perception_cudagraph
+  --buffer_size_frames 20
+  --codec_token_history_size 60
+  --repetition_penalty 1.0
+  --force_turn_taking
+  --force_turn_taking_threshold 40
+  --force_turn_taking_pad_window 25
+  --matmul_precision medium
+  --vllm_gpu_memory_utilization 0.35
+  --vllm_max_model_len 8192
+  --system_prompt "You are a helpful assistant. /no_think"
+  --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02/asr_leaderboard_incremental_v2_02mar_full_artifacts
+  --batch_size 2
+  --code_path /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/NeMo
+  --pip_install "hf-xet==1.1.9 huggingface-hub==0.34.4 nvidia-modelopt==0.33.1 nvidia-modelopt-core==0.33.1 tokenizers==0.22.0 transformers==4.56.0 lhotse==1.32.2 nv-one-logger-core==2.1.0 nv-one-logger-pytorch-lightning-integration==2.1.0 nv-one-logger-training-telemetry==2.1.0 kaldialign==0.9.1 jiwer whisper-normalizer"
+
+server_container: /lustre/fsw/portfolios/llmservice/users/erastorgueva/code/containers/triton25.05_s2svllm26.02.12.sqsh
+server_server_type: vllm_multimodal
+
+# Benchmark name used by nemo-skills eval pipeline
+benchmark: asr-leaderboard
+
+# Paths -- data_dir must match the container mount in cluster config
+# (cluster mounts /lustre/fs12/.../nemo_skills/dataset -> /dataset)
+data_dir: /dataset
+output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02/asr_leaderboard_incremental_v2_02mar_full
+
+installation_command: "pip install jiwer whisper-normalizer sacrebleu"
+expname: asr_leaderboard_s2s_incremental_v2_02mar_full
+
+generation_only: false
+scoring_only: false
+dry_run: false
diff --git a/nemo_skills/dataset/asr-leaderboard/scripts/hf_baseline_02mar_config.yaml b/nemo_skills/dataset/asr-leaderboard/scripts/hf_baseline_02mar_config.yaml
@@ -0,0 +1,60 @@
+# HF Open ASR Leaderboard evaluation - (i) BASELINE setup
+# S2S incremental V2 backend, TEXT output only.
+# Evaluates WER on 8 ASR datasets: librispeech_clean, librispeech_other, voxpopuli,
+# tedlium, gigaspeech, spgispeech, earnings22, ami.
+# No inference boosting, no force turn taking.
+# Checkpoint: Mar 2 2026 (Feb 26 STT + Mar 3 TTS, Megan speaker)
+#
+# Run:
+#   python nemo_skills/dataset/asr-leaderboard/scripts/run_eval.py \
+#     --config nemo_skills/dataset/asr-leaderboard/scripts/hf_baseline_02mar_config.yaml
+
+# Cluster settings
+cluster: s2s_eval_oci_iad
+partition: batch_block1,batch_block3,batch_block4
+cpu_partition: cpu
+
+model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-10min_sw_et_eos_dp_eos_dup_fp32_1delay_02_March_exp_17_afg_long_FT_Megan_msr_34k_steps-stt-AS9.1_11002_new_branch_load_fixed
+
+server_type: vllm
+server_gpus: 1
+num_chunks: 104
+
+server_entrypoint: "-m nemo_skills.inference.server.serve_unified"
+server_args: >-
+  --backend s2s_incremental_v2
+  --no_decode_audio
+  --use_asr_as_response
+  --ignore_system_prompt
+  --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Mg_a_00759.wav
+  --num_frames_per_inference 3
+  --engine_type vllm_llm_vllm_eartts
+  --use_perception_cache
+  --use_perception_cudagraph
+  --buffer_size_frames 21
+  --codec_token_history_size 60
+  --repetition_penalty 1.0
+  --matmul_precision medium
+  --vllm_gpu_memory_utilization 0.35
+  --vllm_max_model_len 8192
+  --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02_FIXED/baseline/hf_artifacts
+  --batch_size 1
+  --code_path /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/NeMo
+  --pip_install "hf-xet==1.1.9 huggingface-hub==0.34.4 nvidia-modelopt==0.33.1 nvidia-modelopt-core==0.33.1 tokenizers==0.22.0 transformers==4.56.0 lhotse==1.32.2 nv-one-logger-core==2.1.0 nv-one-logger-pytorch-lightning-integration==2.1.0 nv-one-logger-training-telemetry==2.1.0 kaldialign==0.9.1 jiwer whisper-normalizer sacrebleu"
+
+server_container: /lustre/fsw/portfolios/llmservice/users/erastorgueva/code/containers/triton25.05_s2svllm26.02.12.sqsh
+server_server_type: vllm_multimodal
+
+# Benchmark name used by nemo-skills eval pipeline
+benchmark: asr-leaderboard
+
+# Paths
+data_dir: /dataset
+output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02_FIXED/baseline/hf
+
+installation_command: "pip install jiwer whisper-normalizer sacrebleu"
+
+expname: hf_baseline_02mar
+generation_only: false
+scoring_only: false
+dry_run: false