diff --git a/cluster_configs/s2s_eval_oci_iad.yaml b/cluster_configs/s2s_eval_oci_iad.yaml new file mode 100644 index 0000000000..8a15ec39f9 --- /dev/null +++ b/cluster_configs/s2s_eval_oci_iad.yaml @@ -0,0 +1,52 @@ +executor: slurm + +ssh_tunnel: + host: draco-oci-login-01.draco-oci-iad.nvidia.com + # ------------------------------- Fill this up! ------------------------------- + user: mmkrtchyan + job_dir: /lustre/fsw/portfolios/llmservice/users/mmkrtchyan/workspace/code/nemo-run + identity: "" + # ----------------------------------------------------------------------------- + +# if you're running directly from cluster, you only need to define job_dir and shouldn't use ssh_tunnel +# job_dir: + +account: llmservice_nemo_speechlm +# account: llmservice_nemo_reasoning +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu +job_name_prefix: "" + +containers: + trtllm: /lustre/fsw/portfolios/llmservice/users/igitman/llm/images/nemo-skills-trtllm-0.7.0.sqsh + vllm: /lustre/fsw/portfolios/llmservice/users/igitman/llm/images/nemo-skills-vllm-0.7.0.sqsh + sglang: /lustre/fsw/portfolios/llmservice/users/igitman/llm/images/nemo-skills-sglang-0.7.0.sqsh + nemo: /lustre/fsw/portfolios/llmservice/users/igitman/llm/images/nemo-skills-nemo-0.7.0.sqsh + nemo-rl: /lustre/fsw/portfolios/llmservice/users/igitman/llm/images/nemo-skills-nemo-rl-0.7.0.sqsh + megatron: /lustre/fsw/portfolios/llmservice/users/igitman/llm/images/nemo-skills-megatron-0.7.0.sqsh + sandbox: /lustre/fsw/portfolios/llmservice/users/igitman/llm/images/nemo-skills-sandbox-0.7.1.sqsh + nemo-skills: /lustre/fsw/portfolios/llmservice/users/igitman/llm/images/nemo-skills-0.7.0.sqsh + verl: /lustre/fsw/portfolios/llmservice/users/igitman/llm/images/nemo-skills-verl-0.7.0.sqsh + +mounts: + - /lustre:/lustre + - /lustre/fs12/portfolios/llmservice/projects/llmservice_nemo_speechlm/data/nemo_skills/dataset:/dataset + +required_env_vars: + - NV_INFERENCE_KEY + +env_vars: + # ------------------------------- Fill this up! ------------------------------- + - HF_HOME=/lustre/fsw/portfolios/llmservice/users/mmkrtchyan/.cache/huggingface + - AUDIO_SAVE_DIR=/lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/fullduplexbench_audio_outputs + - PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True + # ----------------------------------------------------------------------------- + +timeouts: + batch_block1,batch_block3,batch_block4: 04:00:00 + interactive: 04:00:00 + interactive_singlenode: 04:00:00 + cpu: 04:00:00 + +mail_type: FAIL +mail_user: # diff --git a/nemo_skills/dataset/asr-leaderboard/prepare.py b/nemo_skills/dataset/asr-leaderboard/prepare.py index 25bbafd986..f2b0d11f3b 100644 --- a/nemo_skills/dataset/asr-leaderboard/prepare.py +++ b/nemo_skills/dataset/asr-leaderboard/prepare.py @@ -14,12 +14,17 @@ """Prepare ASR Leaderboard datasets for evaluation. -Downloads and formats datasets from the HuggingFace Open ASR Leaderboard. +Downloads and formats datasets from the official HF Open ASR Leaderboard ESB +test-only sorted dataset (hf-audio/esb-datasets-test-only-sorted). This is the +same data source used by the official leaderboard and the offline NeMo eval +pipeline, ensuring apples-to-apples WER comparison. + Audio paths in JSONL: /dataset/asr-leaderboard/data/{dataset}/{sample_id}.flac Usage: ns prepare_data asr-leaderboard ns prepare_data asr-leaderboard --datasets librispeech_clean ami + ns prepare_data asr-leaderboard --datasets earnings22 ns prepare_data asr-leaderboard --no-audio # skip saving audio files """ @@ -34,29 +39,22 @@ SYSTEM_MESSAGE = "You are a helpful assistant. /no_think" MIN_AUDIO_DURATION = 0.1 # Skip audio shorter than this (causes mel spectrogram errors) -# (hf_dataset, hf_config, hf_split, streaming) +# (hf_repo, config, split, text_field, id_field) DATASET_CONFIGS = { - "librispeech_clean": ("librispeech_asr", "clean", "test", False), - "librispeech_other": ("librispeech_asr", "other", "test", False), - "voxpopuli": ("facebook/voxpopuli", "en", "test", False), - "tedlium": ("LIUM/tedlium", "release3", "test", False), - "gigaspeech": ("speechcolab/gigaspeech", "xs", "test", False), - "spgispeech": ("kensho/spgispeech", "test", "test", True), # streaming to avoid timeout due to large metadata - "earnings22": ("distil-whisper/earnings22", "chunked", "test", False), - "ami": ("edinburghcstr/ami", "ihm", "test", False), + "librispeech_clean": ("hf-audio/esb-datasets-test-only-sorted", "librispeech", "test.clean", "text", "id"), + "librispeech_other": ("hf-audio/esb-datasets-test-only-sorted", "librispeech", "test.other", "text", "id"), + "voxpopuli": ("hf-audio/esb-datasets-test-only-sorted", "voxpopuli", "test", "text", "id"), + "tedlium": ("hf-audio/esb-datasets-test-only-sorted", "tedlium", "test", "text", "id"), + "gigaspeech": ("hf-audio/esb-datasets-test-only-sorted", "gigaspeech", "test", "text", "id"), + "spgispeech": ("hf-audio/esb-datasets-test-only-sorted", "spgispeech", "test", "text", "id"), + "earnings22": ("hf-audio/esb-datasets-test-only-sorted", "earnings22", "test", "text", "id"), + "ami": ("hf-audio/esb-datasets-test-only-sorted", "ami", "test", "text", "id"), } -def save_audio_and_format_entry(entry, dataset_name, audio_dir, sample_idx, with_audio=True): +def save_audio_and_format_entry(entry, dataset_name, audio_dir, sample_idx, text_field="text", id_field="id", with_audio=True): """Format a dataset entry and optionally save audio file.""" - # Different datasets use different field names for transcription - text = ( - entry.get("text", "") # ami, LS, gigaspeech, tedlium - or entry.get("normalized_text", "") # voxpopuli - or entry.get("transcript", "") # spgispeech - or entry.get("transcription", "") # earnings22 - ) - text = text.strip() if text else "" + text = entry.get(text_field, "").strip() system_message = {"role": "system", "content": SYSTEM_MESSAGE} user_message = {"role": "user", "content": "Transcribe the following audio."} @@ -70,8 +68,8 @@ def save_audio_and_format_entry(entry, dataset_name, audio_dir, sample_idx, with if duration < MIN_AUDIO_DURATION: return None - sample_id = entry.get("id", str(sample_idx)) - audio_filename = f"{sample_id}.flac" + sample_id = str(entry.get(id_field, sample_idx)).replace("/", "_") + audio_filename = f"{Path(sample_id).stem}.flac" if with_audio: sf.write(str(audio_dir / audio_filename), audio_array, sampling_rate) @@ -82,14 +80,14 @@ def save_audio_and_format_entry(entry, dataset_name, audio_dir, sample_idx, with } formatted_entry = { - "task_type": "ASR_LEADERBOARD", + "task_type": "ASR", "expected_answer": text, "messages": [system_message, user_message], "subset_for_metrics": dataset_name, } - if "id" in entry: - formatted_entry["id"] = entry["id"] + if id_field in entry: + formatted_entry["id"] = entry[id_field] if "speaker_id" in entry: formatted_entry["speaker_id"] = entry["speaker_id"] @@ -101,14 +99,11 @@ def prepare_dataset(dataset_name, output_dir, with_audio=True): if dataset_name not in DATASET_CONFIGS: raise ValueError(f"Unknown dataset: {dataset_name}. Available: {list(DATASET_CONFIGS.keys())}") - hf_dataset, hf_config, hf_split, streaming = DATASET_CONFIGS[dataset_name] + hf_repo, hf_config, hf_split, text_field, id_field = DATASET_CONFIGS[dataset_name] - print(f"Loading {dataset_name} from {hf_dataset} (streaming={streaming})...") + print(f"Loading {dataset_name} from {hf_repo} (config={hf_config}, split={hf_split})...") try: - if hf_config: - dataset = load_dataset(hf_dataset, hf_config, split=hf_split, trust_remote_code=True, streaming=streaming) - else: - dataset = load_dataset(hf_dataset, split=hf_split, trust_remote_code=True, streaming=streaming) + dataset = load_dataset(hf_repo, hf_config, split=hf_split, trust_remote_code=True) except Exception as e: print(f"Warning: Failed to load {dataset_name}: {e}") return 0 @@ -120,16 +115,13 @@ def prepare_dataset(dataset_name, output_dir, with_audio=True): audio_dir.mkdir(parents=True, exist_ok=True) print(f"Saving audio files to {audio_dir}") - if streaming: - print(f"Processing {dataset_name} (streaming)...") - else: - print(f"Processing {len(dataset)} samples from {dataset_name}...") + print(f"Processing {len(dataset)} samples from {dataset_name}...") count = 0 skipped = 0 with open(output_file, "w", encoding="utf-8") as fout: for idx, entry in enumerate(tqdm(dataset, desc=dataset_name)): - formatted = save_audio_and_format_entry(entry, dataset_name, audio_dir, idx, with_audio=with_audio) + formatted = save_audio_and_format_entry(entry, dataset_name, audio_dir, idx, text_field=text_field, id_field=id_field, with_audio=with_audio) if formatted is None: skipped += 1 continue @@ -160,7 +152,8 @@ def main(): ) args = parser.parse_args() - output_dir = Path(__file__).parent + data_dir = Path("/dataset/asr-leaderboard") + output_dir = data_dir if data_dir.exists() else Path(__file__).parent output_dir.mkdir(parents=True, exist_ok=True) with_audio = not args.no_audio diff --git a/nemo_skills/dataset/asr-leaderboard/scripts/asr_leaderboard_s2s_incremental_v2_02mar_config.yaml b/nemo_skills/dataset/asr-leaderboard/scripts/asr_leaderboard_s2s_incremental_v2_02mar_config.yaml new file mode 100644 index 0000000000..6a169c61e9 --- /dev/null +++ b/nemo_skills/dataset/asr-leaderboard/scripts/asr_leaderboard_s2s_incremental_v2_02mar_config.yaml @@ -0,0 +1,59 @@ +# HF Open ASR Leaderboard evaluation - S2S incremental V2 backend, TEXT output only. +# Evaluates WER on 8 ASR datasets: librispeech_clean, librispeech_other, voxpopuli, +# tedlium, gigaspeech, spgispeech, earnings22, ami. +# Checkpoint: Mar 2 2026 (Feb 26 STT + Mar 3 TTS, Megan speaker, force_turn_taking OFF) +# +# Run: +# python nemo_skills/dataset/asr-leaderboard/scripts/run_eval.py \ +# --config nemo_skills/dataset/asr-leaderboard/scripts/asr_leaderboard_s2s_incremental_v2_02mar_config.yaml + +# Cluster settings +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-10min_sw_et_eos_dp_eos_dup_fp32_1delay_02_March_exp_17_afg_long_FT_Megan_msr_34k_steps-stt-AS9.1_11002_new_branch_load_fixed + +server_type: vllm +server_gpus: 1 +num_chunks: 104 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_incremental_v2 + --no_decode_audio + --use_asr_as_response + --ignore_system_prompt + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Mg_a_00759.wav + --num_frames_per_inference 3 + --engine_type vllm_llm_vllm_eartts + --use_perception_cache + --use_perception_cudagraph + --buffer_size_frames 21 + --codec_token_history_size 60 + --repetition_penalty 1.0 + --matmul_precision medium + --vllm_gpu_memory_utilization 0.35 + --vllm_max_model_len 8192 + --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02/asr_leaderboard_incremental_v2_02mar_artifacts_full_no_system_prompt_3frames_per_inference_21buffer + --batch_size 1 + --code_path /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/NeMo + --pip_install "hf-xet==1.1.9 huggingface-hub==0.34.4 nvidia-modelopt==0.33.1 nvidia-modelopt-core==0.33.1 tokenizers==0.22.0 transformers==4.56.0 lhotse==1.32.2 nv-one-logger-core==2.1.0 nv-one-logger-pytorch-lightning-integration==2.1.0 nv-one-logger-training-telemetry==2.1.0 kaldialign==0.9.1 jiwer whisper-normalizer sacrebleu" + +server_container: /lustre/fsw/portfolios/llmservice/users/erastorgueva/code/containers/triton25.05_s2svllm26.02.12.sqsh +server_server_type: vllm_multimodal + +# Benchmark name used by nemo-skills eval pipeline +benchmark: asr-leaderboard + +# Paths -- data_dir must match the container mount in cluster config +# (cluster mounts /lustre/fs12/.../nemo_skills/dataset -> /dataset) +data_dir: /dataset +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02/asr_leaderboard_incremental_v5_02mar_artifacts_full_no_system_prompt_3frames_per_inference_21buffer + +installation_command: "pip install jiwer whisper-normalizer sacrebleu" + +expname: asr_leaderboard_s2s_incremental_v5_02mar +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/asr-leaderboard/scripts/asr_leaderboard_s2s_incremental_v2_config.yaml b/nemo_skills/dataset/asr-leaderboard/scripts/asr_leaderboard_s2s_incremental_v2_config.yaml new file mode 100644 index 0000000000..e43a73e537 --- /dev/null +++ b/nemo_skills/dataset/asr-leaderboard/scripts/asr_leaderboard_s2s_incremental_v2_config.yaml @@ -0,0 +1,65 @@ +# HF Open ASR Leaderboard evaluation - S2S incremental V2 backend, TEXT output only. +# Evaluates WER on 8 ASR datasets: librispeech_clean, librispeech_other, voxpopuli, +# tedlium, gigaspeech, spgispeech, earnings22, ami. +# Checkpoint: Feb 26 2026 (legally friendly personaplex dataset) +# +# Prepare data first: +# ns prepare_data asr-leaderboard +# +# Run: +# python nemo_skills/dataset/asr-leaderboard/scripts/run_eval.py \ +# --config nemo_skills/dataset/asr-leaderboard/scripts/asr_leaderboard_s2s_incremental_v2_config.yaml + +# Cluster settings +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-10min_sw_et_eos_dp_eos_dup_fp32_1delay_26_Feb_exp_13_afg_14k_steps-stt-AS9.1_11002_new_branch_load_fixed + +server_type: vllm +server_gpus: 1 +num_chunks: 1 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_incremental_v2 + --no_decode_audio + --use_asr_as_response + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Mg_a_00759.wav + --num_frames_per_inference 3 + --engine_type vllm_llm_vllm_eartts + --use_perception_cache + --use_perception_cudagraph + --buffer_size_frames 20 + --codec_token_history_size 60 + --repetition_penalty 1.0 + --force_turn_taking + --force_turn_taking_threshold 40 + --force_turn_taking_pad_window 25 + --matmul_precision medium + --vllm_gpu_memory_utilization 0.35 + --vllm_max_model_len 8192 + --system_prompt "You are a helpful assistant. /no_think" + --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02/asr_leaderboard_incremental_v2_02mar_full_artifacts + --batch_size 2 + --code_path /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/NeMo + --pip_install "hf-xet==1.1.9 huggingface-hub==0.34.4 nvidia-modelopt==0.33.1 nvidia-modelopt-core==0.33.1 tokenizers==0.22.0 transformers==4.56.0 lhotse==1.32.2 nv-one-logger-core==2.1.0 nv-one-logger-pytorch-lightning-integration==2.1.0 nv-one-logger-training-telemetry==2.1.0 kaldialign==0.9.1 jiwer whisper-normalizer" + +server_container: /lustre/fsw/portfolios/llmservice/users/erastorgueva/code/containers/triton25.05_s2svllm26.02.12.sqsh +server_server_type: vllm_multimodal + +# Benchmark name used by nemo-skills eval pipeline +benchmark: asr-leaderboard + +# Paths -- data_dir must match the container mount in cluster config +# (cluster mounts /lustre/fs12/.../nemo_skills/dataset -> /dataset) +data_dir: /dataset +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02/asr_leaderboard_incremental_v2_02mar_full + +installation_command: "pip install jiwer whisper-normalizer sacrebleu" +expname: asr_leaderboard_s2s_incremental_v2_02mar_full + +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/asr-leaderboard/scripts/hf_baseline_02mar_config.yaml b/nemo_skills/dataset/asr-leaderboard/scripts/hf_baseline_02mar_config.yaml new file mode 100644 index 0000000000..7e45c42ac5 --- /dev/null +++ b/nemo_skills/dataset/asr-leaderboard/scripts/hf_baseline_02mar_config.yaml @@ -0,0 +1,60 @@ +# HF Open ASR Leaderboard evaluation - (i) BASELINE setup +# S2S incremental V2 backend, TEXT output only. +# Evaluates WER on 8 ASR datasets: librispeech_clean, librispeech_other, voxpopuli, +# tedlium, gigaspeech, spgispeech, earnings22, ami. +# No inference boosting, no force turn taking. +# Checkpoint: Mar 2 2026 (Feb 26 STT + Mar 3 TTS, Megan speaker) +# +# Run: +# python nemo_skills/dataset/asr-leaderboard/scripts/run_eval.py \ +# --config nemo_skills/dataset/asr-leaderboard/scripts/hf_baseline_02mar_config.yaml + +# Cluster settings +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-10min_sw_et_eos_dp_eos_dup_fp32_1delay_02_March_exp_17_afg_long_FT_Megan_msr_34k_steps-stt-AS9.1_11002_new_branch_load_fixed + +server_type: vllm +server_gpus: 1 +num_chunks: 104 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_incremental_v2 + --no_decode_audio + --use_asr_as_response + --ignore_system_prompt + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Mg_a_00759.wav + --num_frames_per_inference 3 + --engine_type vllm_llm_vllm_eartts + --use_perception_cache + --use_perception_cudagraph + --buffer_size_frames 21 + --codec_token_history_size 60 + --repetition_penalty 1.0 + --matmul_precision medium + --vllm_gpu_memory_utilization 0.35 + --vllm_max_model_len 8192 + --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02_FIXED/baseline/hf_artifacts + --batch_size 1 + --code_path /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/NeMo + --pip_install "hf-xet==1.1.9 huggingface-hub==0.34.4 nvidia-modelopt==0.33.1 nvidia-modelopt-core==0.33.1 tokenizers==0.22.0 transformers==4.56.0 lhotse==1.32.2 nv-one-logger-core==2.1.0 nv-one-logger-pytorch-lightning-integration==2.1.0 nv-one-logger-training-telemetry==2.1.0 kaldialign==0.9.1 jiwer whisper-normalizer sacrebleu" + +server_container: /lustre/fsw/portfolios/llmservice/users/erastorgueva/code/containers/triton25.05_s2svllm26.02.12.sqsh +server_server_type: vllm_multimodal + +# Benchmark name used by nemo-skills eval pipeline +benchmark: asr-leaderboard + +# Paths +data_dir: /dataset +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02_FIXED/baseline/hf + +installation_command: "pip install jiwer whisper-normalizer sacrebleu" + +expname: hf_baseline_02mar +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/asr-leaderboard/scripts/hf_matched_demo_v1_02mar_config.yaml b/nemo_skills/dataset/asr-leaderboard/scripts/hf_matched_demo_v1_02mar_config.yaml new file mode 100644 index 0000000000..f9b052da50 --- /dev/null +++ b/nemo_skills/dataset/asr-leaderboard/scripts/hf_matched_demo_v1_02mar_config.yaml @@ -0,0 +1,61 @@ +# HF Open ASR Leaderboard evaluation - (ii) MATCHED_DEMO_V1 setup +# S2S incremental V2 backend, TEXT output only. +# Baseline + force_turn_taking + inference_user_pad_boost=0.8 +# Requires nemotron_h.py vLLM patch for boost params to take effect. +# Checkpoint: Mar 2 2026 (Feb 26 STT + Mar 3 TTS, Megan speaker) +# +# Run: +# python nemo_skills/dataset/asr-leaderboard/scripts/run_eval.py \ +# --config nemo_skills/dataset/asr-leaderboard/scripts/hf_matched_demo_v1_02mar_config.yaml + +# Cluster settings +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-10min_sw_et_eos_dp_eos_dup_fp32_1delay_02_March_exp_17_afg_long_FT_Megan_msr_34k_steps-stt-AS9.1_11002_new_branch_load_fixed + +server_type: vllm +server_gpus: 1 +num_chunks: 104 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_incremental_v2 + --no_decode_audio + --use_asr_as_response + --ignore_system_prompt + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Mg_a_00759.wav + --num_frames_per_inference 3 + --engine_type vllm_llm_vllm_eartts + --use_perception_cache + --use_perception_cudagraph + --buffer_size_frames 21 + --codec_token_history_size 60 + --repetition_penalty 1.0 + --matmul_precision medium + --vllm_gpu_memory_utilization 0.35 + --vllm_max_model_len 8192 + --force_turn_taking + --inference_user_pad_boost 0.8 + --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02_FIXED/matched_demo_v1/hf_artifacts + --batch_size 1 + --code_path /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/NeMo + --pip_install "hf-xet==1.1.9 huggingface-hub==0.34.4 nvidia-modelopt==0.33.1 nvidia-modelopt-core==0.33.1 tokenizers==0.22.0 transformers==4.56.0 lhotse==1.32.2 nv-one-logger-core==2.1.0 nv-one-logger-pytorch-lightning-integration==2.1.0 nv-one-logger-training-telemetry==2.1.0 kaldialign==0.9.1 jiwer whisper-normalizer sacrebleu" + +server_container: /lustre/fsw/portfolios/llmservice/users/erastorgueva/code/containers/triton25.05_s2svllm26.02.12.sqsh +server_server_type: vllm_multimodal + +# Benchmark name used by nemo-skills eval pipeline +benchmark: asr-leaderboard + +# Paths +data_dir: /dataset +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02_FIXED/matched_demo_v1/hf + +installation_command: "mkdir -p /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models && cp /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/nemotron_h.py /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/nemotron_h.py && pip install jiwer whisper-normalizer sacrebleu" + +expname: hf_matched_demo_v1_02mar +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/asr-leaderboard/scripts/run_eval.py b/nemo_skills/dataset/asr-leaderboard/scripts/run_eval.py new file mode 100644 index 0000000000..dede430081 --- /dev/null +++ b/nemo_skills/dataset/asr-leaderboard/scripts/run_eval.py @@ -0,0 +1,149 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Run HF Open ASR Leaderboard evaluation using nemo-skills pipeline. + +Evaluates WER on 8 ASR datasets: librispeech_clean, librispeech_other, +voxpopuli, tedlium, gigaspeech, spgispeech, earnings22, ami. + +The asr-leaderboard dataset uses METRICS_TYPE="audio" and task_type="ASR_LEADERBOARD", +so nemo-skills handles both generation and WER scoring automatically. + +Prepare data first: + ns prepare_data asr-leaderboard + +Usage: + python run_eval.py --config asr_leaderboard_s2s_incremental_v2_02mar_config.yaml + python run_eval.py --config asr_leaderboard_s2s_incremental_v2_02mar_config.yaml --dry_run + python run_eval.py --config asr_leaderboard_s2s_incremental_v2_02mar_config.yaml --generation_only + python run_eval.py --config asr_leaderboard_s2s_incremental_v2_02mar_config.yaml --scoring_only +""" + +import argparse + +import yaml + +from nemo_skills.pipeline.cli import eval as nemo_eval +from nemo_skills.pipeline.cli import wrap_arguments + + +def load_config(config_path: str) -> dict: + with open(config_path, "r") as f: + return yaml.safe_load(f) + + +def run_asr_leaderboard_eval(config: dict): + """Run HF Open ASR Leaderboard evaluation pipeline.""" + benchmark = config.get("benchmark", "asr-leaderboard") + expname = config.get("expname", "asr_leaderboard_eval") + dry_run = config.get("dry_run", False) + generation_only = config.get("generation_only", False) + scoring_only = config.get("scoring_only", False) + + print(f"{'=' * 60}") + print("HF Open ASR Leaderboard Evaluation") + print(f"{'=' * 60}") + split = config.get("split") + + print(f"Benchmark: {benchmark}") + print(f"Model: {config['model']}") + print(f"Output: {config['output_dir']}") + if split: + print(f"Split: {split} (single subset)") + + extra_args = [] + if config.get("max_samples"): + extra_args.append(f"++max_samples={config['max_samples']}") + if config.get("server_server_type"): + extra_args.append(f"++server.server_type={config['server_server_type']}") + extra_args_str = " ".join(extra_args) + + eval_kwargs = dict( + ctx=wrap_arguments(extra_args_str), + cluster=config["cluster"], + output_dir=config["output_dir"], + data_dir=config.get("data_dir"), + benchmarks=benchmark, + model=config["model"], + server_type=config.get("server_type", "vllm"), + server_gpus=config.get("server_gpus", 1), + server_nodes=config.get("server_nodes", 1), + server_args=config.get("server_args", ""), + server_entrypoint=config.get("server_entrypoint"), + server_container=config.get("server_container"), + partition=config.get("partition"), + num_chunks=config.get("num_chunks", 1), + installation_command=config.get("installation_command"), + expname=expname, + auto_summarize_results=True, + dry_run=dry_run, + ) + if config.get("chunk_ids"): + eval_kwargs["chunk_ids"] = config["chunk_ids"] + if split: + eval_kwargs["split"] = split + + if not scoring_only: + print("\n--- Running generation + scoring ---") + nemo_eval(**eval_kwargs) + + if scoring_only: + print("\n--- Running scoring only ---") + nemo_eval(**eval_kwargs) + + print(f"\n{'=' * 60}") + print("Done!") + print(f"{'=' * 60}") + + +def main(): + parser = argparse.ArgumentParser(description="HF Open ASR Leaderboard evaluation") + parser.add_argument("--config", required=True, help="Path to YAML config file") + + parser.add_argument("--cluster", help="Override cluster") + parser.add_argument("--partition", help="Override partition") + parser.add_argument("--model", help="Override model path") + parser.add_argument("--output_dir", help="Override output directory") + parser.add_argument("--max_samples", type=int, help="Override max_samples") + parser.add_argument("--num_chunks", type=int, help="Override num_chunks") + parser.add_argument( + "--split", + help="Data split to evaluate (default: test = all datasets). " + "Use a single dataset name (e.g. ami, earnings22) to run only that subset.", + ) + parser.add_argument("--dry_run", action="store_true", help="Print commands without executing") + parser.add_argument("--generation_only", action="store_true", help="Only run generation") + parser.add_argument("--scoring_only", action="store_true", help="Only run scoring") + + args = parser.parse_args() + config = load_config(args.config) + + override_keys = ["cluster", "partition", "model", "output_dir", "max_samples", "num_chunks", "split"] + for key in override_keys: + if getattr(args, key, None) is not None: + config[key] = getattr(args, key) + + if args.dry_run: + config["dry_run"] = True + if args.generation_only: + config["generation_only"] = True + if args.scoring_only: + config["scoring_only"] = True + + run_asr_leaderboard_eval(config) + + +if __name__ == "__main__": + main() diff --git a/nemo_skills/dataset/fdb/README.md b/nemo_skills/dataset/fdb/README.md new file mode 100644 index 0000000000..47b26a5796 --- /dev/null +++ b/nemo_skills/dataset/fdb/README.md @@ -0,0 +1,116 @@ +# Full-Duplex-Bench (FDB) for nemo-skills + +This package prepares the [Full-Duplex-Bench](https://drive.google.com/drive/folders/1DtoxMVO9_Y_nDs2peZtx3pw-U2qYgpd3) dataset for speech-to-speech evaluation in nemo-skills. + +## Dataset versions + +- **v1.0** — writes to `fdb_v1/` (subtests: pause_candor, pause_synthetic, backchannel, turn_taking, interruption). +- **v1.5** — writes to `fdb_v1_5/` (subtests: background_speech, talking_to_other, backchannel, interruption). + +Source layout (v1.0): `candor_pause_handling/{id}/input.wav`, `synthetic_pause_handling/{id}/input.wav`, `icc_backchannel/`, `candor_turn_taking/`, `synthetic_user_interruption/`. + +## Prepare data + +From the repo root (or with `PYTHONPATH` set so `nemo_skills` is importable): + +```bash +# Download v1.0 to default path and prepare +python -m nemo_skills.dataset.fdb.prepare + +# Use existing Full-Duplex-Bench data (no download) +python -m nemo_skills.dataset.fdb.prepare --fdb_data_path /path/to/Full-Duplex-Bench-data + +# Prepare v1.5 only +python -m nemo_skills.dataset.fdb.prepare --version v1.5 --fdb_data_path /path/to/Full-Duplex-Bench-data + +# Skip copying audio (faster, for testing) +python -m nemo_skills.dataset.fdb.prepare --fdb_data_path /path/to/data --no-audio +``` + +Audio files are copied into `fdb_v1/data/` (or `fdb_v1_5/data/`) with names like `pause_candor_18.wav`, `pause_synthetic_18.wav`, `backchannel_0.wav`, etc., so each subtest has distinct filenames. + +## Output layout + +After prepare: + +- `fdb_v1//test.jsonl` — one JSONL per subtest (e.g. pause_candor, pause_synthetic). +- `fdb_v1/data/*.wav` — shared audio directory; entries in JSONL reference `fdb_v1/data/.wav`. + +Each entry includes `messages`, `messages_text_audio`, and `messages_text` variants. + +## Evaluation + +Set `data_dir` in your eval config to the **fdb package directory** (the parent of `fdb_v1`/`fdb_v1_5`), e.g.: + +```yaml +data_dir: /path/to/nemo_skills/dataset/fdb +``` + +### Running evaluation + +Use `scripts/run_eval.py` with a YAML config: + +```bash +# Full run (generate + score) for v1.0 +python scripts/run_eval.py --config scripts/fdb_s2s_offline_v1.0_config.yaml + +# Full run for v1.5 +python scripts/run_eval.py --config scripts/fdb_s2s_offline_v1.5_config.yaml + +# Score only (skip generation, re-run scoring on existing outputs) +python scripts/run_eval.py --config scripts/fdb_s2s_offline_v1.5_config.yaml \ + --scoring_only --scoring_force + +# Score a specific subtest +python scripts/run_eval.py --config scripts/fdb_s2s_offline_v1.5_config.yaml \ + --scoring_only --scoring_force --subtests backchannel + +# Override output directory +python scripts/run_eval.py --config scripts/fdb_s2s_offline_v1.0_config.yaml \ + --output_dir /path/to/output +``` + +Key config fields (see example YAML files): + +| Field | Description | +|---|---| +| `cluster` | Cluster config name (e.g. `s2s_eval_oci_iad`) | +| `server_gpus` | GPUs for the inference server | +| `num_chunks` | Number of parallel inference chunks | +| `fdb_version` | `v1.0` or `v1.5` | +| `data_dir` | Path to the FDB nemo-skills package (parent of `fdb_v1`/`fdb_v1_5`) | +| `output_dir` | Where to write results | +| `fdb_repo_path` | Path to Full-Duplex-Bench clone (for ASR + evaluation scripts) | +| `fdb_data_path` | Path to raw FDB dataset (needed for turn_taking, interruption, behavior subtests) | +| `subtests` | List of subtests to run | + +### Helper scripts + +- `scripts/run_prepare_and_eval_both.sh` — prepare v1.0 and v1.5 then run offline eval for both (intended for cluster use with lustre paths). + +## Source data + +Download from Google Drive: +https://drive.google.com/drive/folders/1DtoxMVO9_Y_nDs2peZtx3pw-U2qYgpd3 + +If you pass `--fdb_data_path` and the requested version (v1.0 or v1.5) is missing, the script can attempt to download and extract it (requires `gdown`). + +## output.wav pipeline (2-channel) + +FDB expects **2-channel (stereo)** `output.wav`. Scoring uses the [Full-Duplex-Bench-NV](https://github.com/kevinhu-nv/Full-Duplex-Bench-NV) repo (or a clone) for ASR and evaluation; see `STEREO_REFERENCE_EVIDENCE.md` for where 2-channel is created vs consumed in that repo and in this codebase. + +1. **Server** (`s2s_voicechat` backend in `recipes/multimodal/server/backends/s2s_voicechat_infer_backend.py`) runs `offline_inference(..., decode_audio=True)`. The backend **outputs 2-channel WAV**: it duplicates the model’s mono output to both channels so the bytes sent in the API response are already stereo. +2. **Client** (`vllm_multimodal._process_audio_response`) decodes and writes those bytes to `output_dir/audio/.wav` as-is (no channel change). +3. **Prepare** (`prepare_fdb_eval_dir`) copies that file to `fdb_prepared//output.wav` and **verifies** it is 2-channel; if mono, the script fails with a clear error. + +So when using the **s2s_voicechat** backend with `--decode_audio`, the server already returns stereo; the prepare step only verifies and copies. + +### Scoring consistency (stereo + backchannel) + +The pipeline is **prepare (stereo output.wav) → ASR (--stereo, ch1) → evaluate**. For consistency: + +- **fdb_repo_path** in your config must point to a Full-Duplex-Bench clone that has: + 1. **get_transcript/asr.py** with `--stereo` (uses channel 1 for transcription). + 2. **evaluation/eval_backchannel.py** that converts stereo to the model channel (ch1) before calling Silero VAD (Silero expects 1D audio). + +The config `fdb_s2s_offline_v1.0_config.yaml` uses **our** Full-Duplex-Bench clone (`.../Full-Duplex-Bench`) for scoring so backchannel runs with the stereo-safe eval. On the cluster, ensure that clone has the updated `eval_backchannel.py` (and `asr.py` with `--stereo`) deployed. \ No newline at end of file diff --git a/nemo_skills/dataset/fdb/__init__.py b/nemo_skills/dataset/fdb/__init__.py new file mode 100644 index 0000000000..c7c93016f1 --- /dev/null +++ b/nemo_skills/dataset/fdb/__init__.py @@ -0,0 +1,33 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use it except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Full-Duplex-Bench: one group with v1 and v1.5 as subgroups (same prepare.py and eval files). +# Source: https://github.com/DanielLin94144/Full-Duplex-Bench + +DATASET_GROUP = "speechlm" +IS_BENCHMARK_GROUP = True + +BENCHMARKS = { + # v1.0 (pause split for separate TOR/latency) + "fdb_v1.pause_candor": {}, + "fdb_v1.pause_synthetic": {}, + "fdb_v1.backchannel": {}, + "fdb_v1.turn_taking": {}, + "fdb_v1.interruption": {}, + # v1.5 + "fdb_v1_5.background_speech": {}, + "fdb_v1_5.talking_to_other": {}, + "fdb_v1_5.backchannel": {}, + "fdb_v1_5.interruption": {}, +} diff --git a/nemo_skills/dataset/fdb/fdb_v1/__init__.py b/nemo_skills/dataset/fdb/fdb_v1/__init__.py new file mode 100644 index 0000000000..17de278128 --- /dev/null +++ b/nemo_skills/dataset/fdb/fdb_v1/__init__.py @@ -0,0 +1,26 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Full-Duplex-Bench v1.0 subgroup (under fdb/) +# Source: https://github.com/DanielLin94144/Full-Duplex-Bench + +DATASET_GROUP = "speechlm" +IS_BENCHMARK_GROUP = True + +BENCHMARKS = { + "fdb_v1.pause": {}, + "fdb_v1.backchannel": {}, + "fdb_v1.turn_taking": {}, + "fdb_v1.interruption": {}, +} diff --git a/nemo_skills/dataset/fdb/fdb_v1/backchannel/__init__.py b/nemo_skills/dataset/fdb/fdb_v1/backchannel/__init__.py new file mode 100644 index 0000000000..6d03831160 --- /dev/null +++ b/nemo_skills/dataset/fdb/fdb_v1/backchannel/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +METRICS_TYPE = "exact_match" +GENERATION_ARGS = "++prompt_format=openai" +EVAL_ARGS = "++eval_type=exact_match" diff --git a/nemo_skills/dataset/fdb/fdb_v1/interruption/__init__.py b/nemo_skills/dataset/fdb/fdb_v1/interruption/__init__.py new file mode 100644 index 0000000000..6d03831160 --- /dev/null +++ b/nemo_skills/dataset/fdb/fdb_v1/interruption/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +METRICS_TYPE = "exact_match" +GENERATION_ARGS = "++prompt_format=openai" +EVAL_ARGS = "++eval_type=exact_match" diff --git a/nemo_skills/dataset/fdb/fdb_v1/pause_candor/__init__.py b/nemo_skills/dataset/fdb/fdb_v1/pause_candor/__init__.py new file mode 100644 index 0000000000..6d03831160 --- /dev/null +++ b/nemo_skills/dataset/fdb/fdb_v1/pause_candor/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +METRICS_TYPE = "exact_match" +GENERATION_ARGS = "++prompt_format=openai" +EVAL_ARGS = "++eval_type=exact_match" diff --git a/nemo_skills/dataset/fdb/fdb_v1/pause_synthetic/__init__.py b/nemo_skills/dataset/fdb/fdb_v1/pause_synthetic/__init__.py new file mode 100644 index 0000000000..6d03831160 --- /dev/null +++ b/nemo_skills/dataset/fdb/fdb_v1/pause_synthetic/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +METRICS_TYPE = "exact_match" +GENERATION_ARGS = "++prompt_format=openai" +EVAL_ARGS = "++eval_type=exact_match" diff --git a/nemo_skills/dataset/fdb/fdb_v1/turn_taking/__init__.py b/nemo_skills/dataset/fdb/fdb_v1/turn_taking/__init__.py new file mode 100644 index 0000000000..6d03831160 --- /dev/null +++ b/nemo_skills/dataset/fdb/fdb_v1/turn_taking/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +METRICS_TYPE = "exact_match" +GENERATION_ARGS = "++prompt_format=openai" +EVAL_ARGS = "++eval_type=exact_match" diff --git a/nemo_skills/dataset/fdb/fdb_v1_5/__init__.py b/nemo_skills/dataset/fdb/fdb_v1_5/__init__.py new file mode 100644 index 0000000000..80253d5e54 --- /dev/null +++ b/nemo_skills/dataset/fdb/fdb_v1_5/__init__.py @@ -0,0 +1,26 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Full-Duplex-Bench v1.5 subgroup (under fdb/) +# Source: https://github.com/DanielLin94144/Full-Duplex-Bench + +DATASET_GROUP = "speechlm" +IS_BENCHMARK_GROUP = True + +BENCHMARKS = { + "fdb_v1_5.background_speech": {}, + "fdb_v1_5.talking_to_other": {}, + "fdb_v1_5.backchannel": {}, + "fdb_v1_5.interruption": {}, +} diff --git a/nemo_skills/dataset/fdb/fdb_v1_5/backchannel/__init__.py b/nemo_skills/dataset/fdb/fdb_v1_5/backchannel/__init__.py new file mode 100644 index 0000000000..6d03831160 --- /dev/null +++ b/nemo_skills/dataset/fdb/fdb_v1_5/backchannel/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +METRICS_TYPE = "exact_match" +GENERATION_ARGS = "++prompt_format=openai" +EVAL_ARGS = "++eval_type=exact_match" diff --git a/nemo_skills/dataset/fdb/fdb_v1_5/background_speech/__init__.py b/nemo_skills/dataset/fdb/fdb_v1_5/background_speech/__init__.py new file mode 100644 index 0000000000..6d03831160 --- /dev/null +++ b/nemo_skills/dataset/fdb/fdb_v1_5/background_speech/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +METRICS_TYPE = "exact_match" +GENERATION_ARGS = "++prompt_format=openai" +EVAL_ARGS = "++eval_type=exact_match" diff --git a/nemo_skills/dataset/fdb/fdb_v1_5/interruption/__init__.py b/nemo_skills/dataset/fdb/fdb_v1_5/interruption/__init__.py new file mode 100644 index 0000000000..6d03831160 --- /dev/null +++ b/nemo_skills/dataset/fdb/fdb_v1_5/interruption/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +METRICS_TYPE = "exact_match" +GENERATION_ARGS = "++prompt_format=openai" +EVAL_ARGS = "++eval_type=exact_match" diff --git a/nemo_skills/dataset/fdb/fdb_v1_5/talking_to_other/__init__.py b/nemo_skills/dataset/fdb/fdb_v1_5/talking_to_other/__init__.py new file mode 100644 index 0000000000..6d03831160 --- /dev/null +++ b/nemo_skills/dataset/fdb/fdb_v1_5/talking_to_other/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +METRICS_TYPE = "exact_match" +GENERATION_ARGS = "++prompt_format=openai" +EVAL_ARGS = "++eval_type=exact_match" diff --git a/nemo_skills/dataset/fdb/prepare.py b/nemo_skills/dataset/fdb/prepare.py new file mode 100644 index 0000000000..20f1a65fa8 --- /dev/null +++ b/nemo_skills/dataset/fdb/prepare.py @@ -0,0 +1,637 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import json +from pathlib import Path + +import soundfile as sf +from tqdm import tqdm + +# Subtest configurations for Full-Duplex-Bench +# Based on the four main evaluation dimensions +# Pause split into candor and synthetic for separate TOR/latency reporting +SUBTESTS = { + "pause_candor": { + "has_reference": True, + "metrics_type": "exact_match", + "eval_args": "++eval_type=exact_match", + "description": "Evaluate model's ability to handle pauses (candor / natural data)", + }, + "pause_synthetic": { + "has_reference": True, + "metrics_type": "exact_match", + "eval_args": "++eval_type=exact_match", + "description": "Evaluate model's ability to handle pauses (synthetic data)", + }, + "backchannel": { + "has_reference": True, + "metrics_type": "exact_match", + "eval_args": "++eval_type=exact_match", + "description": "Evaluate model's backchanneling behavior (e.g., 'uh-huh', 'yeah')", + }, + "turn_taking": { + "has_reference": True, + "metrics_type": "exact_match", + "eval_args": "++eval_type=exact_match", + "description": "Evaluate model's turn-taking capabilities", + }, + "interruption": { + "has_reference": True, + "metrics_type": "exact_match", + "eval_args": "++eval_type=exact_match", + "description": "Evaluate model's handling of interruptions", + }, + # v1.5-only subtasks (overlap-focused) + "background_speech": { + "has_reference": True, + "metrics_type": "exact_match", + "eval_args": "++eval_type=exact_match", + "description": "Evaluate model's response with background speech", + }, + "talking_to_other": { + "has_reference": True, + "metrics_type": "exact_match", + "eval_args": "++eval_type=exact_match", + "description": "Evaluate model's response when user talks to others", + }, +} + +# Template for subtest __init__.py files +INIT_TEMPLATE = """# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +METRICS_TYPE = "{metrics_type}" +GENERATION_ARGS = "++prompt_format=openai" +{eval_args} +""" + + +def save_audio(audio_data, audio_path, sampling_rate=16000): + """Save audio data to a WAV file.""" + audio_path.parent.mkdir(parents=True, exist_ok=True) + sf.write(str(audio_path), audio_data, sampling_rate) + + +def format_entry(entry, subtest_name, config, audio_dir, entry_idx, no_audio=False, dataset_name="fdb_v1"): + """Format a single entry for nemo-skills with OpenAI messages format. + + Creates three message variants in a single entry: + - messages: audio only (for speech-only evaluation) + - messages_text_audio: both text and audio + - messages_text: text only (for text-only comparison) + """ + prompt_text = entry.get("prompt", entry.get("question", "")) + + formatted = { + "problem": prompt_text, + } + + # Add expected answer if available + if config.get("has_reference") and "reference" in entry: + formatted["expected_answer"] = entry["reference"] + elif config.get("has_reference") and "answer" in entry: + formatted["expected_answer"] = entry["answer"] + + # Preserve additional metadata fields + for field in ["id", "dataset", "sample_id", "category", "duration", "overlap_duration"]: + if field in entry: + formatted[field] = entry[field] + + # System message (shared across all variants) + system_message = {"role": "system", "content": "You are a helpful assistant."} + + # Audio filename: {subtest_name}_{sample_id} so we get pause_candor_18.wav, pause_synthetic_18.wav + # (not just "pause_18" which would collide when both candor and synthetic write to the same audio_dir) + sample_id = entry.get("sample_id", str(entry_idx)) + audio_id = f"{subtest_name}_{sample_id}" + + # Handle audio - copy/link files and get audio info + audio_info = None + if not no_audio: + # Check if audio_path is provided (file already exists) + if "audio_path" in entry: + import shutil + from pathlib import Path + + source_audio = Path(entry["audio_path"]) + if source_audio.exists(): + audio_dest = audio_dir / f"{audio_id}.wav" + + # Copy audio file to our data directory + shutil.copy(source_audio, audio_dest) + + audio_info = {"audio": {"path": f"{dataset_name}/data/{audio_id}.wav"}} + formatted["audio_path"] = f"data/{audio_id}.wav" + + # Handle direct audio data (for compatibility) + elif "audio" in entry and entry["audio"] is not None: + audio_path = audio_dir / f"{audio_id}.wav" + + # Handle different audio data formats + if isinstance(entry["audio"], dict) and "array" in entry["audio"]: + save_audio(entry["audio"]["array"], audio_path, entry["audio"].get("sampling_rate", 16000)) + else: + # If audio is already a numpy array or similar + save_audio(entry["audio"], audio_path) + + audio_info = {"audio": {"path": f"{dataset_name}/data/{audio_id}.wav"}} + formatted["audio_path"] = f"data/{audio_id}.wav" + + # Create three message variants: + + # 1. messages: audio only (empty content, with audio) + user_message_audio = {"role": "user", "content": ""} + if audio_info: + user_message_audio.update(audio_info) + formatted["messages"] = [system_message.copy(), user_message_audio] + + # 2. messages_text_audio: both text and audio + user_message_text_audio = {"role": "user", "content": prompt_text} + if audio_info: + user_message_text_audio.update(audio_info) + formatted["messages_text_audio"] = [system_message.copy(), user_message_text_audio] + + # 3. messages_text: text only (no audio) + user_message_text = {"role": "user", "content": prompt_text} + formatted["messages_text"] = [system_message.copy(), user_message_text] + + return formatted + + +def create_subtest_init(subtest_dir, config): + """Create __init__.py for a subtest directory.""" + eval_args_line = f'EVAL_ARGS = "{config["eval_args"]}"' if config["eval_args"] else "" + content = INIT_TEMPLATE.format( + metrics_type=config["metrics_type"], + eval_args=eval_args_line, + ) + with open(subtest_dir / "__init__.py", "w") as f: + f.write(content) + + +def process_subtest(subtest_name, config, data_dir, audio_dir, fdb_data_path, no_audio=False, version="v1.0", dataset_name="fdb_v1"): + """Process a single subtest and save to JSONL. + + Each entry contains three message variants: + - messages: audio only (for speech-only evaluation) + - messages_text_audio: both text and audio + - messages_text: text only (for text-only comparison) + + Full-Duplex-Bench structure (v1.0 or v1.5): + - candor_pause_handling/{ID}/input.wav, pause.json, transcription.json + - candor_turn_taking/{ID}/input.wav, turn_taking.json, transcription.json + - icc_backchannel/{ID}/input.wav, transcription.json + - synthetic_pause_handling/{ID}/input.wav, pause.json, transcription.json + - synthetic_user_interruption/{ID}/input.wav, context.wav, interrupt.wav, interrupt.json + """ + subtest_dir = data_dir / subtest_name + subtest_dir.mkdir(parents=True, exist_ok=True) + + output_file = subtest_dir / "test.jsonl" + entries = [] + entry_idx = 0 + + print(f"Processing {subtest_name}...") + print(f" Description: {config['description']}") + + # Map subtests to Full-Duplex-Bench dataset folders (v1.0 vs v1.5 have different folders) + subtest_mapping_v1_0 = { + "pause_candor": ["candor_pause_handling"], + "pause_synthetic": ["synthetic_pause_handling"], + "backchannel": ["icc_backchannel"], + "turn_taking": ["candor_turn_taking"], + "interruption": ["synthetic_user_interruption"], + } + # v1.5 Drive folder names: background_speech, talking_to_other, user_interruption, user_backchannel + subtest_mapping_v1_5 = { + "background_speech": ["background_speech"], + "talking_to_other": ["talking_to_other"], + "backchannel": ["user_backchannel"], + "interruption": ["user_interruption"], + } + subtest_mapping = subtest_mapping_v1_5 if version == "v1.5" else subtest_mapping_v1_0 + dataset_folders = subtest_mapping.get(subtest_name, []) + if not dataset_folders: + print(f" Warning: Unknown subtest {subtest_name}") + return 0 + + # Version folder names (v1.0 / v1_0 and v1.5 / v1_5) + version_folders = ("v1.0", "v1_0") if version == "v1.0" else ("v1.5", "v1_5") + # Load test cases from each dataset folder + test_cases = [] + for folder_name in dataset_folders: + # Try different possible paths + possible_paths = [ + fdb_data_path / version_folders[0] / folder_name, + fdb_data_path / version_folders[1] / folder_name, + fdb_data_path / folder_name, # Direct + ] + + folder_path = None + for path in possible_paths: + if path.exists(): + folder_path = path + break + + if folder_path is None: + print(" Warning: Dataset folder not found. Tried:") + for path in possible_paths: + print(f" - {path}") + continue + + # Find all sample directories (numeric IDs) + sample_dirs = sorted([d for d in folder_path.iterdir() if d.is_dir()]) + print(f" Found {len(sample_dirs)} samples in {folder_name}") + + # FDB candor_turn_taking and synthetic_user_interruption use 1-based sample IDs (no 0) + skip_id_zero = folder_name in ("candor_turn_taking", "synthetic_user_interruption") + + for sample_dir in sample_dirs: + sample_id = sample_dir.name + if skip_id_zero and sample_id == "0": + continue + input_wav = sample_dir / "input.wav" + + if not input_wav.exists(): + print(f" Warning: input.wav not found in {sample_dir}") + continue + + # Load transcription if available + transcription_file = sample_dir / "transcription.json" + transcription = "" + if transcription_file.exists(): + with open(transcription_file, "r") as f: + trans_data = json.load(f) + # Extract text from transcription + if isinstance(trans_data, dict): + transcription = trans_data.get("text", "") + elif isinstance(trans_data, list) and len(trans_data) > 0: + # Word-level transcription (FDB uses "text" per segment, some datasets use "word") + transcription = " ".join([w.get("text", w.get("word", "")) for w in trans_data]) + + # Treat whitespace-only transcription as empty (use fallback so problem/content are meaningful) + prompt_text = (transcription or "").strip() or "Respond to the user's speech in the audio." + # Create test case + test_case = { + "id": f"{folder_name}_{sample_id}", + "audio_path": str(input_wav), + "prompt": prompt_text, + "dataset": folder_name, + "sample_id": sample_id, + } + + test_cases.append(test_case) + + if not test_cases: + print(" No test cases found. Make sure you downloaded the dataset from:") + print(" https://drive.google.com/drive/folders/1DtoxMVO9_Y_nDs2peZtx3pw-U2qYgpd3") + return 0 + + # Process each test case + for entry in tqdm(test_cases, desc=" Processing entries"): + formatted = format_entry( + entry, + subtest_name, + config, + audio_dir, + entry_idx, + no_audio=no_audio, + dataset_name=dataset_name, + ) + entries.append(formatted) + entry_idx += 1 + + # Write JSONL + with open(output_file, "w", encoding="utf-8") as f: + for entry in entries: + f.write(json.dumps(entry) + "\n") + + # Create __init__.py + create_subtest_init(subtest_dir, config) + + print(f" Wrote {len(entries)} entries to {output_file}") + return len(entries) + + +# All five zip names per version on Google Drive (must all be downloaded and extracted for full dataset) +EXPECTED_V1_ZIPS = [ + "candor_pause_handling.zip", + "candor_turn_taking.zip", + "icc_backchannel.zip", + "synthetic_pause_handling.zip", + "synthetic_user_interruption.zip", +] + +# v1.5 Drive folder names (after download/extract) +EXPECTED_V1_5_ZIPS = [ + "background_speech", + "talking_to_other", + "user_interruption", + "user_backchannel", +] + +# Google Drive: root has v1.0 and v1.5 subfolders +# https://drive.google.com/drive/folders/1DtoxMVO9_Y_nDs2peZtx3pw-U2qYgpd3 +FDB_ROOT_FOLDER_ID = "1DtoxMVO9_Y_nDs2peZtx3pw-U2qYgpd3" +FDB_V1_0_FOLDER_ID = "1hxzRk7xgtdr5ZEoctnp0sFK0COv91W3h" # v1.0 subfolder (direct) + + +def download_dataset(download_dir: Path, version: str = "v1.0") -> bool: + """Download the Full-Duplex-Bench v1.0 or v1.5 from Google Drive, then unzip. + + Root folder (v1.0 + v1.5): https://drive.google.com/drive/folders/1DtoxMVO9_Y_nDs2peZtx3pw-U2qYgpd3 + - v1.0: we download the v1.0 subfolder directly into download_dir/v1.0. + - v1.5: we download the ROOT folder so that download_dir/v1.5 is populated (enter v1.5 in Drive); + then we extract only the zips under download_dir/v1.5. + + Returns: + True if download successful, False otherwise + """ + try: + import gdown + except ImportError: + print("\nError: 'gdown' package not found. Install it with:") + print(" pip install gdown") + return False + + import shutil + import zipfile + + if version == "v1.0": + folder_id = FDB_V1_0_FOLDER_ID + version_dir_name = "v1.0" + output_for_download = download_dir / version_dir_name + version_dir = output_for_download + zip_description = "candor_pause_handling, candor_turn_taking, icc_backchannel, synthetic_pause_handling, synthetic_user_interruption" + else: + # v1.5: download ROOT folder so we get both v1.0 and v1.5; we only use v1.5 + folder_id = FDB_ROOT_FOLDER_ID + version_dir_name = "v1.5" + output_for_download = download_dir # root download -> download_dir/v1.0 and download_dir/v1.5 + version_dir = download_dir / "v1.5" + zip_description = "background_speech, talking_to_other, user_interruption, user_backchannel" + + folder_url = f"https://drive.google.com/drive/folders/{folder_id}" + + print("\n" + "=" * 60) + print(f"Downloading Full-Duplex-Bench {version} from Google Drive") + print("=" * 60) + print(f"Source: {folder_url}") + if version == "v1.5": + print("(Downloading root folder; v1.5 zips will be taken from download_dir/v1.5)") + print(f"Destination: {version_dir}") + print(f"Zips expected: {zip_description}") + print("=" * 60 + "\n") + + try: + output_for_download.mkdir(parents=True, exist_ok=True) + + print(f"Downloading {'root (v1.0 + v1.5)' if version == 'v1.5' else version} dataset...") + gdown.download_folder( + url=folder_url, + output=str(output_for_download), + quiet=False, + use_cookies=False, + remaining_ok=True, + ) + + # For v1.5 we downloaded root -> find v1.5 or v1_5 under download_dir (maybe one level down) + if version == "v1.5": + if not version_dir.exists(): + version_dir = download_dir / "v1_5" + if not version_dir.exists(): + for sub in download_dir.iterdir(): + if sub.is_dir(): + for name in ("v1.5", "v1_5"): + candidate = sub / name + if candidate.exists(): + version_dir = candidate + break + if version_dir.exists(): + break + if not version_dir.exists(): + print(f"After download, v1.5 folder not found under {download_dir}. Check Drive structure.") + return False + + # gdown may create a single subfolder with the folder name; collect zips from version_dir and one level down + zip_files = list(version_dir.glob("*.zip")) or list(version_dir.rglob("*.zip")) + if not zip_files and any(version_dir.iterdir()): + sub = next((d for d in version_dir.iterdir() if d.is_dir()), None) + if sub: + sub_zips = list(sub.rglob("*.zip")) + if sub_zips: + for z in sub_zips: + dest = version_dir / z.name + if not dest.exists() or dest.stat().st_size != z.stat().st_size: + shutil.move(str(z), str(dest)) + zip_files = list(version_dir.glob("*.zip")) + + print("\n" + "=" * 60) + print("Download completed.") + print("=" * 60 + "\n") + + if not zip_files: + zip_files = list(version_dir.glob("*.zip")) or list(version_dir.rglob("*.zip")) + if not zip_files: + print(f"Warning: No .zip files found under {version_dir_name}. Check Drive permissions or download manually.") + return True + + # Extract all zip files + print(f"Extracting {len(zip_files)} ZIP file(s) in {version_dir_name}...") + for zip_file in tqdm(sorted(zip_files), desc="Extracting"): + try: + extract_dir = zip_file.parent + with zipfile.ZipFile(zip_file, "r") as zip_ref: + zip_ref.extractall(extract_dir) + zip_file.unlink() + print(f" Extracted: {zip_file.name}") + except Exception as e: + print(f" Warning: Failed to extract {zip_file.name}: {e}") + + print(f"\nExtracted {len(zip_files)} ZIP file(s).") + + # Verify expected folders exist (names without .zip for v1.0; explicit list for v1.5) + if version == "v1.5": + expected_folders = EXPECTED_V1_5_ZIPS + else: + expected_folders = [p.replace(".zip", "") for p in EXPECTED_V1_ZIPS] + found = [d.name for d in version_dir.iterdir() if d.is_dir() and not d.name.startswith(".")] + missing = [f for f in expected_folders if f not in found] + if missing: + print(f"Warning: After extraction, missing folder(s): {missing}") + print(" Re-run prepare (without --fdb_data_path to re-download) or add the missing zip(s) manually to:") + print(f" {version_dir}") + else: + print(f"All expected {version} dataset folders are present.") + + return True + + except Exception as e: + print(f"\nError downloading dataset: {e}") + print("\nYou can manually download from:") + print(" https://drive.google.com/drive/folders/1DtoxMVO9_Y_nDs2peZtx3pw-U2qYgpd3") + print(f" Open the {version_dir_name} subfolder, download all 5 zips, and extract them into:") + print(f" {version_dir}") + return False + + +def main(): + parser = argparse.ArgumentParser( + description="Prepare Full-Duplex-Bench dataset for nemo-skills", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Download v1.0 to default path (s2s/Full-Duplex-Bench-data) and prepare + python prepare.py + + # Prepare v1.5 (use existing fdb_data_path that contains v1.5 or v1_5 folder) + python prepare.py --version v1.5 --fdb_data_path /path/to/Full-Duplex-Bench-data + + # Use existing dataset (no download) + python prepare.py --fdb_data_path /path/to/Full-Duplex-Bench-data + + # Download to a specific location and prepare + python prepare.py --fdb_data_path /path/to/download/location + """, + ) + parser.add_argument( + "--version", + type=str, + choices=["v1.0", "v1.5"], + default="v1.0", + help="Dataset version. v1.0 writes to fdb/fdb_v1/; v1.5 writes to fdb/fdb_v1_5/ (run separately).", + ) + parser.add_argument( + "--fdb_data_path", + type=str, + default=None, + help="Path to Full-Duplex-Bench dataset root. If not set, downloads selected version to s2s/Full-Duplex-Bench-data and prepares.", + ) + parser.add_argument( + "--subtests", + nargs="+", + default=None, + help="Specific subtests to process (default: all)", + ) + parser.add_argument( + "--no-audio", + action="store_true", + help="Skip processing audio files (faster, for testing)", + ) + parser.add_argument( + "--data_dir", + type=str, + default=None, + help="Output root for fdb_v1/ or fdb_v1_5/ (default: package dir). Use this to write to Lustre on the cluster so test.jsonl and data/ match.", + ) + args = parser.parse_args() + + version = args.version + base_dir = Path(args.data_dir) if args.data_dir else Path(__file__).parent # fdb package dir (v1 and v1_5 are subgroups under it) + if version == "v1.0": + data_dir = base_dir / "fdb_v1" + dataset_name = "fdb_v1" + else: + data_dir = base_dir / "fdb_v1_5" + dataset_name = "fdb_v1_5" + data_dir.mkdir(parents=True, exist_ok=True) + audio_dir = data_dir / "data" + audio_dir.mkdir(parents=True, exist_ok=True) + + _skills_dir = base_dir.parent.parent.parent + _s2s_root = _skills_dir.parent + _default_fdb_data = _s2s_root / "Full-Duplex-Bench-data" + + if args.fdb_data_path: + fdb_data_path = Path(args.fdb_data_path) + if not fdb_data_path.exists(): + print(f"\nError: Full-Duplex-Bench data path not found: {fdb_data_path}") + print("Run without --fdb_data_path to download, or use:") + print(" https://drive.google.com/drive/folders/1DtoxMVO9_Y_nDs2peZtx3pw-U2qYgpd3") + return + # If requested version data not present, download and extract first + if version == "v1.5": + v1_5_dir = fdb_data_path / "v1.5" + v1_5_alt = fdb_data_path / "v1_5" + has_data = any( + (v1_5_dir / folder).exists() or (v1_5_alt / folder).exists() + for folder in EXPECTED_V1_5_ZIPS + ) + if not has_data: + print(f"v1.5 data not found under {fdb_data_path}. Downloading and extracting v1.5...") + if not download_dataset(fdb_data_path, version="v1.5"): + print("\nDownload failed. Exiting.") + return + elif version == "v1.0": + v1_0_dir = fdb_data_path / "v1.0" + v1_0_alt = fdb_data_path / "v1_0" + expected_v1_0 = [p.replace(".zip", "") for p in EXPECTED_V1_ZIPS] + has_data = any( + (v1_0_dir / folder).exists() or (v1_0_alt / folder).exists() + for folder in expected_v1_0 + ) + if not has_data: + print(f"v1.0 data not found under {fdb_data_path}. Downloading and extracting v1.0...") + if not download_dataset(fdb_data_path, version="v1.0"): + print("\nDownload failed. Exiting.") + return + else: + download_path = _default_fdb_data + print(f"Downloading {version} to {download_path} (overwriting if present)...") + if not download_dataset(download_path, version=version): + print("\nDownload failed. Exiting.") + return + fdb_data_path = download_path + + if args.subtests: + subtests_to_process = args.subtests + elif version == "v1.5": + subtests_to_process = ["background_speech", "talking_to_other", "backchannel", "interruption"] + else: + subtests_to_process = ["pause_candor", "pause_synthetic", "backchannel", "turn_taking", "interruption"] + + print(f"Processing {len(subtests_to_process)} subtests for {version} (dataset_name={dataset_name})...") + if args.no_audio: + print("Skipping audio download (--no-audio)") + + total_entries = 0 + for subtest_name in subtests_to_process: + if subtest_name not in SUBTESTS: + print(f"Warning: Unknown subtest '{subtest_name}', skipping") + continue + + config = SUBTESTS[subtest_name] + count = process_subtest( + subtest_name, config, data_dir, audio_dir, fdb_data_path, + no_audio=args.no_audio, version=version, dataset_name=dataset_name, + ) + total_entries += count + + print(f"\nDone! Processed {total_entries} total entries across {len(subtests_to_process)} subtests.") + + +if __name__ == "__main__": + main() diff --git a/nemo_skills/dataset/fdb/scripts/fdb_s2s_incremental_config.yaml b/nemo_skills/dataset/fdb/scripts/fdb_s2s_incremental_config.yaml new file mode 100644 index 0000000000..1c62286f0c --- /dev/null +++ b/nemo_skills/dataset/fdb/scripts/fdb_s2s_incremental_config.yaml @@ -0,0 +1,52 @@ +# Full-Duplex-Bench v1.0 evaluation configuration for S2S incremental backend +# Turn taking only (merge run). Run: python run_eval.py --config fdb_s2s_incremental_config.yaml + +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-2mim_sw_et_eos_dp_eos_dup_fp32-stt-3-december_stt_edresson_model_R_digits_norm_eip_0.1_EA_model_step_9005 + +server_type: vllm +server_gpus: 4 +num_chunks: 12 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_incremental + --config_path /lustre/fsw/portfolios/convai/users/ecasanova/S2S-Duplex-new-codebase/scripts/configs/inference/nanov2_demo_model_eartts_updated.yaml + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/S2S-full-duplex/inference_references/Emma_S3_A1_SC7_singleturntarget_21_channel_1_audio_in.wav + --ignore_system_prompt + --num_frames_per_inference 2 + --silence_padding_sec 0.0 + --extra_decoding_seconds 10 + --inference_pad_boost -0.5 + --inference_bos_boost 0.5 + --inference_eos_boost 0 + --output_frame_alignment + --session_artifacts_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/fdb_v1_incremental_artifacts + --code_path /lustre/fsw/portfolios/convai/users/kevinhu/S2S-Duplex-new-codebase/branches/NeMo-release_not_rebased + +server_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh +scoring_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh + +server_server_type: vllm_multimodal + +fdb_version: v1.0 +data_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Skills/nemo_skills/dataset/fdb +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/fdb_v1_0_incremental_tt +# fdb_repo_path: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Full-Duplex-Bench +fdb_repo_path: /lustre/fsw/portfolios/llmservice/users/kevinhu/Full-Duplex-Bench-NV +fdb_data_path: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Full-Duplex-Bench-data + +subtests: + - turn_taking + +scoring_installation_command: "pip install numpy scipy scikit-learn silero-vad soundfile" + +expname: fdb_v1_s2s_incremental +# max_samples: 10 + +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/fdb/scripts/fdb_s2s_incremental_v2_02mar_config.yaml b/nemo_skills/dataset/fdb/scripts/fdb_s2s_incremental_v2_02mar_config.yaml new file mode 100644 index 0000000000..4aa1bd9a44 --- /dev/null +++ b/nemo_skills/dataset/fdb/scripts/fdb_s2s_incremental_v2_02mar_config.yaml @@ -0,0 +1,68 @@ +# Full-Duplex-Bench v1.0 evaluation configuration for S2S incremental V2 backend +# Uses NemotronVoicechatInferenceWrapper with vLLM + perception/codec caches. +# Checkpoint: Mar 2 2026 (Feb 26 STT + Mar 3 TTS, Megan speaker, force_turn_taking OFF) +# +# Run: +# python nemo_skills/dataset/fdb/scripts/run_eval.py \ +# --config nemo_skills/dataset/fdb/scripts/fdb_s2s_incremental_v2_02mar_config.yaml + +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-10min_sw_et_eos_dp_eos_dup_fp32_1delay_02_March_exp_17_afg_long_FT_Megan_msr_34k_steps-stt-AS9.1_11002_new_branch_load_fixed + +server_type: vllm +server_gpus: 1 +num_chunks: 8 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_incremental_v2 + --decode_audio + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Mg_a_00759.wav + --num_frames_per_inference 3 + --engine_type vllm_llm_vllm_eartts + --use_perception_cache + --use_perception_cudagraph + --use_codec_cache + --buffer_size_frames 20 + --codec_token_history_size 60 + --repetition_penalty 1.0 + --matmul_precision medium + --vllm_gpu_memory_utilization 0.35 + --vllm_max_model_len 8192 + --ignore_system_prompt + --merge_user_channel_v2 + --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02/fdb_v1_0_incremental_v2_02mar_artifacts_no-force_turn_taking-ignore_system_prompt + --code_path /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/NeMo + --pip_install "hf-xet==1.1.9 huggingface-hub==0.34.4 nvidia-modelopt==0.33.1 nvidia-modelopt-core==0.33.1 tokenizers==0.22.0 transformers==4.56.0 lhotse==1.32.2 nv-one-logger-core==2.1.0 nv-one-logger-pytorch-lightning-integration==2.1.0 nv-one-logger-training-telemetry==2.1.0 kaldialign==0.9.1" + +server_container: /lustre/fsw/portfolios/llmservice/users/erastorgueva/code/containers/triton25.05_s2svllm26.02.12.sqsh +scoring_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh + +server_server_type: vllm_multimodal + +fdb_version: v1.0 + +data_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Skills/nemo_skills/dataset/fdb +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02/fdb_v1_0_incremental_v2_02mar_no-force_turn_taking-ignore_system_prompt +fdb_repo_path: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Full-Duplex-Bench +fdb_data_path: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Full-Duplex-Bench-data + +subtests: + - backchannel + - pause_candor + - pause_synthetic + - turn_taking + - interruption + +installation_command: "" +scoring_installation_command: "pip install numpy scipy scikit-learn silero-vad soundfile" + +expname: fdb_v1_0_s2s_incremental_v2_02mar +# max_samples: 50 + +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/fdb/scripts/fdb_s2s_incremental_v2_26feb_config.yaml b/nemo_skills/dataset/fdb/scripts/fdb_s2s_incremental_v2_26feb_config.yaml new file mode 100644 index 0000000000..d9a4e2571b --- /dev/null +++ b/nemo_skills/dataset/fdb/scripts/fdb_s2s_incremental_v2_26feb_config.yaml @@ -0,0 +1,73 @@ +# Full-Duplex-Bench v1.0 evaluation configuration for S2S incremental V2 backend +# Uses NemotronVoicechatInferenceWrapper with vLLM + perception/codec caches. +# Checkpoint: Feb 26 2026 (legally friendly personaplex dataset) +# +# Run: +# python nemo_skills/dataset/fdb/scripts/run_eval.py \ +# --config nemo_skills/dataset/fdb/scripts/fdb_s2s_incremental_v2_26feb_config.yaml + +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-10min_sw_et_eos_dp_eos_dup_fp32_1delay_26_Feb_exp_13_afg_14k_steps-stt-AS9.1_11002_new_branch_load_fixed + +server_type: vllm +server_gpus: 1 +num_chunks: 8 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_incremental_v2 + --decode_audio + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Mg_a_00759.wav + --num_frames_per_inference 3 + --engine_type vllm_llm_vllm_eartts + --use_perception_cache + --use_perception_cudagraph + --use_codec_cache + --buffer_size_frames 20 + --codec_token_history_size 60 + --pad_to_duration_secs 40 + --repetition_penalty 1.0 + --force_turn_taking + --force_turn_taking_threshold 40 + --force_turn_taking_pad_window 25 + --matmul_precision medium + --vllm_gpu_memory_utilization 0.35 + --vllm_max_model_len 8192 + --ignore_system_prompt + --system_prompt "You are an AI assistant developed by NVIDIA. Your name is NVIDIA VoiceChat. Your job is to be helpful and have engaging conversations in English. Maintain a warm tone. Keep the dialogue open and ongoing. You must provide diverse responses, rephrase answers if user asks the same questions." + --merge_user_channel_v2 + --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/FEB_26/fdb_v1_0_incremental_v2_26feb_artifacts + --code_path /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/NeMo + --pip_install "hf-xet==1.1.9 huggingface-hub==0.34.4 nvidia-modelopt==0.33.1 nvidia-modelopt-core==0.33.1 tokenizers==0.22.0 transformers==4.56.0 lhotse==1.32.2 nv-one-logger-core==2.1.0 nv-one-logger-pytorch-lightning-integration==2.1.0 nv-one-logger-training-telemetry==2.1.0 kaldialign==0.9.1" + +server_container: /lustre/fsw/portfolios/llmservice/users/erastorgueva/code/containers/triton25.05_s2svllm26.02.12.sqsh +scoring_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh + +server_server_type: vllm_multimodal + +fdb_version: v1.0 + +data_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Skills/nemo_skills/dataset/fdb +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/FEB_26/fdb_v1_0_incremental_v2_26feb +fdb_repo_path: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Full-Duplex-Bench +fdb_data_path: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Full-Duplex-Bench-data + +subtests: + - backchannel + - pause_candor + - pause_synthetic + - turn_taking + - interruption + +installation_command: "" +scoring_installation_command: "pip install numpy scipy scikit-learn silero-vad soundfile" + +expname: fdb_v1_0_s2s_incremental_v2_26feb +# max_samples: 50 + +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/fdb/scripts/fdb_s2s_incremental_v2_config.yaml b/nemo_skills/dataset/fdb/scripts/fdb_s2s_incremental_v2_config.yaml new file mode 100644 index 0000000000..c193af5712 --- /dev/null +++ b/nemo_skills/dataset/fdb/scripts/fdb_s2s_incremental_v2_config.yaml @@ -0,0 +1,73 @@ +# Full-Duplex-Bench v1.0 evaluation configuration for S2S incremental V2 backend +# Uses NemotronVoicechatInferenceWrapper with vLLM + perception/codec caches. +# Checkpoint: merged EarTTS+LLM (same as run_s2s_nemo_inference_pipeline.sh reference) +# +# Run: +# python nemo_skills/dataset/fdb/scripts/run_eval.py \ +# --config nemo_skills/dataset/fdb/scripts/fdb_s2s_incremental_v2_config.yaml + +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-2mim_sw_et_eos_dp_eos_dup_fp32_1delay_20_Feb_exp_3.1_afg_40k_steps-stt-AS7.8_11460_new_branch_load_fixed + +server_type: vllm +server_gpus: 1 +num_chunks: 1 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_incremental_v2 + --decode_audio + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Mg_a_00759.wav + --num_frames_per_inference 3 + --engine_type vllm_llm_vllm_eartts + --use_perception_cache + --use_perception_cudagraph + --use_codec_cache + --buffer_size_frames 20 + --codec_token_history_size 60 + --pad_to_duration_secs 40 + --repetition_penalty 1.0 + --force_turn_taking + --force_turn_taking_threshold 40 + --force_turn_taking_pad_window 25 + --matmul_precision medium + --vllm_gpu_memory_utilization 0.35 + --vllm_max_model_len 8192 + --ignore_system_prompt + --system_prompt "You are an AI assistant developed by NVIDIA. Your name is NVIDIA VoiceChat. Your job is to be helpful and have engaging conversations in English. Maintain a warm tone. Keep the dialogue open and ongoing. You must provide diverse responses, rephrase answers if user asks the same questions." + --merge_user_channel_v2 + --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/fdb_v1_0_incremental_v2_artifacts + --code_path /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/NeMo + --pip_install "hf-xet==1.1.9 huggingface-hub==0.34.4 nvidia-modelopt==0.33.1 nvidia-modelopt-core==0.33.1 tokenizers==0.22.0 transformers==4.56.0 lhotse==1.32.2 nv-one-logger-core==2.1.0 nv-one-logger-pytorch-lightning-integration==2.1.0 nv-one-logger-training-telemetry==2.1.0 kaldialign==0.9.1" + +server_container: /lustre/fsw/portfolios/llmservice/users/erastorgueva/code/containers/triton25.05_s2svllm26.02.12.sqsh +scoring_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh + +server_server_type: vllm_multimodal + +fdb_version: v1.0 + +data_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Skills/nemo_skills/dataset/fdb +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/fdb_v1_0_incremental_v2 +fdb_repo_path: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Full-Duplex-Bench +fdb_data_path: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Full-Duplex-Bench-data + +subtests: + - backchannel + - pause_candor + - pause_synthetic + - turn_taking + - interruption + +installation_command: "" +scoring_installation_command: "pip install numpy scipy scikit-learn silero-vad soundfile" + +expname: fdb_v1_0_s2s_incremental_v2 +# max_samples: 50 + +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/fdb/scripts/fdb_s2s_incremental_v2_v1.5_02mar_config.yaml b/nemo_skills/dataset/fdb/scripts/fdb_s2s_incremental_v2_v1.5_02mar_config.yaml new file mode 100644 index 0000000000..1f3bb38a4c --- /dev/null +++ b/nemo_skills/dataset/fdb/scripts/fdb_s2s_incremental_v2_v1.5_02mar_config.yaml @@ -0,0 +1,67 @@ +# Full-Duplex-Bench v1.5 evaluation configuration for S2S incremental V2 backend +# Uses NemotronVoicechatInferenceWrapper with vLLM + perception/codec caches. +# Checkpoint: Mar 2 2026 (Feb 26 STT + Mar 3 TTS, Megan speaker, force_turn_taking OFF) +# +# Run: +# python nemo_skills/dataset/fdb/scripts/run_eval.py \ +# --config nemo_skills/dataset/fdb/scripts/fdb_s2s_incremental_v2_v1.5_02mar_config.yaml + +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-10min_sw_et_eos_dp_eos_dup_fp32_1delay_02_March_exp_17_afg_long_FT_Megan_msr_34k_steps-stt-AS9.1_11002_new_branch_load_fixed + +server_type: vllm +server_gpus: 1 +num_chunks: 8 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_incremental_v2 + --decode_audio + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Mg_a_00759.wav + --num_frames_per_inference 3 + --engine_type vllm_llm_vllm_eartts + --use_perception_cache + --use_perception_cudagraph + --use_codec_cache + --buffer_size_frames 20 + --codec_token_history_size 60 + --repetition_penalty 1.0 + --matmul_precision medium + --vllm_gpu_memory_utilization 0.35 + --vllm_max_model_len 8192 + --ignore_system_prompt + --merge_user_channel_v2 + --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02/fdb_v1_5_incremental_v2_02mar_artifacts_force_turn_taking + --code_path /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/NeMo + --pip_install "hf-xet==1.1.9 huggingface-hub==0.34.4 nvidia-modelopt==0.33.1 nvidia-modelopt-core==0.33.1 tokenizers==0.22.0 transformers==4.56.0 lhotse==1.32.2 nv-one-logger-core==2.1.0 nv-one-logger-pytorch-lightning-integration==2.1.0 nv-one-logger-training-telemetry==2.1.0 kaldialign==0.9.1" + +server_container: /lustre/fsw/portfolios/llmservice/users/erastorgueva/code/containers/triton25.05_s2svllm26.02.12.sqsh +scoring_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh + +server_server_type: vllm_multimodal + +fdb_version: v1.5 + +data_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Skills/nemo_skills/dataset/fdb +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02/fdb_v1_5_incremental_v2_02mar_force_turn_taking +fdb_repo_path: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Full-Duplex-Bench +fdb_data_path: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Full-Duplex-Bench-data + +subtests: + - backchannel + # - background_speech + # - talking_to_other + # - interruption + +installation_command: "" +scoring_installation_command: "pip install numpy scipy scikit-learn silero-vad soundfile" + +expname: fdb_v1_5_s2s_incremental_v2_02mar +# max_samples: 10 + +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/fdb/scripts/fdb_s2s_incremental_v2_v1.5_26feb_config.yaml b/nemo_skills/dataset/fdb/scripts/fdb_s2s_incremental_v2_v1.5_26feb_config.yaml new file mode 100644 index 0000000000..4e403f3af0 --- /dev/null +++ b/nemo_skills/dataset/fdb/scripts/fdb_s2s_incremental_v2_v1.5_26feb_config.yaml @@ -0,0 +1,72 @@ +# Full-Duplex-Bench v1.5 evaluation configuration for S2S incremental V2 backend +# Uses NemotronVoicechatInferenceWrapper with vLLM + perception/codec caches. +# Checkpoint: Feb 26 2026 (legally friendly personaplex dataset) +# +# Run: +# python nemo_skills/dataset/fdb/scripts/run_eval.py \ +# --config nemo_skills/dataset/fdb/scripts/fdb_s2s_incremental_v2_v1.5_26feb_config.yaml + +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-10min_sw_et_eos_dp_eos_dup_fp32_1delay_26_Feb_exp_13_afg_14k_steps-stt-AS9.1_11002_new_branch_load_fixed + +server_type: vllm +server_gpus: 1 +num_chunks: 8 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_incremental_v2 + --decode_audio + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Mg_a_00759.wav + --num_frames_per_inference 3 + --engine_type vllm_llm_vllm_eartts + --use_perception_cache + --use_perception_cudagraph + --use_codec_cache + --buffer_size_frames 20 + --codec_token_history_size 60 + --pad_to_duration_secs 40 + --repetition_penalty 1.0 + --force_turn_taking + --force_turn_taking_threshold 40 + --force_turn_taking_pad_window 25 + --matmul_precision medium + --vllm_gpu_memory_utilization 0.35 + --vllm_max_model_len 8192 + --ignore_system_prompt + --system_prompt "You are an AI assistant developed by NVIDIA. Your name is NVIDIA VoiceChat. Your job is to be helpful and have engaging conversations in English. Maintain a warm tone. Keep the dialogue open and ongoing. You must provide diverse responses, rephrase answers if user asks the same questions." + --merge_user_channel_v2 + --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/FEB_26/fdb_v1_5_incremental_v2_26feb_artifacts + --code_path /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/NeMo + --pip_install "hf-xet==1.1.9 huggingface-hub==0.34.4 nvidia-modelopt==0.33.1 nvidia-modelopt-core==0.33.1 tokenizers==0.22.0 transformers==4.56.0 lhotse==1.32.2 nv-one-logger-core==2.1.0 nv-one-logger-pytorch-lightning-integration==2.1.0 nv-one-logger-training-telemetry==2.1.0 kaldialign==0.9.1" + +server_container: /lustre/fsw/portfolios/llmservice/users/erastorgueva/code/containers/triton25.05_s2svllm26.02.12.sqsh +scoring_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh + +server_server_type: vllm_multimodal + +fdb_version: v1.5 + +data_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Skills/nemo_skills/dataset/fdb +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/FEB_26/fdb_v1_5_incremental_v2_26feb +fdb_repo_path: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Full-Duplex-Bench +fdb_data_path: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Full-Duplex-Bench-data + +subtests: + - backchannel + # - background_speech + # - talking_to_other + # - interruption + +installation_command: "" +scoring_installation_command: "pip install numpy scipy scikit-learn silero-vad soundfile" + +expname: fdb_v1_5_s2s_incremental_v2_26feb +# max_samples: 10 + +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/fdb/scripts/fdb_s2s_incremental_v2_v1.5_config.yaml b/nemo_skills/dataset/fdb/scripts/fdb_s2s_incremental_v2_v1.5_config.yaml new file mode 100644 index 0000000000..be084fdc76 --- /dev/null +++ b/nemo_skills/dataset/fdb/scripts/fdb_s2s_incremental_v2_v1.5_config.yaml @@ -0,0 +1,72 @@ +# Full-Duplex-Bench v1.5 evaluation configuration for S2S incremental V2 backend +# Uses NemotronVoicechatInferenceWrapper with vLLM + perception/codec caches. +# Checkpoint: merged EarTTS+LLM (same as run_s2s_nemo_inference_pipeline.sh reference) +# +# Run: +# python nemo_skills/dataset/fdb/scripts/run_eval.py \ +# --config nemo_skills/dataset/fdb/scripts/fdb_s2s_incremental_v2_v1.5_config.yaml + +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-2mim_sw_et_eos_dp_eos_dup_fp32_1delay_20_Feb_exp_3.1_afg_40k_steps-stt-AS7.8_11460_new_branch_load_fixed + +server_type: vllm +server_gpus: 1 +num_chunks: 32 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_incremental_v2 + --decode_audio + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Mg_a_00759.wav + --num_frames_per_inference 3 + --engine_type vllm_llm_vllm_eartts + --use_perception_cache + --use_perception_cudagraph + --use_codec_cache + --buffer_size_frames 20 + --codec_token_history_size 60 + --pad_to_duration_secs 40 + --repetition_penalty 1.0 + --force_turn_taking + --force_turn_taking_threshold 40 + --force_turn_taking_pad_window 25 + --matmul_precision medium + --vllm_gpu_memory_utilization 0.35 + --vllm_max_model_len 8192 + --ignore_system_prompt + --system_prompt "You are an AI assistant developed by NVIDIA. Your name is NVIDIA VoiceChat. Your job is to be helpful and have engaging conversations in English. Maintain a warm tone. Keep the dialogue open and ongoing. You must provide diverse responses, rephrase answers if user asks the same questions." + --merge_user_channel_v2 + --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/fdb_v1_5_incremental_v2_artifacts + --code_path /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/NeMo + --pip_install "hf-xet==1.1.9 huggingface-hub==0.34.4 nvidia-modelopt==0.33.1 nvidia-modelopt-core==0.33.1 tokenizers==0.22.0 transformers==4.56.0 lhotse==1.32.2 nv-one-logger-core==2.1.0 nv-one-logger-pytorch-lightning-integration==2.1.0 nv-one-logger-training-telemetry==2.1.0 kaldialign==0.9.1" + +server_container: /lustre/fsw/portfolios/llmservice/users/erastorgueva/code/containers/triton25.05_s2svllm26.02.12.sqsh +scoring_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh + +server_server_type: vllm_multimodal + +fdb_version: v1.5 + +data_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Skills/nemo_skills/dataset/fdb +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/fdb_v1_5_incremental_v2 +fdb_repo_path: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Full-Duplex-Bench +fdb_data_path: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Full-Duplex-Bench-data + +subtests: + - background_speech + - talking_to_other + - backchannel + - interruption + +installation_command: "" +scoring_installation_command: "pip install numpy scipy scikit-learn silero-vad soundfile" + +expname: fdb_v1_5_s2s_incremental_v2 +# max_samples: 10 + +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/fdb/scripts/fdb_s2s_offline_v1.0_config.yaml b/nemo_skills/dataset/fdb/scripts/fdb_s2s_offline_v1.0_config.yaml new file mode 100644 index 0000000000..1f148a7838 --- /dev/null +++ b/nemo_skills/dataset/fdb/scripts/fdb_s2s_offline_v1.0_config.yaml @@ -0,0 +1,68 @@ +# Full-Duplex-Bench v1.0 evaluation configuration for S2S voicechat offline backend + +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +# partition: interactive +cpu_partition: cpu + +model: /lustre/fsw/portfolios/llmservice/users/apasad/projects/nemo_s2s_merged_dec/exp_SFT-nano9b/IAD_nano9b_parakeet600m_from_PT_32k_64gpu_5e-5_PT0.7_SFT0.05_QA0.02_TEXT0.1_loss0.5_MCQ0.03_ASR0.01_sysp0.03_NoiseProb0.5_SNR-30-60_asr_dtc2_dst15_loss_text5.0_bos10.0_eos5.0_pad1.0_eosplacementsfix_nospecaug_ei0.1_ot8_TN_all_data_v3.2_ir/checkpoints_hf_step-12556-last.ckpt + +server_type: vllm +server_gpus: 1 +num_chunks: 32 # Adjust based on dataset size + +# Use serve_unified with S2S voicechat backend +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_voicechat + --decode_audio + --config_path /lustre/fsw/portfolios/llmservice/users/kevinhu/projects/duplex-stt/config/inference/infer_nano_eartts_Fullduplexbench_Jan22_2026.yaml + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/S2S-full-duplex/inference_references/Emma_S3_A1_SC7_singleturntarget_21_channel_1_audio_in.wav + --tts_ckpt_path /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Duplex_EARTTS/eartts_rvq_cont_task_nanov2_tts_pretraining_wordlist_duplex_data_new_branch_4nodes_duplex_eartts_2_delay_4rd_stage_fp32_wd_1500_et_eos_dp_eos_dup_step_10004.ckpt + --extra_decoding_seconds 0 + --ignore_system_prompt + --inference_pad_boost -0.5 + --inference_bos_boost 0.5 + --inference_eos_boost 0 + --dtype float32 + --code_path /lustre/fsw/portfolios/convai/users/kevinhu/S2S-Duplex-new-codebase/branches/NeMo-release_not_rebased + +server_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh +scoring_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh + +server_server_type: vllm_multimodal + +fdb_version: v1.0 + +# data_dir must point to fdb so benchmarks resolve to fdb/fdb_v1/* and fdb/fdb_v1_5/* +data_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Skills/nemo_skills/dataset/fdb +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/fdb_v1_0_voicechat_tt_boost_0s_ignore_system_prompt_trimmed +fdb_repo_path: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Full-Duplex-Bench +# fdb_repo_path: /lustre/fsw/portfolios/llmservice/users/kevinhu/Full-Duplex-Bench-NV +fdb_data_path: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Full-Duplex-Bench-data + +# Turn taking only (boost + merge run) +subtests: + - backchannel + - pause_candor + - pause_synthetic + - turn_taking + - interruption + +installation_command: "" +scoring_installation_command: "pip install numpy scipy scikit-learn silero-vad soundfile" + +expname: fdb_v1_0_s2s_offline +# max_samples: 50 # Uncomment to limit samples for testing + +# FDB prepare always writes stereo (2ch) output.wav and runs ASR with --stereo; no config to disable. +# Stereo convention with --merge_user_channel flag (matches reference inference/scoring): ch0 = user/input, ch1 = model/output. + +# Always use more tokens (1024) and reference sampling (temperature=0.8, top_p=0.8). See REFERENCE_INFERENCE_PARAMS.md. +inference_overrides: "++inference.temperature=0.8 ++inference.top_p=0.8 ++inference.tokens_to_generate=1024" + +# FDB prepare always writes stereo (2ch) output.wav and runs ASR with --stereo; no config to disable. + +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/fdb/scripts/fdb_s2s_offline_v1.0_stt11460_sw_config.yaml b/nemo_skills/dataset/fdb/scripts/fdb_s2s_offline_v1.0_stt11460_sw_config.yaml new file mode 100644 index 0000000000..236049596a --- /dev/null +++ b/nemo_skills/dataset/fdb/scripts/fdb_s2s_offline_v1.0_stt11460_sw_config.yaml @@ -0,0 +1,72 @@ +# Full-Duplex-Bench v1.0 evaluation configuration for S2S voicechat offline backend +# STT ckpt: checkpoints_hf_11460 (fix_seq_mask variant) +# S2S ckpt: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-2mim_sw_et_eos_dp_eos_dup_fp32_1delay_main_branch-stt-AS7.8_11460_old_branch_generated_fixed/ +# TTS ckpt: sliding window checkpoint (delay_1, step_24011) +# Ref: https://docs.google.com/document/d/1tLJlb5Fi8ECdoLwbnjQ2lM5JIj8OeVo_2PeoDzK9jQE/edit?tab=t.0#heading=h.d57shsy818cf + +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +# partition: interactive +cpu_partition: cpu + +model: /lustre/fsw/portfolios/llmservice/users/apasad/projects/nemo_s2s_merged_dec/exp_SFT-nano9b/IAD_nano9b_parakeet600m_from_PT_32k_64gpu_5e-5_PT0.5_SFT0.15_QA0.02_TEXT0.1_loss0.5_MCQ0.03_prompt2_ASR0.01_fillerlong_offset2_sysp0.05_NoiseDefault_asr_dtc2_dst15_lossDefault_ei0.033_ot8_TN_all_data_v3.2_ir_fix_seq_mask/checkpoints_hf_11460 + +server_type: vllm +server_gpus: 1 +num_chunks: 32 # Adjust based on dataset size + +# Use serve_unified with S2S voicechat backend +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_voicechat + --decode_audio + --config_path /lustre/fsw/portfolios/llmservice/users/kevinhu/projects/duplex-stt/config/inference/infer_nano_eartts_Fullduplexbench_Jan22_2026.yaml + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/S2S-full-duplex/inference_references/Emma_S3_A1_SC7_singleturntarget_21_channel_1_audio_in.wav + --tts_ckpt_path /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Duplex_EARTTS/eartts_rvq_cont_task_nanov2_tts_pretraining_wordlist_duplex_data_new_branch_new_tok_4nodes_duplex_eartts_nemo_main_delay_1_step_24011-last.ckpt + --extra_decoding_seconds 0 + --ignore_system_prompt + --inference_pad_boost -0.5 + --inference_bos_boost 0.5 + --inference_eos_boost 0 + --dtype float32 + --code_path /lustre/fsw/portfolios/convai/users/kevinhu/S2S-Duplex-new-codebase/branches/NeMo-release_not_rebased + +server_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh +scoring_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh + +server_server_type: vllm_multimodal + +fdb_version: v1.0 + +# data_dir must point to fdb so benchmarks resolve to fdb/fdb_v1/* and fdb/fdb_v1_5/* +data_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Skills/nemo_skills/dataset/fdb +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/fdb_v1_0_stt11460_sw_voicechat +fdb_repo_path: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Full-Duplex-Bench +# fdb_repo_path: /lustre/fsw/portfolios/llmservice/users/kevinhu/Full-Duplex-Bench-NV +fdb_data_path: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Full-Duplex-Bench-data + +# Turn taking only (boost + merge run) +subtests: + - backchannel + - pause_candor + - pause_synthetic + - turn_taking + - interruption + +installation_command: "" +scoring_installation_command: "pip install numpy scipy scikit-learn silero-vad soundfile" + +expname: fdb_v1_0_stt11460_sw_s2s_offline +# max_samples: 50 # Uncomment to limit samples for testing + +# FDB prepare always writes stereo (2ch) output.wav and runs ASR with --stereo; no config to disable. +# Stereo convention with --merge_user_channel flag (matches reference inference/scoring): ch0 = user/input, ch1 = model/output. + +# Always use more tokens (1024) and reference sampling (temperature=0.8, top_p=0.8). See REFERENCE_INFERENCE_PARAMS.md. +# inference_overrides: "++inference.temperature=0.8 ++inference.top_p=0.8 ++inference.tokens_to_generate=1024" + +# FDB prepare always writes stereo (2ch) output.wav and runs ASR with --stereo; no config to disable. + +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/fdb/scripts/fdb_s2s_offline_v1.5_config.yaml b/nemo_skills/dataset/fdb/scripts/fdb_s2s_offline_v1.5_config.yaml new file mode 100644 index 0000000000..a2d08c60bf --- /dev/null +++ b/nemo_skills/dataset/fdb/scripts/fdb_s2s_offline_v1.5_config.yaml @@ -0,0 +1,55 @@ +# Full-Duplex-Bench v1.5 evaluation configuration for S2S voicechat offline backend + +cluster: s2s_eval_oci_iad +# partition: batch_block1,batch_block3,batch_block4 +partition: interactive +cpu_partition: cpu + +model: /lustre/fsw/portfolios/llmservice/users/apasad/projects/nemo_s2s_merged_dec/exp_SFT-nano9b/IAD_nano9b_parakeet600m_from_PT_32k_64gpu_5e-5_PT0.7_SFT0.05_QA0.02_TEXT0.1_loss0.5_MCQ0.03_ASR0.01_sysp0.03_NoiseProb0.5_SNR-30-60_asr_dtc2_dst15_loss_text5.0_bos10.0_eos5.0_pad1.0_eosplacementsfix_nospecaug_ei0.1_ot8_TN_all_data_v3.2_ir/checkpoints_hf_step-12556-last.ckpt + +server_type: vllm +server_gpus: 1 +num_chunks: 32 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_voicechat + --decode_audio + --config_path /lustre/fsw/portfolios/llmservice/users/kevinhu/projects/duplex-stt/config/inference/infer_nano_eartts_Fullduplexbench_Jan22_2026.yaml + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/S2S-full-duplex/inference_references/Emma_S3_A1_SC7_singleturntarget_21_channel_1_audio_in.wav + --tts_ckpt_path /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Duplex_EARTTS/eartts_rvq_cont_task_nanov2_tts_pretraining_wordlist_duplex_data_new_branch_4nodes_duplex_eartts_2_delay_4rd_stage_fp32_wd_1500_et_eos_dp_eos_dup_step_10004.ckpt + --extra_decoding_seconds 0 + --ignore_system_prompt + --inference_pad_boost -0.5 + --inference_bos_boost 0.5 + --inference_eos_boost 0 + --dtype float32 + --code_path /lustre/fsw/portfolios/convai/users/kevinhu/S2S-Duplex-new-codebase/branches/NeMo-release_not_rebased + +server_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh +scoring_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh + +server_server_type: vllm_multimodal + +fdb_version: v1.5 + +data_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Skills/nemo_skills/dataset/fdb +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/fdb_v1_5_s2s_voicechat_32chunks +fdb_repo_path: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Full-Duplex-Bench +fdb_data_path: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Full-Duplex-Bench-data + +subtests: + - background_speech + - talking_to_other + - backchannel + - interruption + +installation_command: "" +scoring_installation_command: "pip install numpy scipy scikit-learn silero-vad soundfile" + +expname: fdb_v1_5_s2s_offline +# max_samples: 10 + +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/fdb/scripts/fdb_s2s_offline_v1.5_stt11460_sw_config.yaml b/nemo_skills/dataset/fdb/scripts/fdb_s2s_offline_v1.5_stt11460_sw_config.yaml new file mode 100644 index 0000000000..c05edf2818 --- /dev/null +++ b/nemo_skills/dataset/fdb/scripts/fdb_s2s_offline_v1.5_stt11460_sw_config.yaml @@ -0,0 +1,58 @@ +# Full-Duplex-Bench v1.5 evaluation configuration for S2S voicechat offline backend +# STT ckpt: checkpoints_hf_11460 (fix_seq_mask variant) +# S2S ckpt: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-2mim_sw_et_eos_dp_eos_dup_fp32_1delay_main_branch-stt-AS7.8_11460_old_branch_generated_fixed/ +# TTS ckpt: sliding window checkpoint (delay_1, step_24011) +# Ref: https://docs.google.com/document/d/1tLJlb5Fi8ECdoLwbnjQ2lM5JIj8OeVo_2PeoDzK9jQE/edit?tab=t.0#heading=h.d57shsy818cf + +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/llmservice/users/apasad/projects/nemo_s2s_merged_dec/exp_SFT-nano9b/IAD_nano9b_parakeet600m_from_PT_32k_64gpu_5e-5_PT0.5_SFT0.15_QA0.02_TEXT0.1_loss0.5_MCQ0.03_prompt2_ASR0.01_fillerlong_offset2_sysp0.05_NoiseDefault_asr_dtc2_dst15_lossDefault_ei0.033_ot8_TN_all_data_v3.2_ir_fix_seq_mask/checkpoints_hf_11460 + +server_type: vllm +server_gpus: 1 +num_chunks: 32 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_voicechat + --decode_audio + --config_path /lustre/fsw/portfolios/llmservice/users/kevinhu/projects/duplex-stt/config/inference/infer_nano_eartts_Fullduplexbench_Jan22_2026.yaml + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/S2S-full-duplex/inference_references/Emma_S3_A1_SC7_singleturntarget_21_channel_1_audio_in.wav + --tts_ckpt_path /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Duplex_EARTTS/eartts_rvq_cont_task_nanov2_tts_pretraining_wordlist_duplex_data_new_branch_new_tok_4nodes_duplex_eartts_nemo_main_delay_1_step_24011-last.ckpt + --extra_decoding_seconds 0 + --ignore_system_prompt + --inference_pad_boost -0.5 + --inference_bos_boost 0.5 + --inference_eos_boost 0 + --dtype float32 + --code_path /lustre/fsw/portfolios/convai/users/kevinhu/S2S-Duplex-new-codebase/branches/NeMo-release_not_rebased + +server_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh +scoring_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh + +server_server_type: vllm_multimodal + +fdb_version: v1.5 + +data_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Skills/nemo_skills/dataset/fdb +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/fdb_v1_5_stt11460_sw_voicechat +fdb_repo_path: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Full-Duplex-Bench +fdb_data_path: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Full-Duplex-Bench-data + +subtests: + - background_speech + - talking_to_other + - backchannel + - interruption + +installation_command: "" +scoring_installation_command: "pip install numpy scipy scikit-learn silero-vad soundfile" + +expname: fdb_v1_5_stt11460_sw_s2s_offline +# max_samples: 10 + +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/fdb/scripts/fdb_v1.0_baseline_02mar_config.yaml b/nemo_skills/dataset/fdb/scripts/fdb_v1.0_baseline_02mar_config.yaml new file mode 100644 index 0000000000..45a4708b46 --- /dev/null +++ b/nemo_skills/dataset/fdb/scripts/fdb_v1.0_baseline_02mar_config.yaml @@ -0,0 +1,68 @@ +# Full-Duplex-Bench v1.0 evaluation - (i) BASELINE setup +# S2S incremental V2 backend, all subtests. +# No inference boosting, no force turn taking. +# Checkpoint: Mar 2 2026 (Feb 26 STT + Mar 3 TTS, Megan speaker) +# +# Run: +# python nemo_skills/dataset/fdb/scripts/run_eval.py \ +# --config nemo_skills/dataset/fdb/scripts/fdb_v1.0_baseline_02mar_config.yaml + +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-10min_sw_et_eos_dp_eos_dup_fp32_1delay_02_March_exp_17_afg_long_FT_Megan_msr_34k_steps-stt-AS9.1_11002_new_branch_load_fixed + +server_type: vllm +server_gpus: 1 +num_chunks: 8 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_incremental_v2 + --decode_audio + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Mg_a_00759.wav + --num_frames_per_inference 3 + --engine_type vllm_llm_vllm_eartts + --use_perception_cache + --use_perception_cudagraph + --use_codec_cache + --buffer_size_frames 21 + --codec_token_history_size 60 + --repetition_penalty 1.0 + --matmul_precision medium + --vllm_gpu_memory_utilization 0.35 + --vllm_max_model_len 8192 + --ignore_system_prompt + --merge_user_channel_v2 + --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02_FIXED/baseline/fdb_v1.0_artifacts + --code_path /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/NeMo + --pip_install "hf-xet==1.1.9 huggingface-hub==0.34.4 nvidia-modelopt==0.33.1 nvidia-modelopt-core==0.33.1 tokenizers==0.22.0 transformers==4.56.0 lhotse==1.32.2 nv-one-logger-core==2.1.0 nv-one-logger-pytorch-lightning-integration==2.1.0 nv-one-logger-training-telemetry==2.1.0 kaldialign==0.9.1" + +server_container: /lustre/fsw/portfolios/llmservice/users/erastorgueva/code/containers/triton25.05_s2svllm26.02.12.sqsh +scoring_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh + +server_server_type: vllm_multimodal + +fdb_version: v1.0 + +data_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Skills/nemo_skills/dataset/fdb +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02_FIXED/baseline/fdb_v1.0 +fdb_repo_path: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Full-Duplex-Bench +fdb_data_path: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Full-Duplex-Bench-data + +subtests: + - backchannel + - pause_candor + - pause_synthetic + - turn_taking + - interruption + +installation_command: "" +scoring_installation_command: "pip install numpy scipy scikit-learn silero-vad soundfile" + +expname: fdb_v1.0_baseline_02mar + +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/fdb/scripts/fdb_v1.0_matched_demo_v1_02mar_config.yaml b/nemo_skills/dataset/fdb/scripts/fdb_v1.0_matched_demo_v1_02mar_config.yaml new file mode 100644 index 0000000000..28c6746b16 --- /dev/null +++ b/nemo_skills/dataset/fdb/scripts/fdb_v1.0_matched_demo_v1_02mar_config.yaml @@ -0,0 +1,71 @@ +# Full-Duplex-Bench v1.0 evaluation - (ii) MATCHED_DEMO_V1 setup +# S2S incremental V2 backend, all subtests. +# Baseline + force_turn_taking + inference_user_pad_boost=0.8 +# Requires nemotron_h.py vLLM patch for boost params to take effect. +# Checkpoint: Mar 2 2026 (Feb 26 STT + Mar 3 TTS, Megan speaker) +# +# Run: +# python nemo_skills/dataset/fdb/scripts/run_eval.py \ +# --config nemo_skills/dataset/fdb/scripts/fdb_v1.0_matched_demo_v1_02mar_config.yaml + +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-10min_sw_et_eos_dp_eos_dup_fp32_1delay_02_March_exp_17_afg_long_FT_Megan_msr_34k_steps-stt-AS9.1_11002_new_branch_load_fixed + +server_type: vllm +server_gpus: 1 +num_chunks: 8 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_incremental_v2 + --decode_audio + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Mg_a_00759.wav + --num_frames_per_inference 3 + --engine_type vllm_llm_vllm_eartts + --use_perception_cache + --use_perception_cudagraph + --use_codec_cache + --buffer_size_frames 21 + --codec_token_history_size 60 + --repetition_penalty 1.0 + --matmul_precision medium + --vllm_gpu_memory_utilization 0.35 + --vllm_max_model_len 8192 + --ignore_system_prompt + --merge_user_channel_v2 + --force_turn_taking + --inference_user_pad_boost 0.8 + --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02_FIXED/matched_demo_v1/fdb_v1.0_artifacts + --code_path /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/NeMo + --pip_install "hf-xet==1.1.9 huggingface-hub==0.34.4 nvidia-modelopt==0.33.1 nvidia-modelopt-core==0.33.1 tokenizers==0.22.0 transformers==4.56.0 lhotse==1.32.2 nv-one-logger-core==2.1.0 nv-one-logger-pytorch-lightning-integration==2.1.0 nv-one-logger-training-telemetry==2.1.0 kaldialign==0.9.1" + +server_container: /lustre/fsw/portfolios/llmservice/users/erastorgueva/code/containers/triton25.05_s2svllm26.02.12.sqsh +scoring_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh + +server_server_type: vllm_multimodal + +fdb_version: v1.0 + +data_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Skills/nemo_skills/dataset/fdb +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02_FIXED/matched_demo_v1/fdb_v1.0 +fdb_repo_path: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Full-Duplex-Bench +fdb_data_path: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Full-Duplex-Bench-data + +subtests: + - backchannel + - pause_candor + - pause_synthetic + - turn_taking + - interruption + +installation_command: "mkdir -p /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models && cp /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/nemotron_h.py /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/nemotron_h.py" +scoring_installation_command: "pip install numpy scipy scikit-learn silero-vad soundfile" + +expname: fdb_v1.0_matched_demo_v1_02mar + +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/fdb/scripts/fdb_v1.5_baseline_02mar_config.yaml b/nemo_skills/dataset/fdb/scripts/fdb_v1.5_baseline_02mar_config.yaml new file mode 100644 index 0000000000..7822fb6878 --- /dev/null +++ b/nemo_skills/dataset/fdb/scripts/fdb_v1.5_baseline_02mar_config.yaml @@ -0,0 +1,64 @@ +# Full-Duplex-Bench v1.5 evaluation - (i) BASELINE setup +# S2S incremental V2 backend, backchannel only. +# No inference boosting, no force turn taking. +# Checkpoint: Mar 2 2026 (Feb 26 STT + Mar 3 TTS, Megan speaker) +# +# Run: +# python nemo_skills/dataset/fdb/scripts/run_eval.py \ +# --config nemo_skills/dataset/fdb/scripts/fdb_v1.5_baseline_02mar_config.yaml + +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-10min_sw_et_eos_dp_eos_dup_fp32_1delay_02_March_exp_17_afg_long_FT_Megan_msr_34k_steps-stt-AS9.1_11002_new_branch_load_fixed + +server_type: vllm +server_gpus: 1 +num_chunks: 8 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_incremental_v2 + --decode_audio + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Mg_a_00759.wav + --num_frames_per_inference 3 + --engine_type vllm_llm_vllm_eartts + --use_perception_cache + --use_perception_cudagraph + --use_codec_cache + --buffer_size_frames 21 + --codec_token_history_size 60 + --repetition_penalty 1.0 + --matmul_precision medium + --vllm_gpu_memory_utilization 0.35 + --vllm_max_model_len 8192 + --ignore_system_prompt + --merge_user_channel_v2 + --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02_FIXED/baseline/fdb_v1.5_artifacts + --code_path /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/NeMo + --pip_install "hf-xet==1.1.9 huggingface-hub==0.34.4 nvidia-modelopt==0.33.1 nvidia-modelopt-core==0.33.1 tokenizers==0.22.0 transformers==4.56.0 lhotse==1.32.2 nv-one-logger-core==2.1.0 nv-one-logger-pytorch-lightning-integration==2.1.0 nv-one-logger-training-telemetry==2.1.0 kaldialign==0.9.1" + +server_container: /lustre/fsw/portfolios/llmservice/users/erastorgueva/code/containers/triton25.05_s2svllm26.02.12.sqsh +scoring_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh + +server_server_type: vllm_multimodal + +fdb_version: v1.5 + +data_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Skills/nemo_skills/dataset/fdb +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02_FIXED/baseline/fdb_v1.5 +fdb_repo_path: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Full-Duplex-Bench +fdb_data_path: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Full-Duplex-Bench-data + +subtests: + - backchannel + +installation_command: "" +scoring_installation_command: "pip install numpy scipy scikit-learn silero-vad soundfile" + +expname: fdb_v1.5_baseline_02mar + +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/fdb/scripts/fdb_v1.5_matched_demo_v1_02mar_config.yaml b/nemo_skills/dataset/fdb/scripts/fdb_v1.5_matched_demo_v1_02mar_config.yaml new file mode 100644 index 0000000000..8e06868abd --- /dev/null +++ b/nemo_skills/dataset/fdb/scripts/fdb_v1.5_matched_demo_v1_02mar_config.yaml @@ -0,0 +1,67 @@ +# Full-Duplex-Bench v1.5 evaluation - (ii) MATCHED_DEMO_V1 setup +# S2S incremental V2 backend, backchannel only. +# Baseline + force_turn_taking + inference_user_pad_boost=0.8 +# Requires nemotron_h.py vLLM patch for boost params to take effect. +# Checkpoint: Mar 2 2026 (Feb 26 STT + Mar 3 TTS, Megan speaker) +# +# Run: +# python nemo_skills/dataset/fdb/scripts/run_eval.py \ +# --config nemo_skills/dataset/fdb/scripts/fdb_v1.5_matched_demo_v1_02mar_config.yaml + +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-10min_sw_et_eos_dp_eos_dup_fp32_1delay_02_March_exp_17_afg_long_FT_Megan_msr_34k_steps-stt-AS9.1_11002_new_branch_load_fixed + +server_type: vllm +server_gpus: 1 +num_chunks: 8 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_incremental_v2 + --decode_audio + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Mg_a_00759.wav + --num_frames_per_inference 3 + --engine_type vllm_llm_vllm_eartts + --use_perception_cache + --use_perception_cudagraph + --use_codec_cache + --buffer_size_frames 21 + --codec_token_history_size 60 + --repetition_penalty 1.0 + --matmul_precision medium + --vllm_gpu_memory_utilization 0.35 + --vllm_max_model_len 8192 + --ignore_system_prompt + --merge_user_channel_v2 + --force_turn_taking + --inference_user_pad_boost 0.8 + --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02_FIXED/matched_demo_v1/fdb_v1.5_artifacts + --code_path /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/NeMo + --pip_install "hf-xet==1.1.9 huggingface-hub==0.34.4 nvidia-modelopt==0.33.1 nvidia-modelopt-core==0.33.1 tokenizers==0.22.0 transformers==4.56.0 lhotse==1.32.2 nv-one-logger-core==2.1.0 nv-one-logger-pytorch-lightning-integration==2.1.0 nv-one-logger-training-telemetry==2.1.0 kaldialign==0.9.1" + +server_container: /lustre/fsw/portfolios/llmservice/users/erastorgueva/code/containers/triton25.05_s2svllm26.02.12.sqsh +scoring_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh + +server_server_type: vllm_multimodal + +fdb_version: v1.5 + +data_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Skills/nemo_skills/dataset/fdb +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02_FIXED/matched_demo_v1/fdb_v1.5 +fdb_repo_path: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Full-Duplex-Bench +fdb_data_path: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/Full-Duplex-Bench-data + +subtests: + - backchannel + +installation_command: "mkdir -p /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models && cp /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/nemotron_h.py /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/nemotron_h.py" +scoring_installation_command: "pip install numpy scipy scikit-learn silero-vad soundfile" + +expname: fdb_v1.5_matched_demo_v1_02mar + +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/fdb/scripts/prepare_fdb_eval_dir.py b/nemo_skills/dataset/fdb/scripts/prepare_fdb_eval_dir.py new file mode 100644 index 0000000000..6f714d254b --- /dev/null +++ b/nemo_skills/dataset/fdb/scripts/prepare_fdb_eval_dir.py @@ -0,0 +1,420 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Prepare eval-results for FDB scoring: copy audio to fdb_prepared, optionally run FDB ASR +to produce time-aligned output.json per sample. +""" + +import argparse +import json +import os +import shutil +import subprocess +import sys +from pathlib import Path + +import numpy as np +from typing import Optional + +try: + import soundfile as sf +except ImportError: + sf = None +try: + from scipy.io import wavfile as scipy_wavfile +except ImportError: + scipy_wavfile = None + + +def _version_folders(fdb_version: str): + """Return (primary, alternate) folder names for the given FDB version (e.g. v1.0 -> ('v1.0', 'v1_0')).""" + if fdb_version == "v1.5": + return ("v1.5", "v1_5") + return ("v1.0", "v1_0") + + +def _get_sample_id_and_folder(subtest: str, entry_id: str, fdb_version: str) -> Optional[tuple]: + """Return (sample_id, fdb_folder_name) for looking up input.wav in fdb_data_path, or None.""" + entry_id = str(entry_id) + if fdb_version == "v1.5": + if entry_id.startswith("user_interruption_"): + return (entry_id.replace("user_interruption_", ""), "user_interruption") + # v1.5 turn_taking if we ever support it + return None + # v1.0 + if subtest == "turn_taking": + if entry_id.startswith("candor_turn_taking_"): + return (entry_id.replace("candor_turn_taking_", ""), "candor_turn_taking") + if entry_id.isdigit(): + return (entry_id, "candor_turn_taking") + if subtest == "interruption": + if entry_id.startswith("synthetic_user_interruption_"): + return (entry_id.replace("synthetic_user_interruption_", ""), "synthetic_user_interruption") + if entry_id.isdigit(): + return (entry_id, "synthetic_user_interruption") + return None + + +def _get_input_wav_path( + subtest: str, + entry_id: str, + fdb_data_path: Path, + fdb_version: str, +) -> Optional[Path]: + """Return path to input.wav in FDB data for this sample, or None if not found.""" + t = _get_sample_id_and_folder(subtest, entry_id, fdb_version) + if t is None: + return None + sample_id, folder = t + vers = _version_folders(fdb_version) + for ver in vers: + candidate = fdb_data_path / ver / folder / sample_id / "input.wav" + if candidate.exists(): + return candidate + return None + + +def _to_stereo(data: np.ndarray) -> np.ndarray: + """Return 2-channel array (samples, 2). If already stereo, return as-is; if mono, duplicate channel.""" + if data.ndim == 2 and data.shape[1] == 2: + return data + if data.ndim == 1: + return np.stack([data, data], axis=1) + if data.ndim == 2 and data.shape[1] == 1: + return np.concatenate([data, data], axis=1) + # multi-channel: take first two or mean to mono then duplicate + mono = data.mean(axis=1) + return np.stack([mono, mono], axis=1) + + +def _ensure_wav_to_dest(src: Path, dest: Path, stereo: bool = False) -> None: + """Copy wav to dest. For FDB (stereo=True) we ensure 2-channel output: convert mono to stereo by duplicating the channel.""" + dest.parent.mkdir(parents=True, exist_ok=True) + + # Prefer soundfile (handles float and int) + if sf is not None: + try: + data, sr = sf.read(str(src)) + if stereo: + data = _to_stereo(data) + else: + if data.ndim > 1: + data = data.mean(axis=1) + sf.write(str(dest), data, sr) + return + except Exception: + pass + + # Fallback: scipy.io.wavfile (int only) + if scipy_wavfile is not None: + try: + sr, data = scipy_wavfile.read(str(src)) + if stereo: + data = _to_stereo(data) + else: + if data.ndim > 1: + data = data.mean(axis=1) + scipy_wavfile.write(str(dest), sr, data) + return + except Exception: + pass + + # No reader: copy as-is + shutil.copy2(src, dest) + + +def _ensure_wav_mono_model_channel(src: Path, dest: Path) -> None: + """Write dest as mono using the model channel (index 1) when source is 2-channel. + FDB backchannel eval uses Silero VAD which requires mono; merged output.wav is ch0=user, ch1=model.""" + dest.parent.mkdir(parents=True, exist_ok=True) + if sf is not None: + try: + data, sr = sf.read(str(src)) + if data.ndim == 2 and data.shape[1] >= 2: + data = np.asarray(data[:, 1], dtype=data.dtype) # model channel + elif data.ndim > 1: + data = data.mean(axis=1) + sf.write(str(dest), data, sr) + return + except Exception: + pass + if scipy_wavfile is not None: + try: + sr, data = scipy_wavfile.read(str(src)) + if data.ndim == 2 and data.shape[1] >= 2: + data = np.asarray(data[:, 1], dtype=data.dtype) + elif data.ndim > 1: + data = data.mean(axis=1) + scipy_wavfile.write(str(dest), sr, data) + return + except Exception: + pass + shutil.copy2(src, dest) + + +def prepare_fdb_dir( + eval_results_dir: Path, + output_jsonl: Path, + fdb_repo: Path, + subdir_name: str = "fdb_prepared", + asr_task: Optional[str] = None, + run_asr: bool = False, + fdb_data_path: Optional[Path] = None, + subtest: Optional[str] = None, + fdb_version: str = "v1.0", + stereo: bool = True, +) -> Path: + """Build FDB-format dir under eval_results_dir/subdir_name. Returns path to prepared dir. + When stereo=True, output.wav is written as 2-channel; mono sources are converted to stereo by duplicating the channel.""" + fdb_prepared = eval_results_dir / subdir_name + audio_dir = eval_results_dir / "audio" + entries_with_audio = [] + + with open(output_jsonl, "r", encoding="utf-8") as f: + for line in f: + if not line.strip(): + continue + entry = json.loads(line) + entry_id = entry.get("id") or entry.get("sample_id") or f"sample_{len(entries_with_audio)}" + audio_path = None + if "audio" in entry and isinstance(entry["audio"], dict): + audio_path = entry["audio"].get("path") + if audio_path and os.path.isabs(audio_path) and os.path.exists(audio_path): + entries_with_audio.append((entry_id, Path(audio_path))) + elif audio_path: + for base in [audio_dir, eval_results_dir]: + p = base / Path(audio_path).name if not str(audio_path).startswith("audio/") else base / audio_path + if p.exists(): + entries_with_audio.append((entry_id, p)) + break + + if not entries_with_audio: + for wav in (audio_dir if audio_dir.exists() else eval_results_dir).rglob("*.wav"): + entry_id = wav.stem + entries_with_audio.append((entry_id, wav)) + + if not entries_with_audio: + print("No audio found in output.jsonl or audio/") + return fdb_prepared + + def fdb_dir_name(entry_id: str) -> str: + """FDB backchannel eval only considers dirs whose name is purely numeric (spk.isdigit()).""" + if subtest == "backchannel": + e = str(entry_id) + if e.startswith("icc_backchannel_"): + return e.replace("icc_backchannel_", "", 1) + if e.startswith("user_backchannel_"): # v1.5 + return e.replace("user_backchannel_", "", 1) + if e.isdigit(): + return e + return str(entry_id) + + fdb_prepared.mkdir(parents=True, exist_ok=True) + # Match reference (reorganize_candor_outputs): output.wav = model response only; copy input.wav from original for turn_taking/interruption so dir has both. + copy_input_wav_to_dir = subtest in ("turn_taking", "interruption") + # All subtests (including backchannel) use stereo when stereo=True: ch0=user, ch1=model. ASR --stereo uses ch1. + for entry_id, src_wav in entries_with_audio: + dest_dir = fdb_prepared / fdb_dir_name(entry_id) + dest_dir.mkdir(parents=True, exist_ok=True) + dest_wav = dest_dir / "output.wav" + _ensure_wav_to_dest(src_wav, dest_wav, stereo=stereo) + if copy_input_wav_to_dir and fdb_data_path and fdb_data_path.exists(): + input_wav_path = _get_input_wav_path(subtest, entry_id, fdb_data_path, fdb_version) + if input_wav_path is not None: + shutil.copy2(input_wav_path, dest_dir / "input.wav") + + # FDB turn_taking eval requires turn_taking.json (input turn end time) in each sample dir + vers = _version_folders(fdb_version) + if subtest == "turn_taking" and fdb_data_path and fdb_data_path.exists(): + for entry_id, _ in entries_with_audio: + entry_id = str(entry_id) + if entry_id.startswith("candor_turn_taking_"): + sample_id = entry_id.replace("candor_turn_taking_", "") + elif entry_id.isdigit(): + sample_id = entry_id # e.g. id "1" -> candor_turn_taking/1/ + else: + continue + for ver in vers: + src = fdb_data_path / ver / "candor_turn_taking" / sample_id / "turn_taking.json" + if src.exists(): + shutil.copy2(src, fdb_prepared / entry_id / "turn_taking.json") + break + else: + print(f"Warning: turn_taking.json not found for {entry_id} (tried {vers})") + + # FDB ASR for interruption expects interrupt.json in each sample dir (to crop audio after interrupt) + # v1.0 uses folder "synthetic_user_interruption" and has interrupt.json; v1.5 uses "user_interruption" and has metadata.json + if subtest == "interruption" and fdb_data_path and fdb_data_path.exists(): + folder_v1_0 = "synthetic_user_interruption" + folder_v1_5 = "user_interruption" + for entry_id, _ in entries_with_audio: + entry_id = str(entry_id) + sample_id = None + if entry_id.startswith("synthetic_user_interruption_"): + sample_id = entry_id.replace("synthetic_user_interruption_", "") + elif entry_id.startswith("user_interruption_"): + sample_id = entry_id.replace("user_interruption_", "") + elif entry_id.isdigit(): + sample_id = entry_id + if sample_id is None: + continue + dest = fdb_prepared / entry_id / "interrupt.json" + dest.parent.mkdir(parents=True, exist_ok=True) + copied = False + for ver in vers: + for folder in (folder_v1_5, folder_v1_0): + src_dir = fdb_data_path / ver / folder / sample_id + src_json = src_dir / "interrupt.json" + if src_json.exists(): + shutil.copy2(src_json, dest) + copied = True + break + # v1.5 has metadata.json only: {context_text, current_turn_text, timestamps} -> build interrupt.json + meta_src = src_dir / "metadata.json" + if meta_src.exists(): + with open(meta_src, "r", encoding="utf-8") as f: + meta = json.load(f) + interrupt_payload = [ + { + "context": meta.get("context_text", ""), + "interrupt": meta.get("current_turn_text", ""), + "timestamp": meta.get("timestamps", [0.0, 0.0]), + } + ] + with open(dest, "w", encoding="utf-8") as f: + json.dump(interrupt_payload, f, indent=2) + copied = True + break + if copied: + break + if not copied: + print(f"Warning: interrupt.json/metadata.json not found for {entry_id} (tried {vers}, {folder_v1_5}/{folder_v1_0})") + + # Behavior eval (background_speech, talking_to_other, v1.5 backchannel, v1.5 interruption) needs: + # input.json, clean_input.json, output.json, clean_output.json + # Copy input.wav and clean_input.wav from FDB v1.5 data; write placeholder clean_output.json; ASR will fill input.json and clean_input.json + _behavior_subtests = ("background_speech", "talking_to_other") + _is_v1_5_backchannel = subtest == "backchannel" and fdb_version == "v1.5" + _is_v1_5_interruption = subtest == "interruption" and fdb_version == "v1.5" + if (subtest in _behavior_subtests or _is_v1_5_backchannel or _is_v1_5_interruption) and fdb_data_path and fdb_data_path.exists(): + # Map subtest to v1.5 source folder name + if _is_v1_5_backchannel: + _src_folder = "user_backchannel" + elif _is_v1_5_interruption: + _src_folder = "user_interruption" + else: + _src_folder = subtest + _prefixes = { + "background_speech": "background_speech_", + "talking_to_other": "talking_to_other_", + } + if _is_v1_5_backchannel: + _prefixes["backchannel"] = "user_backchannel_" + if _is_v1_5_interruption: + _prefixes["interruption"] = "user_interruption_" + for entry_id, _ in entries_with_audio: + entry_id = str(entry_id) + sample_id = None + for prefix in _prefixes.values(): + if entry_id.startswith(prefix): + sample_id = entry_id.replace(prefix, "", 1) + break + if sample_id is None and entry_id.isdigit(): + sample_id = entry_id + if sample_id is None: + continue + dest_dir = fdb_prepared / fdb_dir_name(entry_id) + dest_dir.mkdir(parents=True, exist_ok=True) + for ver in vers: + src_dir = fdb_data_path / ver / _src_folder / sample_id + if not src_dir.exists(): + continue + for wav_name in ("input.wav", "clean_input.wav"): + src = src_dir / wav_name + if src.exists(): + shutil.copy2(src, dest_dir / wav_name) + # Placeholder: we don't run model on clean input, so no real clean_output; eval_behavior needs the file to exist + clean_out = dest_dir / "clean_output.json" + if not clean_out.exists(): + with open(clean_out, "w") as f: + json.dump({"text": "", "chunks": []}, f, indent=2) + break + else: + print(f"Warning: FDB data not found for {entry_id} (tried {vers}/{subtest}/{sample_id})") + + if run_asr and asr_task and (fdb_repo / "get_transcript" / "asr.py").exists(): + asr_script = fdb_repo / "get_transcript" / "asr.py" + cmd = [sys.executable, str(asr_script), "--root_dir", str(fdb_prepared), "--task", asr_task] + if stereo: + cmd.append("--stereo") + subprocess.run(cmd, cwd=str(fdb_repo), check=False) + _needs_inputs_asr = ( + subtest in ("background_speech", "talking_to_other") + or (subtest == "backchannel" and fdb_version == "v1.5") + or (subtest == "interruption" and fdb_version == "v1.5") + ) + if _needs_inputs_asr: + subprocess.run( + [sys.executable, str(asr_script), "--root_dir", str(fdb_prepared), "--task", "inputs_only"], + cwd=str(fdb_repo), + check=False, + ) + + return fdb_prepared + + +def main(): + parser = argparse.ArgumentParser(description="Prepare FDB-format dir from eval output.jsonl and audio") + parser.add_argument("--eval_results_dir", type=Path, required=True) + parser.add_argument("--fdb_repo", type=Path, required=True) + parser.add_argument("--subdir", default="fdb_prepared") + parser.add_argument("--run_asr", action="store_true", help="Run FDB ASR (requires --asr_task)") + parser.add_argument("--asr_task", choices=["full", "user_interruption"], help="FDB asr.py --task") + parser.add_argument( + "--fdb_data_path", + type=Path, + default=None, + help="FDB dataset root; required for turn_taking (turn_taking.json) and interruption (interrupt.json)", + ) + parser.add_argument( + "--subtest", default=None, help="Subtest name; needed to copy task metadata (turn_taking.json for turn_taking, interrupt.json for interruption)" + ) + parser.add_argument( + "--fdb_version", default="v1.0", choices=["v1.0", "v1.5"], help="FDB dataset version (for metadata paths under fdb_data_path)" + ) + args = parser.parse_args() + + output_jsonl = args.eval_results_dir / "output.jsonl" + if not output_jsonl.exists(): + print(f"Error: {output_jsonl} not found") + sys.exit(1) + + path = prepare_fdb_dir( + args.eval_results_dir, + output_jsonl, + args.fdb_repo, + args.subdir, + asr_task=args.asr_task, + run_asr=args.run_asr, + fdb_data_path=args.fdb_data_path, + subtest=args.subtest, + fdb_version=args.fdb_version, + stereo=True, + ) + print(f"Prepared: {path}") + + +if __name__ == "__main__": + main() diff --git a/nemo_skills/dataset/fdb/scripts/run_eval.py b/nemo_skills/dataset/fdb/scripts/run_eval.py new file mode 100644 index 0000000000..3fb93d05d0 --- /dev/null +++ b/nemo_skills/dataset/fdb/scripts/run_eval.py @@ -0,0 +1,233 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Run Full-Duplex-Bench evaluation: generate responses with nemo-skills, then score with FDB metrics. + +Usage: + python run_eval.py --config fdb_s2s_offline_config.yaml +""" + +import argparse +import shlex +from datetime import datetime +from pathlib import Path + +import yaml + +from nemo_skills.pipeline.cli import eval as nemo_eval +from nemo_skills.pipeline.cli import run_cmd, wrap_arguments + +ALL_SUBTESTS = [ + "pause_candor", + "pause_synthetic", + "backchannel", + "turn_taking", + "interruption", +] +# v1.5 has different subtasks (overlap-focused) +ALL_SUBTESTS_V1_5 = [ + "background_speech", + "talking_to_other", + "backchannel", + "interruption", +] + + +def load_config(config_path: str) -> dict: + with open(config_path, "r") as f: + return yaml.safe_load(f) + + +def _benchmark_name(subtest: str, fdb_version: str) -> str: + """Return benchmark name for eval-results path and nemo-skills (e.g. fdb_v1.pause or fdb_v1_5.pause).""" + if fdb_version == "v1.5": + return f"fdb_v1_5.{subtest}" + return f"fdb_v1.{subtest}" + + +def build_score_command(config: dict, subtest: str, force: bool = False) -> str: + """Build the scoring command to run via run_cmd. + + Uses run_fdb_scoring.py to create output compatible with nemo-skills: + - summarized-results/ directory with logs + - metrics.json with evaluation results + """ + fdb_version = config.get("fdb_version", "v1.0") + benchmark = _benchmark_name(subtest, fdb_version) + eval_results_dir = f"{config['output_dir']}/eval-results/{benchmark}" + fdb_repo = config["fdb_repo_path"] + scoring_script = "nemo_skills/dataset/fdb/scripts/run_fdb_scoring.py" + + cmd_args = [ + f"python {scoring_script}", + f"--eval_results_dir {eval_results_dir}", + f"--fdb_repo {fdb_repo}", + f"--subtest {subtest}", + f"--fdb_version {fdb_version}", + ] + if force: + cmd_args.append("--force") + fdb_data_path = config.get("fdb_data_path") + if fdb_data_path: + cmd_args.append(f"--fdb_data_path {shlex.quote(str(fdb_data_path))}") + + return " ".join(cmd_args) + + +def run_fdb_eval(config: dict): + """Run Full-Duplex-Bench evaluation using direct Python calls.""" + + # Parse subtests + fdb_version = config.get("fdb_version", "v1.0") + valid_subtests = ALL_SUBTESTS_V1_5 if fdb_version == "v1.5" else ALL_SUBTESTS + subtests_cfg = config.get("subtests", "all") + if subtests_cfg == "all": + subtests = list(valid_subtests) + elif isinstance(subtests_cfg, str): + subtests = [s.strip() for s in subtests_cfg.split(",")] + else: + subtests = subtests_cfg + + subtests = [s for s in subtests if s in valid_subtests] + if not subtests: + raise ValueError("No valid subtests specified") + + generation_only = config.get("generation_only", False) + scoring_only = config.get("scoring_only", False) + dry_run = config.get("dry_run", False) + + print(f"Processing {len(subtests)} subtests: {', '.join(subtests)}") + print(f"Output directory: {config['output_dir']}") + + # Build base extra args for hydra overrides + # Skip native evaluation for all subtests - FDB scorer handles evaluation + base_extra_args = ["++eval_type=null"] + if config.get("max_samples"): + base_extra_args.append(f"++max_samples={config['max_samples']}") + if config.get("server_server_type"): + base_extra_args.append(f"++server.server_type={config['server_server_type']}") + if config.get("api_key_env_var"): + base_extra_args.append(f"++server.api_key_env_var={config['api_key_env_var']}") + if config.get("inference_overrides"): + base_extra_args.extend(config["inference_overrides"].strip().split()) + + for subtest in subtests: + extra_args_str = " ".join(base_extra_args) + print(f"\n{'=' * 60}") + print(f"Processing subtest: {subtest} (FDB {fdb_version})") + print(f"{'=' * 60}") + + benchmark = _benchmark_name(subtest, fdb_version) + expname = f"{config.get('expname', 'fdb')}_{subtest}" + + # Generation phase + if not scoring_only: + print("\n--- Running generation ---") + server_gpus = config.get("server_gpus", 1) + # Use cpu_partition when not self-hosting (external API) + partition = config.get("cpu_partition") if server_gpus == 0 else config.get("partition") + nemo_eval( + ctx=wrap_arguments(extra_args_str), + cluster=config["cluster"], + output_dir=config["output_dir"], + benchmarks=benchmark, + model=config["model"], + server_type=config.get("server_type", "vllm"), + server_gpus=server_gpus, + server_address=config.get("server_address"), + num_chunks=config.get("num_chunks", 1), + server_container=config.get("server_container"), + server_entrypoint=config.get("server_entrypoint"), + data_dir=config.get("data_dir"), + server_args=config.get("server_args", ""), + installation_command=config.get("installation_command"), + partition=partition, + expname=expname, + auto_summarize_results=False, + dry_run=dry_run, + ) + + # Scoring phase + if not generation_only: + print("\n--- Running scoring ---") + score_command = build_score_command(config, subtest, force=config.get("scoring_force", False)) + eval_results_path = f"{config['output_dir']}/eval-results/{benchmark}" + # FDB scoring runs get_transcript/asr.py which requires NeMo + CUDA; use GPU partition and 1 GPU + scoring_container = config.get("scoring_container") or config.get("server_container") or "nemo-skills" + scoring_gpus = config.get("scoring_gpus", 1) # ASR needs GPU; default 1 + scoring_partition = config.get("scoring_partition") or config.get("partition") # GPU partition, not cpu_partition + run_cmd( + ctx=wrap_arguments(""), + cluster=config["cluster"], + command=score_command, + container=scoring_container, + partition=scoring_partition, + num_gpus=scoring_gpus, + run_after=[expname] if not scoring_only else None, + expname=f"{expname}_score", + installation_command=config.get("scoring_installation_command"), + log_dir=f"{eval_results_path}/summarized-results", + dry_run=dry_run, + ) + + print(f"\n{'=' * 60}") + print("Done!") + print(f"{'=' * 60}") + + +def main(): + parser = argparse.ArgumentParser(description="Run Full-Duplex-Bench evaluation (generate + score)") + parser.add_argument("--config", required=True, help="Path to YAML config file") + + # CLI overrides + parser.add_argument("--cluster", help="Override cluster") + parser.add_argument("--partition", help="Override partition") + parser.add_argument("--model", help="Override model") + parser.add_argument("--output_dir", help="Override output directory") + parser.add_argument("--subtests", help="Override subtests (comma-separated)") + parser.add_argument("--max_samples", type=int, help="Override max_samples") + parser.add_argument("--dry_run", action="store_true", help="Print commands without executing") + parser.add_argument("--generation_only", action="store_true", help="Only run generation") + parser.add_argument("--scoring_only", action="store_true", help="Only run scoring") + parser.add_argument("--scoring_force", action="store_true", help="Re-run scoring even if metrics.json exists") + + args = parser.parse_args() + + config = load_config(args.config) + + # Apply CLI overrides + for key in ["cluster", "partition", "model", "output_dir", "subtests", "max_samples"]: + if getattr(args, key, None) is not None: + config[key] = getattr(args, key) + if args.dry_run: + config["dry_run"] = True + if args.generation_only: + config["generation_only"] = True + if args.scoring_only: + config["scoring_only"] = True + if getattr(args, "scoring_force", False): + config["scoring_force"] = True + + # Add timestamp to output_dir for new runs (so each run has a unique dir). Skip when scoring_only so we use the existing dir. + output_dir = config.get("output_dir", "") + if output_dir and not config.get("scoring_only") and not any(char.isdigit() for char in Path(output_dir).name): + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + config["output_dir"] = f"{output_dir}_{timestamp}" + + run_fdb_eval(config) + + +if __name__ == "__main__": + main() diff --git a/nemo_skills/dataset/fdb/scripts/run_fdb_scoring.py b/nemo_skills/dataset/fdb/scripts/run_fdb_scoring.py new file mode 100644 index 0000000000..52a42c6e55 --- /dev/null +++ b/nemo_skills/dataset/fdb/scripts/run_fdb_scoring.py @@ -0,0 +1,243 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +FDB scoring: prepare (copy audio to fdb_prepared) -> run ASR -> run FDB evaluate -> write metrics.json. +Used by run_eval.py. +""" + +import argparse +import ast +import json +import os +import re +import subprocess +import sys +from pathlib import Path + +ASR_TASK_MAP = { + "pause": "full", + "pause_candor": "full", + "pause_synthetic": "full", + "backchannel": "full", + "turn_taking": "full", + "interruption": "user_interruption", + "background_speech": "full", + "talking_to_other": "full", +} +FDB_TASK_MAP = { + "pause": "pause_handling", + "pause_candor": "pause_handling", + "pause_synthetic": "pause_handling", + "backchannel": "backchannel", + "turn_taking": "smooth_turn_taking", + "interruption": "user_interruption", + "background_speech": "behavior", + "talking_to_other": "behavior", +} +# v1.5 uses behavior eval for backchannel and interruption (not the v1.0 JSD/TOR-based evals) +FDB_TASK_MAP_V1_5 = { + **FDB_TASK_MAP, + "backchannel": "behavior", + "interruption": "behavior", +} +# v1.5 interruption needs full ASR (not user_interruption which crops to post-interrupt) +ASR_TASK_MAP_V1_5 = { + **ASR_TASK_MAP, + "interruption": "full", +} + + +def _convert_stereo_to_mono(fdb_prepared: Path): + """Convert stereo output.wav files to mono (model channel) for Silero-VAD compatibility.""" + try: + import soundfile as sf_mod + except ImportError: + print("Warning: soundfile not available, skipping stereo→mono conversion for timing.") + return + for sample_dir in sorted(fdb_prepared.iterdir()): + out_wav = sample_dir / "output.wav" + if not out_wav.exists(): + continue + try: + data, sr = sf_mod.read(str(out_wav)) + if data.ndim == 2 and data.shape[1] >= 2: + mono = data[:, 1] # ch1 = model channel + sf_mod.write(str(out_wav), mono, sr) + except Exception as e: + print(f"Warning: could not convert {out_wav} to mono: {e}") + + +def main(): + parser = argparse.ArgumentParser(description="FDB prepare + ASR + evaluate -> metrics.json") + parser.add_argument("--eval_results_dir", type=Path, required=True) + parser.add_argument("--fdb_repo", type=Path, required=True) + parser.add_argument("--subtest", required=True, choices=list(ASR_TASK_MAP)) + parser.add_argument("--fdb_data_path", type=Path, default=None, help="FDB dataset root; required for turn_taking (turn_taking.json) and interruption (interrupt.json)") + parser.add_argument("--fdb_version", default="v1.0", choices=["v1.0", "v1.5"], help="FDB dataset version (metadata paths and metrics key)") + args = parser.parse_args() + + eval_results_dir = args.eval_results_dir.resolve() + fdb_repo = args.fdb_repo.resolve() + metrics_file = eval_results_dir / "metrics.json" + if not eval_results_dir.exists() or not fdb_repo.exists(): + print("Error: eval_results_dir or fdb_repo not found.") + sys.exit(1) + if not (eval_results_dir / "output.jsonl").exists(): + print("Error: output.jsonl not found.") + sys.exit(1) + + # turn_taking, interruption, background_speech, talking_to_other need fdb_data_path for metadata / input wavs + if args.subtest in ("turn_taking", "interruption", "background_speech", "talking_to_other") and ( + args.fdb_data_path is None or not args.fdb_data_path.exists() + ): + print( + f"Error: --fdb_data_path is required for subtest '{args.subtest}' " + "(FDB dataset root; used to copy task metadata and, for background_speech/talking_to_other, input.wav and clean_input.wav)." + ) + sys.exit(1) + + asr_map = ASR_TASK_MAP_V1_5 if args.fdb_version == "v1.5" else ASR_TASK_MAP + asr_task = asr_map[args.subtest] + task_map = FDB_TASK_MAP_V1_5 if args.fdb_version == "v1.5" else FDB_TASK_MAP + fdb_task = task_map[args.subtest] + prep_script = Path(__file__).resolve().parent / "prepare_fdb_eval_dir.py" + + prep_cmd = [ + sys.executable, str(prep_script), + "--eval_results_dir", str(eval_results_dir), + "--fdb_repo", str(fdb_repo), + "--run_asr", "--asr_task", asr_task, + "--subtest", args.subtest, + "--fdb_version", args.fdb_version, + ] + if args.fdb_data_path is not None: + prep_cmd.extend(["--fdb_data_path", str(args.fdb_data_path)]) + subprocess.run(prep_cmd, check=True) + + fdb_prepared = eval_results_dir / "fdb_prepared" + evaluate_script = fdb_repo / "evaluation" / "evaluate.py" + if not evaluate_script.exists(): + print(f"Error: {evaluate_script} not found") + sys.exit(1) + # Run from evaluation/ so FDB scripts find ./icc_gt_distribution.json (backchannel) and other relative paths. + # Pass through env so NVIDIA_API_KEY is available for interruption/behavior tasks (NVIDIA NIM API). + result = subprocess.run( + [sys.executable, str(evaluate_script), "--task", fdb_task, "--root_dir", str(fdb_prepared)], + cwd=str(fdb_repo / "evaluation"), capture_output=True, text=True, env=os.environ.copy(), + ) + stdout, stderr = result.stdout, result.stderr + print(stdout) + if stderr: + print(stderr, file=sys.stderr) + + metrics = {} + combined = stdout + "\n" + stderr + # Extract explicitly known FDB metric lines + # Per-subtask reporting: turn_taking -> TOR %, latency_ms; interruption -> TOR %, rating (GPT), latency_ms; pause -> candor TOR %, synthetic TOR % + explicit_metrics = [ + ("JSD - Mean", "jsd"), + ("TOR - Mean", "tor"), + ("Frequency - Mean", "frequency"), + ("Average take turn", "turn"), + ("Average latency", "latency"), + ("Average rating", "rating"), # GPT/LLM judge score (interruption only) + ("Candor - TOR %", "tor_candor_pct"), # pause: candor subset + ("Synthetic - TOR %", "tor_synthetic_pct"), # pause: synthetic subset + ] + for name, key in explicit_metrics: + m = re.search(rf"{re.escape(name)}\s*(?:\(s\))?\s*:\s*([0-9.]+)", combined) + if m: + try: + metrics[key] = float(m.group(1)) + except ValueError: + pass + # TOR as percentage (0-100) for turn_taking and interruption + if "turn" in metrics: + metrics["tor_pct"] = round(metrics["turn"] * 100, 2) + # Latency in ms for turn_taking and interruption + if "latency" in metrics: + metrics["latency_ms"] = round(metrics["latency"] * 1000, 2) + # Behavior eval (background_speech, talking_to_other): "Ratios (C-axis): {'C_RESPOND': 0.8, 'C_RESUME': 0.2}" + ratios_match = re.search(r"Ratios \(C-axis\):\s*(.+)", combined) + if ratios_match: + raw = ratios_match.group(1).strip().split("\n")[0].strip() + # Trim to balanced {...} (in case of trailing text) + if raw.startswith("{"): + end = raw.rfind("}") + if end != -1: + raw = raw[: end + 1] + try: + behavior_ratios = ast.literal_eval(raw) + if isinstance(behavior_ratios, dict): + metrics["behavior_ratios"] = {k: round(float(v), 4) for k, v in behavior_ratios.items()} + for k, v in behavior_ratios.items(): + metrics[f"behavior_{k}"] = round(float(v), 4) + # Add missing C_* keys as 0 for consistent schema + for key in ("C_RESPOND", "C_RESUME", "C_UNCERTAIN_HANDLING", "C_UNKNOWN"): + if key not in metrics["behavior_ratios"]: + metrics["behavior_ratios"][key] = 0.0 + metrics[f"behavior_{key}"] = 0.0 + except (ValueError, SyntaxError): + pass + # --- Timing metrics (Stop Latency & Response Latency) for v1.5 --- + # get_timing.py uses Silero-VAD on input.wav / output.wav to compute overlap and gap intervals. + # Silero-VAD expects mono; output.wav may be stereo (ch0=user, ch1=model) so convert to mono first. + if args.fdb_version == "v1.5": + timing_script = fdb_repo / "evaluation" / "get_timing.py" + if timing_script.exists(): + _convert_stereo_to_mono(fdb_prepared) + print(f"Running timing analysis (get_timing.py) on {fdb_prepared} ...") + timing_result = subprocess.run( + [sys.executable, str(timing_script), "--root_dir", str(fdb_prepared)], + cwd=str(fdb_repo / "evaluation"), capture_output=True, text=True, + ) + print(timing_result.stdout) + if timing_result.stderr: + print(timing_result.stderr, file=sys.stderr) + + stop_durations = [] + resp_durations = [] + for sample_dir in sorted(fdb_prepared.iterdir()): + lat_file = sample_dir / "latency_intervals.json" + if not lat_file.exists(): + continue + with open(lat_file, "r") as lf: + lat_data = json.load(lf) + for s, e in lat_data.get("latency_stop_list", []): + stop_durations.append(e - s) + for s, e in lat_data.get("latency_resp_list", []): + resp_durations.append(e - s) + + if stop_durations: + metrics["stop_latency"] = round(sum(stop_durations) / len(stop_durations), 4) + metrics["stop_latency_ms"] = round(metrics["stop_latency"] * 1000, 2) + if resp_durations: + metrics["response_latency"] = round(sum(resp_durations) / len(resp_durations), 4) + metrics["response_latency_ms"] = round(metrics["response_latency"] * 1000, 2) + else: + print(f"Warning: {timing_script} not found, skipping timing metrics.") + + if not metrics: + metrics["status"] = "no_metrics_found" + + benchmark_key = f"fdb_v1_5.{args.subtest}" if args.fdb_version == "v1.5" else f"fdb_v1.{args.subtest}" + metrics_file.parent.mkdir(parents=True, exist_ok=True) + with open(metrics_file, "w") as f: + json.dump({benchmark_key: {"greedy": metrics}}, f, indent=2) + print(f"Metrics written to {metrics_file}") + + +if __name__ == "__main__": + main() diff --git a/nemo_skills/dataset/fdb/scripts/run_prepare_and_eval_both.sh b/nemo_skills/dataset/fdb/scripts/run_prepare_and_eval_both.sh new file mode 100755 index 0000000000..9290c0cec5 --- /dev/null +++ b/nemo_skills/dataset/fdb/scripts/run_prepare_and_eval_both.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +# Prepare FDB v1.0 and v1.5, then run s2s voicechat offline evaluation for both. +# +# IMPORTANT: Run this script ON THE CLUSTER (where your repo lives on lustre). +# The eval config uses data_dir on lustre; the prepared test.jsonl must exist at +# that path. If you only ran prepare locally, either run this script on the +# cluster or rsync the prepared fdb dir to the cluster at the same path. +# +# Run from Skills/ with PYTHONPATH=$(pwd). For v1.5: ensure Full-Duplex-Bench-data +# on the cluster contains v1.5/ or v1_5/, or install gdown to download. + +set -e +SKILLS_DIR="/home/mmkrtchyan/projects/speechLM/s2s/Skills" +cd "$SKILLS_DIR" +export PYTHONPATH="${SKILLS_DIR}:${PYTHONPATH}" + +# Paths: use same lustre paths as in fdb_s2s_offline_*_config.yaml so prepared data is where eval expects it +FDB_DATA="${FDB_DATA_PATH:-$SKILLS_DIR/../Full-Duplex-Bench-data}" +CONFIG_DIR="nemo_skills/dataset/fdb/scripts" + +echo "=== 1) Prepare FDB v1.0 ===" +python -m nemo_skills.dataset.fdb.prepare --fdb_data_path "$FDB_DATA" --version v1.0 + +echo "" +echo "=== 2) Prepare FDB v1.5 (skips if v1.5 data missing; install gdown to download) ===" +python -m nemo_skills.dataset.fdb.prepare --fdb_data_path "$FDB_DATA" --version v1.5 || true + +echo "" +echo "=== 3) Run s2s voicechat offline eval for v1.0 ===" +python nemo_skills/dataset/fdb/scripts/run_eval.py \ + --config "$CONFIG_DIR/fdb_s2s_offline_v1.0_config.yaml" + +echo "" +echo "=== 4) Run s2s voicechat offline eval for v1.5 ===" +python nemo_skills/dataset/fdb/scripts/run_eval.py \ + --config "$CONFIG_DIR/fdb_s2s_offline_v1.5_config.yaml" + +echo "" +echo "Done. Check output_dir in the configs for eval-results and metrics.json." diff --git a/nemo_skills/dataset/mmau-pro/closed_form/__init__.py b/nemo_skills/dataset/mmau-pro/closed_form/__init__.py index 4e3b424d84..4390c1d887 100644 --- a/nemo_skills/dataset/mmau-pro/closed_form/__init__.py +++ b/nemo_skills/dataset/mmau-pro/closed_form/__init__.py @@ -16,6 +16,7 @@ METRICS_TYPE = "mmau_pro_closed_form" SCORE_MODULE = "nemo_skills.evaluation.metrics.mmau_pro_metrics" GENERATION_ARGS = "++prompt_format=openai" +EVAL_ARGS = "++eval_type=mmau-pro" # NVEmbed judge configuration for closed-form evaluation JUDGE_PIPELINE_ARGS = { diff --git a/nemo_skills/dataset/mmau-pro/open_ended/__init__.py b/nemo_skills/dataset/mmau-pro/open_ended/__init__.py index 22773d6fed..c5f09272d2 100644 --- a/nemo_skills/dataset/mmau-pro/open_ended/__init__.py +++ b/nemo_skills/dataset/mmau-pro/open_ended/__init__.py @@ -23,4 +23,4 @@ "server_type": "openai", "server_address": "https://integrate.api.nvidia.com/v1", } -JUDGE_ARGS = "++prompt_config=judge/speechlm ++generation_key=judgement" +JUDGE_ARGS = "++prompt_config=judge/mmau-pro ++generation_key=judgement" diff --git a/nemo_skills/dataset/mmau-pro/prepare.py b/nemo_skills/dataset/mmau-pro/prepare.py index a6f04d621b..0ea66ec2b7 100644 --- a/nemo_skills/dataset/mmau-pro/prepare.py +++ b/nemo_skills/dataset/mmau-pro/prepare.py @@ -75,8 +75,8 @@ def format_entry(entry, with_audio=False): if category == "open": content = entry["question"] elif choices and len(choices) > 1: - options_text = "\n".join(f"{chr(65 + i)}. {choice}" for i, choice in enumerate(choices)) - content = f"{entry['question']}\n\n{options_text}" + options_text = "\n".join(f"{chr(65 + i)}) {choice}" for i, choice in enumerate(choices)) + content = f"{entry['question']}\n\n{options_text}\n\nRespond with the complete text of the correct option, not just the letter." else: content = entry["question"] @@ -84,13 +84,18 @@ def format_entry(entry, with_audio=False): if entry.get("audio_path"): audio_path = entry["audio_path"] - - if isinstance(audio_path, list) and audio_path: - user_message["audios"] = [{"path": path, "duration": 10.0} for path in audio_path] - elif isinstance(audio_path, str): - user_message["audio"] = {"path": audio_path, "duration": 10.0} - - formatted_entry["messages"] = [user_message] + # Prepend /dataset/mmau-pro/ to make paths absolute for cluster + if len(audio_path) == 1: + user_message["audio"] = {"path": f"/dataset/mmau-pro/{audio_path[0]}"} + else: + user_message["audios"] = [{"path": f"/dataset/mmau-pro/{path}"} for path in audio_path] + + # Don't use /no_think for open-ended questions to allow reasoning + system_content = "You are a helpful assistant." + if category != "open": + system_content += " /no_think" + + formatted_entry["messages"] = [{"role": "system", "content": system_content}, user_message] return formatted_entry diff --git a/nemo_skills/dataset/s2s_demo/README.md b/nemo_skills/dataset/s2s_demo/README.md new file mode 100644 index 0000000000..0428d79892 --- /dev/null +++ b/nemo_skills/dataset/s2s_demo/README.md @@ -0,0 +1,236 @@ +# End2end Evaluation of the Duplex Speech2Speech Model Based on Nemo-Skills + +The suggested recipe performs inference with a duplex s2s model including TTS. It's based on the incremental decoding scripts. Currently the [demo_20251124](demo_20251124/) test set and Voicebench are supported. + +For demo the recipe runs scoring based on Kevin's script which includes: +- Turn-taking evaluation. +- User ASR quality evaluation. +- Agent TTS quality evaluation. +- Special symbol balance. +- Agent content quality evaluation (LLM based). + +For Voicebench it runs original Voicebench evaluation. The only change made there is support for nvidia inference API endpoints. + +In addition to average metrics the scoring recipe saves all kinds of alignments and error scores for each sample. + +## Getting started + +Currently different scripts are used to run demo and Voicebench. The configs are similar but not identical. Of course those should be unified going forward. + +1. Go to https://inference.nvidia.com/ to get API key. +2. Clone this branch. +```bash +git clone git@github.com:NVIDIA-NeMo/Skills.git nemo-skills +git checkout vmendelev/2512_s2s_eval +``` +3. Create a `.venv` and install nemo-skills: + +```bash +cd /path/to/nemo-skills +python -m venv .venv +source .venv/bin/activate +pip install -e . +``` + +4. Decide which cluster you want to work on and setup the corresponding cluster configuration. The example configuration for draco_oci (oci_iad) is provided [here](../../../cluster_configs/). You can get more configurations [here](https://gitlab-master.nvidia.com/igitman/nemo-skills-configs/-/tree/main/cluster_configs/v0.7.1?ref_type=heads). Don't forget to update user name, folder names and add `/lustre` to the mounts list. + +### Dataset preparation +#### Option 1 +Just use the data directory which is already present on draco_oci or copy it to another cluster if necessary. `/lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_test/data_dir` + +#### Option 2 +You can prepare the datasets again. The instructions are present later in the document. + +### Running the tests + +5. Look into the [demo config](scripts/s2s_demo_eval_config.yaml) or [voicebench config](../voicebench/scripts/voicebench_s2s_session_full_config.yaml) and make sure that the required artifacts are in the specified places. +6. Check the `data_dir` parameter. It should point to a folder on the cluster with `s2s_demo` and `voicebench` folders from `nemo_skills/datasets`. Demo wav files should be in `s2s_demo/demo_20251124/data`. On draco everything is in `/lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_test/data_dir` +7. Adjust `output_dir`. +8. Set the `num_chunks` to be e.g. 8 to make the thing run faster. This is applied to each sub-benchmark. For demo 8 is enough, for voicebench I would recommend 32 or more because of big test sets like MMSU. In this case runtime will be about 3 hrs. +9. Set `max_samples` to e.g. 2 if you want a fast run. +10. Make sure that `$NVIDIA_API_KEY` is set to a correct value. +11. Run the below command: + +```bash +# DEMO +cd /path/to/nemo-skills && \ +source .venv/bin/activate && \ +NEMO_SKILLS_DISABLE_UNCOMMITTED_CHANGES_CHECK=1 python nemo_skills/dataset/s2s_demo/scripts/run_s2s_demo_eval.py \ + --config nemo_skills/dataset/s2s_demo/scripts/s2s_demo_eval_config.yaml +``` + +or + +```bash +# VOICEBENCH +cd /path/to/nemo-skills && \ +source .venv/bin/activate && \ +NEMO_SKILLS_DISABLE_UNCOMMITTED_CHANGES_CHECK=1 python nemo_skills/dataset/voicebench/scripts/generate_from_api_and_score_official.py --config nemo_skills/dataset/voicebench/scripts/voicebench_s2s_session_full_config.yaml +``` + +## Running the Server Only (for Manual Testing) + +If you want to run just the unified server for manual testing or integration with external clients (e.g., AU-Harness), you can start it directly without installing nemo-skills. Just copy the code folder to the cluster. + +**On draco_oci:** + +### Option 1: Text-only mode (no TTS) + +Create `run_server.sbatch`: +```bash +#!/bin/bash +#SBATCH --partition=batch_block1,batch_block3,batch_block4 +#SBATCH --account=convai_convaird_nemo-speech +#SBATCH --nodes=1 +#SBATCH --gpus=1 +#SBATCH --time=02:00:00 +#SBATCH --output=server_%j.log + +srun --container-image=/lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh \ + --container-mounts="/lustre:/lustre,/path/to/your/workspace:/workspace" \ + bash -c 'cd /workspace && \ + export PYTHONPATH="/workspace/ns_eval:$PYTHONPATH" && \ + export HF_HOME=/lustre/fsw/portfolios/llmservice/users/YOUR_USER/.cache/huggingface && \ + export INCLUDE_DEBUG_INFO=true && \ + python -m nemo_skills.inference.server.serve_unified \ + --backend s2s_session \ + --model /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-2mim_sw_et_eos_dp_eos_dup_fp32-stt-3-december_stt_edresson_model_R_digits_norm_eip_0.1_EA_model_step_9005 \ + --config_path /lustre/fsw/portfolios/convai/users/ecasanova/S2S-Duplex-new-codebase/scripts/configs/inference/nanov2_demo_model_eartts_updated.yaml \ + --code_path /lustre/fsw/portfolios/convai/users/kevinhu/S2S-Duplex-new-codebase/branches/NeMo-release_not_rebased \ + --ignore_system_prompt \ + --num_frames_per_inference 2 \ + --silence_padding_sec 0.0 \ + --session_artifacts_dir /lustre/fsw/portfolios/llmservice/users/YOUR_USER/tmp/s2s_artifacts \ + --no_decode_audio \ + --response_end_detection_mode eos \ + --eos_detection_window 10 \ + --port 8000' +``` + +### Option 2: With audio output (TTS enabled) + +Create `run_server_sound.sbatch`: +```bash +#!/bin/bash +#SBATCH --partition=batch_block1,batch_block3,batch_block4 +#SBATCH --account=convai_convaird_nemo-speech +#SBATCH --nodes=1 +#SBATCH --gpus=1 +#SBATCH --time=02:00:00 +#SBATCH --output=server_%j.log + +srun --container-image=/lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh \ + --container-mounts="/lustre:/lustre,/path/to/your/workspace:/workspace" \ + bash -c 'cd /workspace && \ + export PYTHONPATH="/workspace/ns_eval:$PYTHONPATH" && \ + export HF_HOME=/lustre/fsw/portfolios/llmservice/users/YOUR_USER/.cache/huggingface && \ + export INCLUDE_DEBUG_INFO=true && \ + python -m nemo_skills.inference.server.serve_unified \ + --backend s2s_session \ + --model /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-2mim_sw_et_eos_dp_eos_dup_fp32-stt-3-december_stt_edresson_model_R_digits_norm_eip_0.1_EA_model_step_9005 \ + --config_path /lustre/fsw/portfolios/convai/users/ecasanova/S2S-Duplex-new-codebase/scripts/configs/inference/nanov2_demo_model_eartts_updated.yaml \ + --code_path /lustre/fsw/portfolios/convai/users/kevinhu/S2S-Duplex-new-codebase/branches/NeMo-release_not_rebased \ + --ignore_system_prompt \ + --num_frames_per_inference 2 \ + --silence_padding_sec 0.0 \ + --session_artifacts_dir /lustre/fsw/portfolios/llmservice/users/YOUR_USER/tmp/s2s_artifacts \ + --port 8001' +``` + +Submit with `sbatch run_server.sbatch` or `sbatch run_server_sound.sbatch`. + +**Notes:** +- Replace `YOUR_USER` with your username and `/path/to/your/workspace` with your workspace path containing the code +- `--session_artifacts_dir` - where session audio artifacts are saved +- `INCLUDE_DEBUG_INFO=true` - includes debug info (with ASR transcription) in responses; set to `false` to disable +- Text-only mode uses `--no_decode_audio`, `--response_end_detection_mode eos` (stops after consecutive PAD tokens) +- Audio mode uses default energy-based response end detection (TTS silence) +- The server exposes OpenAI-compatible `/v1/chat/completions` endpoint +- The server supports multi-turn conversations with automatic session management based on conversation history hashing + +Check `server_.log` for server output. Once running, the log will show which node it's on (e.g., `batch-block1-3196:8000`). You can then send requests from the login node or any machine that can reach the compute node. + +## Incremental and Session Backends + +The demo test is run with incremental backend which assumes silences already in the test audio or you can use `silence_padding_sec` to add trailing pause automatically. + +**Session backend.** This backend is used for Voicebench. The difference from the incremental one is that here we feed the user turn (no added pause), wait for the model to respond, then feed the second one and so on. We can also feed 2 turns in one go and preprogram feeding the second one at a predefined point while the system is responding to the first turn instead of 0s. With this one can enable a dialog between e.g Gemini and our model. + +### NeMo Integration + +The incremental backend interfaces with NeMo's `speechlm2` collection. The main model class is `NemotronVoiceChat` from `nemo.collections.speechlm2.models.nemotron_voicechat`. The frame-by-frame inference uses the following key interfaces: + +1. **Perception** (`stt_model.perception`) - Encodes raw audio waveform into frame-level embeddings. Called once per inference step with a sliding window audio buffer. + +2. **LLM Forward** (`stt_model.__call__`) - Generates agent response and ASR tokens from audio embeddings. Supports two modes: + - `DynamicCache` mode (for most models) - uses HuggingFace's dynamic KV cache + - `input_embeds_history` mode (for Mamba models) - accumulates all input embeddings + +3. **TTS Code Generation** (`tts_model.infer_codes_one_step`) - Generates audio codec tokens frame-by-frame from text tokens. Maintains its own `past_key_values` cache. + +4. **Audio Codec Decoding** (`tts_model.audio_codec.decode`) - Converts codec tokens to waveform audio. Called with a sliding window of codec tokens. + +The backend implementation is in `recipes/multimodal/server/backends/s2s_incremental_backend.py`. The session backend (`s2s_session_backend.py`) extends the incremental backend to persist state (LLM cache, audio buffers, frame index) across multiple turns. + +This should be based on the Niva code going forward. + +In case you want to use a different nemo branch -- just replace the path to nemo code in the config. + +## Preparing Tests + +Standard nemo-skills `test.jsonl` and corresponding set of wav files are what we need per benchmark. The current demo test set can be found at `/lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_test/data_dir/s2s_demo` and it was obtained from the available lhotse shars via this [script](convert_lhotse_to_eval.py). If you want to add more samples you need to add them to the `test.jsonl` and copy wav to the data folder. Or you can use the [script](convert_lhotse_to_eval.py) to convert another dataset. No reference transcription or segmentation is needed. + +With Voicebench you can use the standard nemo-skills preparation command: + +```bash +cd /path/to/nemo-skills && \ +source .venv/bin/activate && \ +python -m nemo_skills.dataset.prepare voicebench +``` + +### VoiceBench Scoring Setup + +VoiceBench scoring uses the GPT judge (`api_judge.py`) for certain subtests. The original VoiceBench only supports OpenAI API, so modifications are needed for NVIDIA API support. + +**On draco_oci:** A pre-modified VoiceBench repository is already available. Just set `voicebench_repo_path` in your config to: +``` +/lustre/fsw/portfolios/llmservice/users/vmendelev/code/VoiceBench +``` + +The key modified file is `api_judge.py` (full path: `/lustre/fsw/portfolios/llmservice/users/vmendelev/code/VoiceBench/api_judge.py`) which adds: +- `--api_type nvidia` argument to use NVIDIA inference API +- `--nvidia_model` argument to specify the model (e.g., `meta/llama-3.1-70b-instruct`) +- Initializes OpenAI client with `https://inference-api.nvidia.com/v1` base URL + +**On other clusters:** Clone VoiceBench, copy the modified `api_judge.py` from draco, and update `voicebench_repo_path` in your config. + +## Comparing Multiple Evaluation Results + +Use the `compare_eval_results.py` script to compare metrics from multiple model evaluations and generate a Markdown report: + +```bash +cd /path/to/nemo-skills && \ +source .venv/bin/activate && \ +python nemo_skills/dataset/s2s_demo/scripts/compare_eval_results.py \ + --eval_folders \ + "draco-oci-login-01.draco-oci-iad.nvidia.com:/lustre/fsw/portfolios/llmservice/users/vmendelev/tmp/s2s_demo_eval_t4:November Model" \ + "draco-oci-login-01.draco-oci-iad.nvidia.com:/lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_s2s_session_full/runs/voicebench_20251230_041530:November Model" \ + "draco-oci-login-01.draco-oci-iad.nvidia.com:/lustre/fsw/portfolios/llmservice/users/vmendelev/tmp/s2s_demo_eval_t4_c3:December Model" \ + "draco-oci-login-01.draco-oci-iad.nvidia.com:/lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_s2s_session_full/runs/voicebench_20251229_113456:December Model" \ + --output /tmp/comparison_report.md +``` + +The script supports: +- **Remote folders via SSH**: `hostname:/path/to/folder:DisplayName` +- **Local folders**: `/path/to/folder:DisplayName` +- **Mixed**: compare models from different clusters or local/remote + +See [example report](scripts/comparison_report.md) for output format. + +## TODOs +0. Refactor the backends to directly call Niva. +1. Integrate vLLM and Triton. +2. Add WandB integration. +4. Refactor configs and scripts used to run the tests. +5. Refactor decmo scoding script. +6. Add batching support. diff --git a/nemo_skills/dataset/s2s_demo/__init__.py b/nemo_skills/dataset/s2s_demo/__init__.py new file mode 100644 index 0000000000..37f687f51e --- /dev/null +++ b/nemo_skills/dataset/s2s_demo/__init__.py @@ -0,0 +1,10 @@ +# S2S Demo dataset - for testing speech-to-speech models + +DATASET_GROUP = "speechlm" +IS_BENCHMARK_GROUP = True + +BENCHMARKS = { + "s2s_demo.demo_20251124": {}, +} + +GENERATION_ARGS = "++prompt_format=openai" diff --git a/nemo_skills/dataset/s2s_demo/convert_lhotse_to_eval.py b/nemo_skills/dataset/s2s_demo/convert_lhotse_to_eval.py new file mode 100644 index 0000000000..07b3bd361c --- /dev/null +++ b/nemo_skills/dataset/s2s_demo/convert_lhotse_to_eval.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +""" +Convert lhotse dataset to evaluation JSONL format. + +This script reads a lhotse dataset (cuts.*.jsonl.gz + recording.*.tar) and produces +an evaluation JSONL file similar to voicebench format with: +- Single user turn containing the full recording audio +- System prompt +- No text content (audio only) +""" + +import argparse +import gzip +import json +import tarfile +from pathlib import Path + + +def parse_args(): + parser = argparse.ArgumentParser(description="Convert lhotse dataset to eval JSONL format") + parser.add_argument( + "--input-dir", + type=str, + required=True, + help="Path to lhotse dataset directory containing cuts.*.jsonl.gz and recording.*.tar", + ) + parser.add_argument( + "--output-dir", + type=str, + required=True, + help="Output directory for test.jsonl and extracted audio files", + ) + parser.add_argument( + "--system-prompt", + type=str, + default="You are a helpful assistant.", + help="System prompt to use for all samples", + ) + parser.add_argument( + "--audio-subdir", + type=str, + default="data", + help="Subdirectory name for audio files in output", + ) + parser.add_argument( + "--dataset-prefix", + type=str, + default="s2s_demo", + help="Dataset prefix for audio paths in messages", + ) + return parser.parse_args() + + +def load_cuts(input_dir: Path) -> list[dict]: + """Load all cuts from gzipped JSONL files.""" + cuts = [] + for cuts_file in sorted(input_dir.glob("cuts.*.jsonl.gz")): + with gzip.open(cuts_file, "rt") as f: + for line in f: + if line.strip(): + cuts.append(json.loads(line)) + return cuts + + +def extract_audio_files(input_dir: Path, output_audio_dir: Path, recording_ids: set[str]): + """Extract audio files from tar archives.""" + output_audio_dir.mkdir(parents=True, exist_ok=True) + extracted = set() + + for tar_file in sorted(input_dir.glob("recording.*.tar")): + with tarfile.open(tar_file, "r") as tar: + for member in tar.getmembers(): + # Audio files are stored as {id}.flac + if member.name.endswith(".flac"): + # The recording id is the name without .flac extension + # But in tar, files are stored as {recording_id}.flac + # where recording_id might already have an extension like .wav or .flac + base_name = member.name[:-5] # Remove .flac + if base_name in recording_ids: + # Extract to output directory with original name + member.name = Path(member.name).name + tar.extract(member, output_audio_dir) + extracted.add(base_name) + print(f"Extracted: {member.name}") + + return extracted + + +def convert_cut_to_eval_format(cut: dict, audio_subdir: str, dataset_prefix: str, system_prompt: str) -> dict: + """Convert a single lhotse cut to evaluation format.""" + recording_id = cut["id"] + audio_filename = f"{recording_id}.flac" + audio_path = f"{audio_subdir}/{audio_filename}" + full_audio_path = f"{dataset_prefix}/{audio_path}" + + return { + "problem": "", + "audio_path": audio_path, + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": "", "audio": {"path": full_audio_path}}, + ], + } + + +def main(): + args = parse_args() + input_dir = Path(args.input_dir) + output_dir = Path(args.output_dir) + output_audio_dir = output_dir / args.audio_subdir + + print(f"Loading cuts from {input_dir}...") + cuts = load_cuts(input_dir) + print(f"Found {len(cuts)} cuts") + + # Get all recording IDs + recording_ids = {cut["id"] for cut in cuts} + + print(f"Extracting {len(recording_ids)} audio files...") + extract_audio_files(input_dir, output_audio_dir, recording_ids) + + # Convert to eval format + print("Converting to eval format...") + output_dir.mkdir(parents=True, exist_ok=True) + output_file = output_dir / "test.jsonl" + + with open(output_file, "w") as f: + for cut in cuts: + entry = convert_cut_to_eval_format(cut, args.audio_subdir, args.dataset_prefix, args.system_prompt) + f.write(json.dumps(entry) + "\n") + + print(f"Wrote {len(cuts)} entries to {output_file}") + + +if __name__ == "__main__": + main() diff --git a/nemo_skills/dataset/s2s_demo/demo_20251124/__init__.py b/nemo_skills/dataset/s2s_demo/demo_20251124/__init__.py new file mode 100644 index 0000000000..26ea1a1ec7 --- /dev/null +++ b/nemo_skills/dataset/s2s_demo/demo_20251124/__init__.py @@ -0,0 +1,3 @@ +# S2S Demo subtest - November 2024 demo recordings + +GENERATION_ARGS = "++prompt_format=openai" diff --git a/nemo_skills/dataset/s2s_demo/demo_20251124/test.jsonl b/nemo_skills/dataset/s2s_demo/demo_20251124/test.jsonl new file mode 100644 index 0000000000..b8f3a89a5e --- /dev/null +++ b/nemo_skills/dataset/s2s_demo/demo_20251124/test.jsonl @@ -0,0 +1,60 @@ +{"problem": "This data should be accessible by most stakeholders ; Who should have access to this data? | Warning received generally positive reviews from music critics. ; What was the critical reception of Warning?", "audio_path": "data/1.cc_record.flac.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/1.cc_record.flac.flac"}}]} +{"problem": "Warning received generally positive reviews from music critics. ; What was the critical reception of Warning? | What actions did the European Council commit to take? ; the European Council promised to make some changes", "audio_path": "data/10.zhehuai_on_elena_s_words2_noise_45db.wav.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/10.zhehuai_on_elena_s_words2_noise_45db.wav.flac"}}]} +{"problem": "What actions did the European Council commit to take? ; the European Council promised to make some changes | The voice on the hotline was Gary Schipper. ; Who was the voice on the hotline?", "audio_path": "data/11.zhehuai_on_elena_s_words2_laptop_24db.wav.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/11.zhehuai_on_elena_s_words2_laptop_24db.wav.flac"}}]} +{"problem": "The voice on the hotline was Gary Schipper. ; Who was the voice on the hotline? | What is the consequence of not being truthful? ; Because then you're not being truthful.", "audio_path": "data/12.zhehuai_on_elena_s_words2_laptop_noise_24db.wav.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/12.zhehuai_on_elena_s_words2_laptop_noise_24db.wav.flac"}}]} +{"problem": "What is the consequence of not being truthful? ; Because then you're not being truthful. | What is the speaker intending to do at the conclusion of their statement? ; President, let me conclude by", "audio_path": "data/13.ankita_eng_medicine_art_inferred_chunks_combined_40db.wav.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/13.ankita_eng_medicine_art_inferred_chunks_combined_40db.wav.flac"}}]} +{"problem": "What is the speaker intending to do at the conclusion of their statement? ; President, let me conclude by | How should actions at the UN level be characterized? ; at UN level must be inclusive,", "audio_path": "data/14.elena_eng_medicine_art_inferred_chunks_combined_40db.wav.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/14.elena_eng_medicine_art_inferred_chunks_combined_40db.wav.flac"}}]} +{"problem": "at UN level must be inclusive, ; How should actions at the UN level be characterized? | What type of orientation does the speaker advocate for? ; I would deem it a customer driven orientation versus an internal orientation. So we're really exposing and directing our team leaders and our people inside that know and are very knowledgeable", "audio_path": "data/5.inferred_chunks_input_elena_wired_earbuds_with_mic_no_reply.wav.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/5.inferred_chunks_input_elena_wired_earbuds_with_mic_no_reply.wav.flac"}}]} +{"problem": "I would deem it a customer driven orientation versus an internal orientation. So we're really exposing and directing our team leaders and our people inside that know and are very knowledgeable ; What type of orientation does the speaker advocate for? | Who did Blair's predecessor as Labour leader, John Smith, choose as Shadow Lord Chancellor? ; Blair's predecessor as Labour leader, John Smith, had chosen Irvine as Shadow Lord Chancellor.", "audio_path": "data/6.inferred_chunks_input_elena_gb.wav.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/6.inferred_chunks_input_elena_gb.wav.flac"}}]} +{"problem": "Who did Blair's predecessor as Labour leader, John Smith, choose as Shadow Lord Chancellor? ; Blair's predecessor as Labour leader, John Smith, had chosen Irvine as Shadow Lord Chancellor. | How many magistrates' courts are there in Hong Kong? ; There are currently seven magistrates' courts in Hong Kong.", "audio_path": "data/7.inferred_chunks_input_elena_us.wav.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/7.inferred_chunks_input_elena_us.wav.flac"}}]} +{"problem": "There are currently seven magistrates' courts in Hong Kong. ; How many magistrates' courts are there in Hong Kong? | How will existing energy efficiency incentive schemes be improved? ; Existing energy efficiency incentive schemes will also be enhanced.", "audio_path": "data/8.zhehuai_on_elena_s_words_55db.wav.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/8.zhehuai_on_elena_s_words_55db.wav.flac"}}]} +{"problem": "How will existing energy efficiency incentive schemes be improved? ; Existing energy efficiency incentive schemes will also be enhanced. | How awkward you are, said the pretty housemaid. ; What did the pretty housemaid say to indicate awkwardness?", "audio_path": "data/9.zhehuai_on_elena_s_words2_55db.wav.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/9.zhehuai_on_elena_s_words2_55db.wav.flac"}}]} +{"problem": "How awkward you are, said the pretty housemaid. ; What did the pretty housemaid say to indicate awkwardness? | Is it possible to handle the situation? ; You can cope with it.", "audio_path": "data/a1-zhehuai-flower1_55db.wav.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/a1-zhehuai-flower1_55db.wav.flac"}}]} +{"problem": "You can cope with it. ; Is it possible to handle the situation? | Who did the individual bump into after forgetting to watch his feet? ; him? He'd forgotten to watch, and was surprised to find his feet on the steps of the apartment building. He jerked back, and bumped into someone. Sorry. The words came from behind him, automatically, and he turned to see the slim young man stepping aside.", "audio_path": "data/a2-zhehuai-flower2_55db.wav.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/a2-zhehuai-flower2_55db.wav.flac"}}]} +{"problem": "him? He'd forgotten to watch, and was surprised to find his feet on the steps of the apartment building. He jerked back, and bumped into someone. Sorry. The words came from behind him, automatically, and he turned to see the slim young man stepping aside. ; Who did the individual bump into after forgetting to watch his feet? | What television show were Lee and Kaiine fans of? ; Lee and Kaiine were also huge fans of Doctor Who.", "audio_path": "data/a3-zhehuai-flower3_55db.wav.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/a3-zhehuai-flower3_55db.wav.flac"}}]} +{"problem": "What television show were Lee and Kaiine fans of? ; Lee and Kaiine were also huge fans of Doctor Who. | The Observation level is called the GeO Deck. ; What is the name of the Observation level?", "audio_path": "data/a4-cc_script_qwen_elena_engineering.wav.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/a4-cc_script_qwen_elena_engineering.wav.flac"}}]} +{"problem": "What is the name of the Observation level? ; The Observation level is called the GeO Deck. | Today we are naturally focused on global challenges, ; What are we primarily focused on today in the context of global affairs?", "audio_path": "data/a5-zhehuai-friends.wav.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/a5-zhehuai-friends.wav.flac"}}]} +{"problem": "What are we primarily focused on today in the context of global affairs? ; Today we are naturally focused on global challenges, | Iran has an important influence in the region, so ; What role does Iran play in its region?", "audio_path": "data/a6-zhehuai-career.wav.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/a6-zhehuai-career.wav.flac"}}]} +{"problem": "What role does Iran play in its region? ; Iran has an important influence in the region, so | All Saints then parted company with their record label. ; Which record label did All Saints part ways with?", "audio_path": "data/a7-zhehuai-phoebe.wav.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/a7-zhehuai-phoebe.wav.flac"}}]} +{"problem": "All Saints then parted company with their record label. ; Which record label did All Saints part ways with? | performance orientation and a result oriented CAP. ; What type of orientation does the CAP exhibit?", "audio_path": "data/a8-zhehuai-joey.wav.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/a8-zhehuai-joey.wav.flac"}}]} +{"problem": "performance orientation and a result oriented CAP. ; What type of orientation does the CAP exhibit? | That is a question for the next manager. ; Who should handle the mentioned question?", "audio_path": "data/a9-zhehuai-ross.wav.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/a9-zhehuai-ross.wav.flac"}}]} +{"problem": "Who should handle the mentioned question? ; That is a question for the next manager. | What did Fitzsimons continue to be involved in after withdrawing from politics? ; While withdrawing from politics, Fitzsimons remained active in civic and business affairs.", "audio_path": "data/aa-zhehuai-rachel.wav.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/aa-zhehuai-rachel.wav.flac"}}]} +{"problem": "Hey, I'm thinking about switching careers after five years in marketing. Any advice on making the transition smooth that makes sense. I'm leaning towards product management. How can I prove I'm ready for that role? Should I get a certification or just rely on that portfolio? Got it. And what about networking? I don't know many product managers. Informational interviews sound intimidating. Any tips on how to approach them? How long should I be studying before applying for junior product roles? Should I tailor my resume heavily towards product language? And what about interview prep? Any specific questions I should practice? If I get rejected, how do I stay motivated? Thanks. This roadmap feels doable now. I'll start building that portfolio this weekend.", "audio_path": "data/ankita_careeradvice_defaultmiconmacbookpro_nvphoneroom_audacity_2.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/ankita_careeradvice_defaultmiconmacbookpro_nvphoneroom_audacity_2.flac"}}]} +{"problem": "Hey, I'm thinking about switching careers after five years in marketing. Any advice on making the transition smooth? That makes sense. I'm leaning towards project management. How can I prove that I'm ready for that role? Should I get a certification or just rely on that portfolio? Got it. What about networking? I don't know many product managers. Informational interviews sound intimidating. Any tips on how to approach them? How long should I be studying before applying for junior product roles? Should I tailor my resume heavily toward product language? What about interview prep? Any specific questions I should practice? If I get rejected, how do I stay motivated? Thanks. This roadmap feels doable now. I'll start building that portfolio this weekend.", "audio_path": "data/ankita_careeradvice_sonywirelessonmacbookpro_nvphoneroom_audacity_1.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/ankita_careeradvice_sonywirelessonmacbookpro_nvphoneroom_audacity_1.flac"}}]} +{"problem": "Hey, I'm craving Indian food. Any recommendations for delivery? Do they do spicy? I like it hot, but not mouth on fire hot. What's their best vegetarian option? Should I get raita too? Any deals? I'm on a budget. Perfect. Should I add samosas? Okay, so two curries, naan, raita, samosas. Anything else? Sold. I'll order now. Done. Delivery in thirty? Got it. Thanks for the advice.", "audio_path": "data/ankita_restaurantordering_sonywirelessonmacbookpro_nvcafe_audacity_5.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/ankita_restaurantordering_sonywirelessonmacbookpro_nvcafe_audacity_5.flac"}}]} +{"problem": "Hey, I'm thinking about planning a trip to Austria this fall. Any ideas where we should start? Tough choice. I'd love to see Vienna's palaces, but the mountain views sound incredible too. Maybe we can do both? When's the best time to visit? What kind of weather should I pack for in September? Got it. I've heard the food is amazing. Any must-try dishes? That sounds delicious. I'm not a huge meat eater. Any other suggestions? Perfect. I've also heard Salzburg is beautiful. Is it worth a visit? Great, I'll add it to my list. Thanks for all the tips.", "audio_path": "data/ankita_tripplanning_logitechdirectionalonmacbookpro_nvphoneroom_audacity_3.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/ankita_tripplanning_logitechdirectionalonmacbookpro_nvphoneroom_audacity_3.flac"}}]} +{"problem": "Hey, I'm feeling completely burned out from work lately. I'm working late every night and still feel like I'm falling behind. I feel guilty if I don't finish everything. The emails just never stop. Hmm, that might work. I just find it hard to actually switch off. I used to read, but I haven't picked up a book in months. Okay, I'll try that tonight. Thanks for the practical advice.", "audio_path": "data/ankita_worklifebalance_logitechdirectionalonmacbookpro_nvcafe_audacity_4.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/ankita_worklifebalance_logitechdirectionalonmacbookpro_nvcafe_audacity_4.flac"}}]} +{"problem": "I'm thinking about buying a house next year, but I'm not sure where to start any device. Got it. I will talk to my bank this weekend. Should I focus on a specific neighborhood or keep my options open? That makes sense. How important is the home inspection? I heard some buyers skip it to speed up the process. I will definitely schedule one. What about making an offer? Should I start low or match the asking price right away? Understood. How much should I expect to pay in closing costs?", "audio_path": "data/edresson_buying_house.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/edresson_buying_house.flac"}}]} +{"problem": "Hey, I have been thinking about switching careers lately, but I'm not sure where to start in advise Mostly growth. I feel stuck in my current role like I have been stopped learning anything meaningful and yeah, that's just disappointing Yeah, I'm good at data analysis and project management but I'm not sure where those fit best Okay, that sounds interesting. Should I take course first we're starting applying and learning on the job A mix works best makes sense I guess networking would help to write", "audio_path": "data/edresson_carrer.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/edresson_carrer.flac"}}]} +{"problem": "", "audio_path": "data/edresson_carrer_internal_microphone.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/edresson_carrer_internal_microphone.flac"}}]} +{"problem": "I'm thinking about buying a house next year, but I'm not sure where to start. Any advice you can give me? Got it. I will take to sorry, I will talk to my bank this weekend. Should I focus on specific neighborhoods or keep my options open? That makes sense. How important is the home inspection? I have heard some buyers skip it to speed up the process. I will definitely schedule one. What about make an offer? Should I start low or match the asking price right away? Understood, how much should I expect to pay in closing cost?", "audio_path": "data/edresson_fifine_buying_house.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/edresson_fifine_buying_house.flac"}}]} +{"problem": "", "audio_path": "data/edresson_fifine_restaurant.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/edresson_fifine_restaurant.flac"}}]} +{"problem": "", "audio_path": "data/edresson_fifine_trip.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/edresson_fifine_trip.flac"}}]} +{"problem": "I'm thinking about buying a house next year, but I'm not sure where to start. Any advice you can give me? Got it. Sorry, I will talk to my bank this weekend. Should I focus on specific neighborhoods or keep my options open? That makes sense. How important is the home inspection? I have heard some buyers skip it to speed up the process. I will definitely schedule one. What about make an offer? Should I start low or match the asking price right away? Understood, how much should I expect to pay in closing costs?", "audio_path": "data/edresson_smartphone_buying_house_2.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/edresson_smartphone_buying_house_2.flac"}}]} +{"problem": "Hey, I'm starving. What should you eat? Pizza sounds perfect. Where should you go? Okay, let's go there. What do you want? I will go for the pepperoni. Yeah, I will get a Coke. Perfect. Let's head over there now.", "audio_path": "data/edresson_smartphone_restaurant.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/edresson_smartphone_restaurant.flac"}}]} +{"problem": "Hey, planet trip soon. Any ideas where to go? Sure, but where? Caribbean or maybe Thailand? Yeah, but flights might be pricey. What dates? Agreed, should we book hotels or Bambi? Definitely need a place near the beach. Both, maybe a day trip to a template trip to Thai cousin. Can't miss that. Agreed, I will check flight details tonight and try to buy some tickets.", "audio_path": "data/edresson_smartphone_trip.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/edresson_smartphone_trip.flac"}}]} +{"problem": "", "audio_path": "data/edresson_trip.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/edresson_trip.flac"}}]} +{"problem": "Hey, do you ever feel like work just never ends? I'm always checking mails at home Right, it's so hard to switch off I feel guilty for not being available 24 7 Definitely I start turning off notification after 7 p.m. It's a small but helpful like yeah, it's helping It's a process I still struggles but make time for hobbies has helped a lot I Have got back into reading just 30 minutes a night makes a huge difference Absolutely your health is more important than any late night working mail Of course we have to support each other in achieving a better balance", "audio_path": "data/edresson_worklife_balance.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/edresson_worklife_balance.flac"}}]} +{"problem": "", "audio_path": "data/restaurant.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/restaurant.flac"}}]} +{"problem": "Hey, should I study engineering or art? Oh, what if I like maths? No, but I like it actually. No, I like mathematics. Okay, so which is better for me to study, engineering or art? Um okay, what if I want to do architecture? Huh, thank you. Hey, should I study engineering or art? That's a great question. Both fields have their own unique benefits and challenges. Oh, what if I like maths?", "audio_path": "data/moshi_client_input_maths.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/moshi_client_input_maths.flac"}}]} +{"problem": "", "audio_path": "data/moshi_client_input_sf.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/moshi_client_input_sf.flac"}}]} +{"problem": "", "audio_path": "data/moshi_client_nemo_20251117_151731_input.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/moshi_client_nemo_20251117_151731_input.flac"}}]} +{"problem": "Hello, what is the best time of year to visit San Francisco?", "audio_path": "data/moshi_client_nemo_20251117_151800_input.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/moshi_client_nemo_20251117_151800_input.flac"}}]} +{"problem": "", "audio_path": "data/moshi_client_nemo_20251117_151913_input.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/moshi_client_nemo_20251117_151913_input.flac"}}]} +{"problem": "", "audio_path": "data/moshi_client_nemo_20251117_152033_input.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/moshi_client_nemo_20251117_152033_input.flac"}}]} +{"problem": "Hey, um, what's your name? What?", "audio_path": "data/moshi_client_nemo_20251117_152527_input.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/moshi_client_nemo_20251117_152527_input.flac"}}]} +{"problem": "Hi, how are you? What's your name?", "audio_path": "data/moshi_client_nemo_20251117_152541_input.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/moshi_client_nemo_20251117_152541_input.flac"}}]} +{"problem": "Hey, what's your name? Hello, what's your name? What's your name, sir? Please respond to me please.", "audio_path": "data/moshi_client_nemo_20251117_152905_input.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/moshi_client_nemo_20251117_152905_input.flac"}}]} +{"problem": "Hello, how are you today? Hello, what's your name? Hello Hello Hello Hello Can you hear me? Hello Hello", "audio_path": "data/moshi_client_nemo_20251117_153048_input.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/moshi_client_nemo_20251117_153048_input.flac"}}]} +{"problem": "Hello, how are you today? How are you doing? Hello?", "audio_path": "data/moshi_client_nemo_20251117_153058_input.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/moshi_client_nemo_20251117_153058_input.flac"}}]} +{"problem": "Hello, hey, how's it going?", "audio_path": "data/moshi_client_nemo_20251117_153436_input.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/moshi_client_nemo_20251117_153436_input.flac"}}]} +{"problem": "", "audio_path": "data/moshi_client_nemo_20251117_153453_input.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/moshi_client_nemo_20251117_153453_input.flac"}}]} +{"problem": "Hi, Flora. I'm calling to order some flowers for my mom's birthday. Will you be able to make a rush delivery for tomorrow? It's her birthday tomorrow. Okay, so she likes roses and tulips. How about can we get those in yellow? Okay, and can you confirm you're able to make a delivery by 3 p.m. tomorrow to Santa Mars Expressway? Okay, thank you. You have saved my life. Thank you so much. Bye.", "audio_path": "data/kevin_flower1_hyperx.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/kevin_flower1_hyperx.flac"}}]} +{"problem": "Hi, Flora. I'm calling to order some flowers for my friend's birthday. So she likes roses and tulips. Yeah, do you think that will look nice? Maybe in like pink? Okay, and can you confirm you're able to deliver that to the main street tomorrow? Okay, and how much for the flowers and for the delivery? Okay, yeah, like, yeah, add some rose and I'm sorry, some chocolates. Ah, okay, yeah, that's everything. Thank you very much. Bye.", "audio_path": "data/kevin_flower2_hyperx.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/kevin_flower2_hyperx.flac"}}]} +{"problem": "Hi, Flora. I'm calling to order some flowers for my mom's birthday. Yeah, first of all, can I just confirm that you're able to make a rush delivery for this evening? Okay, thank goodness. Yeah, she likes roses and tulips. Do you think you have a bouquet with both of them? Yeah, she really likes pink. Do you have a pink, like a bouquet with lots of pink in it? Okay, cool. And okay, can you deliver to the main street by 5 p.m. today? Okay, yeah, yeah. Please do add a card that says happy birthday, mom. Love you so very much. And also, yeah, put it in all like a nice vas. Yeah, and please add some chocolates as well. Just make sure it's milk chocolate. Okay, oh, thank you so much. You're a lifesaver. Have a good day.", "audio_path": "data/kevin_flower3_hyperx.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/kevin_flower3_hyperx.flac"}}]} +{"problem": "Hi Flora, I'm going to request the delivery of flowers for this weekend. Yeah, so it's for my friend's birthday. She really likes tulips. And do you have like, can you have them in blue? Okay, let's see. Yeah, please deliver to the main street this weekend. Sorry, I mean this Saturday. Yeah, Saturday. Can you do by 2 p.m. And yeah, please add like a birthday card that says happy birthday, lots of love from Kevin. Oh yeah, yeah, I think that's everything actually. Yeah, thank you very much.", "audio_path": "data/kevin_flower4_hyperx.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/kevin_flower4_hyperx.flac"}}]} +{"problem": "Hey, can you help me with my geography homework? Yeah, so the first question is: What is the capital of Australia? Okay, thank you. And then the other question is: what is the longest river in the world? Okay, thank you. And then, like, the final question: this is really a short quiz for some reason. The final question is: What is the tallest mountain in the world? Okay, thank you very much. I think I hope I'm going to get an A on this test.", "audio_path": "data/kevin_geography1_hyperx.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/kevin_geography1_hyperx.flac"}}]} +{"problem": "Hi, can you help me with the geography homework, please? Yeah, so it's like asking me some questions. The first one is, what is the capital of Australia? Okay, then the next one says, what is the longest river in the world? Okay, and the final question for some reason, this is a really short quiz. The final question is, what is the highest mountain in the world? Okay, thank you. I really hope those are the correct answers and I could get an A on a test.", "audio_path": "data/kevin_geography2_hyperx.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/kevin_geography2_hyperx.flac"}}]} +{"problem": "Hey, it's movie night. Any idea what you are in the mood for tonight? Fair. How about a comedy then? Maybe something like Super Bad or The Nice Guys? Good idea. Spider-Man across the Spider-Verse. It's funny, stylish, and has great music. True, it's around two hours. We could do something shorter, like Luca or Zootopia. Yeah, plus the soundtrack is great. Try everything always get stuck in my head. Let's do some popcorn first. If we are still hungry halfway through, we can order pizza. Awesome, starting the movie in five.", "audio_path": "data/kevin_movie1_app3.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/kevin_movie1_app3.flac"}}]} +{"problem": "Hey, do you want to watch a movie tonight? I feel like relaxing after a long week. Hmm, I could go for a good mystery or thriller, something that keeps us guessing till the end. I've seen knives out, but not glass onion. Is it as good as the first one? Gone girl sounds intense, but maybe too dark for tonight. I kind of want something lighter. Oh, I love water meaty. That's one perfect. Beautiful visuals, inspiring story, and not too heavy. Check if it's on Netflix first. If not, rinting is fine. Should we start around 8 deal movie night settled? Finally, something easy to agree on.", "audio_path": "data/kevin_movie2_app3.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/kevin_movie2_app3.flac"}}]} +{"problem": "Hey, what movie should we watch tonight? I'm in the mood for something fun, but not too heavy. Hmm, that's tempting, but I've seen too many action flicks lately. Maybe something lighter, like a comedy or a mystery. Yeah, that's a good pick. We already watched it already. What about something newer? Maybe a 2024 release. Oh, right. I saw the trailer. It looked entertaining. Do you know if it's streaming anywhere yet? Perfect mix of ridiculous and exciting. Let's do that. Want a popcorn or should we order snacks instead? Let's mix both. I will start the movie while you grab the snacks.", "audio_path": "data/kevin_movie3_app3.flac", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "", "audio": {"path": "s2s_demo/demo_20251124/data/kevin_movie3_app3.flac"}}]} diff --git a/nemo_skills/dataset/s2s_demo/scripts/aggregate_llm_judge.py b/nemo_skills/dataset/s2s_demo/scripts/aggregate_llm_judge.py new file mode 100644 index 0000000000..5c5e0ca8e2 --- /dev/null +++ b/nemo_skills/dataset/s2s_demo/scripts/aggregate_llm_judge.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +""" +Aggregate LLM judge results and update metrics.json + output_with_eval.jsonl. + +Usage: + python aggregate_llm_judge.py --results_dir /path/to/eval-results/benchmark + python aggregate_llm_judge.py --results_dir /path/to/eval-results/benchmark --llm_judge_output /path/to/llm_judge/output.jsonl +""" + +import argparse +import json +import os +import re + + +def extract_rating(text: str) -> float | None: + """Extract rating from LLM judge response.""" + if not text: + return None + match = re.search(r"Rating:\s*([0-9]+(?:\.[0-9]+)?)", text, re.IGNORECASE) + if match: + return max(0.0, min(5.0, float(match.group(1)))) + return None + + +def main(): + parser = argparse.ArgumentParser(description="Aggregate LLM judge results") + parser.add_argument("--results_dir", required=True, help="Directory with metrics.json and output_with_eval.jsonl") + parser.add_argument( + "--llm_judge_output", help="Path to LLM judge output.jsonl (default: results_dir/output.jsonl)" + ) + args = parser.parse_args() + + llm_judge_file = args.llm_judge_output or os.path.join(args.results_dir, "output.jsonl") + metrics_file = os.path.join(args.results_dir, "metrics.json") + eval_file = os.path.join(args.results_dir, "output_with_eval.jsonl") + + # Parse LLM judge output into {item_id: rating} + judge_ratings = {} + with open(llm_judge_file, "r") as f: + for line in f: + if not line.strip(): + continue + entry = json.loads(line) + item_id = entry.get("item_id", "") + rating = extract_rating(entry.get("generation", "")) + if rating is not None: + judge_ratings[item_id] = rating + + # Aggregate by subset + ratings_by_subset = {"full": [], "sounded": []} + for item_id, rating in judge_ratings.items(): + if item_id.endswith("_full"): + ratings_by_subset["full"].append(rating) + elif item_id.endswith("_sounded"): + ratings_by_subset["sounded"].append(rating) + + # Compute metrics + llm_judge_metrics = {} + all_ratings = list(judge_ratings.values()) + if all_ratings: + avg = sum(all_ratings) / len(all_ratings) + llm_judge_metrics["overall"] = { + "avg_rating": round(avg, 3), + "judge_score": round(avg * 20, 2), + "count": len(all_ratings), + } + for subset, ratings in ratings_by_subset.items(): + if ratings: + avg = sum(ratings) / len(ratings) + llm_judge_metrics[subset] = { + "avg_rating": round(avg, 3), + "judge_score": round(avg * 20, 2), + "count": len(ratings), + } + + # Update metrics.json + metrics = json.load(open(metrics_file)) if os.path.exists(metrics_file) else {} + if "dataset_metrics" in metrics: + metrics["dataset_metrics"]["llm_judge"] = llm_judge_metrics + else: + metrics["llm_judge"] = llm_judge_metrics + with open(metrics_file, "w") as f: + json.dump(metrics, f, indent=2) + print(f"Updated {metrics_file}") + + # Update output_with_eval.jsonl with per-sample scores + if os.path.exists(eval_file): + updated_entries = [] + with open(eval_file, "r") as f: + for line in f: + if not line.strip(): + continue + entry = json.loads(line) + # Get item_id from original_entry + original = entry.get("original_entry", entry) + audio_path = original.get("audio_path", "") or original.get("audio", {}).get("path", "") + base_id = os.path.basename(audio_path).rsplit(".", 1)[0] if audio_path else "" + + # Add judge scores + full_rating = judge_ratings.get(f"{base_id}_full") + sounded_rating = judge_ratings.get(f"{base_id}_sounded") + entry["llm_judge_scores"] = { + "full": {"rating": full_rating, "score": round(full_rating * 20, 2) if full_rating else None}, + "sounded": { + "rating": sounded_rating, + "score": round(sounded_rating * 20, 2) if sounded_rating else None, + }, + } + updated_entries.append(entry) + + with open(eval_file, "w") as f: + for entry in updated_entries: + f.write(json.dumps(entry) + "\n") + print(f"Updated {eval_file} with {len(updated_entries)} entries") + + # Print summary + print("\nLLM Judge Results:") + for subset, m in llm_judge_metrics.items(): + print(f" {subset}: {m['avg_rating']:.2f}/5 (score: {m['judge_score']:.1f}/100, n={m['count']})") + + +if __name__ == "__main__": + main() diff --git a/nemo_skills/dataset/s2s_demo/scripts/compare_eval_results.py b/nemo_skills/dataset/s2s_demo/scripts/compare_eval_results.py new file mode 100644 index 0000000000..ab2fc9dc8a --- /dev/null +++ b/nemo_skills/dataset/s2s_demo/scripts/compare_eval_results.py @@ -0,0 +1,538 @@ +#!/usr/bin/env python3 +""" +Compare multiple S2S demo and VoiceBench evaluation results and generate a Markdown report. + +Usage: + python compare_eval_results.py \ + --eval_folders \ + "host:/path/to/s2s_demo_eval:Model A" \ + "host:/path/to/voicebench_eval:Model A" \ + "host:/path/to/s2s_demo_eval2:Model B" \ + "host:/path/to/voicebench_eval2:Model B" \ + --output comparison_report.md + +The script auto-detects whether each folder contains s2s_demo or voicebench results. +""" + +import argparse +import json +import os +import subprocess +from typing import Optional + + +def load_json_local(path: str) -> Optional[dict]: + """Load JSON from a local file.""" + if not os.path.exists(path): + return None + with open(path, "r") as f: + return json.load(f) + + +def load_json_remote(host: str, path: str) -> Optional[dict]: + """Load JSON from a remote file via SSH.""" + try: + result = subprocess.run(["ssh", host, f"cat {path}"], capture_output=True, text=True, timeout=30) + if result.returncode != 0: + return None + return json.loads(result.stdout) + except (subprocess.TimeoutExpired, json.JSONDecodeError, Exception): + return None + + +def list_dir_remote(host: str, path: str) -> list[str]: + """List directory contents via SSH.""" + try: + result = subprocess.run(["ssh", host, f"ls {path}"], capture_output=True, text=True, timeout=30) + if result.returncode != 0: + return [] + return [x.strip() for x in result.stdout.strip().split("\n") if x.strip()] + except Exception: + return [] + + +def list_dir_local(path: str) -> list[str]: + """List local directory contents.""" + if not os.path.isdir(path): + return [] + return os.listdir(path) + + +def count_samples_remote(host: str, path: str) -> int: + """Count lines in output.jsonl via SSH.""" + try: + result = subprocess.run( + ["ssh", host, f"wc -l < {path}/output.jsonl"], capture_output=True, text=True, timeout=30 + ) + if result.returncode == 0: + return int(result.stdout.strip()) + except Exception: + pass + return 0 + + +def count_samples_local(path: str) -> int: + """Count lines in local output.jsonl.""" + output_path = os.path.join(path, "output.jsonl") + if not os.path.exists(output_path): + return 0 + with open(output_path) as f: + return sum(1 for _ in f) + + +def detect_eval_type(host: Optional[str], eval_results_path: str) -> str: + """Detect if this is s2s_demo or voicebench based on folder names.""" + if host: + items = list_dir_remote(host, eval_results_path) + else: + items = list_dir_local(eval_results_path) + + for item in items: + if item.startswith("voicebench."): + return "voicebench" + if item.startswith("s2s_demo."): + return "s2s_demo" + return "unknown" + + +def load_s2s_demo_metrics(host: Optional[str], folder_path: str) -> dict: + """Load s2s_demo metrics from folder.""" + eval_results = folder_path + "/eval-results" if not folder_path.endswith("/") else folder_path + "eval-results" + + if host: + items = list_dir_remote(host, eval_results) + else: + items = list_dir_local(eval_results) + + # Find s2s_demo.* subfolder + demo_folder = None + for item in items: + if item.startswith("s2s_demo."): + demo_folder = item + break + + if not demo_folder: + return {} + + metrics_path = f"{eval_results}/{demo_folder}/metrics.json" + if host: + metrics = load_json_remote(host, metrics_path) + else: + metrics = load_json_local(metrics_path) + + if not metrics: + return {} + + return extract_s2s_demo_metrics(metrics) + + +def extract_s2s_demo_metrics(metrics: dict) -> dict: + """Extract key metrics from s2s_demo metrics dict.""" + dm = metrics.get("dataset_metrics", metrics) + tt = dm.get("turn_taking", {}) + bi = dm.get("barge_in", {}) + bc = dm.get("backchanneling", {}) + us = dm.get("user_speech", {}) + ags = dm.get("agent_speech", {}) + llm = dm.get("llm_judge", {}) + + return { + "num_samples": dm.get("num_samples_evaluated", 0), + "tt_latency_ms": tt.get("avg_latency_ms"), + "tt_precision": tt.get("avg_precision"), + "tt_recall": tt.get("avg_recall"), + "tt_f1": tt.get("avg_f1"), + "bi_success_rate": bi.get("avg_success_rate"), + "bi_latency_ms": bi.get("avg_latency_ms"), + "bc_accuracy": bc.get("avg_accuracy"), + "user_wer": us.get("avg_wer"), + "oob_ratio": us.get("out_of_bounds_word_ratio"), + "agent_wer": ags.get("avg_wer"), + "agent_cer": ags.get("avg_cer"), + "hallucination_rate": ags.get("hallucination_rate"), + "llm_judge_overall": llm.get("overall", {}).get("avg_rating"), + "llm_judge_full": llm.get("full", {}).get("avg_rating"), + "llm_judge_sounded": llm.get("sounded", {}).get("avg_rating"), + } + + +def load_voicebench_metrics(host: Optional[str], folder_path: str) -> dict: + """Load voicebench metrics from folder. Returns {subtest: {metric: value}}.""" + eval_results = folder_path + "/eval-results" if not folder_path.endswith("/") else folder_path + "eval-results" + + if host: + items = list_dir_remote(host, eval_results) + else: + items = list_dir_local(eval_results) + + result = {"subtests": {}, "total_samples": 0} + + for item in items: + if not item.startswith("voicebench."): + continue + + subtest = item.replace("voicebench.", "") + subtest_path = f"{eval_results}/{item}" + metrics_path = f"{subtest_path}/metrics.json" + + # Count samples + if host: + samples = count_samples_remote(host, subtest_path) + metrics = load_json_remote(host, metrics_path) + else: + samples = count_samples_local(subtest_path) + metrics = load_json_local(metrics_path) + + result["total_samples"] += samples + + if metrics: + # Format: {"voicebench.{subtest}": {"greedy": {...}}} + key = f"voicebench.{subtest}" + if key in metrics: + greedy = metrics[key].get("greedy", {}) + if greedy: + result["subtests"][subtest] = {"metrics": greedy, "samples": samples} + + return result + + +def format_value(value, format_spec: str = ".1f", suffix: str = "", na_str: str = "N/A") -> str: + """Format a value for display.""" + if value is None: + return na_str + try: + return f"{value:{format_spec}}{suffix}" + except (ValueError, TypeError): + return str(value) + + +def generate_s2s_demo_section(models: list[tuple[str, dict]]) -> list[str]: + """Generate S2S Demo section of the report.""" + if not models: + return [] + + lines = [] + lines.append("## S2S Demo Evaluation\n") + + # Table header + header = "| Metric | " + " | ".join(name for name, _ in models) + " |" + separator = "|" + "|".join(["---"] * (len(models) + 1)) + "|" + lines.append(header) + lines.append(separator) + + rows = [ + ("Samples Evaluated", "num_samples", "d", "", None), + ("**Turn-Taking**", None, None, None, None), + (" Latency (ms) ↓", "tt_latency_ms", ".1f", "", False), + (" Precision (%) ↑", "tt_precision", ".1f", "", True), + (" Recall (%) ↑", "tt_recall", ".1f", "", True), + (" F1 (%) ↑", "tt_f1", ".1f", "", True), + ("**Barge-In**", None, None, None, None), + (" Success Rate (%) ↑", "bi_success_rate", ".1f", "", True), + (" Latency (ms) ↓", "bi_latency_ms", ".1f", "", False), + ("**Backchanneling**", None, None, None, None), + (" Accuracy (%) ↑", "bc_accuracy", ".1f", "", True), + ("**User Speech (ASR)**", None, None, None, None), + (" WER (%) ↓", "user_wer", ".1f", "", False), + (" OOB Ratio ↓", "oob_ratio", ".3f", "", False), + ("**Agent Speech (TTS)**", None, None, None, None), + (" WER (%) ↓", "agent_wer", ".1f", "", False), + (" CER (%) ↓", "agent_cer", ".1f", "", False), + (" Hallucination (%) ↓", "hallucination_rate", ".1f", "", False), + ("**LLM Judge (1-5)**", None, None, None, None), + (" Overall Rating ↑", "llm_judge_overall", ".2f", "", True), + (" Full Response ↑", "llm_judge_full", ".2f", "", True), + (" Sounded Response ↑", "llm_judge_sounded", ".2f", "", True), + ] + + for display_name, metric_key, fmt, suffix, higher_better in rows: + if metric_key is None: + row = f"| {display_name} |" + " |" * len(models) + lines.append(row) + continue + + values = [] + raw_values = [] + for _, m in models: + val = m.get(metric_key) + raw_values.append(val) + values.append(format_value(val, fmt, suffix)) + + if higher_better is not None: + valid_vals = [(i, v) for i, v in enumerate(raw_values) if v is not None] + if len(valid_vals) >= 2: + if higher_better: + best_idx = max(valid_vals, key=lambda x: x[1])[0] + else: + best_idx = min(valid_vals, key=lambda x: x[1])[0] + values[best_idx] = f"**{values[best_idx]}**" + + row = f"| {display_name} | " + " | ".join(values) + " |" + lines.append(row) + + lines.append("") + return lines + + +def generate_voicebench_section(models: list[tuple[str, dict]]) -> list[str]: + """Generate VoiceBench section of the report.""" + if not models: + return [] + + lines = [] + lines.append("## VoiceBench Evaluation\n") + + # Collect all subtests across all models + all_subtests = set() + for _, vb_data in models: + all_subtests.update(vb_data.get("subtests", {}).keys()) + all_subtests = sorted(all_subtests) + + if not all_subtests: + lines.append("*No VoiceBench results available.*\n") + return lines + + # Total samples row + lines.append("### Summary\n") + header = "| Model | Total Samples | Subtests |" + lines.append(header) + lines.append("|---|---|---|") + for name, vb_data in models: + total = vb_data.get("total_samples", 0) + num_subtests = len(vb_data.get("subtests", {})) + lines.append(f"| {name} | {total} | {num_subtests} |") + lines.append("") + + # Per-subtest metrics table + lines.append("### Per-Subtest Metrics\n") + + # Build a unified table with all metrics + # First, collect all unique metric names across all subtests + all_metrics = set() + for _, vb_data in models: + for subtest, data in vb_data.get("subtests", {}).items(): + all_metrics.update(data.get("metrics", {}).keys()) + + # Metric display info: (metric_key, higher_better) + metric_info = { + "acc": True, + "gpt": True, + "panda": True, + "pedant": True, + "bleu": True, + "exact_match": True, + "score": True, + "fail": False, + "wer": False, + "cer": False, + } + + for subtest in all_subtests: + lines.append(f"#### {subtest}\n") + + # Check which models have this subtest + model_data = [] + for name, vb_data in models: + subtests = vb_data.get("subtests", {}) + if subtest in subtests: + model_data.append((name, subtests[subtest])) + else: + model_data.append((name, None)) + + # Collect metrics for this subtest + subtest_metrics = set() + for _, data in model_data: + if data: + subtest_metrics.update(data.get("metrics", {}).keys()) + + if not subtest_metrics: + lines.append("*No metrics available.*\n") + continue + + # Build table + header = "| Metric | " + " | ".join(name for name, _ in model_data) + " |" + lines.append(header) + lines.append("|" + "|".join(["---"] * (len(model_data) + 1)) + "|") + + # Samples row + samples_row = "| Samples |" + for _, data in model_data: + if data: + samples_row += f" {data.get('samples', 'N/A')} |" + else: + samples_row += " N/A |" + lines.append(samples_row) + + # Metric rows + for metric in sorted(subtest_metrics): + higher_better = metric_info.get(metric.lower(), True) + arrow = "↑" if higher_better else "↓" + + values = [] + raw_values = [] + for _, data in model_data: + if data and data.get("metrics"): + val = data["metrics"].get(metric) + else: + val = None + raw_values.append(val) + values.append(format_value(val, ".2f")) + + # Highlight best + valid_vals = [(i, v) for i, v in enumerate(raw_values) if v is not None] + if len(valid_vals) >= 2: + if higher_better: + best_idx = max(valid_vals, key=lambda x: x[1])[0] + else: + best_idx = min(valid_vals, key=lambda x: x[1])[0] + values[best_idx] = f"**{values[best_idx]}**" + + row = f"| {metric} {arrow} | " + " | ".join(values) + " |" + lines.append(row) + + lines.append("") + + return lines + + +def parse_folder_spec(spec: str) -> tuple[Optional[str], str, str]: + """Parse folder specification: 'path:name', 'host:path:name', 'path', 'host:path'.""" + parts = spec.split(":") + + if len(parts) == 1: + path = parts[0] + name = os.path.basename(path.rstrip("/")) + return None, path, name + + if len(parts) == 2: + if parts[0].startswith("/") or parts[0].startswith("."): + return None, parts[0], parts[1] + else: + host, path = parts[0], parts[1] + name = os.path.basename(path.rstrip("/")) + return host, path, name + + if len(parts) == 3: + return parts[0], parts[1], parts[2] + + if len(parts) > 3: + host = parts[0] + name = parts[-1] + path = ":".join(parts[1:-1]) + return host, path, name + + return None, spec, os.path.basename(spec.rstrip("/")) + + +def generate_report( + s2s_demo_models: list[tuple[str, dict]], voicebench_models: list[tuple[str, dict]], output_path: str +): + """Generate the full comparison report.""" + lines = [] + lines.append("# Evaluation Comparison Report\n") + + # Collect unique model names + all_models = set() + for name, _ in s2s_demo_models: + all_models.add(name) + for name, _ in voicebench_models: + all_models.add(name) + + lines.append(f"Comparing {len(all_models)} model(s):\n") + for name in sorted(all_models): + lines.append(f"- {name}") + lines.append("") + + # S2S Demo section + if s2s_demo_models: + lines.extend(generate_s2s_demo_section(s2s_demo_models)) + + # VoiceBench section + if voicebench_models: + lines.extend(generate_voicebench_section(voicebench_models)) + + # Legend + lines.append("---") + lines.append("*↑ = higher is better, ↓ = lower is better, **bold** = best value*") + + report = "\n".join(lines) + + with open(output_path, "w") as f: + f.write(report) + + print(f"Report saved to: {output_path}") + print("\n" + "=" * 60) + print(report) + + +def main(): + parser = argparse.ArgumentParser( + description="Compare S2S demo and VoiceBench evaluation results", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Compare s2s_demo and voicebench results for two models: + python compare_eval_results.py \\ + --eval_folders \\ + "host:/path/to/model_a/s2s_demo:Model A" \\ + "host:/path/to/model_a/voicebench:Model A" \\ + "host:/path/to/model_b/s2s_demo:Model B" \\ + "host:/path/to/model_b/voicebench:Model B" \\ + --output comparison_report.md + +Each folder should contain eval-results/ with either s2s_demo.* or voicebench.* subfolders. + """, + ) + parser.add_argument( + "--eval_folders", + nargs="+", + required=True, + help="Evaluation folders: 'path:name', 'host:path:name', or just 'path' / 'host:path'", + ) + parser.add_argument("--output", type=str, default="comparison_report.md", help="Output Markdown file") + + args = parser.parse_args() + + s2s_demo_models = [] + voicebench_models = [] + + for spec in args.eval_folders: + host, folder_path, model_name = parse_folder_spec(spec) + eval_results_path = ( + folder_path + "/eval-results" if not folder_path.endswith("/") else folder_path + "eval-results" + ) + + location_str = f"{host}:{folder_path}" if host else folder_path + + eval_type = detect_eval_type(host, eval_results_path) + + if eval_type == "s2s_demo": + metrics = load_s2s_demo_metrics(host, folder_path) + if metrics: + s2s_demo_models.append((model_name, metrics)) + print(f"Loaded s2s_demo metrics for {model_name} from {location_str}") + else: + print(f"Warning: No s2s_demo metrics found in {location_str}") + + elif eval_type == "voicebench": + metrics = load_voicebench_metrics(host, folder_path) + if metrics.get("subtests"): + voicebench_models.append((model_name, metrics)) + n_subtests = len(metrics["subtests"]) + print(f"Loaded voicebench metrics for {model_name} from {location_str} ({n_subtests} subtests)") + else: + print(f"Warning: No voicebench metrics found in {location_str}") + + else: + print(f"Warning: Could not detect eval type for {location_str}") + + if not s2s_demo_models and not voicebench_models: + print("Error: No valid metrics found.") + return 1 + + generate_report(s2s_demo_models, voicebench_models, args.output) + return 0 + + +if __name__ == "__main__": + exit(main()) diff --git a/nemo_skills/dataset/s2s_demo/scripts/comparison_report.md b/nemo_skills/dataset/s2s_demo/scripts/comparison_report.md new file mode 100644 index 0000000000..380b9cb941 --- /dev/null +++ b/nemo_skills/dataset/s2s_demo/scripts/comparison_report.md @@ -0,0 +1,84 @@ +# Evaluation Comparison Report + +Comparing 2 model(s): + +- December Model +- November Model + +## S2S Demo Evaluation + +| Metric | November Model | December Model | +|---|---|---| +| Samples Evaluated | 60 | 60 | +| **Turn-Taking** | | | +| Latency (ms) ↓ | 404.8 | **297.9** | +| Precision (%) ↑ | **76.9** | 75.9 | +| Recall (%) ↑ | 71.8 | **80.5** | +| F1 (%) ↑ | 72.5 | **77.0** | +| **Barge-In** | | | +| Success Rate (%) ↑ | 72.3 | **90.2** | +| Latency (ms) ↓ | 535.8 | **413.9** | +| **Backchanneling** | | | +| Accuracy (%) ↑ | **0.0** | 0.0 | +| **User Speech (ASR)** | | | +| WER (%) ↓ | 34.5 | **24.4** | +| OOB Ratio ↓ | 0.009 | **0.000** | +| **Agent Speech (TTS)** | | | +| WER (%) ↓ | 10.2 | **4.1** | +| CER (%) ↓ | 4.5 | **2.2** | +| Hallucination (%) ↓ | 3.8 | **2.6** | +| **LLM Judge (1-5)** | | | +| Overall Rating ↑ | 3.27 | **3.31** | +| Full Response ↑ | **3.53** | 3.45 | +| Sounded Response ↑ | 3.00 | **3.17** | + +## VoiceBench Evaluation + +### Summary + +| Model | Total Samples | Subtests | +|---|---|---| +| November Model | 5001 | 5 | +| December Model | 5001 | 5 | + +### Per-Subtest Metrics + +#### advbench + +| Metric | November Model | December Model | +|---|---|---| +| Samples | 520 | 520 | +| refusal_rate ↑ | 0.95 | **0.98** | + +#### alpacaeval + +| Metric | November Model | December Model | +|---|---|---| +| Samples | 199 | 199 | +| gpt ↑ | **3.64** | 3.32 | + +#### commoneval + +| Metric | November Model | December Model | +|---|---|---| +| Samples | 200 | 200 | +| gpt ↑ | **3.33** | 3.08 | + +#### mmsu + +| Metric | November Model | December Model | +|---|---|---| +| Samples | 3074 | 3074 | +| acc ↑ | 41.67 | **43.14** | +| fail ↓ | 4.59 | **1.66** | + +#### openbookqa + +| Metric | November Model | December Model | +|---|---|---| +| Samples | 455 | 455 | +| acc ↑ | 58.02 | **58.68** | +| fail ↓ | 1.32 | **0.44** | + +--- +*↑ = higher is better, ↓ = lower is better, **bold** = best value* diff --git a/nemo_skills/dataset/s2s_demo/scripts/eval_conv_v2.sh b/nemo_skills/dataset/s2s_demo/scripts/eval_conv_v2.sh new file mode 100755 index 0000000000..760e292f8e --- /dev/null +++ b/nemo_skills/dataset/s2s_demo/scripts/eval_conv_v2.sh @@ -0,0 +1,104 @@ +#!/bin/bash +# Eval script for conversation behavior using output.jsonl format +# Usage: ./eval_conv_v2.sh [options] +# +# Recommended container: gitlab-master.nvidia.com/pzelasko/nemo_containers:25.04-pytorch2.7-28may25 +# This container has torchaudio pre-installed which is required for VAD. + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Default parameters +BARGE_IN_THRESHOLD_SEC=1.5 +TT_LATENCY_THRESHOLD_SEC=1.5 +TT_PRECISION_BUFFER_SEC=1.0 +TT_RECALL_BUFFER_SEC=20.0 +VAD_MIN_SILENCE_MS=2000 +END_TIME="None" + +# Docker container with torchaudio support +DOCKER_IMAGE="gitlab-master.nvidia.com/pzelasko/nemo_containers:25.04-pytorch2.7-28may25" + +# Parse arguments +RESULTS_DIR=${1:-""} +VERBOSE=${2:-"--verbose"} +DISABLE_TRANSCRIPTION=${3:-""} +USE_DOCKER=${4:-"true"} # Set to "false" to run locally + +if [ -z "$RESULTS_DIR" ]; then + echo "Usage: $0 [--verbose] [--disable_transcription] [use_docker=true|false]" + echo "" + echo "Example (with Docker - recommended):" + echo " $0 /tmp/eval_experiment/s2s_demo.demo_20251124" + echo "" + echo "Example (local, requires torchaudio):" + echo " $0 /path/to/results --verbose \"\" false" + echo "" + echo "Arguments:" + echo " results_dir Directory containing output.jsonl and audio/" + echo " --verbose Print detailed segment information (default: enabled)" + echo " --disable_transcription Disable ASR for user segments" + echo " use_docker Use Docker container (default: true)" + exit 1 +fi + +# Build extra args +EXTRA_ARGS="" +if [ -n "$VERBOSE" ]; then + EXTRA_ARGS="$EXTRA_ARGS $VERBOSE" +fi +if [ -n "$DISABLE_TRANSCRIPTION" ]; then + EXTRA_ARGS="$EXTRA_ARGS $DISABLE_TRANSCRIPTION" +fi + +# Create output log path +OUTPUT_LOG="${RESULTS_DIR}/eval_conv_v2.log" + +echo "========================================" +echo "Conversation Behavior Evaluation (v2)" +echo "========================================" +echo "Results directory: $RESULTS_DIR" +echo "Output log: $OUTPUT_LOG" +echo "Parameters:" +echo " - Barge-in threshold: ${BARGE_IN_THRESHOLD_SEC}s" +echo " - TT latency threshold: ${TT_LATENCY_THRESHOLD_SEC}s" +echo " - TT precision buffer: ${TT_PRECISION_BUFFER_SEC}s" +echo " - TT recall buffer: ${TT_RECALL_BUFFER_SEC}s" +echo " - VAD min silence: ${VAD_MIN_SILENCE_MS}ms" +echo "========================================" + +if [ "$USE_DOCKER" = "true" ]; then + echo "Using Docker container: $DOCKER_IMAGE" + + # Source HF token if available + [ -f ~/.env ] && source ~/.env + + docker run --rm --gpus all --ipc=host \ + -e HF_TOKEN=${HF_READ_ONLY:-$HF_TOKEN} \ + -v "$(dirname $RESULTS_DIR):/data" \ + -v "$SCRIPT_DIR:/workspace/scripts" \ + -w /workspace/scripts \ + $DOCKER_IMAGE \ + python eval_conversation_behavior_v2.py \ + --results_dir "/data/$(basename $RESULTS_DIR)" \ + --barge_in_threshold_sec $BARGE_IN_THRESHOLD_SEC \ + --tt_latency_threshold_sec $TT_LATENCY_THRESHOLD_SEC \ + --tt_precision_buffer_sec $TT_PRECISION_BUFFER_SEC \ + --tt_recall_buffer_sec $TT_RECALL_BUFFER_SEC \ + --vad_min_silence_duration_ms $VAD_MIN_SILENCE_MS \ + --end_time "$END_TIME" \ + $EXTRA_ARGS 2>&1 | tee "$OUTPUT_LOG" +else + echo "Running locally (requires torchaudio)" + python "$SCRIPT_DIR/eval_conversation_behavior_v2.py" \ + --results_dir "$RESULTS_DIR" \ + --barge_in_threshold_sec $BARGE_IN_THRESHOLD_SEC \ + --tt_latency_threshold_sec $TT_LATENCY_THRESHOLD_SEC \ + --tt_precision_buffer_sec $TT_PRECISION_BUFFER_SEC \ + --tt_recall_buffer_sec $TT_RECALL_BUFFER_SEC \ + --vad_min_silence_duration_ms $VAD_MIN_SILENCE_MS \ + --end_time "$END_TIME" \ + $EXTRA_ARGS 2>&1 | tee "$OUTPUT_LOG" +fi + +echo "" +echo "Evaluation complete. Log saved to: $OUTPUT_LOG" diff --git a/nemo_skills/dataset/s2s_demo/scripts/eval_conversation_behavior_v2.py b/nemo_skills/dataset/s2s_demo/scripts/eval_conversation_behavior_v2.py new file mode 100644 index 0000000000..effed306a9 --- /dev/null +++ b/nemo_skills/dataset/s2s_demo/scripts/eval_conversation_behavior_v2.py @@ -0,0 +1,1961 @@ +######################## +# Eval script for turn-taking (TT), user back-channeling (BC), user barge-in (BI) +# V2: Reads from output.jsonl format with inline audio paths and timestamps + +import argparse +import json +import os +import re +import string + +import torch +import torchaudio +from jiwer import process_characters, process_words +from nemo.collections import asr as nemo_asr +from tqdm import tqdm + +INF_LATENCY = 9999.0 +FRAME_SIZE_SEC = 0.08 # 80ms per frame +DEFAULT_SEGMENT_BUFFER_SEC = 0.5 # Default segment buffer for WER calculation + +LLM_JUDGE_PROMPT_TEMPLATE = """ +I need your help to evaluate the performance of a speech-to-speech model. The model receives speech input from the user and responds with speech output. +Your task is to rate the model's responses based on the provided user input transcription [User] and the model's output transcription [Agent]. + +Please evaluate the response on a scale of 1 to 5: +1 point: The response is largely irrelevant, incorrect, or fails to address the user's query. It may be off-topic or provide incorrect information. +2 points: The response is somewhat relevant but lacks accuracy or completeness. It may only partially answer the user's question or include extraneous information. +3 points: The response is relevant and mostly accurate, but it may lack conciseness or include unnecessary details that don't contribute to the main point. +4 points: The response is relevant, accurate, and concise, providing a clear answer to the user's question without unnecessary elaboration. +5 points: The response is exceptionally relevant, accurate, and to the point. It directly addresses the user's query in a highly effective and efficient manner, providing exactly the information needed. + +Below is the conversation transcript: + +{conversation} + +After evaluating, please output "Rating: X" where X is your score (1-5), without anything else. +""".strip() + + +def normalize_text_for_wer(text): + """Normalize text for WER calculation: remove punctuation, timestamps, lowercase. + + Removes: + - Timestamp tags: <|t|>, <$t$>, <|0.00|>, <$0.00$> + - Sentence boundary tokens: , + - Special tokens: , + - All punctuation including hyphens + - Extra whitespace + """ + if not text: + return "" + + # Remove timestamp tags with values: <|0.00|>, <$0.00$> + text = re.sub(r"<\|[\d\.]+\|>", "", text) + text = re.sub(r"<\$[\d\.]+\$>", "", text) + + # Remove sentence boundary tokens + text = re.sub(r"", "", text) + + # Remove special tokens + text = re.sub(r"", "", text) + text = re.sub(r"", "", text) + + # Remove any remaining angle bracket tokens + text = re.sub(r"<[^>]+>", "", text) + + # Remove punctuation including hyphens + text = text.translate(str.maketrans("", "", string.punctuation)) + + # Lowercase and normalize whitespace + text = text.lower() + text = " ".join(text.split()) + + return text + + +def time_to_frame(time_sec): + """Convert time in seconds to frame index.""" + return int(time_sec / FRAME_SIZE_SEC) + + +def frame_to_time(frame_idx): + """Convert frame index to time in seconds.""" + return frame_idx * FRAME_SIZE_SEC + + +def extract_text_from_alignment(frame_alignment, field, start_frame=None, end_frame=None): + """Extract concatenated decoded text from frame_alignment within optional frame bounds. + + Args: + frame_alignment: dict with frame_idx, asr_stream_decoded, agent_stream_decoded, etc. + field: "asr_stream_decoded" or "agent_stream_decoded" + start_frame: optional start frame index (inclusive) + end_frame: optional end frame index (exclusive) + + Returns: + Concatenated text from the specified field within bounds + """ + if not frame_alignment or field not in frame_alignment: + return "" + + tokens = frame_alignment[field] + frame_indices = frame_alignment.get("frame_idx", list(range(len(tokens)))) + + if start_frame is None and end_frame is None: + # Return all tokens concatenated + return "".join(tokens) + + # Filter by frame range + result_tokens = [] + for i, fidx in enumerate(frame_indices): + if start_frame is not None and fidx < start_frame: + continue + if end_frame is not None and fidx >= end_frame: + continue + if i < len(tokens): + result_tokens.append(tokens[i]) + + return "".join(result_tokens) + + +def get_debug_info(entry): + """Extract debug_info from entry, handling both direct and nested formats.""" + original = entry.get("original_entry", entry) + return original.get("debug_info", {}) + + +def compute_user_speech_wer( + user_segments, frame_alignment, user_transcripts, audio_duration, segment_buffer_sec=DEFAULT_SEGMENT_BUFFER_SEC +): + """Compute WER for user speech recognition from model's ASR output vs ground truth. + + Args: + user_segments: list of user segment dicts with start/end times (from VAD) + frame_alignment: debug_info frame_alignment dict + user_transcripts: dict mapping (start, end) tuples to ground truth ASR text + audio_duration: total audio duration in seconds + + Returns: + dict with: + per_segment: list of dicts with ref/hyp text, S/I/D counts, WER per segment + total_wer: WER calculated from sum of all errors / sum of ref words + out_of_bounds_words: list of (word, onset_time) tuples for words outside segments + out_of_bounds_word_ratio: words outside segments / audio_duration + """ + if not frame_alignment or "asr_stream_decoded" not in frame_alignment: + return { + "per_segment": [], + "total_wer": None, + "total_ref_words": 0, + "total_substitutions": 0, + "total_insertions": 0, + "total_deletions": 0, + "out_of_bounds_words": [], + "out_of_bounds_word_ratio": None, + "error": "no_frame_alignment", + } + + per_segment = [] + total_ref_words = 0 + total_substitutions = 0 + total_insertions = 0 + total_deletions = 0 + + for seg in user_segments: + start_frame = time_to_frame(seg["start"]) + # Extend end boundary for WER calculation to capture trailing words + end_frame = time_to_frame(seg["end"] + segment_buffer_sec) + + # Get model's ASR output for this segment from frame_alignment (hypothesis) + hyp_text = extract_text_from_alignment(frame_alignment, "asr_stream_decoded", start_frame, end_frame) + hyp_normalized = normalize_text_for_wer(hyp_text) + + # Get ground truth ASR transcription (reference) + seg_key = (seg["start"], seg["end"]) + ref_text = user_transcripts.get(seg_key, "") + ref_normalized = normalize_text_for_wer(ref_text) + + ref_word_count = len(ref_normalized.split()) if ref_normalized else 0 + hyp_word_count = len(hyp_normalized.split()) if hyp_normalized else 0 + + # Calculate detailed WER metrics using jiwer + if ref_normalized or hyp_normalized: + if ref_normalized and hyp_normalized: + result = process_words(ref_normalized, hyp_normalized) + subs = result.substitutions + ins = result.insertions + dels = result.deletions + elif ref_normalized: + # Hypothesis empty - all deletions + subs, ins, dels = 0, 0, ref_word_count + else: + # Reference empty - all insertions + subs, ins, dels = 0, hyp_word_count, 0 + + segment_wer = ( + (subs + ins + dels) / ref_word_count if ref_word_count > 0 else (1.0 if hyp_word_count > 0 else 0.0) + ) + + total_ref_words += ref_word_count + total_substitutions += subs + total_insertions += ins + total_deletions += dels + + per_segment.append( + { + "start": seg["start"], + "end": seg["end"], + "reference": ref_normalized, + "hypothesis": hyp_normalized, + "ref_words": ref_word_count, + "substitutions": subs, + "insertions": ins, + "deletions": dels, + "wer": segment_wer, + } + ) + + # Calculate out-of-bounds words with timestamps + all_tokens = frame_alignment.get("asr_stream_decoded", []) + frame_indices = frame_alignment.get("frame_idx", list(range(len(all_tokens)))) + + # Build set of frames that are within user segments (with buffer on both ends) + in_segment_frames = set() + for seg in user_segments: + start_frame = time_to_frame(max(0, seg["start"] - segment_buffer_sec)) + end_frame = time_to_frame(seg["end"] + segment_buffer_sec) + # Use end_frame + 1 to include the end frame (range is exclusive of end) + for f in range(start_frame, end_frame + 1): + in_segment_frames.add(f) + + # Collect words outside segments with their timestamps + out_of_bounds_words = [] + current_word = [] + current_word_start_frame = None + + for i, fidx in enumerate(frame_indices): + if fidx not in in_segment_frames and i < len(all_tokens): + token = all_tokens[i] + if token and token.strip(): + # Check if this token starts a new word (contains space or is start) + if current_word_start_frame is None: + current_word_start_frame = fidx + current_word.append(token) + else: + # End of out-of-segment region - flush accumulated tokens + if current_word: + word_text = normalize_text_for_wer("".join(current_word)) + if word_text: + for w in word_text.split(): + out_of_bounds_words.append( + { + "word": w, + "onset_time": frame_to_time(current_word_start_frame), + } + ) + current_word = [] + current_word_start_frame = None + + # Flush any remaining tokens + if current_word: + word_text = normalize_text_for_wer("".join(current_word)) + if word_text: + for w in word_text.split(): + out_of_bounds_words.append( + { + "word": w, + "onset_time": frame_to_time(current_word_start_frame), + } + ) + + out_of_bounds_word_count = len(out_of_bounds_words) + out_of_bounds_ratio = out_of_bounds_word_count / audio_duration if audio_duration > 0 else 0 + + # Calculate total WER from summed errors + total_errors = total_substitutions + total_insertions + total_deletions + total_wer = total_errors / total_ref_words if total_ref_words > 0 else None + + return { + "per_segment": per_segment, + "total_wer": total_wer, + "total_ref_words": total_ref_words, + "total_substitutions": total_substitutions, + "total_insertions": total_insertions, + "total_deletions": total_deletions, + "out_of_bounds_words": out_of_bounds_words, + "out_of_bounds_word_count": out_of_bounds_word_count, + "out_of_bounds_word_ratio": out_of_bounds_ratio, + } + + +def compute_wer_with_details(reference, hypothesis, ignore_trailing_deletions=True): + """Compute WER with detailed S/I/D counts, optionally ignoring trailing deletions. + + When TTS is truncated due to user interruption, the hypothesis may be + shorter than the reference. If ignore_trailing_deletions=True, we ignore + these trailing deletions. + + Args: + reference: normalized reference text (speech2text model output) + hypothesis: normalized hypothesis text (TTS audio transcribed) + ignore_trailing_deletions: whether to ignore trailing deletions (TTS truncation) + + Returns: + dict with wer, subs, ins, dels, ref_words, truncation_detected, truncated_words + """ + ref_words = reference.split() if reference else [] + hyp_words = hypothesis.split() if hypothesis else [] + ref_word_count = len(ref_words) + hyp_word_count = len(hyp_words) + + if not reference and not hypothesis: + return { + "wer": 0.0, + "substitutions": 0, + "insertions": 0, + "deletions": 0, + "ref_words": 0, + "truncation_detected": False, + "truncated_words": [], + } + if not reference: + return { + "wer": 1.0, + "substitutions": 0, + "insertions": hyp_word_count, + "deletions": 0, + "ref_words": 0, + "truncation_detected": False, + "truncated_words": [], + } + if not hypothesis: + return { + "wer": 1.0, + "substitutions": 0, + "insertions": 0, + "deletions": ref_word_count, + "ref_words": ref_word_count, + "truncation_detected": True, + "truncated_words": ref_words, + } + + # Use jiwer to get detailed metrics + result = process_words(reference, hypothesis) + subs = result.substitutions + ins = result.insertions + dels = result.deletions + + truncation_detected = False + truncated_words = [] + + # Check for trailing deletion (truncation) if hypothesis is shorter + if ignore_trailing_deletions and hyp_word_count < ref_word_count: + # Try trimming reference to hypothesis length + trimmed_ref = " ".join(ref_words[:hyp_word_count]) + if trimmed_ref: + trimmed_result = process_words(trimmed_ref, hypothesis) + trimmed_errors = trimmed_result.substitutions + trimmed_result.insertions + trimmed_result.deletions + original_errors = subs + ins + dels + + # If trimming significantly reduces errors, it's truncation + if trimmed_errors < original_errors * 0.8: + truncation_detected = True + truncated_words = ref_words[hyp_word_count:] + subs = trimmed_result.substitutions + ins = trimmed_result.insertions + dels = trimmed_result.deletions + ref_word_count = hyp_word_count # Adjust ref count for WER calculation + + total_errors = subs + ins + dels + wer_value = total_errors / ref_word_count if ref_word_count > 0 else 0.0 + + return { + "wer": wer_value, + "substitutions": subs, + "insertions": ins, + "deletions": dels, + "ref_words": ref_word_count, + "truncation_detected": truncation_detected, + "truncated_words": truncated_words, + } + + +def compute_agent_speech_quality( + agent_segments, frame_alignment, agent_transcripts, segment_buffer_sec=DEFAULT_SEGMENT_BUFFER_SEC +): + """Compute WER/CER for agent speech: TTS output vs speech2text model output. + + Args: + agent_segments: list of agent segment dicts with start/end times + frame_alignment: debug_info frame_alignment dict + agent_transcripts: dict mapping (start, end) tuples to TTS audio ASR transcription + + Returns: + dict with per_segment details including ref/hyp text, S/I/D counts, WER/CER + Total WER/CER calculated from sum of all errors / sum of ref words/chars + """ + if not frame_alignment or "agent_stream_decoded" not in frame_alignment: + return { + "per_segment": [], + "total_wer": None, + "total_cer": None, + "total_ref_words": 0, + "total_ref_chars": 0, + "total_word_substitutions": 0, + "total_word_insertions": 0, + "total_word_deletions": 0, + "total_char_substitutions": 0, + "total_char_insertions": 0, + "total_char_deletions": 0, + "truncation_events": 0, + "truncated_words": [], + "error": "no_frame_alignment", + } + + per_segment = [] + total_ref_words = 0 + total_ref_chars = 0 + total_word_subs = 0 + total_word_ins = 0 + total_word_dels = 0 + total_char_subs = 0 + total_char_ins = 0 + total_char_dels = 0 + truncation_events = 0 + all_truncated_words = [] + + for seg in agent_segments: + start_frame = time_to_frame(seg["start"]) + # Extend end boundary for WER calculation to capture trailing words + end_frame = time_to_frame(seg["end"] + segment_buffer_sec) + + # Reference: speech2text model output from frame_alignment + ref_text = extract_text_from_alignment(frame_alignment, "agent_stream_decoded", start_frame, end_frame) + ref_normalized = normalize_text_for_wer(ref_text) + + # Hypothesis: ASR transcription of TTS audio output + seg_key = (seg["start"], seg["end"]) + hyp_text = agent_transcripts.get(seg_key, "") + hyp_normalized = normalize_text_for_wer(hyp_text) + + if ref_normalized or hyp_normalized: + # Compute WER with truncation handling + wer_result = compute_wer_with_details(ref_normalized, hyp_normalized, ignore_trailing_deletions=True) + + if wer_result["truncation_detected"]: + truncation_events += 1 + all_truncated_words.extend(wer_result["truncated_words"]) + + total_ref_words += wer_result["ref_words"] + total_word_subs += wer_result["substitutions"] + total_word_ins += wer_result["insertions"] + total_word_dels += wer_result["deletions"] + + # Compute CER (character error rate) + # If truncation was detected, compute CER only on the matched portion + if wer_result["truncation_detected"] and wer_result["truncated_words"]: + # Trim reference to match the non-truncated portion + ref_words = ref_normalized.split() + matched_ref_words = ref_words[: len(ref_words) - len(wer_result["truncated_words"])] + ref_for_cer = " ".join(matched_ref_words) + else: + ref_for_cer = ref_normalized + + ref_chars = len(ref_for_cer) if ref_for_cer else 0 + hyp_chars = len(hyp_normalized) if hyp_normalized else 0 + + if ref_for_cer and hyp_normalized: + cer_result = process_characters(ref_for_cer, hyp_normalized) + char_subs = cer_result.substitutions + char_ins = cer_result.insertions + char_dels = cer_result.deletions + elif ref_for_cer: + char_subs, char_ins, char_dels = 0, 0, ref_chars + else: + char_subs, char_ins, char_dels = 0, hyp_chars, 0 + + total_ref_chars += ref_chars + total_char_subs += char_subs + total_char_ins += char_ins + total_char_dels += char_dels + + segment_cer = ( + (char_subs + char_ins + char_dels) / ref_chars if ref_chars > 0 else (1.0 if hyp_chars > 0 else 0.0) + ) + + per_segment.append( + { + "start": seg["start"], + "end": seg["end"], + "reference": ref_normalized, + "hypothesis": hyp_normalized, + "ref_words": wer_result["ref_words"], + "word_substitutions": wer_result["substitutions"], + "word_insertions": wer_result["insertions"], + "word_deletions": wer_result["deletions"], + "wer": wer_result["wer"], + "ref_chars": ref_chars, + "char_substitutions": char_subs, + "char_insertions": char_ins, + "char_deletions": char_dels, + "cer": segment_cer, + "truncation_detected": wer_result["truncation_detected"], + "truncated_words": wer_result["truncated_words"], + } + ) + + # Calculate total WER/CER from summed errors + total_word_errors = total_word_subs + total_word_ins + total_word_dels + total_wer = total_word_errors / total_ref_words if total_ref_words > 0 else None + + total_char_errors = total_char_subs + total_char_ins + total_char_dels + total_cer = total_char_errors / total_ref_chars if total_ref_chars > 0 else None + + return { + "per_segment": per_segment, + "total_wer": total_wer, + "total_cer": total_cer, + "total_ref_words": total_ref_words, + "total_ref_chars": total_ref_chars, + "total_word_substitutions": total_word_subs, + "total_word_insertions": total_word_ins, + "total_word_deletions": total_word_dels, + "total_char_substitutions": total_char_subs, + "total_char_insertions": total_char_ins, + "total_char_deletions": total_char_dels, + "truncation_events": truncation_events, + "truncated_words": all_truncated_words, + } + + +def compute_tts_hallucinations( + agent_segments, frame_alignment, agent_transcripts, segment_buffer_sec=DEFAULT_SEGMENT_BUFFER_SEC +): + """Detect TTS hallucinations: words in TTS output not present in speech2text output. + + Args: + agent_segments: list of agent segment dicts with start/end times + frame_alignment: debug_info frame_alignment dict + agent_transcripts: dict mapping (start, end) tuples to TTS audio ASR transcription + + Returns: + dict with hallucinated words with onset times per segment and overall + """ + if not frame_alignment or "agent_stream_decoded" not in frame_alignment: + return { + "per_segment": [], + "hallucinations": [], + "hallucination_rate": None, + "error": "no_frame_alignment", + } + + per_segment = [] + all_hallucinations = [] + total_hyp_words = 0 + + for seg in agent_segments: + start_frame = time_to_frame(seg["start"]) + # Extend end boundary for consistency with WER calculation + end_frame = time_to_frame(seg["end"] + segment_buffer_sec) + seg_start_time = seg["start"] + + # Reference: speech2text model output from frame_alignment + ref_text = extract_text_from_alignment(frame_alignment, "agent_stream_decoded", start_frame, end_frame) + ref_normalized = normalize_text_for_wer(ref_text) + ref_words = set(ref_normalized.split()) if ref_normalized else set() + + # Hypothesis: ASR transcription of TTS audio output + seg_key = (seg["start"], seg["end"]) + hyp_text = agent_transcripts.get(seg_key, "") + hyp_normalized = normalize_text_for_wer(hyp_text) + hyp_words = hyp_normalized.split() if hyp_normalized else [] + + total_hyp_words += len(hyp_words) + + # Find hallucinated words with estimated onset times + # Estimate onset time based on word position within segment + segment_duration = seg["end"] - seg["start"] + segment_hallucinations = [] + + for i, word in enumerate(hyp_words): + if word not in ref_words: + # Estimate onset time based on word position + word_ratio = i / len(hyp_words) if hyp_words else 0 + estimated_onset = seg_start_time + word_ratio * segment_duration + + hallucination_entry = { + "word": word, + "onset_time": round(estimated_onset, 3), + "segment_start": seg_start_time, + "segment_end": seg["end"], + } + segment_hallucinations.append(hallucination_entry) + all_hallucinations.append(hallucination_entry) + + per_segment.append( + { + "start": seg["start"], + "end": seg["end"], + "count": len(segment_hallucinations), + "hallucinations": segment_hallucinations, + } + ) + + hallucination_rate = len(all_hallucinations) / total_hyp_words if total_hyp_words > 0 else 0.0 + + return { + "per_segment": per_segment, + "hallucinations": all_hallucinations, + "total_hallucinated": len(all_hallucinations), + "total_agent_words": total_hyp_words, + "hallucination_rate": hallucination_rate, + } + + +def compute_token_balance(frame_alignment): + """Compute token balance metrics from frame alignment. + + Both streams use for BOS and for EOS sentence boundary tokens. + + Balance metric is normalized to [-1, 1]: + - 0 = perfectly balanced + - Positive = more BOS tokens than EOS tokens (incomplete utterances) + - Negative = more EOS tokens than BOS tokens + + Args: + frame_alignment: debug_info frame_alignment dict + + Returns: + dict with token counts and balance metrics for both streams + """ + if not frame_alignment: + return { + "agent_bos_count": 0, + "agent_eos_count": 0, + "agent_balance": 0.0, + "user_bos_count": 0, + "user_eos_count": 0, + "user_balance": 0.0, + "error": "no_frame_alignment", + } + + # Count agent stream BOS/EOS tokens ( and ) + agent_tokens = frame_alignment.get("agent_stream_decoded", []) + agent_text = "".join(agent_tokens) if agent_tokens else "" + + # Agent BOS: + agent_bos_count = agent_text.count("") + # Agent EOS: + agent_eos_count = agent_text.count("") + + # Calculate agent balance + agent_total = agent_bos_count + agent_eos_count + agent_balance = (agent_bos_count - agent_eos_count) / agent_total if agent_total > 0 else 0.0 + + # Count user stream (ASR) BOS/EOS tokens ( and ) + user_tokens = frame_alignment.get("asr_stream_decoded", []) + user_text = "".join(user_tokens) if user_tokens else "" + + # User BOS: + user_bos_count = user_text.count("") + # User EOS: + user_eos_count = user_text.count("") + + # Calculate user balance + user_total = user_bos_count + user_eos_count + user_balance = (user_bos_count - user_eos_count) / user_total if user_total > 0 else 0.0 + + return { + "agent_bos_count": agent_bos_count, + "agent_eos_count": agent_eos_count, + "agent_balance": agent_balance, + "user_bos_count": user_bos_count, + "user_eos_count": user_eos_count, + "user_balance": user_balance, + } + + +def parse_float_list(arg): + """Parse a string representation of a list of floats.""" + if arg.startswith("[") and arg.endswith("]"): + arg = arg[1:-1] + return [float(x.strip()) for x in arg.split(",")] + + +def remove_special_symbols(text): + """Remove special symbols like from text.""" + text = re.sub(r"", "", text) + return text.strip() + + +def parse_timestamped_text(text_with_timestamps, audio_duration=None): + """Parse BOS <|t|> and EOS <$t$> timestamps from text. + + Args: + text_with_timestamps: Text containing BOS <|t|> and EOS <$t$> markers + audio_duration: Optional audio duration to use as end time for last segment without EOS + """ + bos_pattern = r"<\|([\d\.]+)\|>" + eos_pattern = r"<\$([\d\.]+)\$>" + + bos_matches = list(re.finditer(bos_pattern, text_with_timestamps)) + eos_matches = list(re.finditer(eos_pattern, text_with_timestamps)) + + bos_timestamps = [float(match.group(1)) for match in bos_matches] + eos_timestamps = [float(match.group(1)) for match in eos_matches] + + agent_segments = [] + + if bos_timestamps and eos_timestamps: + for i, start_time in enumerate(bos_timestamps): + end_time = None + eos_idx = None + for j, eos_time in enumerate(eos_timestamps): + if eos_time > start_time: + end_time = eos_time + eos_idx = j + break + + text = "" + if end_time is not None and i < len(bos_matches) and eos_idx < len(eos_matches): + bos_end_pos = bos_matches[i].end() + eos_start_pos = eos_matches[eos_idx].start() + text = text_with_timestamps[bos_end_pos:eos_start_pos].strip() + elif i < len(bos_matches): + # No matching EOS - extract text from BOS to next BOS or end of string + bos_end_pos = bos_matches[i].end() + if i < len(bos_matches) - 1: + next_bos_start_pos = bos_matches[i + 1].start() + text = text_with_timestamps[bos_end_pos:next_bos_start_pos].strip() + else: + # Last BOS without EOS - extract to end of string + text = text_with_timestamps[bos_end_pos:].strip() + # Remove any trailing EOS tags that might be present + text = re.sub(r"<\$[\d\.]+\$>", "", text).strip() + + if end_time is not None: + agent_segments.append({"start": start_time, "end": end_time, "text": text}) + else: + # No EOS - use audio duration if available, otherwise default to +5s + fallback_end = audio_duration if audio_duration is not None else start_time + 5.0 + agent_segments.append({"start": start_time, "end": fallback_end, "text": text}) + elif bos_timestamps: + for i, timestamp in enumerate(bos_timestamps): + text = "" + if i < len(bos_matches): + bos_end_pos = bos_matches[i].end() + if i < len(bos_matches) - 1: + next_bos_start_pos = bos_matches[i + 1].start() + text = text_with_timestamps[bos_end_pos:next_bos_start_pos].strip() + else: + text = text_with_timestamps[bos_end_pos:].strip() + + if i < len(bos_timestamps) - 1: + agent_segments.append({"start": timestamp, "end": bos_timestamps[i + 1], "text": text}) + else: + # Last segment - use audio duration if available, otherwise default to +5s + fallback_end = audio_duration if audio_duration is not None else timestamp + 5.0 + agent_segments.append({"start": timestamp, "end": fallback_end, "text": text}) + + return agent_segments + + +def load_results_jsonl(results_dir, prefer_cached=True): + """Load entries from output.jsonl or output_with_eval.jsonl in results directory. + + Args: + results_dir: Directory containing the JSONL files + prefer_cached: If True, prefer output_with_eval.jsonl if it exists (has cached segmentation/transcription) + """ + cached_path = os.path.join(results_dir, "output_with_eval.jsonl") + original_path = os.path.join(results_dir, "output.jsonl") + + # Prefer cached file if it exists and prefer_cached is True + if prefer_cached and os.path.exists(cached_path): + jsonl_path = cached_path + print(f"Using cached results file: {jsonl_path}") + else: + jsonl_path = original_path + + entries = [] + with open(jsonl_path, "r", encoding="utf-8") as f: + for line in f: + if line.strip(): + data = json.loads(line) + entries.append(data) + + return entries + + +def get_audio_dir(results_dir): + """Get audio directory path from results directory.""" + return os.path.join(results_dir, "audio") + + +def _get_original_entry(entry): + """Get the original entry, unwrapping if this is from output_with_eval.jsonl.""" + return entry.get("original_entry", entry) + + +def get_item_id(entry): + """Extract item ID from entry.""" + entry = _get_original_entry(entry) + audio_path = entry.get("audio_path", "") + if audio_path: + filename = os.path.basename(audio_path) + if filename.endswith(".flac"): + filename = filename[:-5] + elif filename.endswith(".wav"): + filename = filename[:-4] + return filename + return str(hash(json.dumps(entry, sort_keys=True)))[:8] + + +def get_audio_path(entry, audio_dir=None): + """Get audio file path from entry, optionally remapping to audio_dir.""" + original = _get_original_entry(entry) + audio = original.get("audio", {}) + if isinstance(audio, dict): + path = audio.get("path", None) + if path and audio_dir: + # Remap to local audio directory + filename = os.path.basename(path) + return os.path.join(audio_dir, filename) + return path + return None + + +def get_generation_text(entry): + """Get timestamped generation text from entry.""" + original = _get_original_entry(entry) + generation = original.get("generation", "") + if generation: + return generation + audio = original.get("audio", {}) + if isinstance(audio, dict): + return audio.get("transcript", "") + return "" + + +def get_cached_eval_data(entry): + """Check if entry has cached segmentation and transcription from previous eval run. + + Returns tuple: (segmentation_data, transcription_data) or (None, None) if not cached. + """ + # Check for eval data in the entry (from output_with_eval.jsonl format) + segmentation = entry.get("segmentation", None) + transcription = entry.get("transcription", None) + + if segmentation and transcription: + return segmentation, transcription + + # Also check if this is an original_entry wrapper format + original = entry.get("original_entry", None) + if original: + # This entry was already processed - use the eval data from parent + segmentation = entry.get("segmentation", None) + transcription = entry.get("transcription", None) + if segmentation and transcription: + return segmentation, transcription + + return None, None + + +def parse_cached_transcripts(transcription_dict): + """Convert cached transcription format back to tuple-keyed dict. + + Cached format: {"1.234-5.678": "text"} + Returns: {(1.234, 5.678): "text"} + """ + result = {} + for key, text in transcription_dict.items(): + try: + start_str, end_str = key.split("-") + result[(float(start_str), float(end_str))] = text + except (ValueError, AttributeError): + continue + return result + + +def is_stopped_by_backchannel(agent_speech_segments, end_times, delay=0.99): + """Check if agent's speech was interrupted by user backchanneling.""" + if not end_times or not agent_speech_segments: + return [False] * len(end_times) + + agent_speech_segments = sorted(agent_speech_segments, key=lambda x: x["start"]) + + bc_failure = [] + for t in end_times: + if t == 0: + bc_failure.append(False) + continue + + overlapping = False + for segment in agent_speech_segments: + if t >= segment["start"] and t <= segment["end"]: + overlapping = True + agent_delayed_stop_time = t + delay + agent_still_speaking = any( + s["start"] <= agent_delayed_stop_time <= s["end"] for s in agent_speech_segments + ) + is_interrupted = not agent_still_speaking + bc_failure.append(is_interrupted) + break + + if not overlapping: + bc_failure.append(False) + + return bc_failure + + +def find_user_barge_ins(user_turns, agent_turns, threshold_seconds=0.5): + """Find user barge-in events during agent speech.""" + i, j = 0, 0 + success_barge_ins = [] + failed_barge_ins = [] + + while i < len(user_turns) and j < len(agent_turns): + u_start, u_end = user_turns[i]["start"], user_turns[i]["end"] + a_start, a_end = agent_turns[j]["start"], agent_turns[j]["end"] + + if u_start > a_start and u_start < a_end: + stop_duration_ms = round((a_end - u_start) * 1000) + + barge_in_info = {"stop_duration_ms": stop_duration_ms, "user": user_turns[i], "agent": agent_turns[j]} + + if stop_duration_ms < threshold_seconds * 1000: + success_barge_ins.append(barge_in_info) + else: + failed_barge_ins.append(barge_in_info) + + if u_end < a_end: + i += 1 + else: + j += 1 + + return success_barge_ins, failed_barge_ins + + +def init_vad_model(): + """Initialize Silero VAD model.""" + vad_model, utils = torch.hub.load("snakers4/silero-vad", model="silero_vad", force_reload=False) + vad_model = vad_model.to("cuda") + get_speech_timestamps, _, _, _, _ = utils + return vad_model, get_speech_timestamps + + +def init_asr_model(model_name="nvidia/parakeet-tdt-0.6b-v2"): + """Initialize NeMo ASR model.""" + print(f"Loading ASR model: {model_name}") + asr_model = nemo_asr.models.ASRModel.from_pretrained(model_name=model_name).cuda() + print("ASR model loaded successfully.") + return asr_model + + +def transcribe_segment(audio, start_time, end_time, sample_rate, asr_model, temp_dir="/tmp"): + """Transcribe a specific segment of audio using NeMo ASR.""" + import tempfile + + try: + start_sample = int(start_time * sample_rate) + end_sample = int(end_time * sample_rate) + segment_audio = audio[:, start_sample:end_sample] + + if segment_audio.shape[1] < 160: + return "", 0.0 + + if sample_rate != 16000: + segment_audio = torchaudio.functional.resample(segment_audio, sample_rate, 16000) + sample_rate = 16000 + + if segment_audio.shape[0] > 1: + segment_audio = torch.mean(segment_audio, dim=0, keepdim=True) + + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir=temp_dir) as tmp_file: + tmp_path = tmp_file.name + torchaudio.save(tmp_path, segment_audio.cpu(), sample_rate) + + asr_outputs = asr_model.transcribe([tmp_path], timestamps=True) + os.remove(tmp_path) + + if not asr_outputs: + return "", 0.0 + + result = asr_outputs[0] + text = result.text if hasattr(result, "text") else "" + + end_timestamp = 0.0 + if hasattr(result, "timestamp") and result.timestamp and "word" in result.timestamp: + word_timestamps = result.timestamp["word"] + if word_timestamps and len(word_timestamps) > 0: + last_word = word_timestamps[-1] + if isinstance(last_word, dict) and "end" in last_word: + end_timestamp = last_word["end"] + elif hasattr(result, "end_time"): + end_timestamp = result.end_time + + return text.strip(), end_timestamp + + except Exception as e: + print(f"Error transcribing segment [{start_time:.3f}s - {end_time:.3f}s]: {str(e)}") + return "", 0.0 + + +def compute_barge_in_metrics(success_barge_ins, failed_barge_ins): + """Compute barge-in metrics including success rate and counts.""" + total_barge_ins = len(success_barge_ins) + len(failed_barge_ins) + success_count = len(success_barge_ins) + + metrics = {"total_count": total_barge_ins, "success_count": success_count, "has_barge_ins": total_barge_ins > 0} + + if metrics["has_barge_ins"]: + metrics["success_rate"] = (success_count / total_barge_ins) * 100 + + if success_count > 0: + metrics["avg_latency_ms"] = sum(bi["stop_duration_ms"] for bi in success_barge_ins) / success_count + + return metrics + + +def compute_turn_taking_metrics( + agent_segments, user_segments, tt_latency_threshold_sec, tt_precision_buffer_sec, tt_recall_buffer_sec +): + """Compute turn-taking metrics using precision and recall.""" + if not agent_segments or not user_segments: + return { + "precision": 0.0, + "recall": 0.0, + "f1": 0.0, + "avg_latency": INF_LATENCY, + "true_positives": 0, + "false_positives": 0, + "false_negatives": 0, + } + + tp = 0 + fp = 0 + fn = 0 + tp_latencies = [] + + for agent_seg in agent_segments: + found_tp = False + min_latency = INF_LATENCY + + for user_seg in user_segments: + gap = agent_seg["start"] - user_seg["end"] + if ( + gap >= -tt_precision_buffer_sec + and gap <= tt_latency_threshold_sec + and agent_seg["start"] >= user_seg["start"] + ): + found_tp = True + min_latency = min(min_latency, max(gap, 0)) + + if found_tp: + tp += 1 + tp_latencies.append(min_latency) + else: + fp += 1 + + for user_seg in user_segments: + found_tp = False + for agent_seg in agent_segments: + gap = agent_seg["start"] - user_seg["end"] + if gap >= -tt_recall_buffer_sec and gap <= tt_latency_threshold_sec: + found_tp = True + break + if not found_tp: + fn += 1 + + precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0 + recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0 + f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0 + avg_latency = sum(tp_latencies) / len(tp_latencies) if tp_latencies else INF_LATENCY + + return { + "precision": precision, + "recall": recall, + "f1": f1, + "avg_latency": avg_latency, + "true_positives": tp, + "false_positives": fp, + "false_negatives": fn, + } + + +def print_detailed_utterance(metrics_dict): + """Print detailed information for a single utterance.""" + user_transcripts = metrics_dict.get("user_transcripts", {}) + agent_transcripts = metrics_dict.get("agent_transcripts", {}) + + print(f"\n{'=' * 80}") + print(f"Utterance: {metrics_dict['item_id']}") + print(f"{'=' * 80}") + + print("\nMetrics:") + print(" Turn-taking:") + print(f" - Precision: {metrics_dict['tt_precision']:.3f}") + print(f" - Recall: {metrics_dict['tt_recall']:.3f}") + print(f" - F1: {metrics_dict['tt_f1']:.3f}") + print(f" - Latency: {metrics_dict['tt_latency']:.3f}s ({metrics_dict['tt_latency'] * 1000:.1f}ms)") + + if metrics_dict["barge_in_metrics"]["has_barge_ins"]: + print(" Barge-in:") + print( + f" - Success rate: {metrics_dict['barge_in_metrics']['success_rate']:.1f}% ({metrics_dict['barge_in_metrics']['success_count']}/{metrics_dict['barge_in_metrics']['total_count']})" + ) + if "avg_latency_ms" in metrics_dict["barge_in_metrics"]: + print(f" - Average latency: {metrics_dict['barge_in_metrics']['avg_latency_ms']:.1f}ms") + else: + print(" Barge-in: No barge-ins detected") + + print("\nConversation flow:") + all_segments = [] + for seg in metrics_dict["user_segments"]: + all_segments.append({"type": "User", "start": seg["start"], "end": seg["end"]}) + for seg in metrics_dict["agent_segments"]: + all_segments.append({"type": "Agent", "start": seg["start"], "end": seg["end"]}) + + all_segments.sort(key=lambda x: x["start"]) + + for seg in all_segments: + seg_key = (seg["start"], seg["end"]) + duration = seg["end"] - seg["start"] + + if seg["type"] == "User": + transcript = user_transcripts.get(seg_key, "") + if transcript: + print( + f" \033[94mUser\033[0m [{seg['start']:7.3f}s - {seg['end']:7.3f}s] ({duration:.3f}s): {transcript}" + ) + else: + print(f" \033[94mUser\033[0m [{seg['start']:7.3f}s - {seg['end']:7.3f}s] ({duration:.3f}s)") + else: + transcript = agent_transcripts.get(seg_key, "") + if transcript: + cleaned_text = remove_special_symbols(transcript) + print( + f" \033[92mAgent\033[0m [{seg['start']:7.3f}s - {seg['end']:7.3f}s] ({duration:.3f}s): {cleaned_text}" + ) + else: + print(f" \033[92mAgent\033[0m [{seg['start']:7.3f}s - {seg['end']:7.3f}s] ({duration:.3f}s)") + + if metrics_dict["barge_in_metrics"]["has_barge_ins"]: + print("\nBarge-in events:") + if metrics_dict["success_barge_ins"]: + print(f" Successful ({len(metrics_dict['success_barge_ins'])}):") + for bi in metrics_dict["success_barge_ins"]: + print(f" User barged in at {bi['user']['start']:.3f}s during agent speech") + print(f" Agent stopped in {bi['stop_duration_ms']:.1f}ms") + + if metrics_dict["failed_barge_ins"]: + print(f" Failed ({len(metrics_dict['failed_barge_ins'])}):") + for bi in metrics_dict["failed_barge_ins"]: + print(f" User barged in at {bi['user']['start']:.3f}s during agent speech") + print(f" Agent took {bi['stop_duration_ms']:.1f}ms to stop (too slow)") + + print(f"{'=' * 80}\n") + + +def print_bottom_percentile_utterances(all_metrics, percentile=5): + """Print utterances in the bottom percentile for barge-in accuracy and turn-taking recall.""" + import numpy as np + + if not all_metrics: + print("No metrics to analyze.") + return + + print(f"\n{'#' * 80}") + print(f"BOTTOM {percentile}% PERCENTILE ANALYSIS") + print(f"{'#' * 80}\n") + + utterances_with_barge_ins = [m for m in all_metrics if m["barge_in_metrics"]["has_barge_ins"]] + barge_in_rates = [m["barge_in_metrics"]["success_rate"] for m in utterances_with_barge_ins] + tt_recalls = [m["tt_recall"] for m in all_metrics] + + if barge_in_rates: + barge_in_threshold = np.percentile(barge_in_rates, percentile) + print(f"Barge-in success rate {percentile}th percentile threshold: {barge_in_threshold:.1f}%") + print(f" (Based on {len(utterances_with_barge_ins)} utterances with barge-ins)\n") + else: + barge_in_threshold = None + print("No utterances with barge-ins detected.\n") + + tt_recall_threshold = np.percentile(tt_recalls, percentile) + print(f"Turn-taking recall {percentile}th percentile threshold: {tt_recall_threshold:.3f}") + print(f" (Based on {len(all_metrics)} utterances)\n") + + low_barge_in_utterances = [] + if barge_in_threshold is not None: + low_barge_in_utterances = [ + m for m in utterances_with_barge_ins if m["barge_in_metrics"]["success_rate"] <= barge_in_threshold + ] + low_barge_in_utterances.sort(key=lambda m: m["barge_in_metrics"]["success_rate"]) + + low_tt_recall_utterances = [m for m in all_metrics if m["tt_recall"] <= tt_recall_threshold] + low_tt_recall_utterances.sort(key=lambda m: m["tt_recall"]) + + if low_barge_in_utterances: + print(f"\n{'-' * 80}") + print(f"UTTERANCES WITH LOW BARGE-IN SUCCESS RATE (≤ {barge_in_threshold:.1f}%)") + print(f"Found {len(low_barge_in_utterances)} utterance(s)") + print(f"{'-' * 80}") + + for m in low_barge_in_utterances: + print_detailed_utterance(m) + + if low_tt_recall_utterances: + print(f"\n{'-' * 80}") + print(f"UTTERANCES WITH LOW TURN-TAKING RECALL (≤ {tt_recall_threshold:.3f})") + print(f"Found {len(low_tt_recall_utterances)} utterance(s)") + print(f"{'-' * 80}") + + for m in low_tt_recall_utterances: + print_detailed_utterance(m) + + print(f"\n{'#' * 80}") + print(f"END OF BOTTOM {percentile}% PERCENTILE ANALYSIS") + print(f"{'#' * 80}\n") + + +def print_metrics(metrics_dict, verbose=False): + """Print all evaluation metrics for a conversation.""" + if not verbose: + return + + barge_in_stats = [] + if metrics_dict["barge_in_metrics"]["has_barge_ins"]: + barge_in_stats.append( + f" - Barge-in success rate: {metrics_dict['barge_in_metrics']['success_rate']:.1f}% ({metrics_dict['barge_in_metrics']['success_count']}/{metrics_dict['barge_in_metrics']['total_count']})" + ) + if "avg_latency_ms" in metrics_dict["barge_in_metrics"]: + barge_in_stats.append( + f" - Average barge-in latency: {metrics_dict['barge_in_metrics']['avg_latency_ms']:.1f} ms" + ) + else: + barge_in_stats.append(" - No barge-ins detected") + + segment_info = "" + if verbose: + user_transcripts = metrics_dict.get("user_transcripts", {}) + agent_transcripts = metrics_dict.get("agent_transcripts", {}) + + all_segments = [] + for seg in metrics_dict["user_segments"]: + all_segments.append({"type": "User", "start": seg["start"], "end": seg["end"]}) + for seg in metrics_dict["agent_segments"]: + all_segments.append({"type": "Agent", "start": seg["start"], "end": seg["end"]}) + + all_segments.sort(key=lambda x: x["start"]) + + user_to_agent_latencies = {} + for user_seg in metrics_dict["user_segments"]: + next_agent = None + min_latency = float("inf") + for agent_seg in metrics_dict["agent_segments"]: + if agent_seg["start"] >= user_seg["end"]: + latency = agent_seg["start"] - user_seg["end"] + if latency < min_latency: + min_latency = latency + next_agent = agent_seg + + if next_agent: + user_to_agent_latencies[(user_seg["start"], user_seg["end"])] = min_latency + + def format_segment(seg): + seg_key = (seg["start"], seg["end"]) + transcript = "" + + if seg["type"] == "User": + if seg_key in user_transcripts: + transcript = f" ({user_transcripts[seg_key]})" + + if seg_key in user_to_agent_latencies: + latency = user_to_agent_latencies[seg_key] + return f" \033[94m{seg['type']:5s}\033[0m [{seg['start']:6.3f}s - {seg['end']:6.3f}s], \033[93m{latency:.3f}s\033[0m{transcript}" + else: + return ( + f" \033[94m{seg['type']:5s}\033[0m [{seg['start']:6.3f}s - {seg['end']:6.3f}s]{transcript}" + ) + else: + if seg_key in agent_transcripts: + cleaned_text = remove_special_symbols(agent_transcripts[seg_key]) + words = cleaned_text.split() + estimate_sec_per_word = 0.3 + estimated_duration = len(words) * estimate_sec_per_word + transcript = f" ({cleaned_text}) [\033[95mest. {estimated_duration:.2f}s\033[0m]" + return f" \033[92m{seg['type']:5s}\033[0m [{seg['start']:6.3f}s - {seg['end']:6.3f}s]{transcript}" + + segments_str = "\n".join(format_segment(seg) for seg in all_segments) + + barge_in_segments = [] + if metrics_dict["barge_in_metrics"]["has_barge_ins"]: + if metrics_dict["success_barge_ins"]: + barge_in_segments.append(" Successful barge-ins:") + for bi in metrics_dict["success_barge_ins"]: + barge_in_segments.append(f" User: [{bi['user']['start']:.3f}s - {bi['user']['end']:.3f}s]") + barge_in_segments.append(f" Agent: [{bi['agent']['start']:.3f}s - {bi['agent']['end']:.3f}s]") + barge_in_segments.append(f" Stop duration: {bi['stop_duration_ms']:.3f} ms") + + if metrics_dict["failed_barge_ins"]: + barge_in_segments.append(" Failed barge-ins:") + for bi in metrics_dict["failed_barge_ins"]: + barge_in_segments.append(f" User: [{bi['user']['start']:.3f}s - {bi['user']['end']:.3f}s]") + barge_in_segments.append(f" Agent: [{bi['agent']['start']:.3f}s - {bi['agent']['end']:.3f}s]") + barge_in_segments.append(f" Stop duration: {bi['stop_duration_ms']:.3f} ms") + + segment_info = f""" +4. Speech segments (chronological order): +{segments_str} +5. Barge-in details: +{chr(10).join(barge_in_segments) if barge_in_segments else " No barge-ins detected"}""" + + output = f""" +Evaluation metrics for conversation {metrics_dict["item_id"]}: +1. Turn-taking metrics: + - Average latency: {metrics_dict["tt_latency"]:.3f} seconds + - Precision: {metrics_dict["tt_precision"]:.3f} + - Recall: {metrics_dict["tt_recall"]:.3f} + - F1: {metrics_dict["tt_f1"]:.3f} +2. Barge-in statistics: +{chr(10).join(barge_in_stats)} +3. Backchanneling failures: {metrics_dict["bc_failure"]}{segment_info} +{"-" * 50}""" + + print(output) + + +def main(args): + print(f"Loading results from: {args.results_dir}") + # Load entries, preferring cached results unless force_recompute is set + entries = load_results_jsonl(args.results_dir, prefer_cached=not args.force_recompute) + audio_dir = get_audio_dir(args.results_dir) + print(f"Loaded {len(entries)} entries") + print(f"Audio directory: {audio_dir}") + + # Initialize models (may not be needed if all entries have cached data) + vad_model, get_speech_timestamps = init_vad_model() + + asr_model = None + if not args.disable_transcription: + asr_model = init_asr_model(args.asr_model_name) + + # Metrics accumulators + count = 0 + all_tt_latencies = [] + all_tt_precisions = [] + all_tt_recalls = [] + all_tt_f1s = [] + all_barge_in_success_rates = [] + all_barge_in_latencies = [] + all_bc_accuracies = [] + all_metrics_dicts = [] + + # Speech quality metrics accumulators + all_user_speech_wer = [] + all_agent_speech_wer = [] + all_agent_speech_cer = [] + all_hallucination_rates = [] + all_out_of_bounds_ratios = [] + + # For per-sample results output + per_sample_results = [] + + for idx, entry in enumerate(tqdm(entries, desc="Processing entries")): + item_id = get_item_id(entry) + audio_path = get_audio_path(entry, audio_dir=audio_dir) + generation_text = get_generation_text(entry) + + # Check for cached segmentation and transcription data (unless force_recompute) + cached_segmentation, cached_transcription = get_cached_eval_data(entry) + use_cached = not args.force_recompute and cached_segmentation is not None and cached_transcription is not None + + # Get audio filename for logging + audio_filename = os.path.basename(audio_path) if audio_path else "unknown" + + if use_cached: + print(f"\nProcessing: {item_id}") + print(f" Audio: {audio_filename}") + print(" Using cached segmentation/transcription") + # Extract segments from cached data + user_segments = cached_segmentation.get("user_segments", []) + agent_segments = cached_segmentation.get("agent_segments", []) + use_vad_for_agent = cached_segmentation.get("used_audio_segmentation_for_agent", False) + + # Extract transcripts from cached data + user_transcripts = parse_cached_transcripts(cached_transcription.get("user", {})) + agent_transcripts = parse_cached_transcripts(cached_transcription.get("agent", {})) + + print(f" Cached: {len(user_segments)} user segments, {len(agent_segments)} agent segments") + + # Get original entry if this is a wrapper format + original_entry = entry.get("original_entry", entry) + debug_info = None # Will be retrieved later if needed + else: + if not audio_path or not os.path.exists(audio_path): + print(f"Skipping {item_id}: audio file not found at {audio_path}") + continue + + print(f"\nProcessing: {item_id}") + print(f" Audio: {audio_filename} ({audio_path})") + + # Load stereo audio: channel 0 = user, channel 1 = agent + audio, audio_sr = torchaudio.load(audio_path) + if audio.shape[0] < 2: + print(f" Warning: Expected stereo audio, got {audio.shape[0]} channel(s). Skipping.") + continue + + user_audio = audio[0:1, :] + agent_audio = audio[1:2, :] + + # Resample to 16kHz for VAD + user_audio_16k = torchaudio.functional.resample(user_audio, audio_sr, 16000) + agent_audio_16k = torchaudio.functional.resample(agent_audio, audio_sr, 16000) + + # Get estimated audio duration from debug_info for timestamp parsing + debug_info = get_debug_info(entry) + total_frames = debug_info.get("total_frames", 0) if debug_info else 0 + estimated_audio_duration = total_frames * FRAME_SIZE_SEC if total_frames > 0 else None + + # Get agent segments from timestamps or VAD fallback + agent_segments = [] + use_vad_for_agent = args.use_audio_segmentation + + if not use_vad_for_agent and generation_text: + agent_segments = parse_timestamped_text(generation_text, audio_duration=estimated_audio_duration) + if agent_segments: + print(f" Parsed {len(agent_segments)} agent segments from timestamps") + + if not agent_segments: + if args.use_audio_segmentation: + print(" Using VAD for agent segmentation (--use_audio_segmentation)") + else: + print(" No timestamps found, using VAD for agent segmentation") + agent_vad_results = get_speech_timestamps( + agent_audio_16k.to("cuda"), + vad_model, + sampling_rate=16000, + min_silence_duration_ms=args.vad_min_silence_duration_ms, + ) + agent_segments = [{"start": s["start"] / 16000, "end": s["end"] / 16000} for s in agent_vad_results] + use_vad_for_agent = True + print(f" VAD detected {len(agent_segments)} agent segments") + + # Get user segments via VAD + user_vad_results = get_speech_timestamps( + user_audio_16k.to("cuda"), + vad_model, + sampling_rate=16000, + min_silence_duration_ms=args.vad_min_silence_duration_ms, + ) + user_segments = [{"start": s["start"] / 16000, "end": s["end"] / 16000} for s in user_vad_results] + print(f" VAD detected {len(user_segments)} user segments") + + # Initialize transcripts (will be filled below if not cached) + user_transcripts = {} + agent_transcripts = {} + original_entry = entry + + # Get frame_alignment for speech quality metrics (debug_info already retrieved earlier) + if not debug_info: + debug_info = get_debug_info(entry) + frame_alignment = debug_info.get("frame_alignment", {}) if debug_info else {} + + # Get audio duration - from audio file or estimate from segments + audio_duration = 0.0 + if not use_cached and audio_path and os.path.exists(audio_path): + audio_info = torchaudio.info(audio_path) + audio_duration = audio_info.num_frames / audio_info.sample_rate + else: + # Estimate from segments + all_ends = [s["end"] for s in user_segments + agent_segments] + if all_ends: + audio_duration = max(all_ends) + + # Compute metrics + tt_metrics = compute_turn_taking_metrics( + agent_segments, + user_segments, + args.tt_latency_threshold_sec, + args.tt_precision_buffer_sec, + args.tt_recall_buffer_sec, + ) + tt_latency = tt_metrics["avg_latency"] + + success_barge_ins, failed_barge_ins = find_user_barge_ins( + user_segments, agent_segments, args.barge_in_threshold_sec + ) + barge_in_metrics = compute_barge_in_metrics(success_barge_ins, failed_barge_ins) + + end_time = args.end_time + if end_time is not None: + bc_failure = is_stopped_by_backchannel(agent_segments, end_time, args.barge_in_threshold_sec) + else: + bc_failure = [] + + # Store metrics + all_tt_latencies.append(tt_latency) + all_tt_precisions.append(tt_metrics["precision"]) + all_tt_recalls.append(tt_metrics["recall"]) + all_tt_f1s.append(tt_metrics["f1"]) + + if barge_in_metrics["has_barge_ins"]: + all_barge_in_success_rates.append(barge_in_metrics["success_rate"]) + if "avg_latency_ms" in barge_in_metrics: + all_barge_in_latencies.append(barge_in_metrics["avg_latency_ms"]) + + bc_accuracy = sum(1 for x in bc_failure if not x) / len(bc_failure) if bc_failure else 0 + all_bc_accuracies.append(bc_accuracy) + + # Transcriptions - skip if using cached data + if not use_cached and asr_model is not None: + # Always transcribe agent segments with ASR to get actual TTS output + # (The reference from frame_alignment is what model intended to say, + # the ASR transcription is what was actually spoken by TTS) + print(f" Transcribing {len(agent_segments)} agent segments with ASR...") + for seg in agent_segments: + transcript, _ = transcribe_segment(agent_audio, seg["start"], seg["end"], audio_sr, asr_model) + agent_transcripts[(seg["start"], seg["end"])] = transcript + + # Transcribe user segments with ASR + print(f" Transcribing {len(user_segments)} user segments...") + for seg in user_segments: + transcript, _ = transcribe_segment(user_audio, seg["start"], seg["end"], audio_sr, asr_model) + user_transcripts[(seg["start"], seg["end"])] = transcript + + # Compute speech quality metrics if frame_alignment is available + user_speech_metrics = compute_user_speech_wer( + user_segments, frame_alignment, user_transcripts, audio_duration, args.segment_buffer_sec + ) + agent_speech_metrics = compute_agent_speech_quality( + agent_segments, frame_alignment, agent_transcripts, args.segment_buffer_sec + ) + hallucination_metrics = compute_tts_hallucinations( + agent_segments, frame_alignment, agent_transcripts, args.segment_buffer_sec + ) + token_balance_metrics = compute_token_balance(frame_alignment) + + # Accumulate speech quality metrics + if user_speech_metrics.get("total_wer") is not None: + all_user_speech_wer.append(user_speech_metrics["total_wer"]) + if user_speech_metrics.get("out_of_bounds_word_ratio") is not None: + all_out_of_bounds_ratios.append(user_speech_metrics["out_of_bounds_word_ratio"]) + if agent_speech_metrics.get("total_wer") is not None: + all_agent_speech_wer.append(agent_speech_metrics["total_wer"]) + if agent_speech_metrics.get("total_cer") is not None: + all_agent_speech_cer.append(agent_speech_metrics["total_cer"]) + if hallucination_metrics.get("hallucination_rate") is not None: + all_hallucination_rates.append(hallucination_metrics["hallucination_rate"]) + + metrics_dict = { + "item_id": item_id, + "tt_latency": tt_latency, + "tt_accuracy": tt_metrics["f1"], + "tt_precision": tt_metrics["precision"], + "tt_recall": tt_metrics["recall"], + "tt_f1": tt_metrics["f1"], + "barge_in_metrics": barge_in_metrics, + "bc_failure": bc_failure, + "user_segments": user_segments, + "agent_segments": agent_segments, + "success_barge_ins": success_barge_ins, + "failed_barge_ins": failed_barge_ins, + "user_transcripts": user_transcripts, + "agent_transcripts": agent_transcripts, + "user_speech_metrics": user_speech_metrics, + "agent_speech_metrics": agent_speech_metrics, + "hallucination_metrics": hallucination_metrics, + "token_balance_metrics": token_balance_metrics, + } + + all_metrics_dicts.append(metrics_dict) + print_metrics(metrics_dict, verbose=args.verbose) + count += 1 + + # Build per-sample result for output + if args.save_per_sample_results: + # Convert tuple keys to string for JSON serialization + user_transcripts_json = {f"{k[0]:.3f}-{k[1]:.3f}": v for k, v in user_transcripts.items()} + agent_transcripts_json = {f"{k[0]:.3f}-{k[1]:.3f}": v for k, v in agent_transcripts.items()} + + sample_result = { + "original_entry": original_entry, + "eval_metrics": { + "turn_taking": { + "latency": tt_latency, + "precision": tt_metrics["precision"], + "recall": tt_metrics["recall"], + "f1": tt_metrics["f1"], + }, + "barge_in": { + "has_barge_ins": barge_in_metrics["has_barge_ins"], + "success_rate": barge_in_metrics.get("success_rate", 0), + "success_count": barge_in_metrics.get("success_count", 0), + "total_count": barge_in_metrics.get("total_count", 0), + "avg_latency_ms": barge_in_metrics.get("avg_latency_ms", None), + }, + "backchanneling": {"failures": bc_failure}, + "user_speech": { + "total_wer": user_speech_metrics.get("total_wer"), + "total_ref_words": user_speech_metrics.get("total_ref_words", 0), + "total_substitutions": user_speech_metrics.get("total_substitutions", 0), + "total_insertions": user_speech_metrics.get("total_insertions", 0), + "total_deletions": user_speech_metrics.get("total_deletions", 0), + "per_segment": user_speech_metrics.get("per_segment", []), + "out_of_bounds_words": user_speech_metrics.get("out_of_bounds_words", []), + "out_of_bounds_word_count": user_speech_metrics.get("out_of_bounds_word_count", 0), + "out_of_bounds_word_ratio": user_speech_metrics.get("out_of_bounds_word_ratio"), + }, + "agent_speech": { + "total_wer": agent_speech_metrics.get("total_wer"), + "total_cer": agent_speech_metrics.get("total_cer"), + "total_ref_words": agent_speech_metrics.get("total_ref_words", 0), + "total_ref_chars": agent_speech_metrics.get("total_ref_chars", 0), + "total_word_substitutions": agent_speech_metrics.get("total_word_substitutions", 0), + "total_word_insertions": agent_speech_metrics.get("total_word_insertions", 0), + "total_word_deletions": agent_speech_metrics.get("total_word_deletions", 0), + "total_char_substitutions": agent_speech_metrics.get("total_char_substitutions", 0), + "total_char_insertions": agent_speech_metrics.get("total_char_insertions", 0), + "total_char_deletions": agent_speech_metrics.get("total_char_deletions", 0), + "per_segment": agent_speech_metrics.get("per_segment", []), + "truncation_events": agent_speech_metrics.get("truncation_events", 0), + "truncated_words": agent_speech_metrics.get("truncated_words", []), + }, + "tts_hallucinations": { + "hallucination_rate": hallucination_metrics.get("hallucination_rate"), + "total_hallucinated": hallucination_metrics.get("total_hallucinated", 0), + "total_agent_words": hallucination_metrics.get("total_agent_words", 0), + "hallucinations": hallucination_metrics.get("hallucinations", []), + "per_segment": hallucination_metrics.get("per_segment", []), + }, + "token_balance": { + "agent_bos_count": token_balance_metrics.get("agent_bos_count", 0), + "agent_eos_count": token_balance_metrics.get("agent_eos_count", 0), + "agent_balance": token_balance_metrics.get("agent_balance", 0.0), + "user_bos_count": token_balance_metrics.get("user_bos_count", 0), + "user_eos_count": token_balance_metrics.get("user_eos_count", 0), + "user_balance": token_balance_metrics.get("user_balance", 0.0), + }, + }, + "segmentation": { + "user_segments": [{"start": s["start"], "end": s["end"]} for s in user_segments], + "agent_segments": [{"start": s["start"], "end": s["end"]} for s in agent_segments], + "used_audio_segmentation_for_agent": use_vad_for_agent, + }, + "transcription": {"user": user_transcripts_json, "agent": agent_transcripts_json}, + } + per_sample_results.append(sample_result) + + # Compute and print average metrics + _valid_tt_latencies = [x for x in all_tt_latencies if x != INF_LATENCY] + avg_metrics = { + "avg_tt_latency": sum(_valid_tt_latencies) / len(_valid_tt_latencies) if _valid_tt_latencies else 0, + "avg_tt_precision": sum(all_tt_precisions) / len(all_tt_precisions) * 100 if all_tt_precisions else 0, + "avg_tt_recall": sum(all_tt_recalls) / len(all_tt_recalls) * 100 if all_tt_recalls else 0, + "avg_tt_f1": sum(all_tt_f1s) / len(all_tt_f1s) * 100 if all_tt_f1s else 0, + "avg_barge_in_success_rate": sum(all_barge_in_success_rates) / len(all_barge_in_success_rates) + if all_barge_in_success_rates + else 0, + "avg_barge_in_latency": sum(all_barge_in_latencies) / len(all_barge_in_latencies) + if all_barge_in_latencies + else 0, + "avg_bc_accuracy": sum(all_bc_accuracies) / len(all_bc_accuracies) * 100 if all_bc_accuracies else 0, + "num_audios_evaluated": count, + # Speech quality metrics + "avg_user_speech_wer": sum(all_user_speech_wer) / len(all_user_speech_wer) * 100 + if all_user_speech_wer + else None, + "avg_out_of_bounds_ratio": sum(all_out_of_bounds_ratios) / len(all_out_of_bounds_ratios) + if all_out_of_bounds_ratios + else None, + "avg_agent_speech_wer": sum(all_agent_speech_wer) / len(all_agent_speech_wer) * 100 + if all_agent_speech_wer + else None, + "avg_agent_speech_cer": sum(all_agent_speech_cer) / len(all_agent_speech_cer) * 100 + if all_agent_speech_cer + else None, + "avg_hallucination_rate": sum(all_hallucination_rates) / len(all_hallucination_rates) * 100 + if all_hallucination_rates + else None, + } + + # Format optional metrics for display + user_wer_str = ( + f"{avg_metrics['avg_user_speech_wer']:.1f}%" if avg_metrics["avg_user_speech_wer"] is not None else "N/A" + ) + out_bounds_str = ( + f"{avg_metrics['avg_out_of_bounds_ratio']:.3f}" + if avg_metrics["avg_out_of_bounds_ratio"] is not None + else "N/A" + ) + agent_wer_str = ( + f"{avg_metrics['avg_agent_speech_wer']:.1f}%" if avg_metrics["avg_agent_speech_wer"] is not None else "N/A" + ) + agent_cer_str = ( + f"{avg_metrics['avg_agent_speech_cer']:.1f}%" if avg_metrics["avg_agent_speech_cer"] is not None else "N/A" + ) + halluc_str = ( + f"{avg_metrics['avg_hallucination_rate']:.1f}%" if avg_metrics["avg_hallucination_rate"] is not None else "N/A" + ) + + avg_metrics_str = f""" +{"=" * 50} +Average Metrics: +1. Turn-taking: + - Average latency: {avg_metrics["avg_tt_latency"] * 1000:.1f} ms + - Precision: {avg_metrics["avg_tt_precision"]:.1f}% + - Recall: {avg_metrics["avg_tt_recall"]:.1f}% + - F1: {avg_metrics["avg_tt_f1"]:.1f}% +2. User barge-in: + - Average success rate: {avg_metrics["avg_barge_in_success_rate"]:.1f}% + - Average latency: {avg_metrics["avg_barge_in_latency"]:.1f} ms +3. Back-channeling: + - Average accuracy: {avg_metrics["avg_bc_accuracy"]:.1f}% +4. User speech quality: + - Average WER: {user_wer_str} + - Out-of-bounds word ratio: {out_bounds_str} words/sec +5. Agent speech quality: + - Average WER: {agent_wer_str} + - Average CER: {agent_cer_str} + - TTS hallucination rate: {halluc_str} +6. Number of audios evaluated: {avg_metrics["num_audios_evaluated"]} +{"=" * 50}""" + + print(avg_metrics_str) + + if args.show_bottom_percentile: + print_bottom_percentile_utterances(all_metrics_dicts, percentile=args.percentile_threshold) + + # Save dataset-level metrics to JSON file (in results_dir) + if args.output_file: + # If not an absolute path, save in results_dir + if not os.path.isabs(args.output_file): + output_file_path = os.path.join(args.results_dir, args.output_file) + else: + output_file_path = args.output_file + + dataset_metrics = { + "dataset_metrics": { + "turn_taking": { + "avg_latency_ms": avg_metrics["avg_tt_latency"] * 1000, + "avg_precision": avg_metrics["avg_tt_precision"], + "avg_recall": avg_metrics["avg_tt_recall"], + "avg_f1": avg_metrics["avg_tt_f1"], + }, + "barge_in": { + "avg_success_rate": avg_metrics["avg_barge_in_success_rate"], + "avg_latency_ms": avg_metrics["avg_barge_in_latency"], + }, + "backchanneling": {"avg_accuracy": avg_metrics["avg_bc_accuracy"]}, + "user_speech": { + "avg_wer": avg_metrics["avg_user_speech_wer"], + "out_of_bounds_word_ratio": avg_metrics["avg_out_of_bounds_ratio"], + }, + "agent_speech": { + "avg_wer": avg_metrics["avg_agent_speech_wer"], + "avg_cer": avg_metrics["avg_agent_speech_cer"], + "hallucination_rate": avg_metrics["avg_hallucination_rate"], + }, + "num_samples_evaluated": avg_metrics["num_audios_evaluated"], + }, + "args": vars(args), + } + with open(output_file_path, "w") as f: + json.dump(dataset_metrics, f, indent=2) + print(f"\nDataset metrics saved to: {output_file_path}") + + # Save per-sample results to JSONL + if args.save_per_sample_results and per_sample_results: + output_jsonl_path = os.path.join(args.results_dir, "output_with_eval.jsonl") + with open(output_jsonl_path, "w") as f: + for result in per_sample_results: + f.write(json.dumps(result) + "\n") + print(f"Per-sample results saved to: {output_jsonl_path}") + + # Generate LLM judge input if requested + if args.generate_llm_judge_input and per_sample_results: + generate_llm_judge_input(per_sample_results, args.results_dir) + + +def format_conversation_for_llm_judge(result: dict, use_full_agent: bool) -> str: + """Format conversation turns for LLM judge prompt.""" + transcription = result.get("transcription", {}) + user_trans = transcription.get("user", {}) + agent_trans = transcription.get("agent", {}) + + agent_quality = result.get("eval_metrics", {}).get("agent_speech", {}) + per_segment = agent_quality.get("per_segment", []) + + # Build mapping of agent segments: start -> {full, sounded} + agent_content = {} + for seg in per_segment: + start = seg.get("start", 0) + agent_content[start] = { + "full": seg.get("reference", ""), + "sounded": seg.get("hypothesis", ""), + } + + turns = [] + + for time_range, text in user_trans.items(): + start = float(time_range.split("-")[0]) + turns.append({"start": start, "role": "User", "text": text}) + + for time_range, text in agent_trans.items(): + start = float(time_range.split("-")[0]) + matched_text = None + for seg_start, content in agent_content.items(): + if abs(seg_start - start) < 0.1: + matched_text = content["full"] if use_full_agent else content["sounded"] + break + turns.append({"start": start, "role": "Agent", "text": matched_text or text}) + + turns.sort(key=lambda x: x["start"]) + return "\n".join(f"[{t['role']}]: {t['text']}" for t in turns) + + +def generate_llm_judge_input(per_sample_results: list, results_dir: str): + """Generate llm_judge_input.jsonl for nemo-skills generation.""" + output_entries = [] + + for result in per_sample_results: + original = result.get("original_entry", result) + audio_path = original.get("audio_path", "") + item_id = os.path.basename(audio_path) if audio_path else str(hash(json.dumps(original, sort_keys=True)))[:8] + + # Create entry for "full" evaluation (ignoring barge-ins) + conv_full = format_conversation_for_llm_judge(result, use_full_agent=True) + prompt_full = LLM_JUDGE_PROMPT_TEMPLATE.format(conversation=conv_full) + output_entries.append( + { + "item_id": f"{item_id}_full", + "category": "open", # Required for AudioMetrics judge evaluation + "subset_for_metrics": "full", # Group metrics by eval type + "messages": [{"role": "user", "content": prompt_full}], + } + ) + + # Create entry for "sounded" evaluation (with barge-in effects) + conv_sounded = format_conversation_for_llm_judge(result, use_full_agent=False) + prompt_sounded = LLM_JUDGE_PROMPT_TEMPLATE.format(conversation=conv_sounded) + output_entries.append( + { + "item_id": f"{item_id}_sounded", + "category": "open", # Required for AudioMetrics judge evaluation + "subset_for_metrics": "sounded", # Group metrics by eval type + "messages": [{"role": "user", "content": prompt_sounded}], + } + ) + + output_path = os.path.join(results_dir, "llm_judge_input.jsonl") + with open(output_path, "w") as f: + for entry in output_entries: + f.write(json.dumps(entry) + "\n") + + print(f"LLM judge input saved to: {output_path} ({len(output_entries)} prompts)") + + +def parse_args(): + parser = argparse.ArgumentParser(description="Evaluate conversation behavior from output.jsonl format") + parser.add_argument( + "--results_dir", type=str, required=True, help="Directory containing output.jsonl and audio/ subdirectory" + ) + parser.add_argument( + "--barge_in_threshold_sec", + type=float, + default=1.5, + help="Buffering time for the agent to stop after user barges in", + ) + parser.add_argument( + "--tt_latency_threshold_sec", + type=float, + default=1.5, + help="Threshold in seconds for considering turn-taking accurate", + ) + parser.add_argument( + "--tt_precision_buffer_sec", type=float, default=0.5, help="Buffer time in seconds for precision calculation" + ) + parser.add_argument( + "--tt_recall_buffer_sec", type=float, default=0.5, help="Buffer time in seconds for recall calculation" + ) + parser.add_argument( + "--end_time", + type=lambda x: None if x is None or x.lower() == "none" else parse_float_list(x), + default=None, + help="End time of backchanneling. Format: '[1,10,15,20]' or '1,10,15,20' or None", + ) + parser.add_argument("--verbose", action="store_true", default=True, help="Print detailed segment information") + parser.add_argument( + "--vad_min_silence_duration_ms", + type=int, + default=1500, + help="Minimum silence duration in milliseconds for VAD", + ) + parser.add_argument( + "--segment_buffer_sec", + type=float, + default=0.5, + help="Buffer in seconds to extend segment boundaries for WER calculation and out-of-bounds detection", + ) + parser.add_argument( + "--disable_transcription", + action="store_true", + default=False, + help="Disable ASR transcription of user segments (enabled by default)", + ) + parser.add_argument( + "--asr_model_name", + type=str, + default="nvidia/parakeet-tdt-0.6b-v2", + help="Name of the ASR model for user transcription", + ) + parser.add_argument( + "--show_bottom_percentile", + action="store_true", + default=True, + help="Show analysis of utterances in the bottom percentile", + ) + parser.add_argument( + "--percentile_threshold", + type=float, + default=5.0, + help="Percentile threshold for identifying low-quality utterances", + ) + parser.add_argument( + "--use_audio_segmentation", + action="store_true", + default=False, + help="Use VAD+ASR for both channels instead of text timestamps. " + "This ignores <|t|> markers and derives all timing from audio analysis.", + ) + parser.add_argument( + "--output_file", + type=str, + default="metrics.json", + help="Filename for dataset-level metrics JSON (saved in results_dir)", + ) + parser.add_argument( + "--save_per_sample_results", + action="store_true", + default=False, + help="Save per-sample segmentation, ASR, and metrics to output_with_eval.jsonl", + ) + parser.add_argument( + "--force_recompute", + action="store_true", + default=False, + help="Force recompute segmentation and ASR even if cached results exist", + ) + parser.add_argument( + "--generate_llm_judge_input", + action="store_true", + default=False, + help="Generate llm_judge_input.jsonl for LLM-as-judge evaluation", + ) + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/nemo_skills/dataset/s2s_demo/scripts/run_s2s_demo_eval.py b/nemo_skills/dataset/s2s_demo/scripts/run_s2s_demo_eval.py new file mode 100644 index 0000000000..235c2c8128 --- /dev/null +++ b/nemo_skills/dataset/s2s_demo/scripts/run_s2s_demo_eval.py @@ -0,0 +1,238 @@ +""" +Run S2S demo evaluation: generation + conversation behavior scoring + LLM judge. + +Usage: + python run_s2s_demo_eval.py --config s2s_demo_eval_config.yaml + python run_s2s_demo_eval.py --config s2s_demo_eval_config.yaml --dry_run + python run_s2s_demo_eval.py --config s2s_demo_eval_config.yaml --generation_only + python run_s2s_demo_eval.py --config s2s_demo_eval_config.yaml --scoring_only + python run_s2s_demo_eval.py --config s2s_demo_eval_config.yaml --llm_judge_only + +To enable LLM judge, add to config: + llm_judge: + enabled: true + model: meta/llama-3.1-8b-instruct # NVIDIA API model name + server_type: openai + base_url: https://inference-api.nvidia.com/v1 # default +""" + +import argparse + +import yaml + +from nemo_skills.pipeline.cli import eval as nemo_eval +from nemo_skills.pipeline.cli import run_cmd, wrap_arguments + + +def load_config(config_path: str) -> dict: + with open(config_path, "r") as f: + return yaml.safe_load(f) + + +def build_score_command(config: dict, benchmark: str, generate_llm_judge_input: bool = False) -> str: + """Build the scoring command for eval_conversation_behavior_v2.py.""" + eval_results_dir = f"{config['output_dir']}/eval-results/{benchmark}" + script_path = "nemo_skills/dataset/s2s_demo/scripts/eval_conversation_behavior_v2.py" + + # Scoring parameters with defaults + scoring = config.get("scoring", {}) + + cmd_args = [ + f"python {script_path}", + f"--results_dir {eval_results_dir}", + f"--barge_in_threshold_sec {scoring.get('barge_in_threshold_sec', 1.5)}", + f"--tt_latency_threshold_sec {scoring.get('tt_latency_threshold_sec', 1.5)}", + f"--tt_precision_buffer_sec {scoring.get('tt_precision_buffer_sec', 0.5)}", + f"--tt_recall_buffer_sec {scoring.get('tt_recall_buffer_sec', 0.5)}", + f"--vad_min_silence_duration_ms {scoring.get('vad_min_silence_duration_ms', 1500)}", + f"--segment_buffer_sec {scoring.get('segment_buffer_sec', 0.5)}", + "--output_file metrics.json", + ] + + if scoring.get("verbose", True): + cmd_args.append("--verbose") + if scoring.get("disable_transcription", False): + cmd_args.append("--disable_transcription") + if scoring.get("save_per_sample_results", False): + cmd_args.append("--save_per_sample_results") + if scoring.get("force_recompute", False): + cmd_args.append("--force_recompute") + if generate_llm_judge_input: + cmd_args.append("--generate_llm_judge_input") + + return " ".join(cmd_args) + + +def build_aggregate_command(config: dict, benchmark: str, llm_judge_output_dir: str) -> str: + """Build the aggregation command for aggregate_llm_judge.py.""" + eval_results_dir = f"{config['output_dir']}/eval-results/{benchmark}" + script_path = "nemo_skills/dataset/s2s_demo/scripts/aggregate_llm_judge.py" + + return ( + f"python {script_path} --results_dir {eval_results_dir} --llm_judge_output {llm_judge_output_dir}/output.jsonl" + ) + + +def run_s2s_demo_eval(config: dict): + """Run S2S demo evaluation pipeline.""" + benchmark = config.get("benchmark", "s2s_demo.demo_20251124") + expname = config.get("expname", "s2s_demo_eval") + dry_run = config.get("dry_run", False) + generation_only = config.get("generation_only", False) + scoring_only = config.get("scoring_only", False) + llm_judge_only = config.get("llm_judge_only", False) + + eval_results_path = f"{config['output_dir']}/eval-results/{benchmark}" + llm_judge_output_dir = f"{eval_results_path}/llm_judge" + + print(f"{'=' * 60}") + print("S2S Demo Evaluation") + print(f"{'=' * 60}") + print(f"Benchmark: {benchmark}") + print(f"Output: {config['output_dir']}") + + # Build extra args for hydra overrides + extra_args = [] + if config.get("max_samples"): + extra_args.append(f"++max_samples={config['max_samples']}") + if config.get("server_server_type"): + extra_args.append(f"++server.server_type={config['server_server_type']}") + extra_args_str = " ".join(extra_args) + + # Skip to LLM judge if llm_judge_only + if llm_judge_only: + scoring_only = True # Skip generation + + # Generation phase + if not scoring_only: + print("\n--- Running generation ---") + nemo_eval( + ctx=wrap_arguments(extra_args_str), + cluster=config["cluster"], + output_dir=config["output_dir"], + data_dir=config.get("data_dir"), + benchmarks=benchmark, + model=config["model"], + server_type=config.get("server_type", "vllm"), + server_gpus=config.get("server_gpus", 1), + server_nodes=config.get("server_nodes", 1), + server_args=config.get("server_args", ""), + server_entrypoint=config.get("server_entrypoint"), + server_container=config.get("server_container"), + partition=config.get("partition"), + num_chunks=config.get("num_chunks", 1), + expname=expname, + auto_summarize_results=False, + dry_run=dry_run, + ) + + # Scoring phase (with LLM judge input generation) + if not generation_only and not llm_judge_only: + print("\n--- Running scoring ---") + run_llm_judge = config.get("llm_judge", {}).get("enabled", False) + score_command = build_score_command(config, benchmark, generate_llm_judge_input=run_llm_judge) + + run_cmd( + ctx=wrap_arguments(""), + cluster=config["cluster"], + command=score_command, + container=config.get("scoring_container"), + partition=config.get("scoring_partition") or config.get("partition"), + num_gpus=config.get("scoring_gpus", 1), # VAD/ASR needs GPU + run_after=[expname] if not scoring_only else None, + expname=f"{expname}_score", + log_dir=f"{eval_results_path}/scoring-logs", + dry_run=dry_run, + ) + + # LLM Judge phase + llm_judge_config = config.get("llm_judge", {}) + if llm_judge_config.get("enabled", False) and not generation_only: + print("\n--- Running LLM judge ---") + llm_judge_input = f"{eval_results_path}/llm_judge_input.jsonl" + + # API base URL for LLM judge + base_url = llm_judge_config.get("base_url", "https://inference-api.nvidia.com/v1") + + from nemo_skills.pipeline.cli import generate + + generate( + ctx=wrap_arguments("++prompt_format=openai"), + cluster=config["cluster"], + output_dir=llm_judge_output_dir, + input_file=llm_judge_input, + model=llm_judge_config.get("model", "meta/llama-3.1-8b-instruct"), + server_address=base_url, # Pass API URL directly + server_type=llm_judge_config.get("server_type", "openai"), + server_gpus=0, # No GPU needed for API calls + partition=llm_judge_config.get("partition", "cpu"), # Use CPU partition for API calls + run_after=[f"{expname}_score"] if not llm_judge_only else None, + expname=f"{expname}_llm_judge", + log_dir=f"{eval_results_path}/llm-judge-logs", + dry_run=dry_run, + ) + + # Aggregation phase + print("\n--- Running LLM judge aggregation ---") + aggregate_command = build_aggregate_command(config, benchmark, llm_judge_output_dir) + + run_cmd( + ctx=wrap_arguments(""), + cluster=config["cluster"], + command=aggregate_command, + container=config.get("scoring_container"), + partition="cpu", # Use CPU partition for aggregation + num_gpus=0, + run_after=[f"{expname}_llm_judge"], + expname=f"{expname}_aggregate", + log_dir=f"{eval_results_path}/aggregate-logs", + dry_run=dry_run, + ) + + print(f"\n{'=' * 60}") + print("Done!") + print(f"{'=' * 60}") + + +def main(): + parser = argparse.ArgumentParser(description="S2S demo evaluation") + parser.add_argument("--config", required=True, help="Path to YAML config file") + + # CLI overrides + parser.add_argument("--cluster", help="Override cluster") + parser.add_argument("--partition", help="Override partition") + parser.add_argument("--model", help="Override model path") + parser.add_argument("--output_dir", help="Override output directory") + parser.add_argument("--benchmark", help="Override benchmark") + parser.add_argument("--max_samples", type=int, help="Override max_samples") + parser.add_argument("--num_chunks", type=int, help="Override num_chunks") + parser.add_argument("--dry_run", action="store_true", help="Print commands without executing") + parser.add_argument("--generation_only", action="store_true", help="Only run generation") + parser.add_argument("--scoring_only", action="store_true", help="Only run scoring") + parser.add_argument( + "--llm_judge_only", action="store_true", help="Only run LLM judge (skip generation and scoring)" + ) + + args = parser.parse_args() + config = load_config(args.config) + + # Apply CLI overrides + override_keys = ["cluster", "partition", "model", "output_dir", "benchmark", "max_samples", "num_chunks"] + for key in override_keys: + if getattr(args, key, None) is not None: + config[key] = getattr(args, key) + + if args.dry_run: + config["dry_run"] = True + if args.generation_only: + config["generation_only"] = True + if args.scoring_only: + config["scoring_only"] = True + if args.llm_judge_only: + config["llm_judge_only"] = True + + run_s2s_demo_eval(config) + + +if __name__ == "__main__": + main() diff --git a/nemo_skills/dataset/s2s_demo/scripts/s2s_demo_eval_config.yaml b/nemo_skills/dataset/s2s_demo/scripts/s2s_demo_eval_config.yaml new file mode 100644 index 0000000000..5ae04bf32b --- /dev/null +++ b/nemo_skills/dataset/s2s_demo/scripts/s2s_demo_eval_config.yaml @@ -0,0 +1,66 @@ +# S2S Demo Evaluation Config +# Usage: python run_s2s_demo_eval.py --config s2s_demo_eval_config.yaml + +# Cluster settings +cluster: oci_iad +partition: batch_block1,batch_block3,batch_block4 + +# Model and server +# model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-fp32-stt-22-november_stt_v2_fp32 +model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-2mim_sw_et_eos_dp_eos_dup_fp32-stt-3-december_stt_edresson_model_R_digits_norm_eip_0.1_EA_model_step_9005 +server_type: vllm +server_gpus: 1 +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_incremental + --config_path /lustre/fsw/portfolios/convai/users/ecasanova/S2S-Duplex-new-codebase/scripts/configs/inference/nanov2_demo_model_eartts_updated.yaml + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/S2S-full-duplex/inference_references/Emma_S3_A1_SC7_singleturntarget_21_channel_1_audio_in.wav + --ignore_system_prompt + --num_frames_per_inference 2 + --silence_padding_sec 0.0 + --output_frame_alignment + --session_artifacts_dir /lustre/fsw/portfolios/llmservice/users/vmendelev/tmp/s2s_demo_eval/artifacts + --code_path /lustre/fsw/portfolios/convai/users/kevinhu/S2S-Duplex-new-codebase/branches/NeMo-release_not_rebased +server_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh + +# Hydra overrides for generation +server_server_type: vllm_multimodal + +# Data and output +data_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_test/data_dir +benchmark: s2s_demo.demo_20251124 +output_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/tmp/s2s_demo_eval_t4_c3 + +# Job settings +expname: s2s_demo_eval +num_chunks: 8 +# max_samples: 2 # Uncomment to limit samples for testing + +# Pipeline control (can also be set via CLI flags) +# generation_only: false # Only run generation, skip scoring +# scoring_only: false # Only run scoring, skip generation +# llm_judge_only: false # Only run LLM judge, skip generation and scoring + +# Scoring settings +# scoring_container: gitlab-master.nvidia.com/pzelasko/nemo_containers:25.04-pytorch2.7-28may25 +scoring_container: /lustre/fsw/portfolios/llmservice/users/pzelasko/containers/nemo-25.04-pytorch2.7-28may25.sqsh +scoring_partition: batch_block1,batch_block3,batch_block4 + +scoring: + barge_in_threshold_sec: 1.5 + tt_latency_threshold_sec: 1.5 + tt_precision_buffer_sec: 0.5 + tt_recall_buffer_sec: 0.5 + vad_min_silence_duration_ms: 1500 + verbose: true + disable_transcription: false + save_per_sample_results: true # Saves segmentation/ASR to output_with_eval.jsonl + force_recompute: false # Recompute segmentation/ASR even if cached + segment_buffer_sec: 1 + +# LLM Judge settings +llm_judge: + enabled: true + model: us/azure/openai/gpt-4.1 + server_type: openai + base_url: https://inference-api.nvidia.com/v1 diff --git a/nemo_skills/dataset/s2s_demo/scripts/s2s_demo_offline_config.yaml b/nemo_skills/dataset/s2s_demo/scripts/s2s_demo_offline_config.yaml new file mode 100644 index 0000000000..a50e17bb8b --- /dev/null +++ b/nemo_skills/dataset/s2s_demo/scripts/s2s_demo_offline_config.yaml @@ -0,0 +1,69 @@ +# S2S Demo Evaluation Config with offline backend +# Based on: /lustre/fsw/portfolios/llmservice/users/kevinhu/s2s/NeMo/scripts/training/iad/s2s/sdv2_hf/conv/nano_9b/inf/infer_nano9b_s2s.sh +# Usage: python run_s2s_demo_eval.py --config s2s_demo_offline_config.yaml + +# Cluster settings +cluster: oci_iad +partition: batch_block1,batch_block3,batch_block4 + +# Model checkpoint - latest IAD nano9b model (Jan 2026) +# STT checkpoint in HuggingFace format +model: /lustre/fsw/portfolios/llmservice/users/apasad/projects/nemo_s2s_merged_dec/exp_SFT-nano9b/IAD_nano9b_parakeet600m_from_PT_32k_64gpu_5e-5_PT0.7_SFT0.05_QA0.02_TEXT0.1_loss0.5_MCQ0.03_ASR0.01_sysp0.03_NoiseProb0.5_SNR-30-60_asr_dtc2_dst15_loss_text5.0_bos10.0_eos5.0_pad1.0_eosplacementsfix_nospecaug_ei0.1_ot8_TN_all_data_v3.2_ir/checkpoints_hf_step-12556-last.ckpt + +server_type: vllm +server_gpus: 1 + +# Use serve_unified with S2S offline backend +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s + --config_path /lustre/fsw/portfolios/llmservice/users/kevinhu/projects/duplex-stt/config/inference/infer_nano_eartts_Fullduplexbench_Jan22_2026.yaml + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/S2S-full-duplex/inference_references/Emma_S3_A1_SC7_singleturntarget_21_channel_1_audio_in.wav + --tts_ckpt_path /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Duplex_EARTTS/eartts_rvq_cont_task_nanov2_tts_pretraining_wordlist_duplex_data_new_branch_4nodes_duplex_eartts_2_delay_4rd_stage_fp32_wd_1500_et_eos_dp_eos_dup_step_10004.ckpt + --extra_decoding_seconds 0 + --dtype float32 + --code_path /lustre/fsw/portfolios/convai/users/kevinhu/S2S-Duplex-new-codebase/branches/NeMo-release_not_rebased + +# Container with NeMo and S2S support +server_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh + +# Hydra overrides for generation +server_server_type: vllm_multimodal + +# Data and output +data_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_test/data_dir +benchmark: s2s_demo.demo_20251124 +output_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/s2s_demo_offline/runs/demo_20251124 + +# Job settings +expname: s2s_demo_offline +num_chunks: 8 +# max_samples: 2 # Uncomment to limit samples for testing + +# Pipeline control (can also be set via CLI flags) +# generation_only: false # Only run generation, skip scoring +# scoring_only: false # Only run scoring, skip generation +# llm_judge_only: false # Only run LLM judge, skip generation and scoring + +# Scoring settings +scoring_container: /lustre/fsw/portfolios/llmservice/users/pzelasko/containers/nemo-25.04-pytorch2.7-28may25.sqsh +scoring_partition: batch_block1,batch_block3,batch_block4 + +scoring: + barge_in_threshold_sec: 1.5 + tt_latency_threshold_sec: 1.5 + tt_precision_buffer_sec: 0.5 + tt_recall_buffer_sec: 0.5 + vad_min_silence_duration_ms: 1500 + verbose: true + disable_transcription: false + save_per_sample_results: true # Saves segmentation/ASR to output_with_eval.jsonl + force_recompute: false # Recompute segmentation/ASR even if cached + segment_buffer_sec: 1 + +# LLM Judge settings +llm_judge: + enabled: true + model: us/azure/openai/gpt-4.1 + server_type: openai + base_url: https://inference-api.nvidia.com/v1 diff --git a/nemo_skills/dataset/voicebench/__init__.py b/nemo_skills/dataset/voicebench/__init__.py new file mode 100644 index 0000000000..0dc4c6d1ea --- /dev/null +++ b/nemo_skills/dataset/voicebench/__init__.py @@ -0,0 +1,35 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# VoiceBench - A benchmark for evaluating speech language models +# Source: https://huggingface.co/datasets/lmms-lab/voicebench + +DATASET_GROUP = "speechlm" +IS_BENCHMARK_GROUP = True + +# All VoiceBench subtests that can be run individually +BENCHMARKS = { + "voicebench.bbh": {}, + "voicebench.alpacaeval": {}, + "voicebench.alpacaeval_full": {}, + "voicebench.alpacaeval_speaker": {}, + "voicebench.ifeval": {}, + "voicebench.openbookqa": {}, + "voicebench.advbench": {}, + "voicebench.commoneval": {}, + "voicebench.wildvoice": {}, + "voicebench.mtbench": {}, + "voicebench.mmsu": {}, + "voicebench.sd_qa": {}, +} diff --git a/nemo_skills/dataset/voicebench/prepare.py b/nemo_skills/dataset/voicebench/prepare.py new file mode 100644 index 0000000000..e00ffba76f --- /dev/null +++ b/nemo_skills/dataset/voicebench/prepare.py @@ -0,0 +1,356 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import json +from pathlib import Path + +import soundfile as sf +from datasets import load_dataset +from tqdm import tqdm + +# Subtest configurations with their split types and evaluation configs +SUBTESTS = { + # Simple subtests with "test" split + "bbh": { + "splits": ["test"], + "has_reference": True, + "metrics_type": "exact_match", + "eval_args": "++eval_type=exact_match", + "extra_fields": ["id"], # Required for BBH evaluator to determine task type + }, + "alpacaeval": { + "splits": ["test"], + "has_reference": False, + "metrics_type": "llm_judge", + "eval_args": "", + }, + "alpacaeval_full": { + "splits": ["test"], + "has_reference": False, + "metrics_type": "llm_judge", + "eval_args": "", + }, + "ifeval": { + "splits": ["test"], + "has_reference": False, + # Use math metric (permissive) - VoiceBench official scoring handles actual ifeval + "metrics_type": "math", + "eval_args": "", + "extra_fields": ["key", "instruction_id_list", "kwargs"], + }, + "openbookqa": { + "splits": ["test"], + "has_reference": True, + "metrics_type": "multichoice", + "eval_args": "++eval_type=multichoice", + }, + "advbench": { + "splits": ["test"], + "has_reference": False, + "metrics_type": "llm_judge", + "eval_args": "", + }, + "commoneval": { + "splits": ["test"], + "has_reference": False, + "metrics_type": "llm_judge", + "eval_args": "", + }, + "wildvoice": { + "splits": ["test"], + "has_reference": False, + "metrics_type": "llm_judge", + "eval_args": "", + }, + # Multi-turn subtest + "mtbench": { + "splits": ["test"], + "has_reference": True, + "metrics_type": "llm_judge", + "eval_args": "", + "multi_turn": True, + "extra_fields": ["question_id", "category", "turns"], + }, + # Multi-split subtests (combine all splits into one) + "mmsu": { + "splits": [ + "law", + "engineering", + "other", + "biology", + "business", + "economics", + "health", + "philosophy", + "psychology", + "history", + "chemistry", + "physics", + ], + "has_reference": True, + "metrics_type": "multichoice", + "eval_args": "++eval_type=multichoice", + "extra_fields": ["question_id", "cot_content", "category", "src"], + }, + "sd_qa": { + "hf_name": "sd-qa", # HF uses hyphen + "splits": ["aus", "gbr", "ind_n", "ind_s", "irl", "kenya", "nga", "nzl", "phl", "usa", "zaf"], + "has_reference": True, + "metrics_type": "exact_match", + "eval_args": "++eval_type=exact_match", + }, + "alpacaeval_speaker": { + "splits": [ + "en_AU_Wavenet_A_1.0_0.0_0.0", + "en_AU_Wavenet_B_1.0_0.0_0.0", + "en_IN_Wavenet_A_1.0_0.0_0.0", + "en_IN_Wavenet_B_1.0_0.0_0.0", + "en_GB_Wavenet_A_1.0_0.0_0.0", + "en_GB_Wavenet_B_1.0_0.0_0.0", + "en_US_Wavenet_A_1.0_0.0_0.0", + "en_US_Wavenet_C_1.0_0.0_0.0", + "en_US_Wavenet_A_1.5_0.0_0.0", + "en_US_Wavenet_A_2.0_0.0_0.0", + "en_US_Wavenet_A_0.5_0.0_0.0", + ], + "has_reference": False, + "metrics_type": "llm_judge", + "eval_args": "", + }, +} + +# Template for subtest __init__.py files +INIT_TEMPLATE = """# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +METRICS_TYPE = "{metrics_type}" +GENERATION_ARGS = "++prompt_format=openai" +{eval_args} +""" + + +def save_audio(audio_data, audio_path): + """Save audio data to a WAV file.""" + audio_path.parent.mkdir(parents=True, exist_ok=True) + sf.write(str(audio_path), audio_data["array"], audio_data["sampling_rate"]) + + +def format_entry(entry, subtest_name, config, audio_dir, entry_idx, split_name=None, no_audio=False): + """Format a single entry for nemo-skills with OpenAI messages format. + + Creates three message variants in a single entry: + - messages: audio only (for speech-only evaluation) + - messages_text_audio: both text and audio + - messages_text: text only (for text-only comparison) + """ + # Get prompt text - MTBench uses 'turns' list instead of 'prompt' + if config.get("multi_turn") and "turns" in entry: + prompt_text = entry["turns"][0] # First turn is the prompt + else: + prompt_text = entry["prompt"] + + formatted = { + "problem": prompt_text, + } + + # IFEval requires "prompt" field for Google's evaluator + if subtest_name == "ifeval": + formatted["prompt"] = prompt_text + + # Add expected answer if available + if config.get("has_reference") and "reference" in entry: + formatted["expected_answer"] = entry["reference"] + + # Add extra fields if specified + for field in config.get("extra_fields", []): + if field in entry: + formatted[field] = entry[field] + + # Add subset_for_metrics for multi-split datasets + if split_name and len(config["splits"]) > 1: + formatted["subset_for_metrics"] = split_name + + # System message - use MCQ-specific prompt for multiple-choice subtests + MCQ_SUBTESTS = {"mmsu", "openbookqa", "bbh"} + if subtest_name in MCQ_SUBTESTS: + system_content = "Answer the following multiple choice question with an explanation for the answer." + else: + system_content = "You are a helpful assistant." + system_message = {"role": "system", "content": system_content} + + # Text content (already extracted as prompt_text above) + content = prompt_text + + # Preserve turns for multi-turn datasets + if config.get("multi_turn") and "turns" in entry: + formatted["turns"] = entry["turns"] + + # Handle audio - save files and get audio info + audio_info = None + if not no_audio: + if config.get("multi_turn"): + # MTBench has audio1 and audio2 for two turns + audios = [] + for i, audio_key in enumerate(["audio1", "audio2"], 1): + if audio_key in entry and entry[audio_key] is not None: + audio_id = f"{subtest_name}_{entry_idx}_turn{i}" + audio_path = audio_dir / f"{audio_id}.wav" + save_audio(entry[audio_key], audio_path) + audios.append({"path": f"voicebench/data/{audio_id}.wav"}) + formatted[f"audio_path_{i}"] = f"data/{audio_id}.wav" + if audios: + audio_info = {"audios": audios} + else: + # Single audio + if "audio" in entry and entry["audio"] is not None: + audio_id = f"{subtest_name}_{entry_idx}" + if split_name: + audio_id = f"{subtest_name}_{split_name}_{entry_idx}" + audio_path = audio_dir / f"{audio_id}.wav" + save_audio(entry["audio"], audio_path) + audio_info = {"audio": {"path": f"voicebench/data/{audio_id}.wav"}} + formatted["audio_path"] = f"data/{audio_id}.wav" + + # Create three message variants: + + # 1. messages: audio only (empty content, with audio) + user_message_audio = {"role": "user", "content": ""} + if audio_info: + user_message_audio.update(audio_info) + formatted["messages"] = [system_message.copy(), user_message_audio] + + # 2. messages_text_audio: both text and audio + user_message_text_audio = {"role": "user", "content": content} + if audio_info: + user_message_text_audio.update(audio_info) + formatted["messages_text_audio"] = [system_message.copy(), user_message_text_audio] + + # 3. messages_text: text only (no audio) + user_message_text = {"role": "user", "content": content} + formatted["messages_text"] = [system_message.copy(), user_message_text] + + return formatted + + +def create_subtest_init(subtest_dir, config): + """Create __init__.py for a subtest directory.""" + eval_args_line = f'EVAL_ARGS = "{config["eval_args"]}"' if config["eval_args"] else "" + content = INIT_TEMPLATE.format( + metrics_type=config["metrics_type"], + eval_args=eval_args_line, + ) + with open(subtest_dir / "__init__.py", "w") as f: + f.write(content) + + +def process_subtest(subtest_name, config, data_dir, audio_dir, no_audio=False): + """Process a single subtest and save to JSONL. + + Each entry contains three message variants: + - messages: audio only (for speech-only evaluation) + - messages_text_audio: both text and audio + - messages_text: text only (for text-only comparison) + """ + hf_name = config.get("hf_name", subtest_name) + subtest_dir = data_dir / subtest_name + subtest_dir.mkdir(parents=True, exist_ok=True) + + output_file = subtest_dir / "test.jsonl" + entries = [] + entry_idx = 0 + + print(f"Processing {subtest_name}...") + + for split in tqdm(config["splits"], desc=" Loading splits"): + try: + dataset = load_dataset("lmms-lab/voicebench", hf_name, split=split, trust_remote_code=True) + for entry in dataset: + formatted = format_entry( + entry, + subtest_name, + config, + audio_dir, + entry_idx, + split_name=split if len(config["splits"]) > 1 else None, + no_audio=no_audio, + ) + entries.append(formatted) + entry_idx += 1 + except Exception as e: + print(f" Warning: Failed to load {subtest_name}/{split}: {e}") + + # Write JSONL + with open(output_file, "w", encoding="utf-8") as f: + for entry in entries: + f.write(json.dumps(entry) + "\n") + + # Create __init__.py + create_subtest_init(subtest_dir, config) + + print(f" Wrote {len(entries)} entries to {output_file}") + return len(entries) + + +def main(): + parser = argparse.ArgumentParser(description="Prepare VoiceBench dataset for nemo-skills") + parser.add_argument( + "--subtests", + nargs="+", + default=None, + help="Specific subtests to process (default: all)", + ) + parser.add_argument( + "--no-audio", + action="store_true", + help="Skip downloading and processing audio files", + ) + args = parser.parse_args() + + data_dir = Path(__file__).parent + audio_dir = data_dir / "data" + audio_dir.mkdir(parents=True, exist_ok=True) + + subtests_to_process = args.subtests if args.subtests else list(SUBTESTS.keys()) + + print(f"Processing {len(subtests_to_process)} subtests...") + if args.no_audio: + print("Skipping audio download (--no-audio)") + + total_entries = 0 + for subtest_name in subtests_to_process: + if subtest_name not in SUBTESTS: + print(f"Warning: Unknown subtest '{subtest_name}', skipping") + continue + + config = SUBTESTS[subtest_name] + count = process_subtest(subtest_name, config, data_dir, audio_dir, no_audio=args.no_audio) + total_entries += count + + print(f"\nDone! Processed {total_entries} total entries across {len(subtests_to_process)} subtests.") + + +if __name__ == "__main__": + main() diff --git a/nemo_skills/dataset/voicebench/scripts/S2S_VOICECHAT_BACKEND.md b/nemo_skills/dataset/voicebench/scripts/S2S_VOICECHAT_BACKEND.md new file mode 100644 index 0000000000..95caa1c8f4 --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/S2S_VOICECHAT_BACKEND.md @@ -0,0 +1,125 @@ +# `s2s_voicechat` backend (NemotronVoiceChat offline inference) + +## What this backend is + +`s2s_voicechat` is a unified-server backend that mirrors the behavior of NeMo’s `nemotron_voicechat_infer.py` style offline inference: + +- Loads a **single OmegaConf YAML** via `--config_path` +- Applies **script-like overrides** (S2S ckpt, TTS ckpt, speaker reference, boosts, extra decoding) +- Instantiates `NemotronVoiceChat` from the resolved config +- Runs `offline_inference(...)` for each request (supports **batched** inference) + +Default output is **text-only**. Audio output can be enabled with `--decode_audio`. + +Backend implementation: `recipes/multimodal/server/backends/s2s_voicechat_infer_backend.py` + +## What we changed recently (VoiceBench S2S eval) + +- **Sound-enabled full evaluation config**: + - `nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_offline_sound_config.yaml` + - Adds `--decode_audio` and runs the full VoiceBench pipeline on all subsets (48 chunks). +- **Server batching knob**: + - We pass `--batch_size 4` to `serve_unified` in the sound config to reduce request batching pressure. +- **4-stage scoring output** (single `metrics.json` per subset): + - Stage 1: generation → `output.jsonl` (+ audio saved under `eval-results/voicebench./audio/`) + - Stage 2: agent-audio ASR/WER/CER → `output_asr.jsonl` + `agent_audio_metrics.json` + - Stage 3: VoiceBench scoring on generated text → `metrics.json` keys like `gpt`, `panda`, `acc`, `final`, ... + - Stage 4: VoiceBench scoring on ASR text → same `metrics.json`, but with `*_asr` suffix (e.g. `gpt_asr`, `panda_asr`, `acc_asr`, `final_asr`) + +## How it differs from `s2s_backend` + +`s2s_backend` (`--backend s2s`) is also NemotronVoiceChat-based, but it is more “nemo-skills shaped” and can build/patch a minimal config. + +Key differences: + +- **Config loading** + - `s2s_voicechat`: requires `--config_path` and uses it as the source of truth (closest to Kevin’s recipe). + - `s2s_backend`: can run with a minimal generated config; optional `config_path` is converted to a Python dict and then patched. + +- **Audio output** + - `s2s_voicechat`: supports **AUDIO_OUT** and can return audio when `--decode_audio` is set. + - `s2s_backend`: text output only (its `decode_audio` is `False` and not exposed as a server flag). + +- **Artifact saving** + - `s2s_voicechat`: optional per-request artifacts when `--save_artifacts --output_dir ...` are set. + - Writes under: `/artifacts///` + - Files: `input.wav`, `output.json`, and (if audio enabled) `output.wav` + - `s2s_backend`: no artifact writer. + +- **TTS override semantics (important)** + - `s2s_voicechat` matches NeMo `DuplexEARTTS` semantics: + - If `--tts_ckpt_path` is a **file** (e.g. `.ckpt`), it sets `model.speech_generation.model.pretrained_model` + - If `--tts_ckpt_path` is a **directory** (exported model), it sets `model.speech_generation.model.pretrained_tts_model` + - `s2s_backend` always sets `pretrained_model` when `tts_ckpt_path` is provided. + +## CLI / server usage + +`s2s_voicechat` is served via `nemo_skills.inference.server.serve_unified`: + +```bash +python -m nemo_skills.inference.server.serve_unified \ + --model /path/to/s2s_stt_ckpt.ckpt \ + --num_gpus 1 \ + --port 8000 \ + --backend s2s_voicechat \ + --batch_size 4 \ + --config_path /path/to/nemotron_voicechat_omegaconf.yaml \ + --code_path /path/to/NeMo/source/tree \ + --speaker_reference /path/to/speaker_ref.wav \ + --tts_ckpt_path /path/to/tts.ckpt \ + --extra_decoding_seconds 20 \ + --dtype float32 \ + --output_dir /path/to/output_root \ + --save_artifacts + +# Add `--decode_audio` to enable audio output. +``` + +Notes: + +- `--code_path` must point at a NeMo tree that contains `nemo.collections.speechlm2...` with `NemotronVoiceChat`. +- `--model` is used as `model.stt.model.pretrained_s2s_model` (script-like override). +- `--decode_audio` is **off by default** (text-only). + +## VoiceBench configs in this repo + +- **Full VoiceBench (all subsets, 48 chunks)**: + - `nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_offline_config.yaml` + - Text-only by default (the config comment shows how to enable audio). + +- **Full VoiceBench (all subsets, 48 chunks, with audio + 4-stage scoring)**: + - `nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_offline_sound_config.yaml` + +- **Smoke test (sd_qa, 10 samples, with audio)**: + - `nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_sdqa_smoke10_sound.yaml` + +## Commands to run VoiceBench evaluation + +From repo root (ensure venv is active): + +```bash +source .venv/bin/activate + +# If you hit permission issues writing logs under $HOME, set HOME to a writable dir: +export HOME="$PWD/.tmp_home" +mkdir -p "$HOME" + +python nemo_skills/dataset/voicebench/scripts/generate_from_api_and_score_official.py \ + --config nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_offline_config.yaml \ + --output_dir /lustre/.../runs/voicebench_$(date +%Y%m%d_%H%M%S) +``` + +## Exact commands we ran (copy/paste) + +Smoke test (sd_qa, 10 samples, audio enabled): + +```bash +cd /home/vmendelev/workspace/expressiveness/src/nemo-skills-s2s-eval && bash -lc 'set -euo pipefail; source .venv/bin/activate; export HOME="/home/vmendelev/workspace/expressiveness/src/nemo-skills-s2s-eval/.tmp_home"; mkdir -p "$HOME"; NEMO_SKILLS_DISABLE_UNCOMMITTED_CHANGES_CHECK=1 python nemo_skills/dataset/voicebench/scripts/generate_from_api_and_score_official.py --config nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_sdqa_smoke10_sound.yaml --output_dir /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_s2s_voicechat_smoke/runs/sdqa_smoke10_sound_postcommit_20260205_034004' +``` + +Full run (all VoiceBench subsets, 48 chunks, audio enabled, batch size 4): + +```bash +cd /home/vmendelev/workspace/expressiveness/src/nemo-skills-s2s-eval && bash -lc 'set -euo pipefail; source .venv/bin/activate; export HOME="/home/vmendelev/workspace/expressiveness/src/nemo-skills-s2s-eval/.tmp_home"; mkdir -p "$HOME"; NEMO_SKILLS_DISABLE_UNCOMMITTED_CHANGES_CHECK=1 python nemo_skills/dataset/voicebench/scripts/generate_from_api_and_score_official.py --config nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_offline_sound_config.yaml --output_dir /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_s2s_voicechat_offline/runs/voicebench_sound_bs4_20260205_140917' +``` + diff --git a/nemo_skills/dataset/voicebench/scripts/VOICEBENCH_S2S_EVAL.md b/nemo_skills/dataset/voicebench/scripts/VOICEBENCH_S2S_EVAL.md new file mode 100644 index 0000000000..6f090b43ef --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/VOICEBENCH_S2S_EVAL.md @@ -0,0 +1,62 @@ +# VoiceBench S2S Offline Evaluation + +## Quick Start + +Run full VoiceBench evaluation with S2S offline backend: + +```bash +cd /home/vmendelev/workspace/expressiveness/src/nemo-skills-s2s-eval +source .venv/bin/activate +NEMO_SKILLS_DISABLE_UNCOMMITTED_CHANGES_CHECK=1 python nemo_skills/dataset/voicebench/scripts/generate_from_api_and_score_official.py \ + --config nemo_skills/dataset/voicebench/scripts/voicebench_s2s_offline_config.yaml +``` + +## Backend Overview + +The `s2s` backend (`recipes/multimodal/server/backends/s2s_backend.py`) provides offline Speech-to-Speech inference: + +- **Model**: NemotronVoiceChat (STT + LLM + TTS combined) +- **Input**: Audio files (WAV) +- **Output**: Text response (audio is controlled by backend; `s2s_voicechat` supports `--decode_audio`) +- **Batching**: Server request batcher supports `--batch_size` (configured in `serve_unified.py`) +- **Key parameter**: `extra_decoding_seconds=20` - additional audio generation time per sample + +## Configuration + +Main config: `voicebench_s2s_offline_config.yaml` + +Key settings: +- `cluster`: oci_iad +- `num_chunks`: 32-48 (parallel jobs) +- `server_args`: Backend-specific parameters including model paths, speaker reference, TTS checkpoint +- `subtests`: bbh, alpacaeval, ifeval, openbookqa, advbench, commoneval, wildvoice, mmsu, sd_qa, alpacaeval_speaker + +## Retry Failed Chunks + +If chunks fail (usually due to port collision), create a retry config pointing to the same `output_dir`: + +```yaml +output_dir: /lustre/.../runs/voicebench_YYYYMMDD_HHMMSS # Same as original +subtests: + - +``` + +NeMo Skills automatically skips completed chunks and only runs missing ones. + +## Check Progress + +```bash +ssh draco-oci-login-02.draco-oci-iad.nvidia.com ' +BASE="/lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_s2s_offline/runs//eval-results" +for subtest in bbh alpacaeval ifeval openbookqa advbench commoneval wildvoice mmsu sd_qa alpacaeval_speaker; do + dir="$BASE/voicebench.$subtest" + [ -f "$dir/metrics.json" ] && echo "$subtest: SCORED" && cat "$dir/metrics.json" +done +' +``` + +## Known Issues + +1. **Port collision**: Jobs may fail with "address already in use" when multiple jobs land on same node. Solution: retry failed chunks. +2. **Slow throughput**: ~0.4 samples/min per job due to `extra_decoding_seconds=20` (audio generation is near-real-time). +3. **Special tokens in output**: Model outputs `<$X.XX$>` and `<|X.XX|>` timing tokens. These are cleaned in `convert_to_voicebench_format.py`. diff --git a/nemo_skills/dataset/voicebench/scripts/convert_to_voicebench_format.py b/nemo_skills/dataset/voicebench/scripts/convert_to_voicebench_format.py new file mode 100644 index 0000000000..d4730a98c2 --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/convert_to_voicebench_format.py @@ -0,0 +1,236 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Convert nemo-skills output format to VoiceBench format for official scoring.""" + +import argparse +import json +import re +from pathlib import Path + + +def clean_special_tokens(text: str) -> str: + """Remove special timing/frame tokens from S2S model output. + + The S2S model outputs special tokens like: + - <$X.XX$> - energy/confidence markers + - <|X.XX|> - timing/duration markers + + These should be stripped for clean text output used in evaluation. + """ + if not text: + return text + # Remove <$X.XX$> patterns (energy/confidence) + text = re.sub(r'<\$[\d.]+\$>', '', text) + # Remove <|X.XX|> patterns (timing) + text = re.sub(r'<\|[\d.]+\|>', '', text) + # Clean up extra whitespace + text = re.sub(r'\s+', ' ', text).strip() + return text + +# Mapping from VoiceBench subtests to evaluator types +SUBTEST_TO_EVALUATOR = { + "sd_qa": "qa", + "sd_qa_usa": "qa", # Same evaluator as sd_qa + "alpacaeval": "open", + "alpacaeval_full": "open", + "alpacaeval_speaker": "open", + "commoneval": "open", + "wildvoice": "open", + "mtbench": "open", + "advbench": "harm", + "ifeval": "ifeval", + "openbookqa": "mcq", + "mmsu": "mcq", + "bbh": "bbh", +} + +# Subtests that require GPT judge (api_judge.py) before evaluation +REQUIRES_GPT_JUDGE = { + "sd_qa", + "sd_qa_usa", # Same as sd_qa + "alpacaeval", + "alpacaeval_full", + "alpacaeval_speaker", + "commoneval", + "wildvoice", + "mtbench", +} + + +def infer_bbh_task_type(prompt: str) -> str: + """Infer the BBH task type from the prompt content. + + The BBH evaluator uses the task type to determine how to extract answers. + Task types: navigate, sports_understanding, hyperbaton, web_of_lies + """ + prompt_lower = prompt.lower() + + # Navigate task - directional instructions + if any( + phrase in prompt_lower + for phrase in [ + "turn right", + "turn left", + "turn around", + "take steps", + "take 1 step", + "take 2 steps", + "return to the starting point", + ] + ): + return "navigate" + + # Sports understanding - plausibility of sports sentences + if "is the following sentence plausible" in prompt_lower and any( + sport in prompt_lower + for sport in [ + "touchdown", + "home run", + "corner kick", + "penalty kick", + "free throw", + "slam dunk", + "field goal", + "worked a full count", + "hit a triple", + "scored a goal", + "threw a curveball", + ] + ): + return "sports_understanding" + + # Hyperbaton - adjective order + if "adjective order" in prompt_lower or "which sentence has the correct adjective order" in prompt_lower: + return "hyperbaton" + + # Web of lies - truth-telling puzzles + if "tells the truth" in prompt_lower or "does .* tell the truth" in prompt_lower: + return "web_of_lies" + + # Default fallback - try to infer from Yes/No pattern + if "yes or no" in prompt_lower: + if "is the following sentence plausible" in prompt_lower: + return "sports_understanding" + return "navigate" + + return "unknown" + + +def convert_entry(entry: dict, entry_index: int = 0) -> dict: + """Convert a single nemo-skills entry to VoiceBench format. + + nemo-skills format: + - problem: the prompt/question + - prompt: original prompt (for ifeval, preserved from source) + - generation: model's response + - expected_answer: reference answer (if available) + - Additional fields: id, key, instruction_id_list, kwargs, etc. + + VoiceBench format: + - prompt: the instruction/question + - response: model's output + - reference: expected answer (if available) + - id: identifier (for bbh) + - Additional fields preserved as-is + """ + # Prefer 'prompt' if it exists (e.g., for ifeval), otherwise use 'problem' + prompt_text = entry.get("prompt") or entry.get("problem", "") + # Clean special tokens from generation (S2S model outputs timing markers) + generation = clean_special_tokens(entry.get("generation", "")) + converted = { + "prompt": prompt_text, + "response": generation, + # Default reference to empty string if not present (e.g., for mtbench) + "reference": entry.get("expected_answer") or "", + } + + # Preserve additional fields needed by specific evaluators + # For BBH evaluator - needs 'id' field + if "id" in entry: + converted["id"] = entry["id"] + else: + # Infer task type for BBH if no id present + task_type = infer_bbh_task_type(prompt_text) + if task_type != "unknown": + converted["id"] = f"{task_type}_{entry_index}" + + # For IFEval evaluator - needs instruction_id_list and kwargs + if "instruction_id_list" in entry: + converted["instruction_id_list"] = entry["instruction_id_list"] + if "kwargs" in entry: + converted["kwargs"] = entry["kwargs"] + if "key" in entry: + converted["key"] = entry["key"] + + # Preserve subset_for_metrics if present + if "subset_for_metrics" in entry: + converted["subset_for_metrics"] = entry["subset_for_metrics"] + + return converted + + +def convert_file(input_path: str, output_path: str) -> int: + """Convert a nemo-skills JSONL file to VoiceBench format. + + Returns the number of entries converted. + """ + entries = [] + with open(input_path, "r", encoding="utf-8") as f: + for idx, line in enumerate(f): + if line.strip(): + entry = json.loads(line) + converted = convert_entry(entry, entry_index=idx) + entries.append(converted) + + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + + with open(output_path, "w", encoding="utf-8") as f: + for entry in entries: + f.write(json.dumps(entry) + "\n") + + return len(entries) + + +def get_evaluator_type(subtest: str) -> str: + """Get the VoiceBench evaluator type for a given subtest.""" + return SUBTEST_TO_EVALUATOR.get(subtest, "open") + + +def requires_gpt_judge(subtest: str) -> bool: + """Check if a subtest requires GPT judge scoring before evaluation.""" + return subtest in REQUIRES_GPT_JUDGE + + +def main(): + parser = argparse.ArgumentParser(description="Convert nemo-skills output to VoiceBench format") + parser.add_argument("--input", "-i", required=True, help="Path to input nemo-skills JSONL file") + parser.add_argument("--output", "-o", required=True, help="Path to output VoiceBench JSONL file") + parser.add_argument("--subtest", help="Subtest name (for printing evaluator info)") + args = parser.parse_args() + + count = convert_file(args.input, args.output) + print(f"Converted {count} entries from {args.input} to {args.output}") + + if args.subtest: + evaluator = get_evaluator_type(args.subtest) + needs_judge = requires_gpt_judge(args.subtest) + print(f"Subtest: {args.subtest}") + print(f"Evaluator type: {evaluator}") + print(f"Requires GPT judge: {needs_judge}") + + +if __name__ == "__main__": + main() diff --git a/nemo_skills/dataset/voicebench/scripts/convert_to_voicebench_format_.py b/nemo_skills/dataset/voicebench/scripts/convert_to_voicebench_format_.py new file mode 100644 index 0000000000..436bfbac38 --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/convert_to_voicebench_format_.py @@ -0,0 +1,146 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Convert nemo-skills output format to VoiceBench format for official scoring.""" + +import argparse +import json +from pathlib import Path + +# Mapping from VoiceBench subtests to evaluator types +SUBTEST_TO_EVALUATOR = { + "sd_qa": "qa", + "alpacaeval": "open", + "alpacaeval_full": "open", + "alpacaeval_speaker": "open", + "commoneval": "open", + "wildvoice": "open", + "mtbench": "open", + "advbench": "harm", + "ifeval": "ifeval", + "openbookqa": "mcq", + "mmsu": "mcq", + "bbh": "bbh", +} + +# Subtests that require GPT judge (api_judge.py) before evaluation +REQUIRES_GPT_JUDGE = { + "sd_qa", + "alpacaeval", + "alpacaeval_full", + "alpacaeval_speaker", + "commoneval", + "wildvoice", + "mtbench", +} + + +def convert_entry(entry: dict) -> dict: + """Convert a single nemo-skills entry to VoiceBench format. + + nemo-skills format: + - problem: the prompt/question + - prompt: original prompt (for ifeval, preserved from source) + - generation: model's response + - expected_answer: reference answer (if available) + - Additional fields: id, key, instruction_id_list, kwargs, etc. + + VoiceBench format: + - prompt: the instruction/question + - response: model's output + - reference: expected answer (if available) + - id: identifier (for bbh) + - Additional fields preserved as-is + """ + # Prefer 'prompt' if it exists (e.g., for ifeval), otherwise use 'problem' + converted = { + "prompt": entry.get("prompt") or entry.get("problem", ""), + "response": entry.get("generation", ""), + # Default reference to empty string if not present (e.g., for mtbench) + "reference": entry.get("expected_answer") or "", + } + + # Preserve additional fields needed by specific evaluators + # For BBH evaluator - needs 'id' field + if "id" in entry: + converted["id"] = entry["id"] + + # For IFEval evaluator - needs instruction_id_list and kwargs + if "instruction_id_list" in entry: + converted["instruction_id_list"] = entry["instruction_id_list"] + if "kwargs" in entry: + converted["kwargs"] = entry["kwargs"] + if "key" in entry: + converted["key"] = entry["key"] + + # Preserve subset_for_metrics if present + if "subset_for_metrics" in entry: + converted["subset_for_metrics"] = entry["subset_for_metrics"] + + return converted + + +def convert_file(input_path: str, output_path: str) -> int: + """Convert a nemo-skills JSONL file to VoiceBench format. + + Returns the number of entries converted. + """ + entries = [] + with open(input_path, "r", encoding="utf-8") as f: + for line in f: + if line.strip(): + entry = json.loads(line) + converted = convert_entry(entry) + entries.append(converted) + + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + + with open(output_path, "w", encoding="utf-8") as f: + for entry in entries: + f.write(json.dumps(entry) + "\n") + + return len(entries) + + +def get_evaluator_type(subtest: str) -> str: + """Get the VoiceBench evaluator type for a given subtest.""" + return SUBTEST_TO_EVALUATOR.get(subtest, "open") + + +def requires_gpt_judge(subtest: str) -> bool: + """Check if a subtest requires GPT judge scoring before evaluation.""" + return subtest in REQUIRES_GPT_JUDGE + + +def main(): + parser = argparse.ArgumentParser(description="Convert nemo-skills output to VoiceBench format") + parser.add_argument("--input", "-i", required=True, help="Path to input nemo-skills JSONL file") + parser.add_argument("--output", "-o", required=True, help="Path to output VoiceBench JSONL file") + parser.add_argument("--subtest", help="Subtest name (for printing evaluator info)") + args = parser.parse_args() + + count = convert_file(args.input, args.output) + print(f"Converted {count} entries from {args.input} to {args.output}") + + if args.subtest: + evaluator = get_evaluator_type(args.subtest) + needs_judge = requires_gpt_judge(args.subtest) + print(f"Subtest: {args.subtest}") + print(f"Evaluator type: {evaluator}") + print(f"Requires GPT judge: {needs_judge}") + + +if __name__ == "__main__": + main() diff --git a/nemo_skills/dataset/voicebench/scripts/generate_from_api_and_score_official.py b/nemo_skills/dataset/voicebench/scripts/generate_from_api_and_score_official.py new file mode 100644 index 0000000000..d213b5663f --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/generate_from_api_and_score_official.py @@ -0,0 +1,291 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Generate VoiceBench responses using nemo-skills and score with official VoiceBench package. + +Usage: + python generate_from_api_and_score_official.py --config voicebench_eval_config.yaml +""" + +import argparse +from pathlib import Path + +import yaml +from convert_to_voicebench_format import REQUIRES_GPT_JUDGE, SUBTEST_TO_EVALUATOR + +from nemo_skills.pipeline.cli import eval as nemo_eval +from nemo_skills.pipeline.cli import run_cmd, wrap_arguments + +ALL_SUBTESTS = [ + "advbench", + "alpacaeval", + "alpacaeval_full", + "alpacaeval_speaker", + "bbh", + "commoneval", + "ifeval", + "mmsu", + "mtbench", + "openbookqa", + "sd_qa", + "sd_qa_usa", + "wildvoice", +] + + +def load_config(config_path: str) -> dict: + with open(config_path, "r") as f: + return yaml.safe_load(f) + + +def build_score_command(config: dict, subtest: str) -> str: + """Build the scoring command to run via run_cmd. + + Uses run_voicebench_scoring.py to create output compatible with nemo-skills: + - summarized-results/ directory with logs + - metrics.json with evaluation results + """ + eval_results_dir = f"{config['output_dir']}/eval-results/voicebench.{subtest}" + evaluator = SUBTEST_TO_EVALUATOR.get(subtest, "open") + needs_judge = subtest in REQUIRES_GPT_JUDGE + voicebench_repo = config["voicebench_repo_path"] + scoring_script = "nemo_skills/dataset/voicebench/scripts/run_voicebench_scoring.py" + + cmd_args = [ + f"python {scoring_script}", + f"--eval_results_dir {eval_results_dir}", + f"--voicebench_repo {voicebench_repo}", + f"--subtest {subtest}", + f"--evaluator {evaluator}", + ] + + if needs_judge: + cmd_args.append("--needs_judge") + if config.get("api_type"): + cmd_args.append(f"--api_type {config['api_type']}") + if config.get("nvidia_model"): + cmd_args.append(f"--nvidia_model {config['nvidia_model']}") + + return " ".join(cmd_args) + + +def build_agent_audio_asr_command(config: dict, subtest: str) -> str: + """Build the agent-audio ASR + WER/CER command to run via run_cmd.""" + eval_results_dir = f"{config['output_dir']}/eval-results/voicebench.{subtest}" + asr_script = "nemo_skills/dataset/voicebench/scripts/run_agent_audio_asr_metrics.py" + asr_model = config.get("agent_audio_asr_model", "nvidia/parakeet-tdt-0.6b-v2") + + cmd_args = [ + f"python {asr_script}", + f"--eval_results_dir {eval_results_dir}", + f"--subtest {subtest}", + "--input_jsonl output.jsonl", + "--output_jsonl output_asr.jsonl", + f"--asr_model {asr_model}", + ] + if config.get("agent_audio_force", False): + cmd_args.append("--force") + + return " ".join(cmd_args) + + +def run_voicebench_eval(config: dict): + """Run VoiceBench evaluation using direct Python calls.""" + + # Parse subtests + subtests_cfg = config.get("subtests", "all") + if subtests_cfg == "all": + subtests = ALL_SUBTESTS + elif isinstance(subtests_cfg, str): + subtests = [s.strip() for s in subtests_cfg.split(",")] + else: + subtests = subtests_cfg + + subtests = [s for s in subtests if s in ALL_SUBTESTS] + if not subtests: + raise ValueError("No valid subtests specified") + + generation_only = config.get("generation_only", False) + scoring_only = config.get("scoring_only", False) + dry_run = config.get("dry_run", False) + + agent_audio_stage_enabled_cfg = config.get("agent_audio_stage_enabled") + if agent_audio_stage_enabled_cfg is None: + # Auto-enable if server_args requests audio generation. + agent_audio_stage_enabled = "--decode_audio" in (config.get("server_args") or "") + else: + agent_audio_stage_enabled = bool(agent_audio_stage_enabled_cfg) + # With 4-stage scoring (generated + ASR), ASR stage must run. + agent_audio_stage_enabled = True + + print(f"Processing {len(subtests)} subtests: {', '.join(subtests)}") + print(f"Output directory: {config['output_dir']}") + + # Build base extra args for hydra overrides + # Skip native evaluation for all subtests - VoiceBench scorer handles evaluation + base_extra_args = ["++eval_type=null"] + if config.get("max_samples"): + base_extra_args.append(f"++max_samples={config['max_samples']}") + if config.get("server_server_type"): + base_extra_args.append(f"++server.server_type={config['server_server_type']}") + if config.get("api_key_env_var"): + base_extra_args.append(f"++server.api_key_env_var={config['api_key_env_var']}") + + for subtest in subtests: + extra_args_str = " ".join(base_extra_args) + print(f"\n{'=' * 60}") + print(f"Processing subtest: {subtest}") + print(f"{'=' * 60}") + + expname = f"{config.get('expname', 'voicebench')}_{subtest}" + benchmark = f"voicebench.{subtest}" + eval_results_path = f"{config['output_dir']}/eval-results/voicebench.{subtest}" + eval_dir = Path(eval_results_path) + output_jsonl = eval_dir / "output.jsonl" + output_jsonl_done = eval_dir / "output.jsonl.done" + + # If output.jsonl and done markers exist, nemo-skills will skip scheduling any generation tasks. + # In that case, downstream stages must not depend on `expname`. + num_chunks = int(config.get("num_chunks", 1) or 1) + if num_chunks > 1: + chunk_done_ok = all((eval_dir / f"output_chunk_{i}.jsonl.done").exists() for i in range(num_chunks)) + else: + chunk_done_ok = (eval_dir / "output_chunk_0.jsonl.done").exists() or output_jsonl_done.exists() + generation_complete = output_jsonl.exists() and chunk_done_ok + generation_submitted = False + + # Generation phase + if not scoring_only: + if generation_complete: + print(f"\n--- Skipping generation (found {output_jsonl} and done markers) ---") + else: + print("\n--- Running generation ---") + server_gpus = config.get("server_gpus", 1) + # Use cpu_partition when not self-hosting (external API) + partition = config.get("cpu_partition") if server_gpus == 0 else config.get("partition") + gen_exp = nemo_eval( + ctx=wrap_arguments(extra_args_str), + cluster=config["cluster"], + output_dir=config["output_dir"], + benchmarks=benchmark, + model=config["model"], + server_type=config.get("server_type", "vllm"), + server_gpus=server_gpus, + server_address=config.get("server_address"), + num_chunks=config.get("num_chunks", 1), + server_container=config.get("server_container"), + server_entrypoint=config.get("server_entrypoint"), + data_dir=config.get("data_dir"), + server_args=config.get("server_args", ""), + installation_command=config.get("installation_command"), + partition=partition, + expname=expname, + auto_summarize_results=False, + dry_run=dry_run, + ) + generation_submitted = gen_exp is not None + + # Agent-audio ASR + WER/CER phase (optional) + agent_audio_expname = f"{expname}_agent_audio_asr" + if not generation_only and agent_audio_stage_enabled: + print("\n--- Running agent-audio ASR + WER/CER ---") + asr_command = build_agent_audio_asr_command(config, subtest) + run_cmd( + ctx=wrap_arguments(""), + cluster=config["cluster"], + command=asr_command, + container=config.get("server_container") or "nemo-skills", + partition=config.get("partition"), + num_gpus=1, + run_after=[expname] if generation_submitted else None, + expname=agent_audio_expname, + installation_command=config.get("agent_audio_installation_command"), + log_dir=f"{eval_results_path}/summarized-results", + dry_run=dry_run, + ) + + # Scoring phase (Stage 3: generated text, Stage 4: agent ASR transcript) + if not generation_only: + # Stage 3: VoiceBench scoring on generated text (output.jsonl) + print("\n--- Running scoring (generated text) ---") + score_command_generated = f"{build_score_command(config, subtest)} --input_jsonl output.jsonl --metrics_variant generated" + score_generated_expname = f"{expname}_score_generated" + run_cmd( + ctx=wrap_arguments(""), + cluster=config["cluster"], + command=score_command_generated, + partition=config.get("cpu_partition") or config.get("partition"), + run_after=[expname] if generation_submitted else None, + expname=score_generated_expname, + installation_command=config.get("scoring_installation_command"), + log_dir=f"{eval_results_path}/summarized-results", + dry_run=dry_run, + ) + + # Stage 4: VoiceBench scoring on agent ASR transcript (output_asr.jsonl) + print("\n--- Running scoring (agent ASR) ---") + score_command_asr = f"{build_score_command(config, subtest)} --input_jsonl output_asr.jsonl --metrics_variant asr" + run_cmd( + ctx=wrap_arguments(""), + cluster=config["cluster"], + command=score_command_asr, + partition=config.get("cpu_partition") or config.get("partition"), + run_after=[agent_audio_expname, score_generated_expname], + expname=f"{expname}_score_asr", + installation_command=config.get("scoring_installation_command"), + log_dir=f"{eval_results_path}/summarized-results", + dry_run=dry_run, + ) + + print(f"\n{'=' * 60}") + print("Done!") + print(f"{'=' * 60}") + + +def main(): + parser = argparse.ArgumentParser(description="VoiceBench evaluation with official scoring") + parser.add_argument("--config", required=True, help="Path to YAML config file") + + # CLI overrides + parser.add_argument("--cluster", help="Override cluster") + parser.add_argument("--partition", help="Override partition") + parser.add_argument("--model", help="Override model") + parser.add_argument("--output_dir", help="Override output directory") + parser.add_argument("--subtests", help="Override subtests (comma-separated)") + parser.add_argument("--max_samples", type=int, help="Override max_samples") + parser.add_argument("--dry_run", action="store_true", help="Print commands without executing") + parser.add_argument("--generation_only", action="store_true", help="Only run generation") + parser.add_argument("--scoring_only", action="store_true", help="Only run scoring") + + args = parser.parse_args() + + config = load_config(args.config) + + # Apply CLI overrides + for key in ["cluster", "partition", "model", "output_dir", "subtests", "max_samples"]: + if getattr(args, key, None) is not None: + config[key] = getattr(args, key) + if args.dry_run: + config["dry_run"] = True + if args.generation_only: + config["generation_only"] = True + if args.scoring_only: + config["scoring_only"] = True + + run_voicebench_eval(config) + + +if __name__ == "__main__": + main() diff --git a/nemo_skills/dataset/voicebench/scripts/run_agent_audio_asr_metrics.py b/nemo_skills/dataset/voicebench/scripts/run_agent_audio_asr_metrics.py new file mode 100644 index 0000000000..c8b2959cb6 --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/run_agent_audio_asr_metrics.py @@ -0,0 +1,469 @@ +#!/usr/bin/env python3 +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Stage 2 for VoiceBench S2S runs: agent-audio ASR + WER/CER. + +Reads nemo-skills generation output.jsonl, which may include: + - "generation": model-generated text + - "audio": {"path": "..."} pointing to agent audio (wav) produced by server + +Writes: + - output_asr.jsonl: same items, but with "generation" replaced by the agent-ASR transcript + and original generation saved as "generation_text". + - agent_audio_metrics.json: aggregated agent WER/CER in nemo-skills metric format. + +Segmentation + ASR logic is borrowed (lightly adapted) from: + nemo_skills/dataset/s2s_demo/scripts/eval_conversation_behavior_v2.py +""" + +from __future__ import annotations + +import argparse +import json +import os +import re +import tempfile +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional, Tuple + + +def _read_jsonl(path: Path) -> Iterable[Dict[str, Any]]: + with path.open("rt", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + yield json.loads(line) + + +def _write_jsonl(path: Path, rows: Iterable[Dict[str, Any]]) -> None: + with path.open("wt", encoding="utf-8") as f: + for row in rows: + f.write(json.dumps(row, ensure_ascii=False) + "\n") + + +def _safe_mkdir(path: Path) -> None: + path.mkdir(parents=True, exist_ok=True) + + +def normalize_text_for_wer(text: str) -> str: + """A light normalization similar to s2s_demo evaluation code.""" + if not text: + return "" + text = text.lower() + # remove special timing tokens commonly produced by S2S models + text = re.sub(r"<\$\s*[\d.]+\s*\$>", " ", text) + text = re.sub(r"<\|\s*[\d.]+\s*\|>", " ", text) + # normalize punctuation -> spaces + text = re.sub(r"[^a-z0-9]+", " ", text) + text = re.sub(r"\s+", " ", text).strip() + return text + + +@dataclass +class EditCounts: + substitutions: int + insertions: int + deletions: int + ref_len: int + + +def _edit_distance_counts(ref: List[str], hyp: List[str]) -> EditCounts: + """Compute S/I/D counts for ref->hyp using DP.""" + n, m = len(ref), len(hyp) + # dp[i][j] = min edits for ref[:i] -> hyp[:j] + dp = [[0] * (m + 1) for _ in range(n + 1)] + back = [[None] * (m + 1) for _ in range(n + 1)] # type: ignore[var-annotated] + + for i in range(1, n + 1): + dp[i][0] = i + back[i][0] = "D" + for j in range(1, m + 1): + dp[0][j] = j + back[0][j] = "I" + + for i in range(1, n + 1): + for j in range(1, m + 1): + if ref[i - 1] == hyp[j - 1]: + dp[i][j] = dp[i - 1][j - 1] + back[i][j] = "M" + else: + sub = dp[i - 1][j - 1] + 1 + ins = dp[i][j - 1] + 1 + dele = dp[i - 1][j] + 1 + best = min(sub, ins, dele) + dp[i][j] = best + if best == sub: + back[i][j] = "S" + elif best == ins: + back[i][j] = "I" + else: + back[i][j] = "D" + + i, j = n, m + s = ins = d = 0 + while i > 0 or j > 0: + op = back[i][j] + if op in ("M", "S"): + if op == "S": + s += 1 + i -= 1 + j -= 1 + elif op == "I": + ins += 1 + j -= 1 + elif op == "D": + d += 1 + i -= 1 + else: + # Should not happen, but keep it safe. + if i > 0: + d += 1 + i -= 1 + elif j > 0: + ins += 1 + j -= 1 + + return EditCounts(substitutions=s, insertions=ins, deletions=d, ref_len=n) + + +def compute_wer(ref_text: str, hyp_text: str) -> Tuple[Optional[float], EditCounts]: + ref_norm = normalize_text_for_wer(ref_text) + hyp_norm = normalize_text_for_wer(hyp_text) + ref_words = ref_norm.split() if ref_norm else [] + hyp_words = hyp_norm.split() if hyp_norm else [] + counts = _edit_distance_counts(ref_words, hyp_words) + if counts.ref_len == 0: + return (0.0 if not hyp_words else 1.0), counts + wer = (counts.substitutions + counts.insertions + counts.deletions) / counts.ref_len + return wer, counts + + +def compute_cer(ref_text: str, hyp_text: str) -> Tuple[Optional[float], EditCounts]: + ref_norm = normalize_text_for_wer(ref_text) + hyp_norm = normalize_text_for_wer(hyp_text) + ref_chars = list(ref_norm) if ref_norm else [] + hyp_chars = list(hyp_norm) if hyp_norm else [] + counts = _edit_distance_counts(ref_chars, hyp_chars) + if counts.ref_len == 0: + return (0.0 if not hyp_chars else 1.0), counts + cer = (counts.substitutions + counts.insertions + counts.deletions) / counts.ref_len + return cer, counts + + +def _load_audio_mono(path: str) -> Tuple["Any", int]: + import numpy as np + import soundfile as sf + + audio, sr = sf.read(path, always_2d=False) + audio = np.asarray(audio) + if audio.ndim == 2: + audio = audio.mean(axis=1) + audio = audio.astype("float32", copy=False) + return audio, int(sr) + + +def _resample_to_16k(audio, sr: int): + if sr == 16000: + return audio, 16000 + try: + import librosa + + return librosa.resample(audio, orig_sr=sr, target_sr=16000), 16000 + except Exception: + import torch + import torchaudio + + x = torch.from_numpy(audio).float().unsqueeze(0) + y = torchaudio.functional.resample(x, sr, 16000) + return y.squeeze(0).cpu().numpy(), 16000 + + +class LazySileroVAD: + def __init__(self): + self._vad_model = None + self._get_speech_timestamps = None + + def available(self) -> bool: + try: + import torch # noqa: F401 + + return True + except Exception: + return False + + def _load(self): + if self._vad_model is not None: + return + import torch + + # This is the same entrypoint used in s2s_demo eval code. + vad_model, utils = torch.hub.load("snakers4/silero-vad", model="silero_vad", force_reload=False) + vad_model = vad_model.to("cuda" if torch.cuda.is_available() else "cpu") + get_speech_timestamps, _, _, _, _ = utils + self._vad_model = vad_model + self._get_speech_timestamps = get_speech_timestamps + + def speech_segments(self, audio_16k, sr: int) -> List[Tuple[float, float]]: + """Return list of (start_sec, end_sec). Falls back to full segment on failure.""" + if sr != 16000: + raise ValueError("VAD expects 16k audio") + try: + self._load() + import torch + + x = torch.from_numpy(audio_16k).float() + device = next(self._vad_model.parameters()).device # type: ignore[union-attr] + x = x.to(device) + segs = self._get_speech_timestamps(x, self._vad_model, sampling_rate=16000) # type: ignore[misc] + out = [] + for s in segs or []: + start = float(s.get("start", 0)) / 16000.0 + end = float(s.get("end", 0)) / 16000.0 + if end > start: + out.append((start, end)) + return out + except Exception: + # no VAD available/cached; treat whole audio as one segment + dur = float(len(audio_16k)) / 16000.0 if audio_16k is not None else 0.0 + return [(0.0, dur)] if dur > 0 else [] + + +class LazyNeMoASR: + def __init__(self, model_name: str): + self.model_name = model_name + self._model = None + + def _load(self): + if self._model is not None: + return + import nemo.collections.asr as nemo_asr + + m = nemo_asr.models.ASRModel.from_pretrained(model_name=self.model_name) + m = m.cuda() if hasattr(m, "cuda") else m + self._model = m + + def transcribe_segment(self, audio, sr: int, start_sec: float, end_sec: float) -> str: + """Borrowed approach from s2s_demo: write temp wav then ASRModel.transcribe.""" + self._load() + import numpy as np + import soundfile as sf + + start = max(0, int(start_sec * sr)) + end = min(len(audio), int(end_sec * sr)) + if end <= start: + return "" + seg = np.asarray(audio[start:end], dtype="float32") + if seg.size < int(0.02 * sr): # too short + return "" + if sr != 16000: + seg, _ = _resample_to_16k(seg, sr) + sr = 16000 + + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: + tmp_path = tmp.name + try: + sf.write(tmp_path, seg, sr, format="WAV") + outs = self._model.transcribe([tmp_path]) # type: ignore[union-attr] + if not outs: + return "" + first = outs[0] + # NeMo returns either string or object with .text + if hasattr(first, "text"): + return (first.text or "").strip() + if isinstance(first, str): + return first.strip() + return str(first).strip() + finally: + try: + os.remove(tmp_path) + except Exception: + pass + + +def main(): + parser = argparse.ArgumentParser(description="Agent-audio ASR + WER/CER stage for VoiceBench runs") + parser.add_argument("--eval_results_dir", required=True, help="Path to eval-results/voicebench./") + parser.add_argument("--subtest", required=True, help="Subtest name (for metrics key voicebench.)") + parser.add_argument("--input_jsonl", default="output.jsonl", help="Input jsonl filename inside eval_results_dir") + parser.add_argument("--output_jsonl", default="output_asr.jsonl", help="Output jsonl filename inside eval_results_dir") + parser.add_argument("--asr_model", default="nvidia/parakeet-tdt-0.6b-v2", help="NeMo ASR model name") + parser.add_argument("--force", action="store_true", help="Overwrite outputs if they exist") + args = parser.parse_args() + + eval_dir = Path(args.eval_results_dir) + in_path = eval_dir / args.input_jsonl + out_path = eval_dir / args.output_jsonl + metrics_path = eval_dir / "agent_audio_metrics.json" + + if (out_path.exists() or metrics_path.exists()) and not args.force: + print("Agent-audio ASR stage already done (output_asr/agent_audio_metrics exists). Skipping.") + return + + if not in_path.exists(): + raise FileNotFoundError(f"Missing input jsonl: {in_path}") + + # Aggregate counts + total_word_sub = total_word_ins = total_word_del = total_ref_words = 0 + total_char_sub = total_char_ins = total_char_del = total_ref_chars = 0 + + out_rows: List[Dict[str, Any]] = [] + + rows_in = list(_read_jsonl(in_path)) + has_any_audio = False + for row in rows_in: + if isinstance(row.get("audio"), dict) and row["audio"].get("path"): + has_any_audio = True + break + + # Fast path: if no audio paths are present at all, emit output_asr.jsonl identical-in-text to output.jsonl + # so downstream "ASR scoring" is still meaningful (it will match generated scoring). + if not has_any_audio: + for row in rows_in: + generation_text = row.get("generation", "") or "" + debug_info = row.get("debug_info") + if not isinstance(debug_info, dict): + debug_info = {} + debug_info["agent_audio_missing"] = True + debug_info["agent_audio_asr"] = None + debug_info["agent_audio_wer"] = None + debug_info["agent_audio_cer"] = None + + out_row = dict(row) + out_row["generation_text"] = generation_text + out_row["generation"] = generation_text + out_row["debug_info"] = debug_info + out_rows.append(out_row) + + _write_jsonl(out_path, out_rows) + metrics = { + f"voicebench.{args.subtest}": { + "greedy": { + "agent_wer": None, + "agent_cer": None, + "agent_ref_words": 0, + "agent_ref_chars": 0, + "agent_word_substitutions": 0, + "agent_word_insertions": 0, + "agent_word_deletions": 0, + "agent_char_substitutions": 0, + "agent_char_insertions": 0, + "agent_char_deletions": 0, + } + } + } + with metrics_path.open("wt", encoding="utf-8") as f: + json.dump(metrics, f, indent=2) + print(f"No audio paths found in {in_path}; wrote passthrough {out_path} and {metrics_path}") + return + + vad = LazySileroVAD() + asr = LazyNeMoASR(args.asr_model) + + for row in rows_in: + generation_text = row.get("generation", "") or "" + audio_path = None + if isinstance(row.get("audio"), dict): + audio_path = row["audio"].get("path") + + debug_info = row.get("debug_info") + if not isinstance(debug_info, dict): + debug_info = {} + + agent_asr_text = "" + agent_wer = None + agent_cer = None + segments: List[Tuple[float, float]] = [] + + if audio_path and Path(audio_path).exists(): + audio, sr = _load_audio_mono(audio_path) + audio16, _ = _resample_to_16k(audio, sr) + + # VAD segmentation (fallback to full audio if VAD fails) + segments = vad.speech_segments(audio16, 16000) if vad.available() else [(0.0, len(audio16) / 16000.0)] + if not segments: + segments = [(0.0, float(len(audio)) / float(sr))] if sr > 0 and len(audio) > 0 else [] + + parts: List[str] = [] + for (s, e) in segments: + t = asr.transcribe_segment(audio, sr, s, e) + if t: + parts.append(t) + agent_asr_text = " ".join(parts).strip() + + agent_wer, w_counts = compute_wer(generation_text, agent_asr_text) + agent_cer, c_counts = compute_cer(generation_text, agent_asr_text) + + total_word_sub += w_counts.substitutions + total_word_ins += w_counts.insertions + total_word_del += w_counts.deletions + total_ref_words += w_counts.ref_len + + total_char_sub += c_counts.substitutions + total_char_ins += c_counts.insertions + total_char_del += c_counts.deletions + total_ref_chars += c_counts.ref_len + else: + debug_info["agent_audio_missing"] = True + # Keep ASR output as generated text so ASR-scoring remains usable even when audio is missing. + agent_asr_text = generation_text + + debug_info["agent_audio_asr"] = agent_asr_text + debug_info["agent_audio_wer"] = agent_wer + debug_info["agent_audio_cer"] = agent_cer + if segments: + debug_info["agent_audio_segments_sec"] = [{"start": s, "end": e} for (s, e) in segments] + + out_row = dict(row) + out_row["generation_text"] = generation_text + out_row["generation"] = agent_asr_text + out_row["debug_info"] = debug_info + out_rows.append(out_row) + + _write_jsonl(out_path, out_rows) + + agent_wer_total = None + if total_ref_words > 0: + agent_wer_total = (total_word_sub + total_word_ins + total_word_del) / total_ref_words + agent_cer_total = None + if total_ref_chars > 0: + agent_cer_total = (total_char_sub + total_char_ins + total_char_del) / total_ref_chars + + metrics = { + f"voicebench.{args.subtest}": { + "greedy": { + "agent_wer": agent_wer_total, + "agent_cer": agent_cer_total, + "agent_ref_words": total_ref_words, + "agent_ref_chars": total_ref_chars, + "agent_word_substitutions": total_word_sub, + "agent_word_insertions": total_word_ins, + "agent_word_deletions": total_word_del, + "agent_char_substitutions": total_char_sub, + "agent_char_insertions": total_char_ins, + "agent_char_deletions": total_char_del, + } + } + } + with metrics_path.open("wt", encoding="utf-8") as f: + json.dump(metrics, f, indent=2) + print(f"Wrote {out_path} and {metrics_path}") + + +if __name__ == "__main__": + main() + diff --git a/nemo_skills/dataset/voicebench/scripts/run_generate.py b/nemo_skills/dataset/voicebench/scripts/run_generate.py new file mode 100644 index 0000000000..ba1bd76180 --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/run_generate.py @@ -0,0 +1,135 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Run generation on an arbitrary input.jsonl, parallelized into num_chunks Slurm jobs. + +Uses the s2s_voicechat backend via nemo-skills pipeline. Server/cluster settings +are read from a YAML config (same format as voicebench eval configs). + +Usage: + python run_generate.py \ + --config voicebench_s2s_voicechat_offline_sound_config.yaml \ + --input_file /path/to/input.jsonl \ + --output_dir /path/to/output \ + --num_chunks 48 + +Full run command: +cd +. ./.venv/bin/activate && NEMO_SKILLS_DISABLE_UNCOMMITTED_CHANGES_CHECK=1 \ + python nemo_skills/dataset/voicebench/scripts/run_generate.py \ + --config nemo_skills/dataset/voicebench/scripts/run_generate_test_sdqa10.yaml \ + --input_file /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_test/data_dir/voicebench/sd_qa/test_10.jsonl +""" + +import argparse + +import yaml + +from nemo_skills.pipeline.generate import generate as nemo_generate + + +def wrap_arguments(arguments: str): + """Returns a mock context object to allow using the cli entrypoints as functions.""" + + class MockContext: + def __init__(self, args): + self.args = args + self.obj = None + + return MockContext(args=arguments.split(" ")) + + +def load_config(config_path: str) -> dict: + with open(config_path, "r") as f: + return yaml.safe_load(f) + + +def main(): + parser = argparse.ArgumentParser(description="Run generation on an arbitrary input.jsonl") + parser.add_argument("--config", required=True, help="Path to YAML config file with server/cluster settings") + parser.add_argument("--input_file", required=True, help="Path to input.jsonl file") + parser.add_argument("--output_dir", help="Override output directory from config") + parser.add_argument("--num_chunks", type=int, help="Override num_chunks from config") + parser.add_argument("--model", help="Override model path from config") + parser.add_argument("--partition", help="Override partition from config") + parser.add_argument("--expname", help="Override expname from config") + parser.add_argument("--dry_run", action="store_true", help="Print commands without executing") + + args = parser.parse_args() + + config = load_config(args.config) + + # Apply CLI overrides + if args.output_dir: + config["output_dir"] = args.output_dir + if args.num_chunks is not None: + config["num_chunks"] = args.num_chunks + if args.model: + config["model"] = args.model + if args.partition: + config["partition"] = args.partition + if args.expname: + config["expname"] = args.expname + if args.dry_run: + config["dry_run"] = True + + if not config.get("output_dir"): + raise ValueError("output_dir must be specified in config or via --output_dir") + + # Build hydra extra args + extra_args = ["++prompt_format=openai"] + if config.get("server_server_type"): + extra_args.append(f"++server.server_type={config['server_server_type']}") + if config.get("data_dir"): + extra_args.append(f"++eval_config.data_dir={config['data_dir']}") + if config.get("max_samples"): + extra_args.append(f"++max_samples={config['max_samples']}") + + # Build mount_paths: data_dir is mounted as /dataset so absolute audio paths + # in jsonl (e.g. /dataset/voicebench/data/foo.wav) resolve correctly. + mount_paths = None + if config.get("data_dir"): + mount_paths = f"{config['data_dir']}:/dataset" + extra_args_str = " ".join(extra_args) + + print(f"Input file: {args.input_file}") + print(f"Output directory: {config['output_dir']}") + print(f"Num chunks: {config.get('num_chunks', 1)}") + print(f"Dry run: {config.get('dry_run', False)}") + + nemo_generate( + ctx=wrap_arguments(extra_args_str), + cluster=config["cluster"], + input_file=args.input_file, + output_dir=config["output_dir"], + model=config["model"], + server_type=config.get("server_type", "vllm"), + server_gpus=config.get("server_gpus", 1), + num_chunks=config.get("num_chunks", 1), + server_container=config.get("server_container"), + server_entrypoint=config.get("server_entrypoint"), + server_args=config.get("server_args", ""), + installation_command=config.get("installation_command"), + partition=config.get("partition"), + mount_paths=mount_paths, + expname=config.get("expname", "generate"), + dry_run=config.get("dry_run", False), + ) + + print("Done!") + + +if __name__ == "__main__": + main() diff --git a/nemo_skills/dataset/voicebench/scripts/run_generate_test_sdqa10.yaml b/nemo_skills/dataset/voicebench/scripts/run_generate_test_sdqa10.yaml new file mode 100644 index 0000000000..cb2c69b068 --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/run_generate_test_sdqa10.yaml @@ -0,0 +1,44 @@ +# Test config for run_generate.py: sd_qa test_10, 2 chunks, s2s_voicechat backend on draco (oci_iad). +# +# Run: +# python nemo_skills/dataset/voicebench/scripts/run_generate.py \ +# --config nemo_skills/dataset/voicebench/scripts/run_generate_test_sdqa10.yaml \ +# --input_file /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_test/data_dir/voicebench/sd_qa/test_10.jsonl + +cluster: oci_iad +partition: batch_block1,batch_block3,batch_block4 + +model: /lustre/fsw/portfolios/llmservice/users/apasad/projects/nemo_s2s_merged_dec/exp_SFT-nano9b/IAD_nano9b_parakeet600m_from_PT_32k_64gpu_5e-5_PT0.7_SFT0.05_QA0.02_TEXT0.1_loss0.5_MCQ0.03_ASR0.01_sysp0.03_NoiseProb0.5_SNR-30-60_asr_dtc2_dst15_loss_text5.0_bos10.0_eos5.0_pad1.0_eosplacementsfix_nospecaug_ei0.1_ot8_TN_all_data_v3.2_ir/checkpoints_hf_step-12556-last.ckpt + +server_type: vllm +server_gpus: 1 +num_chunks: 2 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" + +# output_dir here is for server artefacts (used with --save_artifacts) +server_args: >- + --backend s2s_voicechat + --config_path /lustre/fsw/portfolios/llmservice/users/kevinhu/projects/duplex-stt/config/inference/infer_nano_eartts_Voicebench_Jan22_2026.yaml + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/S2S-full-duplex/inference_references/Emma_S3_A1_SC7_singleturntarget_21_channel_1_audio_in.wav + --tts_ckpt_path /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Duplex_EARTTS/eartts_rvq_cont_task_nanov2_tts_pretraining_wordlist_duplex_data_new_branch_4nodes_duplex_eartts_2_delay_4rd_stage_fp32_wd_1500_et_eos_dp_eos_dup_step_10004.ckpt + --extra_decoding_seconds 20 + --inference_pad_boost -0.5 + --inference_bos_boost 0.5 + --inference_eos_boost 0 + --ignore_system_prompt + --dtype float32 + --code_path /lustre/fsw/portfolios/convai/users/kevinhu/S2S-Duplex-new-codebase/branches/NeMo-release_not_rebased + --output_dir /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/run_generate_test/sdqa10_2chunks_v2_a2/server_artefacts + --save_artifacts + --decode_audio + --batch_size 2 + +server_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh +server_server_type: vllm_multimodal + +data_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_test/data_dir +output_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/run_generate_test/sdqa10_2chunks_v2_a2 + +installation_command: "" +expname: run_generate_test_sdqa10 diff --git a/nemo_skills/dataset/voicebench/scripts/run_voicebench_scoring.py b/nemo_skills/dataset/voicebench/scripts/run_voicebench_scoring.py new file mode 100644 index 0000000000..45a569465f --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/run_voicebench_scoring.py @@ -0,0 +1,233 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Run VoiceBench scoring with nemo-skills compatible output structure. + +Creates: +- summarized-results/ directory +- metrics.json with evaluation results +""" + +import argparse +import json +import re +import subprocess +import sys +from pathlib import Path + + +def run_scoring( + eval_results_dir: str, + voicebench_repo: str, + subtest: str, + evaluator: str, + needs_judge: bool, + input_jsonl: str = "output.jsonl", + metrics_variant: str = "generated", + api_type: str = "openai", + nvidia_model: str = "meta/llama-3.1-70b-instruct", + force: bool = False, +): + """Run VoiceBench scoring and save results in nemo-skills format.""" + eval_results_dir = Path(eval_results_dir) + output_jsonl = eval_results_dir / input_jsonl + converted_jsonl = eval_results_dir / "voicebench_format.jsonl" + summarized_dir = eval_results_dir / "summarized-results" + metrics_file = eval_results_dir / "metrics.json" + agent_audio_metrics_file = eval_results_dir / "agent_audio_metrics.json" + + if metrics_variant not in ("generated", "asr"): + raise ValueError("metrics_variant must be one of: generated, asr") + # We always write into a single `greedy` dict; the ASR variant is stored with *_asr keys. + metrics_key = "greedy" + asr_suffix = "_asr" + + # Skip if this variant already exists (unless force is set) + if metrics_file.exists() and not force: + try: + with open(metrics_file) as f: + existing_metrics = json.load(f) + greedy = existing_metrics.get(f"voicebench.{subtest}", {}).get("greedy", {}) + if isinstance(greedy, dict): + if metrics_variant == "asr": + if any(k.endswith(asr_suffix) for k in greedy.keys()): + print( + f"Scoring already done for voicebench.{subtest} (ASR keys exist in metrics.json). Skipping." + ) + print("Use --force to re-run scoring.") + return 0 + else: + # Skip if we already have any non-agent, non-ASR VoiceBench metrics. + has_generated_metrics = any( + (not k.startswith("agent_")) and (not k.endswith(asr_suffix)) for k in greedy.keys() + ) + if has_generated_metrics: + print( + f"Scoring already done for voicebench.{subtest} (generated metrics exist in metrics.json). Skipping." + ) + print("Use --force to re-run scoring.") + return 0 + except Exception: + # If metrics.json is malformed, fall back to recomputing. + pass + + # Create summarized-results directory + summarized_dir.mkdir(parents=True, exist_ok=True) + + # Step 1: Convert format + print(f"Converting {output_jsonl} to VoiceBench format...") + convert_script = Path(__file__).parent / "convert_to_voicebench_format.py" + cmd = f"python {convert_script} --input {output_jsonl} --output {converted_jsonl} --subtest {subtest}" + subprocess.run(cmd, shell=True, check=True) + + # Step 2: Run GPT judge if needed + if needs_judge: + print("Running GPT judge...") + api_judge_args = f"--src_file {converted_jsonl}" + if api_type: + api_judge_args += f" --api_type {api_type}" + if nvidia_model: + api_judge_args += f" --nvidia_model {nvidia_model}" + cmd = f"cd {voicebench_repo} && python api_judge.py {api_judge_args}" + subprocess.run(cmd, shell=True, check=True) + result_file = eval_results_dir / "result-voicebench_format.jsonl" + else: + result_file = converted_jsonl + + # Step 3: Run evaluate.py and capture metrics + print(f"Running evaluation with {evaluator} evaluator...") + cmd = f"cd {voicebench_repo} && python evaluate.py --src_file {result_file} --evaluator {evaluator}" + result = subprocess.run(cmd, shell=True, capture_output=True, text=True) + + # Print output + print(result.stdout) + print(result.stderr, file=sys.stderr) + + # Parse metrics from loguru output + # Format: 2025-12-07 09:11:15.950 | INFO | __main__:main:18 - {'panda': 30.0, 'gpt': 50.0} + metrics = {} + for line in result.stderr.split("\n"): + if "INFO" in line and "{" in line and "}" in line: + try: + match = re.search(r"\{[^}]+\}", line) + if match: + # Use ast.literal_eval for safety + import ast + + metrics = ast.literal_eval(match.group()) + except Exception: + print(f"Warning: Could not parse metrics from line: {line}", file=sys.stderr) + + # Rename ASR metrics keys to *_asr to keep a single structure: + # greedy.{panda,gpt,...} for generated text + # greedy.{panda_asr,gpt_asr,...} for ASR-scored text + if metrics_variant == "asr": + metrics = {f"{k}{asr_suffix}": v for k, v in metrics.items()} + + nemo_metrics: dict = {f"voicebench.{subtest}": {metrics_key: metrics}} + + # Merge agent-audio metrics (WER/CER) if present. + if agent_audio_metrics_file.exists(): + try: + with open(agent_audio_metrics_file) as f: + agent_metrics = json.load(f) + key = f"voicebench.{subtest}" + agent_greedy = agent_metrics.get(key, {}).get("greedy", {}) + if isinstance(agent_greedy, dict): + nemo_metrics[key][metrics_key].update(agent_greedy) + except Exception as e: + print(f"Warning: failed merging agent_audio_metrics.json: {e}", file=sys.stderr) + + # Merge with existing metrics.json if present (keep one greedy dict with both generated + *_asr keys). + if metrics_file.exists(): + try: + with open(metrics_file) as f: + existing_metrics = json.load(f) + if isinstance(existing_metrics, dict): + key = f"voicebench.{subtest}" + existing_sub = existing_metrics.get(key, {}) + if not isinstance(existing_sub, dict): + existing_sub = {} + existing_greedy = existing_sub.get("greedy", {}) + if not isinstance(existing_greedy, dict): + existing_greedy = {} + + new_greedy = nemo_metrics.get(key, {}).get("greedy", {}) + if isinstance(new_greedy, dict): + existing_greedy.update(new_greedy) + + existing_sub["greedy"] = existing_greedy + existing_metrics[key] = existing_sub + nemo_metrics = existing_metrics + except Exception: + pass + with open(metrics_file, "w") as f: + json.dump(nemo_metrics, f, indent=2) + print(f"Metrics saved to {metrics_file}") + + # Also print metrics summary + print("\n" + "=" * 60) + print(f"RESULTS for voicebench.{subtest}") + print("=" * 60) + for k, v in metrics.items(): + print(f" {k}: {v}") + print("=" * 60) + + return result.returncode + + +def main(): + parser = argparse.ArgumentParser(description="Run VoiceBench scoring with nemo-skills output format") + parser.add_argument( + "--eval_results_dir", required=True, help="Path to eval-results/voicebench.{subtest}/ directory" + ) + parser.add_argument("--voicebench_repo", required=True, help="Path to VoiceBench repository") + parser.add_argument("--subtest", required=True, help="Subtest name") + parser.add_argument("--evaluator", required=True, help="Evaluator type (qa, open, harm, ifeval, mcq, bbh)") + parser.add_argument("--needs_judge", action="store_true", help="Whether to run GPT judge first") + parser.add_argument( + "--input_jsonl", + default="output.jsonl", + help="Which jsonl in eval_results_dir to score (e.g. output.jsonl or output_asr.jsonl)", + ) + parser.add_argument( + "--metrics_variant", + default="generated", + choices=["generated", "asr"], + help="Which scoring variant to compute (generated->panda/gpt keys, asr->panda_asr/gpt_asr keys)", + ) + parser.add_argument("--api_type", default="openai", choices=["openai", "nvidia"], help="API type for judge") + parser.add_argument("--nvidia_model", default="meta/llama-3.1-70b-instruct", help="Model for NVIDIA API") + parser.add_argument("--force", action="store_true", help="Force re-run scoring even if metrics.json exists") + + args = parser.parse_args() + + rc = run_scoring( + eval_results_dir=args.eval_results_dir, + voicebench_repo=args.voicebench_repo, + subtest=args.subtest, + evaluator=args.evaluator, + needs_judge=args.needs_judge, + input_jsonl=args.input_jsonl, + metrics_variant=args.metrics_variant, + api_type=args.api_type, + nvidia_model=args.nvidia_model, + force=args.force, + ) + sys.exit(rc) + + +if __name__ == "__main__": + main() diff --git a/nemo_skills/dataset/voicebench/scripts/run_voicebench_scoring_.py b/nemo_skills/dataset/voicebench/scripts/run_voicebench_scoring_.py new file mode 100644 index 0000000000..f016a8985e --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/run_voicebench_scoring_.py @@ -0,0 +1,150 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Run VoiceBench scoring with nemo-skills compatible output structure. + +Creates: +- summarized-results/ directory +- metrics.json with evaluation results +""" + +import argparse +import json +import re +import subprocess +import sys +from pathlib import Path + + +def run_scoring( + eval_results_dir: str, + voicebench_repo: str, + subtest: str, + evaluator: str, + needs_judge: bool, + api_type: str = "openai", + nvidia_model: str = "meta/llama-3.1-70b-instruct", + force: bool = False, +): + """Run VoiceBench scoring and save results in nemo-skills format.""" + eval_results_dir = Path(eval_results_dir) + output_jsonl = eval_results_dir / "output.jsonl" + converted_jsonl = eval_results_dir / "voicebench_format.jsonl" + summarized_dir = eval_results_dir / "summarized-results" + metrics_file = eval_results_dir / "metrics.json" + + # Skip if already scored (unless force is set) + if metrics_file.exists() and not force: + print(f"Scoring already done for voicebench.{subtest} (metrics.json exists). Skipping.") + print("Use --force to re-run scoring.") + with open(metrics_file) as f: + existing_metrics = json.load(f) + print(f"Existing metrics: {json.dumps(existing_metrics, indent=2)}") + return 0 + + # Create summarized-results directory + summarized_dir.mkdir(parents=True, exist_ok=True) + + # Step 1: Convert format + print(f"Converting {output_jsonl} to VoiceBench format...") + convert_script = Path(__file__).parent / "convert_to_voicebench_format.py" + cmd = f"python {convert_script} --input {output_jsonl} --output {converted_jsonl} --subtest {subtest}" + subprocess.run(cmd, shell=True, check=True) + + # Step 2: Run GPT judge if needed + if needs_judge: + print("Running GPT judge...") + api_judge_args = f"--src_file {converted_jsonl}" + if api_type: + api_judge_args += f" --api_type {api_type}" + if nvidia_model: + api_judge_args += f" --nvidia_model {nvidia_model}" + cmd = f"cd {voicebench_repo} && python api_judge.py {api_judge_args}" + subprocess.run(cmd, shell=True, check=True) + result_file = eval_results_dir / "result-voicebench_format.jsonl" + else: + result_file = converted_jsonl + + # Step 3: Run evaluate.py and capture metrics + print(f"Running evaluation with {evaluator} evaluator...") + cmd = f"cd {voicebench_repo} && python evaluate.py --src_file {result_file} --evaluator {evaluator}" + result = subprocess.run(cmd, shell=True, capture_output=True, text=True) + + # Print output + print(result.stdout) + print(result.stderr, file=sys.stderr) + + # Parse metrics from loguru output + # Format: 2025-12-07 09:11:15.950 | INFO | __main__:main:18 - {'panda': 30.0, 'gpt': 50.0} + metrics = {} + for line in result.stderr.split("\n"): + if "INFO" in line and "{" in line and "}" in line: + try: + match = re.search(r"\{[^}]+\}", line) + if match: + # Use ast.literal_eval for safety + import ast + + metrics = ast.literal_eval(match.group()) + except Exception: + print(f"Warning: Could not parse metrics from line: {line}", file=sys.stderr) + + # Save metrics.json in nemo-skills format + nemo_metrics = {f"voicebench.{subtest}": {"greedy": metrics}} + with open(metrics_file, "w") as f: + json.dump(nemo_metrics, f, indent=2) + print(f"Metrics saved to {metrics_file}") + + # Also print metrics summary + print("\n" + "=" * 60) + print(f"RESULTS for voicebench.{subtest}") + print("=" * 60) + for k, v in metrics.items(): + print(f" {k}: {v}") + print("=" * 60) + + return result.returncode + + +def main(): + parser = argparse.ArgumentParser(description="Run VoiceBench scoring with nemo-skills output format") + parser.add_argument( + "--eval_results_dir", required=True, help="Path to eval-results/voicebench.{subtest}/ directory" + ) + parser.add_argument("--voicebench_repo", required=True, help="Path to VoiceBench repository") + parser.add_argument("--subtest", required=True, help="Subtest name") + parser.add_argument("--evaluator", required=True, help="Evaluator type (qa, open, harm, ifeval, mcq, bbh)") + parser.add_argument("--needs_judge", action="store_true", help="Whether to run GPT judge first") + parser.add_argument("--api_type", default="openai", choices=["openai", "nvidia"], help="API type for judge") + parser.add_argument("--nvidia_model", default="meta/llama-3.1-70b-instruct", help="Model for NVIDIA API") + parser.add_argument("--force", action="store_true", help="Force re-run scoring even if metrics.json exists") + + args = parser.parse_args() + + rc = run_scoring( + eval_results_dir=args.eval_results_dir, + voicebench_repo=args.voicebench_repo, + subtest=args.subtest, + evaluator=args.evaluator, + needs_judge=args.needs_judge, + api_type=args.api_type, + nvidia_model=args.nvidia_model, + force=args.force, + ) + sys.exit(rc) + + +if __name__ == "__main__": + main() diff --git a/nemo_skills/dataset/voicebench/scripts/vb_baseline_02mar_config.yaml b/nemo_skills/dataset/voicebench/scripts/vb_baseline_02mar_config.yaml new file mode 100644 index 0000000000..8ccfbe38ca --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/vb_baseline_02mar_config.yaml @@ -0,0 +1,75 @@ +# VoiceBench evaluation - (i) BASELINE setup +# S2S incremental V2 backend, SPEECH output. +# No inference boosting, no force turn taking. +# Checkpoint: Mar 2 2026 (Feb 26 STT + Mar 3 TTS, Megan speaker) +# pad_audio_to_sec=40, ignore_system_prompt, buffer_size_frames=21 +# +# Run: +# python nemo_skills/dataset/voicebench/scripts/generate_from_api_and_score_official.py \ +# --config nemo_skills/dataset/voicebench/scripts/vb_baseline_02mar_config.yaml + +# Cluster settings +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-10min_sw_et_eos_dp_eos_dup_fp32_1delay_02_March_exp_17_afg_long_FT_Megan_msr_34k_steps-stt-AS9.1_11002_new_branch_load_fixed + +server_type: vllm +server_gpus: 1 +num_chunks: 48 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_incremental_v2 + --decode_audio + --ignore_system_prompt + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Mg_a_00759.wav + --num_frames_per_inference 3 + --engine_type vllm_llm_vllm_eartts + --use_perception_cache + --use_perception_cudagraph + --use_codec_cache + --buffer_size_frames 21 + --codec_token_history_size 60 + --pad_audio_to_sec 40 + --repetition_penalty 1.0 + --matmul_precision medium + --vllm_gpu_memory_utilization 0.35 + --vllm_max_model_len 8192 + --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02_FIXED/baseline/vb_artifacts + --batch_size 2 + --code_path /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/NeMo + --pip_install "hf-xet==1.1.9 huggingface-hub==0.34.4 nvidia-modelopt==0.33.1 nvidia-modelopt-core==0.33.1 tokenizers==0.22.0 transformers==4.56.0 lhotse==1.32.2 nv-one-logger-core==2.1.0 nv-one-logger-pytorch-lightning-integration==2.1.0 nv-one-logger-training-telemetry==2.1.0 kaldialign==0.9.1" + +server_container: /lustre/fsw/portfolios/llmservice/users/erastorgueva/code/containers/triton25.05_s2svllm26.02.12.sqsh +server_server_type: vllm_multimodal + +# Paths +data_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/voicebench_data_dir +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02_FIXED/baseline/vb +voicebench_repo_path: /lustre/fsw/portfolios/llmservice/users/vmendelev/code/VoiceBench + +subtests: + - sd_qa + - alpacaeval_full + - alpacaeval + - ifeval + - advbench + - commoneval + - wildvoice + - alpacaeval_speaker + +# Scoring settings +api_type: nvidia +nvidia_model: azure/openai/gpt-4o-mini +agent_audio_asr_model: nvidia/parakeet-tdt-1.1b + +installation_command: "" +agent_audio_installation_command: "ln -sf $(which python3) /usr/local/bin/python" +scoring_installation_command: "pip install sacrebleu qa_metrics" + +expname: vb_baseline_02mar +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/voicebench/scripts/vb_baseline_02mar_mcq_config.yaml b/nemo_skills/dataset/voicebench/scripts/vb_baseline_02mar_mcq_config.yaml new file mode 100644 index 0000000000..934542420a --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/vb_baseline_02mar_mcq_config.yaml @@ -0,0 +1,70 @@ +# VoiceBench evaluation - (i) BASELINE setup - MCQ subtests +# S2S incremental V2 backend, SPEECH output. +# No inference boosting, no force turn taking. +# MCQ datasets (bbh, openbookqa, mmsu) require a system prompt. +# Checkpoint: Mar 2 2026 (Feb 26 STT + Mar 3 TTS, Megan speaker) +# +# Run: +# python nemo_skills/dataset/voicebench/scripts/generate_from_api_and_score_official.py \ +# --config nemo_skills/dataset/voicebench/scripts/vb_baseline_02mar_mcq_config.yaml + +# Cluster settings +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-10min_sw_et_eos_dp_eos_dup_fp32_1delay_02_March_exp_17_afg_long_FT_Megan_msr_34k_steps-stt-AS9.1_11002_new_branch_load_fixed + +server_type: vllm +server_gpus: 1 +num_chunks: 48 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_incremental_v2 + --decode_audio + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Mg_a_00759.wav + --num_frames_per_inference 3 + --engine_type vllm_llm_vllm_eartts + --use_perception_cache + --use_perception_cudagraph + --use_codec_cache + --buffer_size_frames 21 + --codec_token_history_size 60 + --pad_audio_to_sec 40 + --repetition_penalty 1.0 + --matmul_precision medium + --vllm_gpu_memory_utilization 0.35 + --vllm_max_model_len 8192 + --system_prompt "Answer the following multiple choice question with an explanation for the answer." + --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02_FIXED/baseline/vb_mcq_artifacts + --batch_size 2 + --code_path /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/NeMo + --pip_install "hf-xet==1.1.9 huggingface-hub==0.34.4 nvidia-modelopt==0.33.1 nvidia-modelopt-core==0.33.1 tokenizers==0.22.0 transformers==4.56.0 lhotse==1.32.2 nv-one-logger-core==2.1.0 nv-one-logger-pytorch-lightning-integration==2.1.0 nv-one-logger-training-telemetry==2.1.0 kaldialign==0.9.1" + +server_container: /lustre/fsw/portfolios/llmservice/users/erastorgueva/code/containers/triton25.05_s2svllm26.02.12.sqsh +server_server_type: vllm_multimodal + +# Paths +data_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/voicebench_data_dir +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02_FIXED/baseline/vb_mcq +voicebench_repo_path: /lustre/fsw/portfolios/llmservice/users/vmendelev/code/VoiceBench + +subtests: + - bbh + - openbookqa + - mmsu + +# Scoring settings +api_type: nvidia +nvidia_model: azure/openai/gpt-4o-mini +agent_audio_asr_model: nvidia/parakeet-tdt-1.1b + +installation_command: "" +agent_audio_installation_command: "ln -sf $(which python3) /usr/local/bin/python" +scoring_installation_command: "pip install sacrebleu qa_metrics" + +expname: vb_baseline_02mar_mcq +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/voicebench/scripts/vb_matched_demo_v1_02mar_config.yaml b/nemo_skills/dataset/voicebench/scripts/vb_matched_demo_v1_02mar_config.yaml new file mode 100644 index 0000000000..bc9d9a74e3 --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/vb_matched_demo_v1_02mar_config.yaml @@ -0,0 +1,77 @@ +# VoiceBench evaluation - (ii) MATCHED_DEMO_V1 setup +# S2S incremental V2 backend, SPEECH output. +# Baseline + force_turn_taking + inference_user_pad_boost=0.8 +# Requires nemotron_h.py vLLM patch for boost params to take effect. +# Checkpoint: Mar 2 2026 (Feb 26 STT + Mar 3 TTS, Megan speaker) +# +# Run: +# python nemo_skills/dataset/voicebench/scripts/generate_from_api_and_score_official.py \ +# --config nemo_skills/dataset/voicebench/scripts/vb_matched_demo_v1_02mar_config.yaml + +# Cluster settings +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-10min_sw_et_eos_dp_eos_dup_fp32_1delay_02_March_exp_17_afg_long_FT_Megan_msr_34k_steps-stt-AS9.1_11002_new_branch_load_fixed + +server_type: vllm +server_gpus: 1 +num_chunks: 48 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_incremental_v2 + --decode_audio + --ignore_system_prompt + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Mg_a_00759.wav + --num_frames_per_inference 3 + --engine_type vllm_llm_vllm_eartts + --use_perception_cache + --use_perception_cudagraph + --use_codec_cache + --buffer_size_frames 21 + --codec_token_history_size 60 + --pad_audio_to_sec 40 + --repetition_penalty 1.0 + --matmul_precision medium + --vllm_gpu_memory_utilization 0.35 + --vllm_max_model_len 8192 + --force_turn_taking + --inference_user_pad_boost 0.8 + --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02_FIXED/matched_demo_v1/vb_artifacts + --batch_size 2 + --code_path /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/NeMo + --pip_install "hf-xet==1.1.9 huggingface-hub==0.34.4 nvidia-modelopt==0.33.1 nvidia-modelopt-core==0.33.1 tokenizers==0.22.0 transformers==4.56.0 lhotse==1.32.2 nv-one-logger-core==2.1.0 nv-one-logger-pytorch-lightning-integration==2.1.0 nv-one-logger-training-telemetry==2.1.0 kaldialign==0.9.1" + +server_container: /lustre/fsw/portfolios/llmservice/users/erastorgueva/code/containers/triton25.05_s2svllm26.02.12.sqsh +server_server_type: vllm_multimodal + +# Paths +data_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/voicebench_data_dir +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02_FIXED/matched_demo_v1/vb +voicebench_repo_path: /lustre/fsw/portfolios/llmservice/users/vmendelev/code/VoiceBench + +subtests: + - sd_qa + - alpacaeval_full + - alpacaeval + - ifeval + - advbench + - commoneval + - wildvoice + - alpacaeval_speaker + +# Scoring settings +api_type: nvidia +nvidia_model: azure/openai/gpt-4o-mini +agent_audio_asr_model: nvidia/parakeet-tdt-1.1b + +installation_command: "mkdir -p /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models && cp /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/nemotron_h.py /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/nemotron_h.py" +agent_audio_installation_command: "ln -sf $(which python3) /usr/local/bin/python" +scoring_installation_command: "pip install sacrebleu qa_metrics" + +expname: vb_matched_demo_v1_02mar +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/voicebench/scripts/vb_matched_demo_v1_02mar_mcq_config.yaml b/nemo_skills/dataset/voicebench/scripts/vb_matched_demo_v1_02mar_mcq_config.yaml new file mode 100644 index 0000000000..398b79f9ff --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/vb_matched_demo_v1_02mar_mcq_config.yaml @@ -0,0 +1,73 @@ +# VoiceBench evaluation - (ii) MATCHED_DEMO_V1 setup - MCQ subtests +# S2S incremental V2 backend, SPEECH output. +# Baseline + force_turn_taking + inference_user_pad_boost=0.8 +# MCQ datasets (bbh, openbookqa, mmsu) require a system prompt. +# Requires nemotron_h.py vLLM patch for boost params to take effect. +# Checkpoint: Mar 2 2026 (Feb 26 STT + Mar 3 TTS, Megan speaker) +# +# Run: +# python nemo_skills/dataset/voicebench/scripts/generate_from_api_and_score_official.py \ +# --config nemo_skills/dataset/voicebench/scripts/vb_matched_demo_v1_02mar_mcq_config.yaml + +# Cluster settings +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-10min_sw_et_eos_dp_eos_dup_fp32_1delay_02_March_exp_17_afg_long_FT_Megan_msr_34k_steps-stt-AS9.1_11002_new_branch_load_fixed + +server_type: vllm +server_gpus: 1 +num_chunks: 48 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_incremental_v2 + --decode_audio + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Mg_a_00759.wav + --num_frames_per_inference 3 + --engine_type vllm_llm_vllm_eartts + --use_perception_cache + --use_perception_cudagraph + --use_codec_cache + --buffer_size_frames 21 + --codec_token_history_size 60 + --pad_audio_to_sec 40 + --repetition_penalty 1.0 + --matmul_precision medium + --vllm_gpu_memory_utilization 0.35 + --vllm_max_model_len 8192 + --force_turn_taking + --inference_user_pad_boost 0.8 + --system_prompt "Answer the following multiple choice question with an explanation for the answer." + --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02_FIXED/matched_demo_v1/vb_mcq_artifacts + --batch_size 2 + --code_path /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/NeMo + --pip_install "hf-xet==1.1.9 huggingface-hub==0.34.4 nvidia-modelopt==0.33.1 nvidia-modelopt-core==0.33.1 tokenizers==0.22.0 transformers==4.56.0 lhotse==1.32.2 nv-one-logger-core==2.1.0 nv-one-logger-pytorch-lightning-integration==2.1.0 nv-one-logger-training-telemetry==2.1.0 kaldialign==0.9.1" + +server_container: /lustre/fsw/portfolios/llmservice/users/erastorgueva/code/containers/triton25.05_s2svllm26.02.12.sqsh +server_server_type: vllm_multimodal + +# Paths +data_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/voicebench_data_dir +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02_FIXED/matched_demo_v1/vb_mcq +voicebench_repo_path: /lustre/fsw/portfolios/llmservice/users/vmendelev/code/VoiceBench + +subtests: + - bbh + - openbookqa + - mmsu + +# Scoring settings +api_type: nvidia +nvidia_model: azure/openai/gpt-4o-mini +agent_audio_asr_model: nvidia/parakeet-tdt-1.1b + +installation_command: "mkdir -p /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models && cp /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/nemotron_h.py /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/nemotron_h.py" +agent_audio_installation_command: "ln -sf $(which python3) /usr/local/bin/python" +scoring_installation_command: "pip install sacrebleu qa_metrics" + +expname: vb_matched_demo_v1_02mar_mcq +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/voicebench/scripts/vb_matched_demo_v2_02mar_config.yaml b/nemo_skills/dataset/voicebench/scripts/vb_matched_demo_v2_02mar_config.yaml new file mode 100644 index 0000000000..7cdbfc8c03 --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/vb_matched_demo_v2_02mar_config.yaml @@ -0,0 +1,79 @@ +# VoiceBench evaluation - (iii) MATCHED_DEMO_V2 setup +# S2S incremental V2 backend, SPEECH output. +# matched_demo_v1 + top_p=0.5 + repetition_penalty=1.1 + temperature=0.3 +# Requires nemotron_h.py vLLM patch for boost params to take effect. +# Checkpoint: Mar 2 2026 (Feb 26 STT + Mar 3 TTS, Megan speaker) +# +# Run: +# python nemo_skills/dataset/voicebench/scripts/generate_from_api_and_score_official.py \ +# --config nemo_skills/dataset/voicebench/scripts/vb_matched_demo_v2_02mar_config.yaml + +# Cluster settings +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-10min_sw_et_eos_dp_eos_dup_fp32_1delay_02_March_exp_17_afg_long_FT_Megan_msr_34k_steps-stt-AS9.1_11002_new_branch_load_fixed + +server_type: vllm +server_gpus: 1 +num_chunks: 48 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_incremental_v2 + --decode_audio + --ignore_system_prompt + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Mg_a_00759.wav + --num_frames_per_inference 3 + --engine_type vllm_llm_vllm_eartts + --use_perception_cache + --use_perception_cudagraph + --use_codec_cache + --buffer_size_frames 21 + --codec_token_history_size 60 + --pad_audio_to_sec 40 + --matmul_precision medium + --vllm_gpu_memory_utilization 0.35 + --vllm_max_model_len 8192 + --force_turn_taking + --inference_user_pad_boost 0.8 + --top_p 0.5 + --repetition_penalty 1.1 + --temperature 0.3 + --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02_FIXED/matched_demo_v2/vb_artifacts + --batch_size 2 + --code_path /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/NeMo + --pip_install "hf-xet==1.1.9 huggingface-hub==0.34.4 nvidia-modelopt==0.33.1 nvidia-modelopt-core==0.33.1 tokenizers==0.22.0 transformers==4.56.0 lhotse==1.32.2 nv-one-logger-core==2.1.0 nv-one-logger-pytorch-lightning-integration==2.1.0 nv-one-logger-training-telemetry==2.1.0 kaldialign==0.9.1" + +server_container: /lustre/fsw/portfolios/llmservice/users/erastorgueva/code/containers/triton25.05_s2svllm26.02.12.sqsh +server_server_type: vllm_multimodal + +# Paths +data_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/voicebench_data_dir +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02_FIXED/matched_demo_v2/vb +voicebench_repo_path: /lustre/fsw/portfolios/llmservice/users/vmendelev/code/VoiceBench + +subtests: + - sd_qa + - alpacaeval_full + - alpacaeval + - ifeval + - advbench + - commoneval + - wildvoice + - alpacaeval_speaker + +# Scoring settings +api_type: nvidia +nvidia_model: azure/openai/gpt-4o-mini +agent_audio_asr_model: nvidia/parakeet-tdt-1.1b + +installation_command: "mkdir -p /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models && cp /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/nemotron_h.py /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/nemotron_h.py" +agent_audio_installation_command: "ln -sf $(which python3) /usr/local/bin/python" +scoring_installation_command: "pip install sacrebleu qa_metrics" + +expname: vb_matched_demo_v2_02mar +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/voicebench/scripts/vb_matched_demo_v2_02mar_mcq_config.yaml b/nemo_skills/dataset/voicebench/scripts/vb_matched_demo_v2_02mar_mcq_config.yaml new file mode 100644 index 0000000000..ecc3549fbe --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/vb_matched_demo_v2_02mar_mcq_config.yaml @@ -0,0 +1,75 @@ +# VoiceBench evaluation - (iii) MATCHED_DEMO_V2 setup - MCQ subtests +# S2S incremental V2 backend, SPEECH output. +# matched_demo_v1 + top_p=0.5 + repetition_penalty=1.1 + temperature=0.3 +# MCQ datasets (bbh, openbookqa, mmsu) require a system prompt. +# Requires nemotron_h.py vLLM patch for boost params to take effect. +# Checkpoint: Mar 2 2026 (Feb 26 STT + Mar 3 TTS, Megan speaker) +# +# Run: +# python nemo_skills/dataset/voicebench/scripts/generate_from_api_and_score_official.py \ +# --config nemo_skills/dataset/voicebench/scripts/vb_matched_demo_v2_02mar_mcq_config.yaml + +# Cluster settings +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-10min_sw_et_eos_dp_eos_dup_fp32_1delay_02_March_exp_17_afg_long_FT_Megan_msr_34k_steps-stt-AS9.1_11002_new_branch_load_fixed + +server_type: vllm +server_gpus: 1 +num_chunks: 48 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_incremental_v2 + --decode_audio + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Mg_a_00759.wav + --num_frames_per_inference 3 + --engine_type vllm_llm_vllm_eartts + --use_perception_cache + --use_perception_cudagraph + --use_codec_cache + --buffer_size_frames 21 + --codec_token_history_size 60 + --pad_audio_to_sec 40 + --matmul_precision medium + --vllm_gpu_memory_utilization 0.35 + --vllm_max_model_len 8192 + --force_turn_taking + --inference_user_pad_boost 0.8 + --top_p 0.5 + --repetition_penalty 1.1 + --temperature 0.3 + --system_prompt "Answer the following multiple choice question with an explanation for the answer." + --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02_FIXED/matched_demo_v2/vb_mcq_artifacts + --batch_size 2 + --code_path /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/NeMo + --pip_install "hf-xet==1.1.9 huggingface-hub==0.34.4 nvidia-modelopt==0.33.1 nvidia-modelopt-core==0.33.1 tokenizers==0.22.0 transformers==4.56.0 lhotse==1.32.2 nv-one-logger-core==2.1.0 nv-one-logger-pytorch-lightning-integration==2.1.0 nv-one-logger-training-telemetry==2.1.0 kaldialign==0.9.1" + +server_container: /lustre/fsw/portfolios/llmservice/users/erastorgueva/code/containers/triton25.05_s2svllm26.02.12.sqsh +server_server_type: vllm_multimodal + +# Paths +data_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/voicebench_data_dir +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02_FIXED/matched_demo_v2/vb_mcq +voicebench_repo_path: /lustre/fsw/portfolios/llmservice/users/vmendelev/code/VoiceBench + +subtests: + - bbh + - openbookqa + - mmsu + +# Scoring settings +api_type: nvidia +nvidia_model: azure/openai/gpt-4o-mini +agent_audio_asr_model: nvidia/parakeet-tdt-1.1b + +installation_command: "mkdir -p /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models && cp /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/nemotron_h.py /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/nemotron_h.py" +agent_audio_installation_command: "ln -sf $(which python3) /usr/local/bin/python" +scoring_installation_command: "pip install sacrebleu qa_metrics" + +expname: vb_matched_demo_v2_02mar_mcq +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/voicebench/scripts/voicebench_eval_config.yaml b/nemo_skills/dataset/voicebench/scripts/voicebench_eval_config.yaml new file mode 100644 index 0000000000..267c4eeb3f --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/voicebench_eval_config.yaml @@ -0,0 +1,45 @@ +# VoiceBench evaluation configuration +# Copy and modify this file for your setup + +# Cluster settings +cluster: oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu # For scoring jobs (no GPU needed) + +# Model and server +model: Qwen/Qwen2.5-Omni-3B +server_type: vllm +server_gpus: 1 +num_chunks: 1 +server_container: /lustre/fsw/portfolios/llmservice/users/nkarpov/workspace/containers/vllm-openai-audio.sqsh +server_args: "" # e.g., "--hf_token YOUR_TOKEN" + +# Paths +data_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_test/data_dir +output_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_demo/runs/voicebench +voicebench_repo_path: /lustre/fsw/portfolios/llmservice/users/vmendelev/code/VoiceBench + +# Subtests to evaluate (list or "all") +# Available: advbench, alpacaeval, alpacaeval_full, alpacaeval_speaker, bbh, +# commoneval, ifeval, mmsu, mtbench, openbookqa, sd_qa, wildvoice +subtests: + - sd_qa # QA with PEDANT + GPT judge + + +# Scoring settings +api_type: nvidia +nvidia_model: azure/openai/gpt-4o-mini + +# Installation commands +installation_command: "" # For generation phase (if needed) +scoring_installation_command: "pip install sacrebleu qa_metrics" # For scoring phase + +# Experiment settings +expname: voicebench +max_samples: 10 # Set to integer for testing (e.g., 10) + +# Run modes +generation_only: false +scoring_only: false +dry_run: false + diff --git a/nemo_skills/dataset/voicebench/scripts/voicebench_eval_config_gemini.yaml b/nemo_skills/dataset/voicebench/scripts/voicebench_eval_config_gemini.yaml new file mode 100644 index 0000000000..d81a8a72b3 --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/voicebench_eval_config_gemini.yaml @@ -0,0 +1,49 @@ +# VoiceBench evaluation configuration +# Copy and modify this file for your setup + +# Cluster settings +cluster: oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu # For scoring jobs (no GPU needed) + +# Model and server +model: gcp/google/gemini-2.5-pro +server_server_type: vllm # Model class with audio handling +server_type: openai # Pipeline routing (CPU partition, external API) +server_gpus: 0 # Not self-hosting, using external API +num_chunks: 1 +server_address: https://inference-api.nvidia.com/v1 +api_key_env_var: NVIDIA_API_KEY # Tell VLLMModel which env var has the API key +# server_container: not needed for external API +server_args: "" + +# Paths +data_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_test/data_dir +output_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_demo/runs/voicebench +voicebench_repo_path: /lustre/fsw/portfolios/llmservice/users/vmendelev/code/VoiceBench + +# Subtests to evaluate (list or "all") +# Available: advbench, alpacaeval, alpacaeval_full, alpacaeval_speaker, bbh, +# commoneval, ifeval, mmsu, mtbench, openbookqa, sd_qa, wildvoice +subtests: + - sd_qa # QA with PEDANT + GPT judge + + + +# Scoring settings +api_type: nvidia +nvidia_model: azure/openai/gpt-4o-mini + +# Installation commands +installation_command: "" # For generation phase (if needed) +scoring_installation_command: "pip install sacrebleu qa_metrics" # For scoring phase + +# Experiment settings +expname: voicebench +max_samples: 10 # Set to integer for testing (e.g., 10) + +# Run modes +generation_only: false +scoring_only: false +dry_run: false + diff --git a/nemo_skills/dataset/voicebench/scripts/voicebench_rescore_initial.yaml b/nemo_skills/dataset/voicebench/scripts/voicebench_rescore_initial.yaml new file mode 100644 index 0000000000..f6dac73413 --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/voicebench_rescore_initial.yaml @@ -0,0 +1,38 @@ +# Rescore initial run subtests (bbh, ifeval) with cleaned tokens +# Run: python generate_from_api_and_score_official.py --config voicebench_rescore_initial.yaml + +cluster: oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +# Dummy model (not used for scoring_only) +model: /lustre/fsw/portfolios/llmservice/users/apasad/projects/nemo_s2s_merged_dec/exp_SFT-nano9b/IAD_nano9b_parakeet600m_from_PT_32k_64gpu_5e-5_PT0.7_SFT0.05_QA0.02_TEXT0.1_loss0.5_MCQ0.03_ASR0.01_sysp0.03_NoiseProb0.5_SNR-30-60_asr_dtc2_dst15_loss_text5.0_bos10.0_eos5.0_pad1.0_eosplacementsfix_nospecaug_ei0.1_ot8_TN_all_data_v3.2_ir/checkpoints_hf_step-12556-last.ckpt + +server_type: vllm +server_gpus: 1 +num_chunks: 1 + +# Paths - use existing output directory +data_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_test/data_dir +output_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_s2s_offline/runs/voicebench_20260203_111554 +voicebench_repo_path: /lustre/fsw/portfolios/llmservice/users/vmendelev/code/VoiceBench + +# Only rescore these subtests +subtests: + - bbh + - ifeval + +# Scoring settings +api_type: nvidia +nvidia_model: azure/openai/gpt-4o-mini + +# Installation commands +installation_command: "" +scoring_installation_command: "pip install sacrebleu qa_metrics" + +expname: voicebench_rescore_initial + +# SCORING ONLY - skip generation +generation_only: false +scoring_only: true +dry_run: false diff --git a/nemo_skills/dataset/voicebench/scripts/voicebench_rescore_remaining.yaml b/nemo_skills/dataset/voicebench/scripts/voicebench_rescore_remaining.yaml new file mode 100644 index 0000000000..a9b6423eea --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/voicebench_rescore_remaining.yaml @@ -0,0 +1,37 @@ +# Rescore remaining run subtest (sd_qa) with cleaned tokens +# Run: python generate_from_api_and_score_official.py --config voicebench_rescore_remaining.yaml + +cluster: oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +# Dummy model (not used for scoring_only) +model: /lustre/fsw/portfolios/llmservice/users/apasad/projects/nemo_s2s_merged_dec/exp_SFT-nano9b/IAD_nano9b_parakeet600m_from_PT_32k_64gpu_5e-5_PT0.7_SFT0.05_QA0.02_TEXT0.1_loss0.5_MCQ0.03_ASR0.01_sysp0.03_NoiseProb0.5_SNR-30-60_asr_dtc2_dst15_loss_text5.0_bos10.0_eos5.0_pad1.0_eosplacementsfix_nospecaug_ei0.1_ot8_TN_all_data_v3.2_ir/checkpoints_hf_step-12556-last.ckpt + +server_type: vllm +server_gpus: 1 +num_chunks: 1 + +# Paths - use existing output directory +data_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_test/data_dir +output_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_s2s_offline/runs/voicebench_remaining_20260203_123112 +voicebench_repo_path: /lustre/fsw/portfolios/llmservice/users/vmendelev/code/VoiceBench + +# Only rescore sd_qa +subtests: + - sd_qa + +# Scoring settings +api_type: nvidia +nvidia_model: azure/openai/gpt-4o-mini + +# Installation commands +installation_command: "" +scoring_installation_command: "pip install sacrebleu qa_metrics" + +expname: voicebench_rescore_remaining + +# SCORING ONLY - skip generation +generation_only: false +scoring_only: true +dry_run: false diff --git a/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_incremental_v2_sound_02mar_config.yaml b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_incremental_v2_sound_02mar_config.yaml new file mode 100644 index 0000000000..1339c8e828 --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_incremental_v2_sound_02mar_config.yaml @@ -0,0 +1,79 @@ +# (P1) VoiceBench evaluation - S2S incremental V2 backend, SPEECH output. +# Full audio decoding with codec cache for clean output. +# Uses NemotronVoicechatInferenceWrapper with vLLM + perception/codec caches. +# Checkpoint: Mar 2 2026 (Feb 26 STT + Mar 3 TTS, Megan speaker) +# No system prompt, no force_turn_taking, pad_audio_to_sec=40. +# Produces both metrics_variant=generated (text) and metrics_variant=asr (speech) scores. +# +# Run: +# python nemo_skills/dataset/voicebench/scripts/generate_from_api_and_score_official.py \ +# --config nemo_skills/dataset/voicebench/scripts/voicebench_s2s_incremental_v2_sound_02mar_config.yaml + +# Cluster settings +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-10min_sw_et_eos_dp_eos_dup_fp32_1delay_02_March_exp_17_afg_long_FT_Megan_msr_34k_steps-stt-AS9.1_11002_new_branch_load_fixed + +server_type: vllm +server_gpus: 1 +num_chunks: 48 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_incremental_v2 + --decode_audio + --ignore_system_prompt + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Mg_a_00759.wav + --num_frames_per_inference 3 + --engine_type vllm_llm_vllm_eartts + --use_perception_cache + --use_perception_cudagraph + --use_codec_cache + --buffer_size_frames 21 + --codec_token_history_size 60 + --pad_audio_to_sec 40 + --repetition_penalty 1.0 + --matmul_precision medium + --vllm_gpu_memory_utilization 0.35 + --vllm_max_model_len 8192 + --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02_FIXED/voicebench_incremental_v2_sound_02mar_no_sys_prompt_artifacts + --batch_size 2 + --code_path /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/NeMo + --pip_install "hf-xet==1.1.9 huggingface-hub==0.34.4 nvidia-modelopt==0.33.1 nvidia-modelopt-core==0.33.1 tokenizers==0.22.0 transformers==4.56.0 lhotse==1.32.2 nv-one-logger-core==2.1.0 nv-one-logger-pytorch-lightning-integration==2.1.0 nv-one-logger-training-telemetry==2.1.0 kaldialign==0.9.1" + +server_container: /lustre/fsw/portfolios/llmservice/users/erastorgueva/code/containers/triton25.05_s2svllm26.02.12.sqsh +server_server_type: vllm_multimodal + +# Paths +data_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/voicebench_data_dir +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02_FIXED/voicebench_incremental_v2_sound_02mar_no_sys_prompt +voicebench_repo_path: /lustre/fsw/portfolios/llmservice/users/vmendelev/code/VoiceBench + +subtests: + - sd_qa + - alpacaeval_full + - alpacaeval + - ifeval + - advbench + - commoneval + - wildvoice + - alpacaeval_speaker + # - openbookqa + # - mmsu + # - bbh + +# Scoring settings +api_type: nvidia +nvidia_model: azure/openai/gpt-4o-mini +agent_audio_asr_model: nvidia/parakeet-tdt-1.1b + +installation_command: "" +agent_audio_installation_command: "ln -sf $(which python3) /usr/local/bin/python" +scoring_installation_command: "pip install sacrebleu qa_metrics" + +expname: voicebench_s2s_incremental_v2_sound_02mar_no_sys_prompt +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_incremental_v2_sound_26feb_config.yaml b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_incremental_v2_sound_26feb_config.yaml new file mode 100644 index 0000000000..3f961032ba --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_incremental_v2_sound_26feb_config.yaml @@ -0,0 +1,78 @@ +# (P1) VoiceBench evaluation - S2S incremental V2 backend, SPEECH output. +# Full audio decoding with codec cache for clean output. +# Uses NemotronVoicechatInferenceWrapper with vLLM + perception/codec caches. +# Checkpoint: Feb 26 2026 (legally friendly personaplex dataset) +# +# Run: +# python nemo_skills/dataset/voicebench/scripts/generate_from_api_and_score_official.py \ +# --config nemo_skills/dataset/voicebench/scripts/voicebench_s2s_incremental_v2_sound_26feb_config.yaml + +# Cluster settings +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-10min_sw_et_eos_dp_eos_dup_fp32_1delay_26_Feb_exp_13_afg_14k_steps-stt-AS9.1_11002_new_branch_load_fixed + +server_type: vllm +server_gpus: 1 +num_chunks: 48 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_incremental_v2 + --decode_audio + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Mg_a_00759.wav + --num_frames_per_inference 3 + --engine_type vllm_llm_vllm_eartts + --use_perception_cache + --use_perception_cudagraph + --use_codec_cache + --buffer_size_frames 20 + --codec_token_history_size 60 + --pad_to_duration_secs 40 + --repetition_penalty 1.0 + --force_turn_taking + --force_turn_taking_threshold 40 + --force_turn_taking_pad_window 25 + --matmul_precision medium + --vllm_gpu_memory_utilization 0.35 + --vllm_max_model_len 8192 + --system_prompt "You are an AI assistant developed by NVIDIA. Your name is NVIDIA VoiceChat. Your job is to be helpful and have engaging conversations in English. Maintain a warm tone. Keep the dialogue open and ongoing. You must provide diverse responses, rephrase answers if user asks the same questions." + --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/FEB_26/voicebench_incremental_v2_sound_26feb_artifacts + --batch_size 2 + --code_path /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/NeMo + --pip_install "hf-xet==1.1.9 huggingface-hub==0.34.4 nvidia-modelopt==0.33.1 nvidia-modelopt-core==0.33.1 tokenizers==0.22.0 transformers==4.56.0 lhotse==1.32.2 nv-one-logger-core==2.1.0 nv-one-logger-pytorch-lightning-integration==2.1.0 nv-one-logger-training-telemetry==2.1.0 kaldialign==0.9.1" + +server_container: /lustre/fsw/portfolios/llmservice/users/erastorgueva/code/containers/triton25.05_s2svllm26.02.12.sqsh +server_server_type: vllm_multimodal + +# Paths +data_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/voicebench_data_dir +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/FEB_26/voicebench_incremental_v2_sound_26feb +voicebench_repo_path: /lustre/fsw/portfolios/llmservice/users/vmendelev/code/VoiceBench + +subtests: + - openbookqa + # - sd_qa + # - mmsu + # - alpacaeval_full + # - alpacaeval + # - ifeval + # - advbench + # - commoneval + # - wildvoice + # - alpacaeval_speaker + # - bbh + +# Scoring settings +api_type: nvidia +nvidia_model: azure/openai/gpt-4o-mini + +installation_command: "" +scoring_installation_command: "pip install sacrebleu qa_metrics" + +expname: voicebench_s2s_incremental_v2_sound_26feb +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_incremental_v2_sound_config.yaml b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_incremental_v2_sound_config.yaml new file mode 100644 index 0000000000..94bb2f1870 --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_incremental_v2_sound_config.yaml @@ -0,0 +1,77 @@ +# (P1) VoiceBench evaluation - S2S incremental V2 backend, SPEECH output. +# Full audio decoding with codec cache for clean output. +# Uses NemotronVoicechatInferenceWrapper with vLLM + perception/codec caches. +# +# Run: +# python nemo_skills/dataset/voicebench/scripts/generate_from_api_and_score_official.py \ +# --config nemo_skills/dataset/voicebench/scripts/voicebench_s2s_incremental_v2_sound_config.yaml + +# Cluster settings +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-2mim_sw_et_eos_dp_eos_dup_fp32_1delay_20_Feb_exp_3.1_afg_40k_steps-stt-AS7.8_11460_new_branch_load_fixed + +server_type: vllm +server_gpus: 1 +num_chunks: 48 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_incremental_v2 + --decode_audio + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Mg_a_00759.wav + --num_frames_per_inference 3 + --engine_type vllm_llm_vllm_eartts + --use_perception_cache + --use_perception_cudagraph + --use_codec_cache + --buffer_size_frames 20 + --codec_token_history_size 60 + --pad_to_duration_secs 40 + --repetition_penalty 1.0 + --force_turn_taking + --force_turn_taking_threshold 40 + --force_turn_taking_pad_window 25 + --matmul_precision medium + --vllm_gpu_memory_utilization 0.35 + --vllm_max_model_len 8192 + --system_prompt "You are an AI assistant developed by NVIDIA. Your name is NVIDIA VoiceChat. Your job is to be helpful and have engaging conversations in English. Maintain a warm tone. Keep the dialogue open and ongoing. You must provide diverse responses, rephrase answers if user asks the same questions." + --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/voicebench_incremental_v2_sound_artifacts + --batch_size 2 + --code_path /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/NeMo + --pip_install "hf-xet==1.1.9 huggingface-hub==0.34.4 nvidia-modelopt==0.33.1 nvidia-modelopt-core==0.33.1 tokenizers==0.22.0 transformers==4.56.0 lhotse==1.32.2 nv-one-logger-core==2.1.0 nv-one-logger-pytorch-lightning-integration==2.1.0 nv-one-logger-training-telemetry==2.1.0 kaldialign==0.9.1" + +server_container: /lustre/fsw/portfolios/llmservice/users/erastorgueva/code/containers/triton25.05_s2svllm26.02.12.sqsh +server_server_type: vllm_multimodal + +# Paths +data_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/voicebench_data_dir +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/voicebench_incremental_v2_sound +voicebench_repo_path: /lustre/fsw/portfolios/llmservice/users/vmendelev/code/VoiceBench + +subtests: + - sd_qa + - mmsu + - alpacaeval_full + - alpacaeval + - ifeval + - advbench + - commoneval + - wildvoice + - alpacaeval_speaker + - openbookqa + - bbh + +# Scoring settings +api_type: nvidia +nvidia_model: azure/openai/gpt-4o-mini + +installation_command: "" +scoring_installation_command: "pip install sacrebleu qa_metrics" + +expname: voicebench_s2s_incremental_v2_sound +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_incremental_v2_text_02mar_config.yaml b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_incremental_v2_text_02mar_config.yaml new file mode 100644 index 0000000000..5672faa6d3 --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_incremental_v2_text_02mar_config.yaml @@ -0,0 +1,75 @@ +# (P0) VoiceBench evaluation - S2S incremental V2 backend, TEXT output only. +# No audio decoding -- faster inference, text-only scoring. +# Uses NemotronVoicechatInferenceWrapper with vLLM + perception cache. +# Checkpoint: Mar 2 2026 (Feb 26 STT + Mar 3 TTS, Megan speaker, force_turn_taking OFF) +# +# Run: +# python nemo_skills/dataset/voicebench/scripts/generate_from_api_and_score_official.py \ +# --config nemo_skills/dataset/voicebench/scripts/voicebench_s2s_incremental_v2_text_02mar_config.yaml + +# Cluster settings +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-10min_sw_et_eos_dp_eos_dup_fp32_1delay_02_March_exp_17_afg_long_FT_Megan_msr_34k_steps-stt-AS9.1_11002_new_branch_load_fixed + +server_type: vllm +server_gpus: 1 +num_chunks: 48 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_incremental_v2 + --no_decode_audio + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Mg_a_00759.wav + --num_frames_per_inference 3 + --engine_type vllm_llm_vllm_eartts + --use_perception_cache + --use_perception_cudagraph + --buffer_size_frames 20 + --codec_token_history_size 60 + --pad_to_duration_secs 40 + --repetition_penalty 1.0 + --force_turn_taking + --matmul_precision medium + --vllm_gpu_memory_utilization 0.35 + --vllm_max_model_len 8192 + --system_prompt "You are an AI assistant developed by NVIDIA. Your name is NVIDIA VoiceChat. Your job is to be helpful and have engaging conversations in English. Maintain a warm tone. Keep the dialogue open and ongoing. You must provide diverse responses, rephrase answers if user asks the same questions." + --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02/voicebench_incremental_v2_text_02mar_artifacts + --batch_size 2 + --code_path /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/NeMo + --pip_install "hf-xet==1.1.9 huggingface-hub==0.34.4 nvidia-modelopt==0.33.1 nvidia-modelopt-core==0.33.1 tokenizers==0.22.0 transformers==4.56.0 lhotse==1.32.2 nv-one-logger-core==2.1.0 nv-one-logger-pytorch-lightning-integration==2.1.0 nv-one-logger-training-telemetry==2.1.0 kaldialign==0.9.1" + +server_container: /lustre/fsw/portfolios/llmservice/users/erastorgueva/code/containers/triton25.05_s2svllm26.02.12.sqsh +server_server_type: vllm_multimodal + +# Paths +data_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/voicebench_data_dir +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/MAR_02/voicebench_incremental_v2_text_02mar +voicebench_repo_path: /lustre/fsw/portfolios/llmservice/users/vmendelev/code/VoiceBench + +subtests: + - sd_qa + - mmsu + - alpacaeval_full + - alpacaeval + - ifeval + - advbench + - commoneval + - wildvoice + - alpacaeval_speaker + - openbookqa + - bbh + +# Scoring settings +api_type: nvidia +nvidia_model: azure/openai/gpt-4o-mini + +installation_command: "" +scoring_installation_command: "pip install sacrebleu qa_metrics" + +expname: voicebench_s2s_incremental_v2_text_02mar +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_incremental_v2_text_26feb_config.yaml b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_incremental_v2_text_26feb_config.yaml new file mode 100644 index 0000000000..9e0f393399 --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_incremental_v2_text_26feb_config.yaml @@ -0,0 +1,77 @@ +# (P0) VoiceBench evaluation - S2S incremental V2 backend, TEXT output only. +# No audio decoding -- faster inference, text-only scoring. +# Uses NemotronVoicechatInferenceWrapper with vLLM + perception cache. +# Checkpoint: Feb 26 2026 (legally friendly personaplex dataset) +# +# Run: +# python nemo_skills/dataset/voicebench/scripts/generate_from_api_and_score_official.py \ +# --config nemo_skills/dataset/voicebench/scripts/voicebench_s2s_incremental_v2_text_26feb_config.yaml + +# Cluster settings +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-10min_sw_et_eos_dp_eos_dup_fp32_1delay_26_Feb_exp_13_afg_14k_steps-stt-AS9.1_11002_new_branch_load_fixed + +server_type: vllm +server_gpus: 1 +num_chunks: 48 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_incremental_v2 + --no_decode_audio + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Mg_a_00759.wav + --num_frames_per_inference 3 + --engine_type vllm_llm_vllm_eartts + --use_perception_cache + --use_perception_cudagraph + --buffer_size_frames 20 + --codec_token_history_size 60 + --pad_to_duration_secs 40 + --repetition_penalty 1.0 + --force_turn_taking + --force_turn_taking_threshold 40 + --force_turn_taking_pad_window 25 + --matmul_precision medium + --vllm_gpu_memory_utilization 0.35 + --vllm_max_model_len 8192 + --system_prompt "You are an AI assistant developed by NVIDIA. Your name is NVIDIA VoiceChat. Your job is to be helpful and have engaging conversations in English. Maintain a warm tone. Keep the dialogue open and ongoing. You must provide diverse responses, rephrase answers if user asks the same questions." + --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/FEB_26/voicebench_incremental_v2_text_26feb_artifacts + --batch_size 2 + --code_path /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/NeMo + --pip_install "hf-xet==1.1.9 huggingface-hub==0.34.4 nvidia-modelopt==0.33.1 nvidia-modelopt-core==0.33.1 tokenizers==0.22.0 transformers==4.56.0 lhotse==1.32.2 nv-one-logger-core==2.1.0 nv-one-logger-pytorch-lightning-integration==2.1.0 nv-one-logger-training-telemetry==2.1.0 kaldialign==0.9.1" + +server_container: /lustre/fsw/portfolios/llmservice/users/erastorgueva/code/containers/triton25.05_s2svllm26.02.12.sqsh +server_server_type: vllm_multimodal + +# Paths +data_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/voicebench_data_dir +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/FEB_26/voicebench_incremental_v2_text_26feb +voicebench_repo_path: /lustre/fsw/portfolios/llmservice/users/vmendelev/code/VoiceBench + +subtests: + - sd_qa + - mmsu + - alpacaeval_full + - alpacaeval + - ifeval + - advbench + - commoneval + - wildvoice + - alpacaeval_speaker + - openbookqa + - bbh + +# Scoring settings +api_type: nvidia +nvidia_model: azure/openai/gpt-4o-mini + +installation_command: "" +scoring_installation_command: "pip install sacrebleu qa_metrics" + +expname: voicebench_s2s_incremental_v2_text_26feb +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_incremental_v2_text_config.yaml b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_incremental_v2_text_config.yaml new file mode 100644 index 0000000000..c42e16f3d9 --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_incremental_v2_text_config.yaml @@ -0,0 +1,76 @@ +# (P0) VoiceBench evaluation - S2S incremental V2 backend, TEXT output only. +# No audio decoding -- faster inference, text-only scoring. +# Uses NemotronVoicechatInferenceWrapper with vLLM + perception cache. +# +# Run: +# python nemo_skills/dataset/voicebench/scripts/generate_from_api_and_score_official.py \ +# --config nemo_skills/dataset/voicebench/scripts/voicebench_s2s_incremental_v2_text_config.yaml + +# Cluster settings +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-2mim_sw_et_eos_dp_eos_dup_fp32_1delay_20_Feb_exp_3.1_afg_40k_steps-stt-AS7.8_11460_new_branch_load_fixed + +server_type: vllm +server_gpus: 1 +num_chunks: 48 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_incremental_v2 + --no_decode_audio + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Mg_a_00759.wav + --num_frames_per_inference 3 + --engine_type vllm_llm_vllm_eartts + --use_perception_cache + --use_perception_cudagraph + --buffer_size_frames 20 + --codec_token_history_size 60 + --pad_to_duration_secs 40 + --repetition_penalty 1.0 + --force_turn_taking + --force_turn_taking_threshold 40 + --force_turn_taking_pad_window 25 + --matmul_precision medium + --vllm_gpu_memory_utilization 0.35 + --vllm_max_model_len 8192 + --system_prompt "You are an AI assistant developed by NVIDIA. Your name is NVIDIA VoiceChat. Your job is to be helpful and have engaging conversations in English. Maintain a warm tone. Keep the dialogue open and ongoing. You must provide diverse responses, rephrase answers if user asks the same questions." + --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/voicebench_incremental_v2_text_artifacts + --batch_size 2 + --code_path /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/NeMo + --pip_install "hf-xet==1.1.9 huggingface-hub==0.34.4 nvidia-modelopt==0.33.1 nvidia-modelopt-core==0.33.1 tokenizers==0.22.0 transformers==4.56.0 lhotse==1.32.2 nv-one-logger-core==2.1.0 nv-one-logger-pytorch-lightning-integration==2.1.0 nv-one-logger-training-telemetry==2.1.0 kaldialign==0.9.1" + +server_container: /lustre/fsw/portfolios/llmservice/users/erastorgueva/code/containers/triton25.05_s2svllm26.02.12.sqsh +server_server_type: vllm_multimodal + +# Paths +data_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/voicebench_data_dir +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/voicebench_incremental_v2_text +voicebench_repo_path: /lustre/fsw/portfolios/llmservice/users/vmendelev/code/VoiceBench + +subtests: + - sd_qa + - mmsu + - alpacaeval_full + - alpacaeval + - ifeval + - advbench + - commoneval + - wildvoice + - alpacaeval_speaker + - openbookqa + - bbh + +# Scoring settings +api_type: nvidia +nvidia_model: azure/openai/gpt-4o-mini + +installation_command: "" +scoring_installation_command: "pip install sacrebleu qa_metrics" + +expname: voicebench_s2s_incremental_v2_text +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_offline_config.yaml b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_offline_config.yaml new file mode 100644 index 0000000000..905081924c --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_offline_config.yaml @@ -0,0 +1,69 @@ +# VoiceBench evaluation configuration for S2S offline backend +# Based on: /lustre/fsw/portfolios/llmservice/users/kevinhu/s2s/NeMo/scripts/training/iad/s2s/sdv2_hf/conv/nano_9b/inf/infer_nano9b_s2s.sh +# Run: python generate_from_api_and_score_official.py --config voicebench_s2s_offline_config.yaml + +# Cluster settings +cluster: oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu # For scoring jobs (no GPU needed) + +# Model checkpoint - latest IAD nano9b model (Jan 2026) +# STT checkpoint in HuggingFace format +model: /lustre/fsw/portfolios/llmservice/users/apasad/projects/nemo_s2s_merged_dec/exp_SFT-nano9b/IAD_nano9b_parakeet600m_from_PT_32k_64gpu_5e-5_PT0.7_SFT0.05_QA0.02_TEXT0.1_loss0.5_MCQ0.03_ASR0.01_sysp0.03_NoiseProb0.5_SNR-30-60_asr_dtc2_dst15_loss_text5.0_bos10.0_eos5.0_pad1.0_eosplacementsfix_nospecaug_ei0.1_ot8_TN_all_data_v3.2_ir/checkpoints_hf_step-12556-last.ckpt + +server_type: vllm +server_gpus: 1 +num_chunks: 32 # Recommended for voicebench due to large test sets like MMSU + +# Use serve_unified with S2S offline backend +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s + --config_path /lustre/fsw/portfolios/llmservice/users/kevinhu/projects/duplex-stt/config/inference/infer_nano_eartts_Voicebench_Jan22_2026.yaml + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/S2S-full-duplex/inference_references/Emma_S3_A1_SC7_singleturntarget_21_channel_1_audio_in.wav + --tts_ckpt_path /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Duplex_EARTTS/eartts_rvq_cont_task_nanov2_tts_pretraining_wordlist_duplex_data_new_branch_4nodes_duplex_eartts_2_delay_4rd_stage_fp32_wd_1500_et_eos_dp_eos_dup_step_10004.ckpt + --extra_decoding_seconds 20 + --dtype float32 + --code_path /lustre/fsw/portfolios/convai/users/kevinhu/S2S-Duplex-new-codebase/branches/NeMo-release_not_rebased + +# Container with NeMo and S2S support +server_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh + +# Use vllm_multimodal to save audio to files instead of base64 in jsonl +server_server_type: vllm_multimodal + +# Paths +data_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_test/data_dir +output_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_s2s_offline/runs/voicebench +voicebench_repo_path: /lustre/fsw/portfolios/llmservice/users/vmendelev/code/VoiceBench + +# Subtests to evaluate (all except mtbench which is multi-turn) +subtests: + - bbh + - alpacaeval + - alpacaeval_full + - ifeval + - openbookqa + - advbench + - commoneval + - wildvoice + - mmsu + - sd_qa + - alpacaeval_speaker + +# Scoring settings +api_type: nvidia +nvidia_model: azure/openai/gpt-4o-mini + +# Installation commands +installation_command: "" +scoring_installation_command: "pip install sacrebleu qa_metrics" + +# Experiment settings +expname: voicebench_s2s_offline +# max_samples: 10 # Uncomment to limit samples for testing + +# Run modes +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_offline_no_sysprompt.yaml b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_offline_no_sysprompt.yaml new file mode 100644 index 0000000000..7a2bacd6c0 --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_offline_no_sysprompt.yaml @@ -0,0 +1,70 @@ +# VoiceBench evaluation configuration for S2S offline backend - NO SYSTEM PROMPT +# Same as voicebench_s2s_offline_config.yaml but with --ignore_system_prompt +# Run: python generate_from_api_and_score_official.py --config voicebench_s2s_offline_no_sysprompt.yaml + +# Cluster settings +cluster: oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu # For scoring jobs (no GPU needed) + +# Model checkpoint - latest IAD nano9b model (Jan 2026) +# STT checkpoint in HuggingFace format +model: /lustre/fsw/portfolios/llmservice/users/apasad/projects/nemo_s2s_merged_dec/exp_SFT-nano9b/IAD_nano9b_parakeet600m_from_PT_32k_64gpu_5e-5_PT0.7_SFT0.05_QA0.02_TEXT0.1_loss0.5_MCQ0.03_ASR0.01_sysp0.03_NoiseProb0.5_SNR-30-60_asr_dtc2_dst15_loss_text5.0_bos10.0_eos5.0_pad1.0_eosplacementsfix_nospecaug_ei0.1_ot8_TN_all_data_v3.2_ir/checkpoints_hf_step-12556-last.ckpt + +server_type: vllm +server_gpus: 1 +num_chunks: 32 # Recommended for voicebench due to large test sets like MMSU + +# Use serve_unified with S2S offline backend - WITH --ignore_system_prompt +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s + --config_path /lustre/fsw/portfolios/llmservice/users/kevinhu/projects/duplex-stt/config/inference/infer_nano_eartts_Voicebench_Jan22_2026.yaml + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/S2S-full-duplex/inference_references/Emma_S3_A1_SC7_singleturntarget_21_channel_1_audio_in.wav + --tts_ckpt_path /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Duplex_EARTTS/eartts_rvq_cont_task_nanov2_tts_pretraining_wordlist_duplex_data_new_branch_4nodes_duplex_eartts_2_delay_4rd_stage_fp32_wd_1500_et_eos_dp_eos_dup_step_10004.ckpt + --extra_decoding_seconds 20 + --dtype float32 + --ignore_system_prompt + --code_path /lustre/fsw/portfolios/convai/users/kevinhu/S2S-Duplex-new-codebase/branches/NeMo-release_not_rebased + +# Container with NeMo and S2S support +server_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh + +# Use vllm_multimodal to save audio to files instead of base64 in jsonl +server_server_type: vllm_multimodal + +# Paths +data_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_test/data_dir +output_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_s2s_offline/runs/voicebench_no_sysprompt +voicebench_repo_path: /lustre/fsw/portfolios/llmservice/users/vmendelev/code/VoiceBench + +# Subtests to evaluate (all except mtbench which is multi-turn) +subtests: + - bbh + - alpacaeval + - alpacaeval_full + - ifeval + - openbookqa + - advbench + - commoneval + - wildvoice + - mmsu + - sd_qa + - alpacaeval_speaker + +# Scoring settings +api_type: nvidia +nvidia_model: us/azure/openai/gpt-5-mini + +# Installation commands +installation_command: "" +scoring_installation_command: "pip install sacrebleu qa_metrics" + +# Experiment settings +expname: voicebench_s2s_offline_no_sysprompt +# max_samples: 10 # Uncomment to limit samples for testing + +# Run modes +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_offline_remaining.yaml b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_offline_remaining.yaml new file mode 100644 index 0000000000..530b1db252 --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_offline_remaining.yaml @@ -0,0 +1,63 @@ +# VoiceBench evaluation configuration for S2S offline backend - REMAINING SUBTESTS +# Run: python generate_from_api_and_score_official.py --config voicebench_s2s_offline_remaining.yaml + +# Cluster settings +cluster: oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu # For scoring jobs (no GPU needed) + +# Model checkpoint - latest IAD nano9b model (Jan 2026) +# STT checkpoint in HuggingFace format +model: /lustre/fsw/portfolios/llmservice/users/apasad/projects/nemo_s2s_merged_dec/exp_SFT-nano9b/IAD_nano9b_parakeet600m_from_PT_32k_64gpu_5e-5_PT0.7_SFT0.05_QA0.02_TEXT0.1_loss0.5_MCQ0.03_ASR0.01_sysp0.03_NoiseProb0.5_SNR-30-60_asr_dtc2_dst15_loss_text5.0_bos10.0_eos5.0_pad1.0_eosplacementsfix_nospecaug_ei0.1_ot8_TN_all_data_v3.2_ir/checkpoints_hf_step-12556-last.ckpt + +server_type: vllm +server_gpus: 1 +num_chunks: 32 # Recommended for voicebench due to large test sets like MMSU + +# Use serve_unified with S2S offline backend +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s + --config_path /lustre/fsw/portfolios/llmservice/users/kevinhu/projects/duplex-stt/config/inference/infer_nano_eartts_Voicebench_Jan22_2026.yaml + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/S2S-full-duplex/inference_references/Emma_S3_A1_SC7_singleturntarget_21_channel_1_audio_in.wav + --tts_ckpt_path /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Duplex_EARTTS/eartts_rvq_cont_task_nanov2_tts_pretraining_wordlist_duplex_data_new_branch_4nodes_duplex_eartts_2_delay_4rd_stage_fp32_wd_1500_et_eos_dp_eos_dup_step_10004.ckpt + --extra_decoding_seconds 20 + --dtype float32 + --code_path /lustre/fsw/portfolios/convai/users/kevinhu/S2S-Duplex-new-codebase/branches/NeMo-release_not_rebased + +# Container with NeMo and S2S support +server_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh + +# Use vllm_multimodal to save audio to files instead of base64 in jsonl +server_server_type: vllm_multimodal + +# Paths +data_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_test/data_dir +output_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_s2s_offline/runs/voicebench_remaining +voicebench_repo_path: /lustre/fsw/portfolios/llmservice/users/vmendelev/code/VoiceBench + +# REMAINING subtests only (the ones that didn't complete) +subtests: + - openbookqa + - advbench + - commoneval + - wildvoice + - mmsu + - sd_qa + - alpacaeval_speaker + +# Scoring settings +api_type: nvidia +nvidia_model: azure/openai/gpt-4o-mini + +# Installation commands +installation_command: "" +scoring_installation_command: "pip install sacrebleu qa_metrics" + +# Experiment settings +expname: voicebench_s2s_offline_remaining + +# Run modes +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_offline_sdqa_test.yaml b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_offline_sdqa_test.yaml new file mode 100644 index 0000000000..c8de23c2a8 --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_offline_sdqa_test.yaml @@ -0,0 +1,59 @@ +# VoiceBench evaluation configuration for S2S offline backend - SD_QA test only +# Based on: /lustre/fsw/portfolios/llmservice/users/kevinhu/s2s/NeMo/scripts/training/iad/s2s/sdv2_hf/conv/nano_9b/inf/infer_nano9b_s2s.sh +# Run: python generate_from_api_and_score_official.py --config voicebench_s2s_offline_sdqa_test.yaml + +# Cluster settings +cluster: oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu # For scoring jobs (no GPU needed) + +# Model checkpoint - latest IAD nano9b model (Jan 2026) +# STT checkpoint in HuggingFace format +model: /lustre/fsw/portfolios/llmservice/users/apasad/projects/nemo_s2s_merged_dec/exp_SFT-nano9b/IAD_nano9b_parakeet600m_from_PT_32k_64gpu_5e-5_PT0.7_SFT0.05_QA0.02_TEXT0.1_loss0.5_MCQ0.03_ASR0.01_sysp0.03_NoiseProb0.5_SNR-30-60_asr_dtc2_dst15_loss_text5.0_bos10.0_eos5.0_pad1.0_eosplacementsfix_nospecaug_ei0.1_ot8_TN_all_data_v3.2_ir/checkpoints_hf_step-12556-last.ckpt + +server_type: vllm +server_gpus: 1 +num_chunks: 1 # Small test, single chunk is enough + +# Use serve_unified with S2S offline backend +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s + --config_path /lustre/fsw/portfolios/llmservice/users/kevinhu/projects/duplex-stt/config/inference/infer_nano_eartts_Voicebench_Jan22_2026.yaml + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/S2S-full-duplex/inference_references/Emma_S3_A1_SC7_singleturntarget_21_channel_1_audio_in.wav + --tts_ckpt_path /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Duplex_EARTTS/eartts_rvq_cont_task_nanov2_tts_pretraining_wordlist_duplex_data_new_branch_4nodes_duplex_eartts_2_delay_4rd_stage_fp32_wd_1500_et_eos_dp_eos_dup_step_10004.ckpt + --extra_decoding_seconds 20 + --dtype float32 + --code_path /lustre/fsw/portfolios/convai/users/kevinhu/S2S-Duplex-new-codebase/branches/NeMo-release_not_rebased + +# Container with NeMo and S2S support +server_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh + +# Use vllm_multimodal to save audio to files instead of base64 in jsonl +server_server_type: vllm_multimodal + +# Paths +data_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_test/data_dir +output_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_s2s_offline/runs/voicebench_sdqa_test +voicebench_repo_path: /lustre/fsw/portfolios/llmservice/users/vmendelev/code/VoiceBench + +# Subtests to evaluate - SD_QA only for quick test +subtests: + - sd_qa + +# Scoring settings +api_type: nvidia +nvidia_model: us/azure/openai/gpt-5-mini + +# Installation commands +installation_command: "" +scoring_installation_command: "pip install sacrebleu qa_metrics" + +# Experiment settings +expname: voicebench_sdqa_test +max_samples: 10 # Limit to 10 samples for quick testing + +# Run modes +generation_only: true +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_session_full_config.yaml b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_session_full_config.yaml new file mode 100644 index 0000000000..ff57296bb5 --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_session_full_config.yaml @@ -0,0 +1,69 @@ +# VoiceBench full evaluation configuration for S2S session backend +# Run: python generate_from_api_and_score_official.py --config voicebench_s2s_session_full_config.yaml + +# Cluster settings +cluster: oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu # For scoring jobs (no GPU needed) + +# Model and server - using serve_unified with S2S session backend +# dec 12 +# model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-2mim_sw_et_eos_dp_eos_dup_fp32-stt-3-december_stt_edresson_model_R_digits_norm_eip_0.1_EA_model_step_9005 +# dec 4 +# model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-fp32-stt-3-december_stt_edresson_model_R_digits_norm_eip_0.1_EA_model_step_9005 +# Nov +model: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-fp32-stt-22-november_stt_v2_fp32 +server_type: vllm +server_gpus: 1 +num_chunks: 1 + +# Use serve_unified with S2S session backend (no silence padding) +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_session + --config_path /lustre/fsw/portfolios/convai/users/ecasanova/S2S-Duplex-new-codebase/scripts/configs/inference/nanov2_demo_model_eartts_updated.yaml + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/S2S-full-duplex/inference_references/Emma_S3_A1_SC7_singleturntarget_21_channel_1_audio_in.wav + --ignore_system_prompt + --num_frames_per_inference 2 + --silence_padding_sec 0.0 + --output_frame_alignment + --session_artifacts_dir /lustre/fsw/portfolios/llmservice/users/vmendelev/tmp/voicebench_s2s_session_full/artifacts + --code_path /lustre/fsw/portfolios/convai/users/kevinhu/S2S-Duplex-new-codebase/branches/NeMo-release_not_rebased + +# Container with NeMo and S2S support +server_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh + +# Use vllm_multimodal to save audio to files instead of base64 in jsonl +server_server_type: vllm_multimodal + +# Paths +data_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_test/data_dir +output_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_demo/runs/voicebench +voicebench_repo_path: /lustre/fsw/portfolios/llmservice/users/vmendelev/code/VoiceBench + +# Subtests to evaluate +# - openbookqa +# - mmsu +# - alpacaeval +# - commoneval +# - sd_qa_usa +# - advbench +subtests: + - sd_qa_usa + +# Scoring settings +api_type: nvidia +nvidia_model: azure/openai/gpt-4o-mini + +# Installation commands +installation_command: "" +scoring_installation_command: "pip install sacrebleu qa_metrics" + +# Experiment settings +expname: voicebench +max_samples: 10 # Uncomment to limit samples for testing + +# Run modes +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_offline_config.yaml b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_offline_config.yaml new file mode 100644 index 0000000000..c6d6c19a45 --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_offline_config.yaml @@ -0,0 +1,74 @@ +# VoiceBench evaluation configuration for NemotronVoiceChat offline inference (s2s_voicechat backend). +# +# This config keeps the same model/config artifacts as the standard S2S offline config, +# but uses the new `s2s_voicechat` backend which instantiates NemotronVoiceChat from a resolved +# OmegaConf YAML (mirroring `nemotron_voicechat_infer.py`) and can optionally write artifacts +# under `output_dir/`. +# +# Run: +# python nemo_skills/dataset/voicebench/scripts/generate_from_api_and_score_official.py \ +# --config nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_offline_config.yaml + +# Cluster settings +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +# Model checkpoint (HF-format ckpt). Passed as --model to serve_unified and used as pretrained_s2s_model override. +model: /lustre/fsw/portfolios/llmservice/users/apasad/projects/nemo_s2s_merged_dec/exp_SFT-nano9b/IAD_nano9b_parakeet600m_from_PT_32k_64gpu_5e-5_PT0.7_SFT0.05_QA0.02_TEXT0.1_loss0.5_MCQ0.03_ASR0.01_sysp0.03_NoiseProb0.5_SNR-30-60_asr_dtc2_dst15_loss_text5.0_bos10.0_eos5.0_pad1.0_eosplacementsfix_nospecaug_ei0.1_ot8_TN_all_data_v3.2_ir/checkpoints_hf_step-12556-last.ckpt + +server_type: vllm +server_gpus: 1 +num_chunks: 48 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_voicechat + --config_path /lustre/fsw/portfolios/llmservice/users/kevinhu/projects/duplex-stt/config/inference/infer_nano_eartts_Voicebench_Jan22_2026.yaml + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/S2S-full-duplex/inference_references/Emma_S3_A1_SC7_singleturntarget_21_channel_1_audio_in.wav + --tts_ckpt_path /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Duplex_EARTTS/eartts_rvq_cont_task_nanov2_tts_pretraining_wordlist_duplex_data_new_branch_4nodes_duplex_eartts_2_delay_4rd_stage_fp32_wd_1500_et_eos_dp_eos_dup_step_10004.ckpt + --extra_decoding_seconds 20 + --inference_pad_boost -0.5 + --inference_bos_boost 0.5 + --inference_eos_boost 0 + --ignore_system_prompt + --dtype float32 + --code_path /lustre/fsw/portfolios/convai/users/kevinhu/S2S-Duplex-new-codebase/branches/NeMo-release_not_rebased + --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/voicebench_s2s_voicechat_offline_20s + --save_artifacts + --decode_audio + --batch_size 2 + +server_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh +server_server_type: vllm_multimodal + +# Paths +data_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_test/data_dir +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/voicebench_s2s_voicechat_offline_20s +voicebench_repo_path: /lustre/fsw/portfolios/llmservice/users/vmendelev/code/VoiceBench + +subtests: + - bbh + - alpacaeval + - alpacaeval_full + - ifeval + - openbookqa + - advbench + - commoneval + - wildvoice + - mmsu + - sd_qa + - alpacaeval_speaker + +# Scoring settings +api_type: nvidia +nvidia_model: azure/openai/gpt-4o-mini + +installation_command: "" +scoring_installation_command: "pip install sacrebleu qa_metrics" + +expname: voicebench_s2s_voicechat_offline +generation_only: false +scoring_only: false +dry_run: false + diff --git a/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_offline_sound_config.yaml b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_offline_sound_config.yaml new file mode 100644 index 0000000000..8089ddfb77 --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_offline_sound_config.yaml @@ -0,0 +1,69 @@ +# Full VoiceBench offline evaluation using s2s_voicechat backend, with audio output enabled. +# +# This is a copy of `voicebench_s2s_voicechat_offline_config.yaml` but adds `--decode_audio` +# (matching the smoke config `voicebench_s2s_voicechat_sdqa_smoke10_sound.yaml`) and uses a new +# output directory so generation runs end-to-end. +# +# Run: +# python nemo_skills/dataset/voicebench/scripts/generate_from_api_and_score_official.py \ +# --config nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_offline_sound_config.yaml + +# Cluster settings +cluster: oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +# Model checkpoint (HF-format ckpt). Passed as --model to serve_unified and used as pretrained_s2s_model override. +model: /lustre/fsw/portfolios/llmservice/users/apasad/projects/nemo_s2s_merged_dec/exp_SFT-nano9b/IAD_nano9b_parakeet600m_from_PT_32k_64gpu_5e-5_PT0.7_SFT0.05_QA0.02_TEXT0.1_loss0.5_MCQ0.03_ASR0.01_sysp0.03_NoiseProb0.5_SNR-30-60_asr_dtc2_dst15_loss_text5.0_bos10.0_eos5.0_pad1.0_eosplacementsfix_nospecaug_ei0.1_ot8_TN_all_data_v3.2_ir/checkpoints_hf_step-12556-last.ckpt + +server_type: vllm +server_gpus: 1 +num_chunks: 48 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_voicechat + --batch_size 4 + --config_path /lustre/fsw/portfolios/llmservice/users/kevinhu/projects/duplex-stt/config/inference/infer_nano_eartts_Voicebench_Jan22_2026.yaml + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/S2S-full-duplex/inference_references/Emma_S3_A1_SC7_singleturntarget_21_channel_1_audio_in.wav + --tts_ckpt_path /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Duplex_EARTTS/eartts_rvq_cont_task_nanov2_tts_pretraining_wordlist_duplex_data_new_branch_4nodes_duplex_eartts_2_delay_4rd_stage_fp32_wd_1500_et_eos_dp_eos_dup_step_10004.ckpt + --extra_decoding_seconds 20 + --dtype float32 + --code_path /lustre/fsw/portfolios/convai/users/kevinhu/S2S-Duplex-new-codebase/branches/NeMo-release_not_rebased + --output_dir /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_s2s_voicechat_offline/runs/voicebench_sound_20260205_131219 + --save_artifacts + --decode_audio + +server_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh +server_server_type: vllm_multimodal + +# Paths +data_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_test/data_dir +output_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_s2s_voicechat_offline/runs/voicebench_sound_20260205_131219 +voicebench_repo_path: /lustre/fsw/portfolios/llmservice/users/vmendelev/code/VoiceBench + +subtests: + - bbh + - alpacaeval + - alpacaeval_full + - ifeval + - openbookqa + - advbench + - commoneval + - wildvoice + - mmsu + - sd_qa + - alpacaeval_speaker + +# Scoring settings +api_type: nvidia +nvidia_model: azure/openai/gpt-4o-mini + +installation_command: "" +scoring_installation_command: "pip install sacrebleu qa_metrics" + +expname: voicebench_s2s_voicechat_offline_sound +generation_only: false +scoring_only: false +dry_run: false + diff --git a/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_offline_stt11460_sw_config.yaml b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_offline_stt11460_sw_config.yaml new file mode 100644 index 0000000000..013f458fac --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_offline_stt11460_sw_config.yaml @@ -0,0 +1,73 @@ +# VoiceBench evaluation configuration for NemotronVoiceChat offline inference (s2s_voicechat backend). +# STT ckpt: checkpoints_hf_11460 (fix_seq_mask variant) +# S2S ckpt: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-2mim_sw_et_eos_dp_eos_dup_fp32_1delay_main_branch-stt-AS7.8_11460_old_branch_generated_fixed/ +# TTS ckpt: sliding window checkpoint (delay_1, step_24011) +# Ref: https://docs.google.com/document/d/1tLJlb5Fi8ECdoLwbnjQ2lM5JIj8OeVo_2PeoDzK9jQE/edit?tab=t.0#heading=h.d57shsy818cf +# +# Run: +# python nemo_skills/dataset/voicebench/scripts/generate_from_api_and_score_official.py \ +# --config nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_offline_stt11460_sw_config.yaml + +# Cluster settings +cluster: s2s_eval_oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +# Model checkpoint (HF-format ckpt). Passed as --model to serve_unified and used as pretrained_s2s_model override. +model: /lustre/fsw/portfolios/llmservice/users/apasad/projects/nemo_s2s_merged_dec/exp_SFT-nano9b/IAD_nano9b_parakeet600m_from_PT_32k_64gpu_5e-5_PT0.5_SFT0.15_QA0.02_TEXT0.1_loss0.5_MCQ0.03_prompt2_ASR0.01_fillerlong_offset2_sysp0.05_NoiseDefault_asr_dtc2_dst15_lossDefault_ei0.033_ot8_TN_all_data_v3.2_ir_fix_seq_mask/checkpoints_hf_11460 + +server_type: vllm +server_gpus: 1 +num_chunks: 32 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_voicechat + --config_path /lustre/fsw/portfolios/llmservice/users/kevinhu/projects/duplex-stt/config/inference/infer_nano_eartts_Voicebench_Jan22_2026.yaml + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/S2S-full-duplex/inference_references/Emma_S3_A1_SC7_singleturntarget_21_channel_1_audio_in.wav + --tts_ckpt_path /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Duplex_EARTTS/eartts_rvq_cont_task_nanov2_tts_pretraining_wordlist_duplex_data_new_branch_new_tok_4nodes_duplex_eartts_nemo_main_delay_1_step_24011-last.ckpt + --extra_decoding_seconds 20 + --inference_pad_boost -0.5 + --inference_bos_boost 0.5 + --ignore_system_prompt + --inference_eos_boost 0 + --dtype float32 + --code_path /lustre/fsw/portfolios/convai/users/kevinhu/S2S-Duplex-new-codebase/branches/NeMo-release_not_rebased + --output_dir /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/voicebench_stt11460_sw_voicechat_offline + --save_artifacts + --decode_audio + --batch_size 2 + +server_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh +server_server_type: vllm_multimodal + +# Paths +data_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/voicebench_data_dir +output_dir: /lustre/fsw/portfolios/convai/users/mmkrtchyan/projects/speechLM/s2s/voicebench_stt11460_sw_voicechat_offline_mcq +voicebench_repo_path: /lustre/fsw/portfolios/llmservice/users/vmendelev/code/VoiceBench + +subtests: + - mmsu + - alpacaeval_full + - alpacaeval + - ifeval + - advbench + - commoneval + - wildvoice + - sd_qa + - alpacaeval_speaker + - openbookqa + - bbh + + +# Scoring settings +api_type: nvidia +nvidia_model: azure/openai/gpt-4o-mini + +installation_command: "" +scoring_installation_command: "pip install sacrebleu qa_metrics" + +expname: voicebench_stt11460_sw_s2s_voicechat_offline +generation_only: false +scoring_only: false +dry_run: false diff --git a/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_sdqa_smoke10.yaml b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_sdqa_smoke10.yaml new file mode 100644 index 0000000000..20e4b0c1d6 --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_sdqa_smoke10.yaml @@ -0,0 +1,49 @@ +# 10-sample smoke test for VoiceBench sd_qa using s2s_voicechat backend. +# +# Run: +# python nemo_skills/dataset/voicebench/scripts/generate_from_api_and_score_official.py \ +# --config nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_sdqa_smoke10.yaml + +cluster: oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/llmservice/users/apasad/projects/nemo_s2s_merged_dec/exp_SFT-nano9b/IAD_nano9b_parakeet600m_from_PT_32k_64gpu_5e-5_PT0.7_SFT0.05_QA0.02_TEXT0.1_loss0.5_MCQ0.03_ASR0.01_sysp0.03_NoiseProb0.5_SNR-30-60_asr_dtc2_dst15_loss_text5.0_bos10.0_eos5.0_pad1.0_eosplacementsfix_nospecaug_ei0.1_ot8_TN_all_data_v3.2_ir/checkpoints_hf_step-12556-last.ckpt + +server_type: vllm +server_gpus: 1 +num_chunks: 1 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_voicechat + --config_path /lustre/fsw/portfolios/llmservice/users/kevinhu/projects/duplex-stt/config/inference/infer_nano_eartts_Voicebench_Jan22_2026.yaml + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/S2S-full-duplex/inference_references/Emma_S3_A1_SC7_singleturntarget_21_channel_1_audio_in.wav + --tts_ckpt_path /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Duplex_EARTTS/eartts_rvq_cont_task_nanov2_tts_pretraining_wordlist_duplex_data_new_branch_4nodes_duplex_eartts_2_delay_4rd_stage_fp32_wd_1500_et_eos_dp_eos_dup_step_10004.ckpt + --extra_decoding_seconds 20 + --dtype float32 + --code_path /lustre/fsw/portfolios/convai/users/kevinhu/S2S-Duplex-new-codebase/branches/NeMo-release_not_rebased + --output_dir /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_s2s_voicechat_smoke/runs/sdqa_smoke10 + --save_artifacts + +server_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh +server_server_type: vllm_multimodal + +data_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_test/data_dir +output_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_s2s_voicechat_smoke/runs/sdqa_smoke10 +voicebench_repo_path: /lustre/fsw/portfolios/llmservice/users/vmendelev/code/VoiceBench + +subtests: [sd_qa] +max_samples: 10 + +api_type: nvidia +nvidia_model: azure/openai/gpt-4o-mini + +installation_command: "" +scoring_installation_command: "pip install sacrebleu qa_metrics" + +expname: voicebench_s2s_voicechat_sdqa_smoke10 +generation_only: false +scoring_only: false +dry_run: false + diff --git a/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_sdqa_smoke10_sound.yaml b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_sdqa_smoke10_sound.yaml new file mode 100644 index 0000000000..4888686e05 --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_sdqa_smoke10_sound.yaml @@ -0,0 +1,50 @@ +# 10-sample smoke test for VoiceBench sd_qa using s2s_voicechat backend, with audio output enabled. +# +# Run: +# python nemo_skills/dataset/voicebench/scripts/generate_from_api_and_score_official.py \ +# --config nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_sdqa_smoke10_sound.yaml + +cluster: oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/llmservice/users/apasad/projects/nemo_s2s_merged_dec/exp_SFT-nano9b/IAD_nano9b_parakeet600m_from_PT_32k_64gpu_5e-5_PT0.7_SFT0.05_QA0.02_TEXT0.1_loss0.5_MCQ0.03_ASR0.01_sysp0.03_NoiseProb0.5_SNR-30-60_asr_dtc2_dst15_loss_text5.0_bos10.0_eos5.0_pad1.0_eosplacementsfix_nospecaug_ei0.1_ot8_TN_all_data_v3.2_ir/checkpoints_hf_step-12556-last.ckpt + +server_type: vllm +server_gpus: 1 +num_chunks: 1 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_voicechat + --config_path /lustre/fsw/portfolios/llmservice/users/kevinhu/projects/duplex-stt/config/inference/infer_nano_eartts_Voicebench_Jan22_2026.yaml + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/S2S-full-duplex/inference_references/Emma_S3_A1_SC7_singleturntarget_21_channel_1_audio_in.wav + --tts_ckpt_path /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Duplex_EARTTS/eartts_rvq_cont_task_nanov2_tts_pretraining_wordlist_duplex_data_new_branch_4nodes_duplex_eartts_2_delay_4rd_stage_fp32_wd_1500_et_eos_dp_eos_dup_step_10004.ckpt + --extra_decoding_seconds 20 + --dtype float32 + --code_path /lustre/fsw/portfolios/convai/users/kevinhu/S2S-Duplex-new-codebase/branches/NeMo-release_not_rebased + --output_dir /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_s2s_voicechat_smoke/runs/sdqa_smoke10_sound + --save_artifacts + --decode_audio + +server_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh +server_server_type: vllm_multimodal + +data_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_test/data_dir +output_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_s2s_voicechat_smoke/runs/sdqa_smoke10_sound +voicebench_repo_path: /lustre/fsw/portfolios/llmservice/users/vmendelev/code/VoiceBench + +subtests: [sd_qa] +max_samples: 10 + +api_type: nvidia +nvidia_model: azure/openai/gpt-4o-mini + +installation_command: "" +scoring_installation_command: "pip install sacrebleu qa_metrics" + +expname: voicebench_s2s_voicechat_sdqa_smoke10_sound +generation_only: false +scoring_only: false +dry_run: false + diff --git a/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_sdqa_smoke10_sound_score_asr.yaml b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_sdqa_smoke10_sound_score_asr.yaml new file mode 100644 index 0000000000..46933eff1e --- /dev/null +++ b/nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_sdqa_smoke10_sound_score_asr.yaml @@ -0,0 +1,57 @@ +# 10-sample smoke test for VoiceBench sd_qa using s2s_voicechat backend, with audio output enabled, +# and VoiceBench scoring performed on the agent ASR transcript (output_asr.jsonl). +# +# Run: +# python nemo_skills/dataset/voicebench/scripts/generate_from_api_and_score_official.py \ +# --config nemo_skills/dataset/voicebench/scripts/voicebench_s2s_voicechat_sdqa_smoke10_sound_score_asr.yaml \ +# --output_dir /lustre/.../sdqa_smoke10_sound_ + +cluster: oci_iad +partition: batch_block1,batch_block3,batch_block4 +cpu_partition: cpu + +model: /lustre/fsw/portfolios/llmservice/users/apasad/projects/nemo_s2s_merged_dec/exp_SFT-nano9b/IAD_nano9b_parakeet600m_from_PT_32k_64gpu_5e-5_PT0.7_SFT0.05_QA0.02_TEXT0.1_loss0.5_MCQ0.03_ASR0.01_sysp0.03_NoiseProb0.5_SNR-30-60_asr_dtc2_dst15_loss_text5.0_bos10.0_eos5.0_pad1.0_eosplacementsfix_nospecaug_ei0.1_ot8_TN_all_data_v3.2_ir/checkpoints_hf_step-12556-last.ckpt + +server_type: vllm +server_gpus: 1 +num_chunks: 1 + +server_entrypoint: "-m nemo_skills.inference.server.serve_unified" +server_args: >- + --backend s2s_voicechat + --config_path /lustre/fsw/portfolios/llmservice/users/kevinhu/projects/duplex-stt/config/inference/infer_nano_eartts_Voicebench_Jan22_2026.yaml + --speaker_reference /lustre/fsw/portfolios/convai/users/ecasanova/S2S-full-duplex/inference_references/Emma_S3_A1_SC7_singleturntarget_21_channel_1_audio_in.wav + --tts_ckpt_path /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Duplex_EARTTS/eartts_rvq_cont_task_nanov2_tts_pretraining_wordlist_duplex_data_new_branch_4nodes_duplex_eartts_2_delay_4rd_stage_fp32_wd_1500_et_eos_dp_eos_dup_step_10004.ckpt + --extra_decoding_seconds 20 + --dtype float32 + --code_path /lustre/fsw/portfolios/convai/users/kevinhu/S2S-Duplex-new-codebase/branches/NeMo-release_not_rebased + --output_dir /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_s2s_voicechat_smoke/runs/sdqa_smoke10_sound + --save_artifacts + --decode_audio + +server_container: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh +server_server_type: vllm_multimodal + +data_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_test/data_dir +output_dir: /lustre/fsw/portfolios/llmservice/users/vmendelev/experiments/voicebench_s2s_voicechat_smoke/runs/sdqa_smoke10_sound +voicebench_repo_path: /lustre/fsw/portfolios/llmservice/users/vmendelev/code/VoiceBench + +subtests: [sd_qa] +max_samples: 10 + +# New pipeline knobs +voicebench_scoring_input: agent_asr # score output_asr.jsonl instead of output.jsonl +agent_audio_stage_enabled: true +agent_audio_force: true # recompute output_asr.jsonl/agent_audio_metrics.json if removed + +api_type: nvidia +nvidia_model: azure/openai/gpt-4o-mini + +installation_command: "" +scoring_installation_command: "pip install sacrebleu qa_metrics" + +expname: voicebench_s2s_voicechat_sdqa_smoke10_sound +generation_only: false +scoring_only: false +dry_run: false + diff --git a/nemo_skills/evaluation/evaluator/audio.py b/nemo_skills/evaluation/evaluator/audio.py index c212666311..f0bd6c952f 100644 --- a/nemo_skills/evaluation/evaluator/audio.py +++ b/nemo_skills/evaluation/evaluator/audio.py @@ -124,7 +124,7 @@ def evaluate_asr_pc(reference: str, hypothesis: str, normalize_standard_wer: boo def preprocess_asr_text(text: str) -> str: """Apply Whisper-style normalization: lowercase, normalize, remove brackets.""" - from whisper.normalizers import EnglishTextNormalizer + from whisper_normalizer.english import EnglishTextNormalizer text = text.lower() text = EnglishTextNormalizer()(text) @@ -134,20 +134,39 @@ def preprocess_asr_text(text: str) -> str: def preprocess_hf_leaderboard(text: str) -> str: - """Apply HuggingFace leaderboard normalization: lowercase, remove punctuation, normalize unicode.""" - import unicodedata + """Apply HuggingFace Open ASR Leaderboard normalization using Whisper's EnglishTextNormalizer. - text = unicodedata.normalize("NFC", text) - text = text.lower() - text = re.sub(r"[^\w\s]", "", text) + This matches the official HF leaderboard (github.com/huggingface/open_asr_leaderboard) + which uses whisper_normalizer.english.EnglishTextNormalizer for both references and hypotheses. + """ + from whisper_normalizer.english import EnglishTextNormalizer + + text = EnglishTextNormalizer()(text) text = re.sub(r"\s+", " ", text).strip() return text -def evaluate_asr(reference: str, hypothesis: str, apply_normalization: bool = True) -> dict[str, Any]: - """Evaluate ASR: computes WER with optional Whisper normalization.""" +def _wer_with_counts(ref: str, hyp: str) -> dict[str, Any]: + """Compute WER and return both the score and raw error/reference counts for corpus-level aggregation.""" import jiwer + wer_score = jiwer.wer(ref, hyp) + measures = jiwer.process_words(ref, hyp) + wer_errors = measures.substitutions + measures.deletions + measures.insertions + wer_ref_words = measures.substitutions + measures.deletions + measures.hits + + return { + "wer": wer_score, + "wer_errors": wer_errors, + "wer_ref_words": wer_ref_words, + "wer_substitutions": measures.substitutions, + "wer_insertions": measures.insertions, + "wer_deletions": measures.deletions, + } + + +def evaluate_asr(reference: str, hypothesis: str, apply_normalization: bool = True) -> dict[str, Any]: + """Evaluate ASR: computes WER with optional Whisper normalization.""" if apply_normalization: ref = preprocess_asr_text(reference) hyp = preprocess_asr_text(hypothesis) @@ -160,18 +179,13 @@ def evaluate_asr(reference: str, hypothesis: str, apply_normalization: bool = Tr if not hyp: hyp = "empty" - wer_score = jiwer.wer(ref, hyp) - - return { - "wer": wer_score, - "is_correct": wer_score < 0.5, - } + result = _wer_with_counts(ref, hyp) + result["is_correct"] = result["wer"] < 0.5 + return result def evaluate_asr_leaderboard(reference: str, hypothesis: str) -> dict[str, Any]: """Evaluate ASR with HuggingFace leaderboard preprocessing for direct comparison.""" - import jiwer - ref = preprocess_hf_leaderboard(reference) hyp = preprocess_hf_leaderboard(hypothesis) @@ -180,12 +194,9 @@ def evaluate_asr_leaderboard(reference: str, hypothesis: str) -> dict[str, Any]: if not hyp: hyp = "empty" - wer_score = jiwer.wer(ref, hyp) - - return { - "wer": wer_score, - "is_correct": wer_score < 0.5, - } + result = _wer_with_counts(ref, hyp) + result["is_correct"] = result["wer"] < 0.5 + return result def evaluate_translation(reference: str, hypothesis: str) -> dict[str, Any]: diff --git a/nemo_skills/evaluation/metrics/audio_metrics.py b/nemo_skills/evaluation/metrics/audio_metrics.py index 95a133833d..98e2b77cb8 100644 --- a/nemo_skills/evaluation/metrics/audio_metrics.py +++ b/nemo_skills/evaluation/metrics/audio_metrics.py @@ -66,6 +66,13 @@ def __init__(self, compute_no_answer: bool = True, max_k: int = 1): self.per_scores = [] self.bleu_scores = [] + # Corpus-level WER accumulators (total errors / total ref words) + self.wer_total_errors = 0 + self.wer_total_ref_words = 0 + self.wer_total_substitutions = 0 + self.wer_total_insertions = 0 + self.wer_total_deletions = 0 + # Extended metrics self.cer_scores = [] self.hallucination_scores = [] @@ -190,6 +197,12 @@ def update(self, predictions): for pred in predictions: if "wer" in pred and pred["wer"] is not None: self.wer_scores.append(pred["wer"]) + if "wer_errors" in pred and "wer_ref_words" in pred: + self.wer_total_errors += pred["wer_errors"] + self.wer_total_ref_words += pred["wer_ref_words"] + self.wer_total_substitutions += pred.get("wer_substitutions", 0) + self.wer_total_insertions += pred.get("wer_insertions", 0) + self.wer_total_deletions += pred.get("wer_deletions", 0) if "wer_c" in pred and pred["wer_c"] is not None: self.wer_c_scores.append(pred["wer_c"]) if "wer_pc" in pred and pred["wer_pc"] is not None: @@ -252,6 +265,14 @@ def get_metrics(self): # Add existing metrics: WER, PnC, and BLEU if available (convert to percentages and round to 2 decimals) if self.wer_scores: agg_metrics["wer"] = round(100.0 * sum(self.wer_scores) / len(self.wer_scores), 2) + if self.wer_total_ref_words > 0: + agg_metrics["corpus_wer"] = round( + 100.0 * self.wer_total_errors / self.wer_total_ref_words, 2 + ) + agg_metrics["corpus_substitutions"] = self.wer_total_substitutions + agg_metrics["corpus_insertions"] = self.wer_total_insertions + agg_metrics["corpus_deletions"] = self.wer_total_deletions + agg_metrics["corpus_ref_words"] = self.wer_total_ref_words if self.wer_c_scores: agg_metrics["wer_c"] = round(100.0 * sum(self.wer_c_scores) / len(self.wer_c_scores), 2) if self.wer_pc_scores: @@ -317,6 +338,12 @@ def metrics_to_print(self): # Add existing metrics if they were computed if self.wer_scores: base_metrics["wer"] = as_percentage + if self.wer_total_ref_words > 0: + base_metrics["corpus_wer"] = as_percentage + base_metrics["corpus_substitutions"] = as_int + base_metrics["corpus_insertions"] = as_int + base_metrics["corpus_deletions"] = as_int + base_metrics["corpus_ref_words"] = as_int if self.wer_c_scores: base_metrics["wer_c"] = as_percentage if self.wer_pc_scores: diff --git a/nemo_skills/evaluation/metrics/mmau_pro_metrics.py b/nemo_skills/evaluation/metrics/mmau_pro_metrics.py index f079049cc1..000dbcf13f 100644 --- a/nemo_skills/evaluation/metrics/mmau_pro_metrics.py +++ b/nemo_skills/evaluation/metrics/mmau_pro_metrics.py @@ -13,14 +13,52 @@ # limitations under the License. import logging +import re + +import numpy as np from nemo_skills.evaluation.metrics.base import BaseMetrics, as_int, as_percentage -from nemo_skills.evaluation.metrics.utils import is_correct_judgement from nemo_skills.utils import get_logger_name LOG = logging.getLogger(get_logger_name(__file__)) +def extract_multicriteria_scores(judgement_text: str) -> dict[str, float]: + """Extract multi-criteria scores (1-5 scale) from LLM judge evaluation. + + Expected format: + CORRECTNESS: [score] - [justification] + RELEVANCE: [score] - [justification] + COMPLETENESS: [score] - [justification] + CLARITY: [score] - [justification] + OVERALL: [score] - [overall assessment] + + Returns: + Dictionary with keys: correctness, relevance, completeness, clarity, overall + Defaults to 3.0 if score not found. + """ + scores = {} + + patterns = { + "correctness": r"CORRECTNESS:\s*(\d+(?:\.\d+)?)", + "relevance": r"RELEVANCE:\s*(\d+(?:\.\d+)?)", + "completeness": r"COMPLETENESS:\s*(\d+(?:\.\d+)?)", + "clarity": r"CLARITY:\s*(\d+(?:\.\d+)?)", + "overall": r"OVERALL:\s*(\d+(?:\.\d+)?)", + } + + for criterion, pattern in patterns.items(): + match = re.search(pattern, judgement_text, re.IGNORECASE) + scores[criterion] = float(match.group(1)) if match else 3.0 + + # Fallback: compute overall if missing or still 3.0 + if "overall" not in scores or scores["overall"] == 3.0: + criteria_scores = [scores.get(k, 3.0) for k in ["correctness", "relevance", "completeness", "clarity"]] + scores["overall"] = sum(criteria_scores) / len(criteria_scores) + + return scores + + class MMAUProMetrics(BaseMetrics): """Metrics class for MMAU-Pro benchmark (all subgroups).""" @@ -28,16 +66,24 @@ def __init__(self, compute_no_answer: bool = True, max_k: int = 1): super().__init__(compute_no_answer=compute_no_answer) self.max_k = max_k + # Track multi-criteria scores for open-ended questions (1-5 scale) + self.multicriteria_scores = { + "correctness": [], + "relevance": [], + "completeness": [], + "clarity": [], + "overall": [], + } + def _get_score_dict(self, prediction: dict) -> dict[str, bool | int | float]: """Extract correctness scores from prediction.""" score_dict = {} - # Open-ended: extract from judge result + # Open-ended: use LLM judge correctness score >= 3 as correct if "judgement" in prediction: - judge_result = is_correct_judgement(prediction["judgement"]) - score_dict["judge_correct"] = judge_result - score_dict["correct"] = judge_result - # Closed-form and instruction following: use is_correct + multicriteria = extract_multicriteria_scores(prediction["judgement"]) + score_dict["correct"] = multicriteria.get("correctness", 3.0) >= 3.0 + # Closed-form / instruction-following: use binary correctness elif "is_correct" in prediction: score_dict["correct"] = prediction["is_correct"] else: @@ -58,24 +104,61 @@ def get_incorrect_sample(self, prediction: dict) -> dict: def update(self, predictions): """Update metrics with new predictions.""" super().update(predictions) + predicted_answers = [pred.get("generation", None).strip() or None for pred in predictions] self._compute_pass_at_k(predictions=predictions, predicted_answers=predicted_answers) self._compute_majority_at_k(predictions=predictions, predicted_answers=predicted_answers) + # Collect multi-criteria scores for open-ended questions + for pred in predictions: + if "judgement" in pred: + multicriteria = extract_multicriteria_scores(pred["judgement"]) + for criterion in self.multicriteria_scores: + self.multicriteria_scores[criterion].append(multicriteria.get(criterion, 3.0)) + def get_metrics(self): """Get computed metrics.""" metrics_dict = super().get_metrics() + for agg_mode, agg_metrics in metrics_dict.items(): - # Ensure avg_tokens is always present for MMAU-Pro + # Ensure avg_tokens is present if "avg_tokens" not in agg_metrics: agg_metrics["avg_tokens"] = 0 if "no_answer" in agg_metrics: agg_metrics["no_answer"] = agg_metrics["no_answer"] / 2.0 - # Set success_rate from correct or judge_correct - if "judge_correct" in agg_metrics: - agg_metrics["success_rate"] = agg_metrics["judge_correct"] + + # Add multi-criteria averages for open-ended (convert 1-5 scale to percentage) + if self.multicriteria_scores["overall"]: + for criterion in self.multicriteria_scores: + scores = self.multicriteria_scores[criterion] + if scores: + # Convert 1-5 scale to 0-100 percentage scale + avg_score = np.mean(scores) + std_score = np.std(scores) + agg_metrics[f"avg_{criterion}"] = (avg_score / 5.0) * 100 + agg_metrics[f"std_{criterion}"] = (std_score / 5.0) * 100 + + # Set correct and success_rate to avg_correctness for open-ended + agg_metrics["correct"] = agg_metrics["avg_correctness"] + agg_metrics["success_rate"] = agg_metrics["avg_correctness"] + + # Calculate good/poor response rates based on overall >= 4 or <= 2 + overall_scores = self.multicriteria_scores["overall"] + good_responses = sum(1 for score in overall_scores if score >= 4.0) + poor_responses = sum(1 for score in overall_scores if score <= 2.0) + + agg_metrics["good_response_rate"] = (good_responses / len(overall_scores)) * 100 + agg_metrics["poor_response_rate"] = (poor_responses / len(overall_scores)) * 100 + + # For closed-form / instruction-following: use binary correctness elif "correct" in agg_metrics: agg_metrics["success_rate"] = agg_metrics["correct"] + + # Round all numeric values to 2 decimal places + for key, value in agg_metrics.items(): + if isinstance(value, float) and not isinstance(value, bool): + agg_metrics[key] = round(value, 2) + return metrics_dict def metrics_to_print(self): @@ -87,5 +170,20 @@ def metrics_to_print(self): } if self.compute_no_answer: base_metrics["no_answer"] = as_percentage + + # Add multi-criteria metrics for open-ended questions (now in percentage format) + if self.multicriteria_scores["overall"]: + base_metrics.update( + { + "avg_overall": as_percentage, + "avg_correctness": as_percentage, + "avg_relevance": as_percentage, + "avg_completeness": as_percentage, + "avg_clarity": as_percentage, + "good_response_rate": as_percentage, + "poor_response_rate": as_percentage, + } + ) + base_metrics["num_entries"] = as_int return base_metrics diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index 136375db46..e368aaee4d 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -399,8 +399,19 @@ def setup_prompt(self): def setup_llm(self): self.sandbox = get_sandbox(**self.cfg.sandbox) if self.cfg.sandbox is not None else None + self.data_dir = None + if "data_dir" in self.cfg.eval_config and not isinstance(self.cfg.eval_config.get("data_dir"), type(None)): + self.data_dir = self.cfg.eval_config["data_dir"] + + output_dir = str(Path(self.cfg.output_file).parent) if self.cfg.code_execution: - llm = get_code_execution_model(**self.cfg.server, tokenizer=self.tokenizer, sandbox=self.sandbox) + llm = get_code_execution_model( + **self.cfg.server, + tokenizer=self.tokenizer, + sandbox=self.sandbox, + data_dir=self.data_dir or "", + output_dir=output_dir, + ) elif self.cfg.tool_modules is not None: llm = get_tool_calling_model( **self.cfg.server, @@ -409,9 +420,13 @@ def setup_llm(self): schema_overrides=self.cfg.schema_overrides, tokenizer=self.tokenizer, additional_config={"sandbox": self.cfg.sandbox}, + data_dir=self.data_dir or "", + output_dir=output_dir, ) else: - llm = get_model(**self.cfg.server, tokenizer=self.tokenizer) + llm = get_model( + **self.cfg.server, tokenizer=self.tokenizer, data_dir=self.data_dir or "", output_dir=output_dir + ) if self.cfg.parallel_thinking.mode is not None: # We don't want to override these key variables which overlap with self.cfg @@ -545,6 +560,20 @@ def dump_outputs(self, outputs, data_points, fout): for output in outputs: fout.write(json.dumps(output) + "\n") + def drop_binary_data(self, output): + """Remove binary data (like base64 audio) from messages to keep output files smaller.""" + for message in output["messages"]: + # Skip if content is not a list (e.g., string content in system messages) + if not isinstance(message.get("content"), list): + continue + + # Filter out audio_url and input_audio items from list-style content + message["content"] = [ + content + for content in message["content"] + if content.get("type") not in ("audio_url", "input_audio") + ] + async def postprocess_single_output(self, output, original_data_point): # to make it easier to follow up with other generations and limit accidental errors, we are adding # all of the original data to the output file alongside the new generations @@ -560,6 +589,9 @@ async def postprocess_single_output(self, output, original_data_point): for key in output: original_data_point.pop(key, None) output.update(original_data_point) + + self.drop_binary_data(output) + if self.cfg.parse_reasoning: parse_reasoning( output, diff --git a/nemo_skills/inference/model/__init__.py b/nemo_skills/inference/model/__init__.py index 164d92fcc8..595d8fd3ee 100644 --- a/nemo_skills/inference/model/__init__.py +++ b/nemo_skills/inference/model/__init__.py @@ -39,6 +39,7 @@ # Utilities from .vllm import VLLMModel +from .vllm_multimodal import VLLMMultimodalModel # Model implementations @@ -51,6 +52,7 @@ "azureopenai": AzureOpenAIModel, "gemini": GeminiModel, "vllm": VLLMModel, + "vllm_multimodal": VLLMMultimodalModel, "sglang": SGLangModel, "tts_nim": TTSNIMModel, "asr_nim": ASRNIMModel, diff --git a/nemo_skills/inference/model/base.py b/nemo_skills/inference/model/base.py index 9318bfb475..117096b4c7 100644 --- a/nemo_skills/inference/model/base.py +++ b/nemo_skills/inference/model/base.py @@ -75,9 +75,14 @@ def __init__( enable_soft_fail: bool = False, context_limit_retry_strategy: str | None = None, num_special_tokens_budget: int = 100, + # Directory paths for data and output + data_dir: str = "", + output_dir: str | None = None, ): self._tunnel = None self.model_name_or_path = model + self.data_dir = data_dir + self.output_dir = output_dir self.server_host = host self.server_port = port self.ssh_server = ssh_server diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py index e9a2146520..ab22d095e6 100644 --- a/nemo_skills/inference/model/vllm.py +++ b/nemo_skills/inference/model/vllm.py @@ -12,7 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +import base64 import logging +import os import requests @@ -24,10 +26,14 @@ LOG = logging.getLogger(get_logger_name(__file__)) -class VLLMModel(BaseModel): - def __init__(self, **kwargs): - super().__init__(**kwargs) +def audio_file_to_base64(audio_file_path: str): + """Encodes an audio file into a base64 string.""" + with open(audio_file_path, "rb") as audio_file: + audio_content = audio_file.read() + return base64.b64encode(audio_content).decode("utf-8") + +class VLLMModel(BaseModel): def _get_tokenizer_endpoint(self): """ Returns the tokenizer endpoint if available, otherwise returns None. @@ -99,6 +105,35 @@ def _build_completion_request_params( "extra_body": self._build_request_body(top_k, min_p, repetition_penalty, extra_body=extra_body), } + def content_text_to_list(self, message): + if "audio" in message or "audios" in message: + content = message["content"] + if isinstance(content, str): + message["content"] = [{"type": "text", "text": content}] if content else [] + elif isinstance(content, list): + message["content"] = content + else: + raise TypeError(str(content)) + + if "audio" in message: + audio = message.pop("audio") # Remove the original audio key + audio_path = os.path.join(self.data_dir, audio["path"]) + base64_audio = audio_file_to_base64(audio_path) + # Detect format from file extension + audio_format = os.path.splitext(audio_path)[1].lstrip('.').lower() or "wav" + # OpenAI input_audio format + audio_message = {"type": "input_audio", "input_audio": {"data": base64_audio, "format": audio_format}} + message["content"].append(audio_message) + elif "audios" in message: + audios = message.pop("audios") # Remove the original audios key + for audio in audios: + audio_path = os.path.join(self.data_dir, audio["path"]) + base64_audio = audio_file_to_base64(audio_path) + audio_format = os.path.splitext(audio_path)[1].lstrip('.').lower() or "wav" + audio_message = {"type": "input_audio", "input_audio": {"data": base64_audio, "format": audio_format}} + message["content"].append(audio_message) + return message + def _build_chat_request_params( self, messages: list[dict], @@ -117,6 +152,7 @@ def _build_chat_request_params( tools: list[dict] | None = None, extra_body: dict = None, ) -> dict: + messages = [self.content_text_to_list(message) for message in messages] request = { "messages": messages, "max_tokens": tokens_to_generate, diff --git a/nemo_skills/inference/model/vllm_multimodal.py b/nemo_skills/inference/model/vllm_multimodal.py new file mode 100644 index 0000000000..0569c9efd9 --- /dev/null +++ b/nemo_skills/inference/model/vllm_multimodal.py @@ -0,0 +1,110 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import base64 +import json +import logging +import os +import re + +from nemo_skills.utils import get_logger_name + +from .vllm import VLLMModel + +LOG = logging.getLogger(get_logger_name(__file__)) + +# Pattern to extract debug_info from content +DEBUG_INFO_PATTERN = re.compile(r"\n?(.*?)", re.DOTALL) + + +class VLLMMultimodalModel(VLLMModel): + """VLLMModel with support for saving audio responses to disk. + + When the server returns audio in the response, this model will: + 1. Save the audio bytes to a file in output_dir/audio/ + 2. Replace the base64 data with the file path in the result + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.output_audio_dir = None + if self.output_dir: + self.output_audio_dir = os.path.join(self.output_dir, "audio") + os.makedirs(self.output_audio_dir, exist_ok=True) + LOG.info(f"Audio responses will be saved to: {self.output_audio_dir}") + + def _parse_chat_completion_response(self, response, include_response: bool = False, **kwargs) -> dict: + """Parse chat completion response and save any audio to disk.""" + result = super()._parse_chat_completion_response(response, include_response=include_response, **kwargs) + + # Extract debug_info from content (embedded as JSON in tags) + if "generation" in result and result["generation"]: + match = DEBUG_INFO_PATTERN.search(result["generation"]) + if match: + try: + result["debug_info"] = json.loads(match.group(1)) + # Strip debug_info from generation + result["generation"] = DEBUG_INFO_PATTERN.sub("", result["generation"]) + except json.JSONDecodeError: + LOG.warning("Failed to parse debug_info JSON from content") + + choice = response.choices[0] + if hasattr(choice.message, "audio") and choice.message.audio: + audio_result = self._process_audio_response(choice.message.audio, response.id) + result["audio"] = audio_result + + # Strip audio data from serialized_output to avoid duplication + if "serialized_output" in result: + for item in result["serialized_output"]: + if isinstance(item, dict) and "audio" in item: + # Keep only metadata, remove base64 data + if isinstance(item["audio"], dict) and "data" in item["audio"]: + del item["audio"]["data"] + # Also strip debug_info from serialized content + if isinstance(item, dict) and "content" in item and item["content"]: + item["content"] = DEBUG_INFO_PATTERN.sub("", item["content"]) + + return result + + def _process_audio_response(self, audio_data, response_id: str) -> dict: + """Process audio data: save to file and return metadata with path.""" + audio_info = { + "format": getattr(audio_data, "format", "wav"), + "sample_rate": getattr(audio_data, "sample_rate", 22050), + "transcript": getattr(audio_data, "transcript", None), + } + + audio_base64 = getattr(audio_data, "data", None) + if not audio_base64: + return audio_info + + if self.output_audio_dir: + try: + audio_bytes = base64.b64decode(audio_base64) + filename = f"{response_id}.wav" + filepath = os.path.join(self.output_audio_dir, filename) + + with open(filepath, "wb") as f: + f.write(audio_bytes) + + audio_info["path"] = filepath + audio_info["size_bytes"] = len(audio_bytes) + LOG.info(f"Saved audio: {filepath} ({len(audio_bytes)} bytes)") + except Exception as e: + LOG.warning(f"Failed to save audio: {e}") + audio_info["data"] = audio_base64 + else: + audio_info["data"] = audio_base64 + + return audio_info diff --git a/nemo_skills/inference/server/serve_unified.py b/nemo_skills/inference/server/serve_unified.py new file mode 100644 index 0000000000..ddec36469e --- /dev/null +++ b/nemo_skills/inference/server/serve_unified.py @@ -0,0 +1,780 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +CLI wrapper for the Unified NeMo Inference Server. + +This module provides a command-line interface compatible with nemo-skills +server deployment patterns. It translates standard vllm-style CLI arguments +to the unified server configuration. + +Usage via NeMo-Skills: + + # SALM backend (speech-augmented language model) + ns eval \\ + --server_type vllm \\ + --server_gpus 1 \\ + --model /path/to/model \\ + --server_entrypoint "-m nemo_skills.inference.server.serve_unified" \\ + --server_args "--backend salm" + + # TTS backend (text-to-speech) + ns eval \\ + --server_type vllm \\ + --server_gpus 1 \\ + --model /path/to/tts_model \\ + --server_entrypoint "-m nemo_skills.inference.server.serve_unified" \\ + --server_args "--backend tts --codec_model /path/to/codec" + + # S2S backend (speech-to-speech) + ns eval \\ + --server_type vllm \\ + --server_gpus 1 \\ + --model /path/to/s2s_model \\ + --server_entrypoint "-m nemo_skills.inference.server.serve_unified" \\ + --server_args "--backend s2s" + +Environment Variables: + UNIFIED_SERVER_HOST: Server host (default: 0.0.0.0) + UNIFIED_SERVER_PORT: Server port (default: 8000) + UNIFIED_SERVER_BACKEND: Backend type (default: salm) + UNIFIED_SERVER_MODEL_PATH: Path to model + UNIFIED_SERVER_CODEC_MODEL_PATH: Path to codec model + UNIFIED_SERVER_BATCH_SIZE: Batch size (default: 8) + UNIFIED_SERVER_BATCH_TIMEOUT: Batch timeout (default: 0.1) + DEBUG: Enable debug mode +""" + +import argparse +import inspect +import os +import shutil +import sys +from typing import Optional + + +def setup_pythonpath(code_path: Optional[str] = None): + """Set up PYTHONPATH for NeMo and the unified server. + + Args: + code_path: Single path or colon-separated paths to add to PYTHONPATH + """ + paths_to_add = [] + + # Add explicit code path(s) if provided (supports colon-separated paths) + if code_path: + for path in code_path.split(":"): + if path and path not in paths_to_add: + paths_to_add.append(path) + + # Add recipes path for unified server imports + # Look for the recipes directory relative to this file + this_dir = os.path.dirname(os.path.abspath(__file__)) + + # Try to find ns_eval root (go up from nemo_skills/inference/server/) + ns_eval_root = os.path.dirname(os.path.dirname(os.path.dirname(this_dir))) + if os.path.exists(os.path.join(ns_eval_root, "recipes")): + paths_to_add.append(ns_eval_root) + + # Also check /nemo_run/code pattern used in containers + if os.path.exists("/nemo_run/code"): + paths_to_add.append("/nemo_run/code") + + # Update PYTHONPATH + current_path = os.environ.get("PYTHONPATH", "") + for path in paths_to_add: + if path not in current_path.split(":"): + current_path = f"{path}:{current_path}" if current_path else path + + os.environ["PYTHONPATH"] = current_path + + # Also add to sys.path for immediate imports + for path in paths_to_add: + if path not in sys.path: + sys.path.insert(0, path) + + +def apply_safetensors_patch(hack_path: Optional[str]): + """Apply safetensors patch if provided (for some NeMo models).""" + if not hack_path or not os.path.exists(hack_path): + return + + try: + import safetensors.torch as st_torch + + dest_path = inspect.getfile(st_torch) + os.makedirs(os.path.dirname(dest_path), exist_ok=True) + shutil.copyfile(hack_path, dest_path) + print(f"[serve_unified] Applied safetensors patch: {hack_path} -> {dest_path}") + except Exception as e: + print(f"[serve_unified] Warning: Failed to apply safetensors patch: {e}") + + +def main(): + parser = argparse.ArgumentParser( + description="Unified NeMo Inference Server CLI wrapper", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + + # Standard vllm-style arguments (for nemo-skills compatibility) + parser.add_argument("--model", required=True, help="Path to the model") + parser.add_argument("--num_gpus", type=int, default=1, help="Number of GPUs to use") + parser.add_argument("--port", type=int, default=8000, help="Server port") + + # Backend selection + parser.add_argument( + "--backend", + default="salm", + choices=["salm", "tts", "s2s", "s2s_voicechat", "s2s_incremental", "s2s_incremental_v2", "s2s_session"], + help="Backend type: salm (speech-augmented LM), tts (text-to-speech), s2s (speech-to-speech offline), s2s_voicechat (NemotronVoiceChat offline, YAML-driven), s2s_incremental (frame-by-frame processing), s2s_incremental_v2 (V2 frame-by-frame with NeMo wrapper, vLLM, caches), s2s_session (session-aware multi-turn)", + ) + + # Backend-specific model paths + parser.add_argument("--codec_model", default=None, help="Path to codec model (required for TTS, optional for S2S)") + + # Server configuration + parser.add_argument("--host", default="0.0.0.0", help="Server host") + parser.add_argument("--batch_size", type=int, default=8, help="Maximum batch size") + parser.add_argument( + "--batch_timeout", type=float, default=0.1, help="Batch timeout in seconds (0 for no batching delay)" + ) + + # Generation defaults + parser.add_argument("--max_new_tokens", type=int, default=512, help="Max tokens to generate") + parser.add_argument("--temperature", type=float, default=1.0, help="Generation temperature") + parser.add_argument("--top_p", type=float, default=1.0, help="Top-p sampling") + + # Model configuration + parser.add_argument("--device", default="cuda", help="Device to use") + parser.add_argument("--dtype", default="bfloat16", help="Model dtype") + + # Backend-specific options + parser.add_argument("--prompt_format", default=None, help="Prompt format (SALM backend)") + parser.add_argument( + "--phoneme_input_type", default="predicted", help="Phoneme input type: predicted or gt (TTS backend)" + ) + parser.add_argument( + "--decoder_only_model", action="store_true", help="Use decoder-only model architecture (TTS backend)" + ) + parser.add_argument("--use_local_transformer", action="store_true", help="Use local transformer (TTS backend)") + parser.add_argument("--top_k", type=int, default=None, help="Top-k sampling (TTS backend)") + + # Environment setup + parser.add_argument("--code_path", default=None, help="Path to NeMo source code to add to PYTHONPATH") + parser.add_argument("--hack_path", default=None, help="Path to safetensors/torch.py patch file") + + # S2S backend options + parser.add_argument( + "--ignore_system_prompt", + action="store_true", + help="Ignore system prompts from requests (for models that don't support them)", + ) + parser.add_argument( + "--silence_padding_sec", + type=float, + default=5.0, + help="Seconds of silence to append after audio (legacy, prefer --extra_decoding_seconds)", + ) + parser.add_argument( + "--extra_decoding_seconds", + type=float, + default=0.0, + help="Extra decoding time in seconds (0 for FDB, 20 for Voicebench)", + ) + parser.add_argument( + "--tts_ckpt_path", + default=None, + help="Path to TTS checkpoint (s2s offline backend)", + ) + parser.add_argument( + "--inference_pad_boost", + type=float, + default=0.0, + help="Boost for PAD token logits during inference", + ) + parser.add_argument( + "--inference_bos_boost", + type=float, + default=0.0, + help="Boost for BOS token logits during inference", + ) + parser.add_argument( + "--inference_eos_boost", + type=float, + default=0.0, + help="Boost for EOS token logits during inference", + ) + parser.add_argument( + "--inference_user_pad_boost", + type=float, + default=None, + help="Boost for ASR PAD token logits during inference (requires nemotron_h.py patch for vLLM)", + ) + parser.add_argument( + "--inference_user_bos_boost", + type=float, + default=None, + help="Boost for ASR BOS token logits during inference (requires nemotron_h.py patch for vLLM)", + ) + parser.add_argument( + "--inference_user_eos_boost", + type=float, + default=None, + help="Boost for ASR EOS token logits during inference (requires nemotron_h.py patch for vLLM)", + ) + + # s2s_voicechat (nemotron_voicechat_infer-like) options + parser.add_argument( + "--decode_audio", + action="store_true", + help="Enable audio decoding/output (s2s_voicechat backend; default: text-only)", + ) + parser.add_argument( + "--output_dir", + default=None, + help="Base output directory for artifacts (s2s_voicechat backend)", + ) + parser.add_argument( + "--save_artifacts", + action="store_true", + help="Save per-request artifacts under output_dir (s2s_voicechat backend)", + ) + parser.add_argument( + "--trim_leading_silence", + action="store_true", + help="Trim leading silence from response audio to reduce FDB-reported latency (s2s_voicechat)", + ) + parser.add_argument( + "--trim_leading_silence_padding_sec", + type=float, + default=0.01, + help="Seconds to keep before first speech when trimming (default: 0.01)", + ) + parser.add_argument( + "--merge_user_channel", + action="store_true", + help="Merge user (input) + model (pred) into two-channel WAV like NeMo ResultsLogger (for FDB)", + ) + + # S2S Incremental backend options + parser.add_argument( + "--config_path", + default=None, + help="Path to YAML config file (s2s_incremental backend)", + ) + parser.add_argument( + "--llm_checkpoint_path", + default=None, + help="Path to LLM checkpoint (s2s_incremental backend)", + ) + parser.add_argument( + "--tts_checkpoint_path", + default=None, + help="Path to TTS checkpoint (s2s_incremental backend)", + ) + parser.add_argument( + "--speaker_reference", + default=None, + help="Path to speaker reference audio for TTS (s2s_incremental backend)", + ) + parser.add_argument( + "--num_frames_per_inference", + type=int, + default=1, + help="Frames per inference step (s2s_incremental backend)", + ) + parser.add_argument( + "--no_decode_audio", + action="store_true", + help="Disable audio output (s2s_incremental backend)", + ) + parser.add_argument( + "--response_end_detection_mode", + type=str, + default="audio_energy", + choices=["audio_energy", "eos"], + help="Response end detection mode: audio_energy (TTS silence) or eos (consecutive PAD tokens)", + ) + parser.add_argument( + "--eos_detection_window", + type=int, + default=10, + help="Number of consecutive PAD tokens to detect end of response (used when mode=eos)", + ) + + # S2S Incremental V2 backend options + parser.add_argument( + "--engine_type", + type=str, + default="native", + choices=["native", "vllm_llm", "vllm_eartts", "vllm_llm_vllm_eartts"], + help="Inference engine type (s2s_incremental_v2 backend)", + ) + parser.add_argument( + "--use_perception_cache", + action="store_true", + help="Enable cache-aware streaming for perception encoder (s2s_incremental_v2)", + ) + parser.add_argument( + "--use_perception_cudagraph", + action="store_true", + help="Enable CUDA graph-accelerated perception encoder (s2s_incremental_v2)", + ) + parser.add_argument( + "--use_codec_cache", + action="store_true", + help="Incremental codec decode to remove clicking (s2s_incremental_v2)", + ) + parser.add_argument( + "--buffer_size_frames", + type=int, + default=None, + help="Number of frames in audio buffer (s2s_incremental_v2, default: 20 w/ perception cache, 71 without)", + ) + parser.add_argument( + "--codec_token_history_size", + type=int, + default=60, + help="Sliding-window buffer size; ignored when use_codec_cache is on (s2s_incremental_v2)", + ) + parser.add_argument( + "--pad_audio_to_sec", + "--pad_to_duration_secs", + type=float, + default=None, + dest="pad_to_duration_secs", + help="Pad input audio to this duration in seconds (s2s_incremental_v2)", + ) + parser.add_argument( + "--system_prompt", + type=str, + default=None, + help="System prompt for the model (s2s_incremental_v2)", + ) + parser.add_argument( + "--tts_system_prompt", + type=str, + default=None, + help="TTS system prompt to condition generation style (s2s_incremental_v2)", + ) + parser.add_argument( + "--repetition_penalty", + type=float, + default=1.0, + help="Repetition penalty (s2s_incremental_v2)", + ) + parser.add_argument( + "--force_turn_taking", + action="store_true", + help="Enable forced turn-taking (s2s_incremental_v2)", + ) + parser.add_argument( + "--force_turn_taking_threshold", + type=int, + default=40, + help="Threshold for forced turn-taking (s2s_incremental_v2)", + ) + parser.add_argument( + "--force_turn_taking_pad_window", + type=int, + default=25, + help="Pad window for forced turn-taking (s2s_incremental_v2)", + ) + parser.add_argument( + "--matmul_precision", + type=str, + default="medium", + help="torch float32 matmul precision (s2s_incremental_v2)", + ) + parser.add_argument( + "--vllm_gpu_memory_utilization", + type=float, + default=0.35, + help="GPU memory utilization for vLLM engines (s2s_incremental_v2)", + ) + parser.add_argument( + "--vllm_max_model_len", + type=int, + default=8192, + help="Max model sequence length for vLLM engines (s2s_incremental_v2)", + ) + parser.add_argument( + "--merge_user_channel_v2", + action="store_true", + help="Return dual-channel (user+agent) WAV in response (s2s_incremental_v2, for FDB)", + ) + parser.add_argument( + "--use_asr_as_response", + action="store_true", + help="Use ASR channel (user transcription) as primary response text instead of agent text (for ASR evaluation)", + ) + + # Session management options (s2s_session backend) + parser.add_argument( + "--session_ttl", + type=float, + default=300.0, + help="Session time-to-live in seconds (s2s_session backend)", + ) + parser.add_argument( + "--max_sessions", + type=int, + default=100, + help="Maximum number of concurrent sessions (s2s_session backend)", + ) + parser.add_argument( + "--session_artifacts_dir", + type=str, + default=None, + help="Directory to save session artifacts (input/output audio, JSON). Default: /tmp/s2s_sessions", + ) + parser.add_argument( + "--no_save_session_artifacts", + action="store_true", + help="Disable saving session artifacts to disk", + ) + parser.add_argument( + "--output_frame_alignment", + action="store_true", + help="Include per-frame alignment data in debug output (user/agent/ASR per frame)", + ) + + # Debug + parser.add_argument("--debug", action="store_true", help="Enable debug mode") + + # Pre-server pip install (runs inside the server process before model loading) + parser.add_argument( + "--pip_install", + type=str, + default=None, + help="Space-separated pip packages to install before starting the server " + "(e.g. 'lhotse==1.32.2 transformers==4.56.0')", + ) + + # Parse known args, allowing extra args to be passed through + args, extra_args = parser.parse_known_args() + + # Run pre-server pip install if requested + if args.pip_install: + import subprocess + + pip_cmd = f"pip install {args.pip_install}" + print(f"[serve_unified] Running: {pip_cmd}") + result = subprocess.run(pip_cmd, shell=True) + if result.returncode != 0: + print("[serve_unified] pip install failed, exiting.") + sys.exit(1) + print("[serve_unified] pip install completed.") + + # Setup environment + setup_pythonpath(args.code_path) + apply_safetensors_patch(args.hack_path) + + # Set environment variables + os.environ["UNIFIED_SERVER_HOST"] = args.host + os.environ["UNIFIED_SERVER_PORT"] = str(args.port) + os.environ["UNIFIED_SERVER_BACKEND"] = args.backend + os.environ["UNIFIED_SERVER_MODEL_PATH"] = args.model + os.environ["UNIFIED_SERVER_BATCH_SIZE"] = str(args.batch_size) + os.environ["UNIFIED_SERVER_BATCH_TIMEOUT"] = str(args.batch_timeout) + os.environ["UNIFIED_SERVER_MAX_NEW_TOKENS"] = str(args.max_new_tokens) + os.environ["UNIFIED_SERVER_TEMPERATURE"] = str(args.temperature) + os.environ["UNIFIED_SERVER_TOP_P"] = str(args.top_p) + + if args.codec_model: + os.environ["UNIFIED_SERVER_CODEC_MODEL_PATH"] = args.codec_model + + if args.debug: + os.environ["DEBUG"] = "1" + + # Set CUDA devices + os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(i) for i in range(args.num_gpus)) + + # Build extra config for backend-specific options + extra_config = {} + + if args.prompt_format: + extra_config["prompt_format"] = args.prompt_format + + if args.backend == "tts": + extra_config["decoder_only_model"] = args.decoder_only_model + extra_config["phoneme_input_type"] = args.phoneme_input_type + extra_config["use_local_transformer"] = args.use_local_transformer + if args.top_k: + extra_config["top_k"] = args.top_k + + # S2S backend options + if args.backend in ("s2s", "s2s_voicechat", "s2s_incremental", "s2s_incremental_v2", "s2s_session"): + extra_config["ignore_system_prompt"] = args.ignore_system_prompt + if args.silence_padding_sec != 5.0: + extra_config["silence_padding_sec"] = args.silence_padding_sec + + # S2S offline backend specific options + if args.backend == "s2s": + if args.extra_decoding_seconds: + extra_config["extra_decoding_seconds"] = args.extra_decoding_seconds + if args.config_path: + extra_config["config_path"] = args.config_path + if args.tts_ckpt_path: + extra_config["tts_ckpt_path"] = args.tts_ckpt_path + if args.speaker_reference: + extra_config["speaker_reference"] = args.speaker_reference + if args.code_path: + extra_config["code_path"] = args.code_path + if args.inference_pad_boost: + extra_config["inference_pad_boost"] = args.inference_pad_boost + if args.inference_bos_boost: + extra_config["inference_bos_boost"] = args.inference_bos_boost + if args.inference_eos_boost: + extra_config["inference_eos_boost"] = args.inference_eos_boost + + # s2s_voicechat backend specific options + if args.backend == "s2s_voicechat": + if args.extra_decoding_seconds: + extra_config["extra_decoding_seconds"] = args.extra_decoding_seconds + if args.config_path: + extra_config["config_path"] = args.config_path + if args.tts_ckpt_path: + extra_config["tts_ckpt_path"] = args.tts_ckpt_path + if args.speaker_reference: + extra_config["speaker_reference"] = args.speaker_reference + if args.code_path: + extra_config["code_path"] = args.code_path + if args.inference_pad_boost: + extra_config["inference_pad_boost"] = args.inference_pad_boost + if args.inference_bos_boost: + extra_config["inference_bos_boost"] = args.inference_bos_boost + if args.inference_eos_boost: + extra_config["inference_eos_boost"] = args.inference_eos_boost + if args.decode_audio: + extra_config["decode_audio"] = True + if args.output_dir: + extra_config["output_dir"] = args.output_dir + if args.save_artifacts: + extra_config["save_artifacts"] = True + if args.trim_leading_silence: + extra_config["trim_leading_silence"] = True + if args.trim_leading_silence_padding_sec != 0.01: + extra_config["trim_leading_silence_padding_sec"] = args.trim_leading_silence_padding_sec + if args.merge_user_channel: + extra_config["merge_user_channel"] = True + + # S2S Incremental/Session backend options (shared config) + if args.backend in ("s2s_incremental", "s2s_session"): + if args.config_path: + extra_config["config_path"] = args.config_path + if args.llm_checkpoint_path: + extra_config["llm_checkpoint_path"] = args.llm_checkpoint_path + if args.tts_checkpoint_path: + extra_config["tts_checkpoint_path"] = args.tts_checkpoint_path + if args.speaker_reference: + extra_config["speaker_reference"] = args.speaker_reference + if args.num_frames_per_inference != 1: + extra_config["num_frames_per_inference"] = args.num_frames_per_inference + if args.no_decode_audio: + extra_config["decode_audio"] = False + # Response end detection (text-only mode uses eos) + extra_config["response_end_detection_mode"] = args.response_end_detection_mode + extra_config["eos_detection_window"] = args.eos_detection_window + # Artifacts and alignment (available for both backends) + if args.session_artifacts_dir: + extra_config["session_artifacts_dir"] = args.session_artifacts_dir + extra_config["save_session_artifacts"] = not args.no_save_session_artifacts + extra_config["output_frame_alignment"] = args.output_frame_alignment + + # S2S Incremental V2 backend options + if args.backend == "s2s_incremental_v2": + if args.llm_checkpoint_path: + extra_config["llm_checkpoint_path"] = args.llm_checkpoint_path + if args.tts_checkpoint_path: + extra_config["tts_checkpoint_path"] = args.tts_checkpoint_path + if args.speaker_reference: + extra_config["speaker_reference"] = args.speaker_reference + extra_config["num_frames_per_inference"] = args.num_frames_per_inference if args.num_frames_per_inference != 1 else 3 + extra_config["engine_type"] = args.engine_type + extra_config["use_perception_cache"] = args.use_perception_cache + extra_config["use_perception_cudagraph"] = args.use_perception_cudagraph + extra_config["use_codec_cache"] = args.use_codec_cache + extra_config["codec_token_history_size"] = args.codec_token_history_size + extra_config["repetition_penalty"] = args.repetition_penalty + extra_config["top_p"] = args.top_p + extra_config["temperature"] = args.temperature + if args.inference_pad_boost: + extra_config["inference_pad_boost"] = args.inference_pad_boost + if args.inference_bos_boost: + extra_config["inference_bos_boost"] = args.inference_bos_boost + if args.inference_eos_boost: + extra_config["inference_eos_boost"] = args.inference_eos_boost + if args.inference_user_pad_boost is not None: + extra_config["inference_user_pad_boost"] = args.inference_user_pad_boost + if args.inference_user_bos_boost is not None: + extra_config["inference_user_bos_boost"] = args.inference_user_bos_boost + if args.inference_user_eos_boost is not None: + extra_config["inference_user_eos_boost"] = args.inference_user_eos_boost + extra_config["force_turn_taking"] = args.force_turn_taking + extra_config["force_turn_taking_threshold"] = args.force_turn_taking_threshold + extra_config["force_turn_taking_pad_window"] = args.force_turn_taking_pad_window + extra_config["matmul_precision"] = args.matmul_precision + if args.buffer_size_frames is not None: + extra_config["buffer_size_frames"] = args.buffer_size_frames + else: + extra_config["buffer_size_frames"] = 20 if args.use_perception_cache else 71 + if args.pad_to_duration_secs is not None: + extra_config["pad_to_duration_secs"] = args.pad_to_duration_secs + extra_config["silence_padding_sec"] = 0.0 + if args.system_prompt: + extra_config["system_prompt"] = args.system_prompt + if args.tts_system_prompt: + extra_config["tts_system_prompt"] = args.tts_system_prompt + if args.decode_audio: + extra_config["decode_audio"] = True + if args.no_decode_audio: + extra_config["decode_audio"] = False + if args.output_dir: + extra_config["session_artifacts_dir"] = args.output_dir + elif args.session_artifacts_dir: + extra_config["session_artifacts_dir"] = args.session_artifacts_dir + extra_config["save_session_artifacts"] = not args.no_save_session_artifacts + if args.merge_user_channel_v2: + extra_config["merge_user_channel"] = True + if args.use_asr_as_response: + extra_config["use_asr_as_response"] = True + # Build vLLM configs when using a vLLM engine + if "vllm" in args.engine_type: + model_path = args.model + llm_path = args.llm_checkpoint_path or args.model + extra_config["vllm_llm_config"] = { + "model_path": model_path, + "max_model_len": args.vllm_max_model_len, + "gpu_memory_utilization": args.vllm_gpu_memory_utilization, + "dtype": "bfloat16", + "engine_path": None, + "pretrained_llm": llm_path, + } + extra_config["vllm_tts_config"] = { + "model_path": model_path, + "max_model_len": args.vllm_max_model_len, + "gpu_memory_utilization": args.vllm_gpu_memory_utilization, + "dtype": "float32", + "engine_path": None, + "pretrained_llm": None, + "skip_tokenizer_init": True, + } + + # S2S Session backend options + if args.backend == "s2s_session": + extra_config["session_ttl"] = args.session_ttl + extra_config["max_sessions"] = args.max_sessions + + # Print configuration + print("=" * 60) + print("[serve_unified] Starting Unified NeMo Inference Server") + print("=" * 60) + print(f" Backend: {args.backend}") + print(f" Model: {args.model}") + if args.codec_model: + print(f" Codec Model: {args.codec_model}") + print(f" Port: {args.port}") + print(f" GPUs: {args.num_gpus}") + print(f" Batch Size: {args.batch_size}") + print(f" Batch Timeout: {args.batch_timeout}s") + print(f" Device: {args.device}") + print(f" Dtype: {args.dtype}") + if args.backend == "s2s": + if args.config_path: + print(f" Config Path: {args.config_path}") + if args.tts_ckpt_path: + print(f" TTS Checkpoint: {args.tts_ckpt_path}") + if args.speaker_reference: + print(f" Speaker Reference: {args.speaker_reference}") + print(f" Extra Decoding Seconds: {args.extra_decoding_seconds}") + print(f" Inference Boosts: pad={args.inference_pad_boost}, bos={args.inference_bos_boost}, eos={args.inference_eos_boost}") + if args.backend in ("s2s_incremental", "s2s_session"): + if args.config_path: + print(f" Config Path: {args.config_path}") + if args.llm_checkpoint_path: + print(f" LLM Checkpoint: {args.llm_checkpoint_path}") + if args.speaker_reference: + print(f" Speaker Reference: {args.speaker_reference}") + print(f" Frames per Inference: {args.num_frames_per_inference}") + print(f" Decode Audio: {not args.no_decode_audio}") + print(f" Response End Mode: {args.response_end_detection_mode}") + if args.response_end_detection_mode == "eos": + print(f" EOS Detection Window: {args.eos_detection_window} frames") + print(f" Save Artifacts: {not args.no_save_session_artifacts}") + if args.session_artifacts_dir: + print(f" Artifacts Dir: {args.session_artifacts_dir}") + else: + print(" Artifacts Dir: /tmp/s2s_sessions (default)") + print(f" Output Frame Alignment: {args.output_frame_alignment}") + if args.backend == "s2s_incremental_v2": + print(f" Engine Type: {args.engine_type}") + print(f" Perception Cache: {args.use_perception_cache}") + print(f" Perception CUDAGraph: {args.use_perception_cudagraph}") + print(f" Codec Cache: {args.use_codec_cache}") + print(f" Buffer Size Frames: {extra_config.get('buffer_size_frames')}") + print(f" Frames per Inference: {extra_config.get('num_frames_per_inference')}") + print(f" Decode Audio: {extra_config.get('decode_audio', True)}") + if args.llm_checkpoint_path: + print(f" LLM Checkpoint: {args.llm_checkpoint_path}") + if args.speaker_reference: + print(f" Speaker Reference: {args.speaker_reference}") + if args.pad_to_duration_secs: + print(f" Pad to Duration: {args.pad_to_duration_secs}s") + if args.system_prompt: + print(f" System Prompt: {args.system_prompt[:80]}...") + print(f" Force Turn Taking: {args.force_turn_taking}") + print(f" Save Artifacts: {not args.no_save_session_artifacts}") + if args.backend == "s2s_session": + print(f" Session TTL: {args.session_ttl}s") + print(f" Max Sessions: {args.max_sessions}") + if extra_config: + print(f" Extra Config: {extra_config}") + print("=" * 60) + + # Import and run the unified server + try: + import uvicorn + + from recipes.multimodal.server.unified_server import create_app + + app = create_app( + backend_type=args.backend, + model_path=args.model, + codec_model_path=args.codec_model or "", + batch_size=args.batch_size, + batch_timeout=args.batch_timeout, + device=args.device, + dtype=args.dtype, + extra_config=extra_config if extra_config else None, + ) + + uvicorn.run(app, host=args.host, port=args.port, log_level="info") + + except ImportError as e: + print(f"[serve_unified] Error: Failed to import unified server: {e}") + print("[serve_unified] Make sure the recipes.multimodal.server package is in PYTHONPATH") + sys.exit(1) + except Exception as e: + print(f"[serve_unified] Error: {e}") + import traceback + + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/nemo_skills/pipeline/utils/generation.py b/nemo_skills/pipeline/utils/generation.py index cd576053c1..4b45ed8a39 100644 --- a/nemo_skills/pipeline/utils/generation.py +++ b/nemo_skills/pipeline/utils/generation.py @@ -446,6 +446,9 @@ def configure_client( - server_address: Address of the server. - extra_arguments: Updated extra arguments for the command. """ + # Check if user already specified server.server_type in extra_arguments + user_specified_server_type = "++server.server_type=" in extra_arguments + if server_gpus: # we need to host the model server_port = get_free_port(strategy="random") if get_random_port else 5000 assert server_gpus is not None, "Need to specify server_gpus if hosting the model" @@ -462,14 +465,17 @@ def configure_client( } if server_container: server_config["container"] = server_container + # Only add server_type if user didn't specify it (allows vllm_multimodal override) + server_type_arg = "" if user_specified_server_type else f"++server.server_type={server_type} " extra_arguments = ( - f"{extra_arguments} ++server.server_type={server_type} ++server.host=127.0.0.1 " + f"{extra_arguments} {server_type_arg}++server.host=127.0.0.1 " f"++server.port={server_port} ++server.model={model} " ) else: # model is hosted elsewhere server_config = None + # Only add server_type if user didn't specify it + server_type_arg = "" if user_specified_server_type else f"++server.server_type={server_type} " extra_arguments = ( - f"{extra_arguments} ++server.server_type={server_type} " - f"++server.base_url={server_address} ++server.model={model} " + f"{extra_arguments} {server_type_arg}++server.base_url={server_address} ++server.model={model} " ) return server_config, server_address, extra_arguments diff --git a/nemo_skills/prompt/config/judge/mmau-pro.yaml b/nemo_skills/prompt/config/judge/mmau-pro.yaml new file mode 100644 index 0000000000..5339e4ab0d --- /dev/null +++ b/nemo_skills/prompt/config/judge/mmau-pro.yaml @@ -0,0 +1,30 @@ +# Judge prompt configuration for Speech/Audio Language Model evaluation +# Used for evaluating open-ended responses in MMAU-Pro benchmark +# Uses multi-criteria scoring on 1-5 scale + +user: |- + You are an expert evaluator for audio and speech-related questions. Please evaluate the quality of a model's response to a question. + + Question: {question} + + Reference Answer: {expected_answer} + + Model Response: {generation} + + Please evaluate the model response on the following criteria and provide scores from 1-5 (where 5 is best): + + 1. **Correctness**: How factually accurate is the response compared to the reference? + 2. **Relevance**: How well does the response address the specific question asked? + 3. **Completeness**: Does the response cover all important aspects mentioned in the reference? + 4. **Clarity**: How clear and well-structured is the response? + + For each criterion, provide: + - A score from 1-5 + - A brief justification (1-2 sentences) + + Format your response as: + CORRECTNESS: [score] - [justification] + RELEVANCE: [score] - [justification] + COMPLETENESS: [score] - [justification] + CLARITY: [score] - [justification] + OVERALL: [average score] - [overall assessment] diff --git a/nemo_skills/prompt/config/judge/speechlm.yaml b/nemo_skills/prompt/config/judge/speechlm.yaml deleted file mode 100644 index 4862558145..0000000000 --- a/nemo_skills/prompt/config/judge/speechlm.yaml +++ /dev/null @@ -1,28 +0,0 @@ -# Judge prompt configuration for Speech/Audio Language Model evaluation -# Used for evaluating open-ended responses in MMAU-Pro benchmark -# Follows nemo-skills standard Yes/No judgement pattern - -user: |- - You are an expert evaluator for audio and speech-related questions. Please evaluate whether the model's response correctly answers the question. - - Question: {question} - - Reference Answer: {expected_answer} - - Model Response: {generation} - - Your task is to determine if the model's response is correct based on the reference answer. Consider: - - 1. **Factual Accuracy**: Is the information in the response factually correct? - 2. **Relevance**: Does the response address the specific question asked? - 3. **Completeness**: Does the response cover the key points from the reference answer? - - Please first explain your reasoning in 2-3 sentences, then provide your final judgement. - - Your final judgement must be either "Yes" or "No": - - "Yes" if the model response is correct and adequately answers the question - - "No" if the model response is incorrect, irrelevant, or inadequate - - Format your response as: - Reasoning: [Your explanation] - Judgement: [Yes or No] diff --git a/recipes/multimodal/server/__init__.py b/recipes/multimodal/server/__init__.py new file mode 100644 index 0000000000..89a349346e --- /dev/null +++ b/recipes/multimodal/server/__init__.py @@ -0,0 +1,38 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Unified NeMo Inference Server package. + +Provides a pluggable FastAPI server that supports multiple NeMo model backends: +- SALM: Speech-Augmented Language Model (text output from text/audio input) +- TTS: Text-to-Speech (audio output from text input) +- S2S: Speech-to-Speech (text+audio output from audio input) +""" + +from .backends import ( + BackendConfig, + GenerationRequest, + GenerationResult, + InferenceBackend, + get_backend, +) + +__all__ = [ + "InferenceBackend", + "GenerationRequest", + "GenerationResult", + "BackendConfig", + "get_backend", +] diff --git a/recipes/multimodal/server/backends/__init__.py b/recipes/multimodal/server/backends/__init__.py new file mode 100644 index 0000000000..b098407fd6 --- /dev/null +++ b/recipes/multimodal/server/backends/__init__.py @@ -0,0 +1,83 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Backend implementations for the Unified NeMo Inference Server. + +Available backends: +- salm: Speech-Augmented Language Model (text output from text/audio input) +- tts: Text-to-Speech using MagpieTTS (audio output from text input) +- s2s: Speech-to-Speech using DuplexS2S offline (text output from audio input) +- s2s_incremental: Speech-to-Speech using NemotronVoiceChat incremental (text+audio from audio) +- s2s_session: Speech-to-Speech with session support for multi-turn conversations +""" + +from .base import BackendConfig, GenerationRequest, GenerationResult, InferenceBackend, Modality + +__all__ = [ + "InferenceBackend", + "GenerationRequest", + "GenerationResult", + "BackendConfig", + "Modality", + "get_backend", + "list_backends", +] + +# Registry of available backends +BACKEND_REGISTRY = { + "salm": ("salm_backend", "SALMBackend"), + "tts": ("tts_backend", "TTSBackend"), + "s2s": ("s2s_backend", "S2SBackend"), + "s2s_voicechat": ("s2s_voicechat_infer_backend", "S2SVoiceChatInferBackend"), + "s2s_incremental": ("s2s_incremental_backend", "S2SIncrementalBackend"), + "s2s_incremental_v2": ("s2s_incremental_backend_v2", "S2SIncrementalBackendV2"), + "s2s_session": ("s2s_session_backend", "S2SSessionBackend"), +} + + +def list_backends() -> list: + """Return list of available backend names.""" + return list(BACKEND_REGISTRY.keys()) + + +def get_backend(backend_name: str) -> type: + """ + Get backend class by name with lazy loading. + + Args: + backend_name: One of 'salm', 'tts', 's2s' + + Returns: + Backend class (not instance) + + Raises: + ValueError: If backend name is unknown + ImportError: If backend dependencies are not available + """ + if backend_name not in BACKEND_REGISTRY: + available = ", ".join(BACKEND_REGISTRY.keys()) + raise ValueError(f"Unknown backend: '{backend_name}'. Available backends: {available}") + + module_name, class_name = BACKEND_REGISTRY[backend_name] + + import importlib + + try: + module = importlib.import_module(f".{module_name}", package=__name__) + return getattr(module, class_name) + except ImportError as e: + raise ImportError( + f"Failed to import backend '{backend_name}'. Make sure required dependencies are installed. Error: {e}" + ) from e diff --git a/recipes/multimodal/server/backends/base.py b/recipes/multimodal/server/backends/base.py new file mode 100644 index 0000000000..2a4ae2a7b3 --- /dev/null +++ b/recipes/multimodal/server/backends/base.py @@ -0,0 +1,254 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Abstract base class for inference backends. + +All model backends (SALM, TTS, S2S, etc.) must implement this interface +to be usable with the unified inference server. +""" + +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from enum import Enum +from typing import Any, Dict, List, Optional, Set + + +class Modality(str, Enum): + """Supported input/output modalities.""" + + TEXT = "text" + AUDIO_IN = "audio_in" + AUDIO_OUT = "audio_out" + + +@dataclass +class BackendConfig: + """Base configuration for all backends.""" + + model_path: str + device: str = "cuda" + dtype: str = "bfloat16" + + # Generation defaults + max_new_tokens: int = 512 + temperature: float = 1.0 + top_p: float = 1.0 + top_k: Optional[int] = None + + # Additional model-specific configs passed through + extra_config: Dict[str, Any] = field(default_factory=dict) + + @classmethod + def from_dict(cls, d: Dict[str, Any]) -> "BackendConfig": + """Create config from dictionary, extracting known fields.""" + known_fields = {f.name for f in cls.__dataclass_fields__.values()} + known = {k: v for k, v in d.items() if k in known_fields and k != "extra_config"} + extra = {k: v for k, v in d.items() if k not in known_fields} + return cls(**known, extra_config=extra) + + +@dataclass +class GenerationRequest: + """ + A single generation request. + + Supports text and/or audio inputs depending on the backend's capabilities. + """ + + # Text inputs + text: Optional[str] = None + system_prompt: Optional[str] = None + user_prompt: Optional[str] = None + + # Audio input (raw bytes or file path) + audio_bytes: Optional[bytes] = None + audio_path: Optional[str] = None + sample_rate: int = 16000 + + # Multi-turn audio inputs (list of audio bytes or paths) + audio_bytes_list: Optional[List[bytes]] = None + audio_paths: Optional[List[str]] = None + + # Generation parameters (override backend defaults) + max_new_tokens: Optional[int] = None + temperature: Optional[float] = None + top_p: Optional[float] = None + top_k: Optional[int] = None + seed: Optional[int] = None + + # Additional parameters + extra_params: Dict[str, Any] = field(default_factory=dict) + + # Request tracking + request_id: Optional[str] = None + + +@dataclass +class GenerationResult: + """ + Result from a generation request. + + Contains text output and optionally audio output, plus metadata. + """ + + # Text output (agent response) + text: str = "" + + # ASR text output (user speech transcription from ASR channel) + asr_text: Optional[str] = None + + # Audio output (raw bytes, can be encoded to base64 for JSON) + audio_bytes: Optional[bytes] = None + audio_sample_rate: int = 16000 + audio_format: str = "wav" + + # Metadata + request_id: Optional[str] = None + num_tokens_generated: int = 0 + generation_time_ms: float = 0.0 + + # Debug info (optional, backend-specific) + debug_info: Optional[Dict[str, Any]] = None + + # Error handling + error: Optional[str] = None + + def is_success(self) -> bool: + return self.error is None + + +class InferenceBackend(ABC): + """ + Abstract base class for inference backends. + + Implementations must provide: + - load_model(): Initialize the model from config + - generate(): Run inference on a batch of requests + - supported_modalities: What input/output types are supported + + The unified server uses this interface to handle any backend uniformly. + """ + + def __init__(self, config: BackendConfig): + """ + Initialize the backend with configuration. + + Args: + config: Backend configuration including model path and generation defaults + """ + self.config = config + self._model = None + self._is_loaded = False + + @property + @abstractmethod + def name(self) -> str: + """Return the backend name (e.g., 'salm', 'tts', 's2s').""" + pass + + @property + @abstractmethod + def supported_modalities(self) -> Set[Modality]: + """ + Return the set of supported modalities. + + Examples: + - SALM: {TEXT, AUDIO_IN} - text output from text/audio input + - TTS: {TEXT, AUDIO_OUT} - audio output from text input + - S2S: {TEXT, AUDIO_IN, AUDIO_OUT} - audio+text output from audio input + """ + pass + + @abstractmethod + def load_model(self) -> None: + """ + Load and initialize the model. + + Should set self._model and self._is_loaded = True on success. + Called once during server startup. + + Raises: + RuntimeError: If model loading fails + """ + pass + + @abstractmethod + def generate(self, requests: List[GenerationRequest]) -> List[GenerationResult]: + """ + Run inference on a batch of requests. + + Args: + requests: List of generation requests to process + + Returns: + List of generation results, one per request (same order) + + Note: + - Implementations should handle batching internally + - Each result should have request_id matching the input + - On error, set result.error instead of raising + """ + pass + + @property + def is_loaded(self) -> bool: + """Check if the model is loaded and ready.""" + return self._is_loaded + + def health_check(self) -> Dict[str, Any]: + """ + Return health status information. + + Override to add backend-specific health info. + """ + return { + "backend": self.name, + "model_loaded": self._is_loaded, + "model_path": self.config.model_path, + "device": self.config.device, + "modalities": [m.value for m in self.supported_modalities], + } + + def get_generation_params(self, request: GenerationRequest) -> Dict[str, Any]: + """ + Get effective generation parameters, merging request with config defaults. + """ + return { + "max_new_tokens": request.max_new_tokens or self.config.max_new_tokens, + "temperature": request.temperature or self.config.temperature, + "top_p": request.top_p or self.config.top_p, + "top_k": request.top_k or self.config.top_k, + } + + def validate_request(self, request: GenerationRequest) -> Optional[str]: + """ + Validate a request against supported modalities. + + Returns: + Error message if invalid, None if valid + """ + modalities = self.supported_modalities + + has_text_input = request.text is not None + has_audio_input = request.audio_bytes is not None or request.audio_path is not None + + # Check input modalities + if has_audio_input and Modality.AUDIO_IN not in modalities: + return f"Backend '{self.name}' does not support audio input" + + if not has_text_input and not has_audio_input: + return "Request must have either text or audio input" + + return None diff --git a/recipes/multimodal/server/backends/s2s_backend.py b/recipes/multimodal/server/backends/s2s_backend.py new file mode 100644 index 0000000000..bc3fe34c27 --- /dev/null +++ b/recipes/multimodal/server/backends/s2s_backend.py @@ -0,0 +1,649 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Speech-to-Speech (S2S) offline backend using NemotronVoiceChat. + +Based on inference pattern from: +/lustre/fsw/portfolios/llmservice/users/kevinhu/s2s/NeMo/scripts/training/iad/s2s/sdv2_hf/conv/nano_9b/inf/infer_nano9b_s2s.sh +Config: infer_nano_eartts_Fullduplexbench_Jan22_2026.yaml + +This backend takes audio input and produces frame-synchronized TEXT output. +Uses NemotronVoiceChat model with decode_audio=False for text-only output. + +Output format: +- "text": Agent's generated response text +- "asr_hyps": User's transcribed text (via ASR scoring) +- "tokens_text": Raw text token IDs (frame-synchronized) + +Key parameters matching the latest inference recipe: +- extra_decoding_seconds: Additional decoding time (default 0 for FDB, 20 for Voicebench) +- inference_pad_boost, inference_bos_boost, inference_eos_boost: Token logit adjustments +""" + +import io +import os +import random +import re +import tempfile +import time +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional, Set, Tuple + +import numpy as np +import soundfile as sf +import torch + +from .base import ( + BackendConfig, + GenerationRequest, + GenerationResult, + InferenceBackend, + Modality, +) + + +@dataclass +class S2SConfig(BackendConfig): + """S2S-specific configuration matching the latest inference recipe.""" + + # Frame-based processing parameters + frame_length: float = 0.08 # 80ms frames + source_sample_rate: int = 16000 + target_sample_rate: int = 22050 # TTS sample rate + + # Role configuration + input_roles: List[str] = field(default_factory=lambda: ["user", "User"]) + output_roles: List[str] = field(default_factory=lambda: ["agent", "Assistant", "assistant", "Agent"]) + + # Model behavior + predict_user_text: bool = True # Also transcribe user speech + decode_audio: bool = True # Text-only output + + # Extra decoding time - duplex models need additional time to generate response + # Default 0 for Fullduplexbench, 20 for Voicebench + extra_decoding_seconds: float = 0.0 + + # Legacy parameter (kept for backward compatibility) + silence_padding_sec: float = 5.0 + + # Inference boost parameters (matching the latest recipe) + inference_pad_boost: float = 0.0 + inference_bos_boost: float = 0.0 + inference_eos_boost: float = 0.0 + + # Config path for model configuration YAML + config_path: Optional[str] = None + + # Speaker reference for TTS (required if decode_audio=True) + speaker_reference: Optional[str] = None + + # Checkpoint paths + stt_ckpt_path: Optional[str] = None # Separate STT checkpoint + tts_ckpt_path: Optional[str] = None # Separate TTS checkpoint + + # Code path to add to PYTHONPATH + code_path: Optional[str] = None + + @classmethod + def from_dict(cls, d: Dict[str, Any]) -> "S2SConfig": + """Create S2S config from dictionary.""" + known_fields = { + "model_path", + "device", + "dtype", + "max_new_tokens", + "temperature", + "top_p", + "top_k", + "frame_length", + "source_sample_rate", + "target_sample_rate", + "input_roles", + "output_roles", + "predict_user_text", + "decode_audio", + "extra_decoding_seconds", + "silence_padding_sec", + "inference_pad_boost", + "inference_bos_boost", + "inference_eos_boost", + "config_path", + "speaker_reference", + "stt_ckpt_path", + "tts_ckpt_path", + "code_path", + } + known = {k: v for k, v in d.items() if k in known_fields} + extra = {k: v for k, v in d.items() if k not in known_fields} + return cls(**known, extra_config=extra) + + +class S2SBackend(InferenceBackend): + """ + Speech-to-Text inference backend using NemotronVoiceChat. + + This model processes audio frame-by-frame and generates synchronized text. + With decode_audio=False (default), it produces text output only. + + Supports: + - Audio input (required) + - Text output (agent response, synchronized with audio frames) + - Optionally: User transcription (predict_user_text=True) + + The model generates text tokens at each audio frame, allowing for + real-time synchronization between input speech and generated response. + """ + + @property + def name(self) -> str: + return "s2s" + + @property + def supported_modalities(self) -> Set[Modality]: + return {Modality.TEXT, Modality.AUDIO_IN} + + def __init__(self, config: BackendConfig): + # Convert to S2S-specific config if needed + if isinstance(config, S2SConfig): + self.s2s_config = config + else: + self.s2s_config = S2SConfig.from_dict( + { + **{ + k: getattr(config, k) + for k in ["model_path", "device", "dtype", "max_new_tokens", "temperature", "top_p", "top_k"] + }, + **config.extra_config, + } + ) + + super().__init__(self.s2s_config) + + self._tokenizer = None + self._model_config = None + + def _add_code_path(self): + """Add code path to PYTHONPATH if specified.""" + import sys + + code_path = self.s2s_config.code_path + if code_path and code_path not in sys.path: + sys.path.insert(0, code_path) + print(f"[S2SBackend] Added {code_path} to PYTHONPATH") + + def _load_model_config(self) -> Dict[str, Any]: + """Load model configuration from YAML file.""" + from omegaconf import OmegaConf + + config_path = self.s2s_config.config_path + if config_path and os.path.exists(config_path): + print(f"[S2SBackend] Loading config from {config_path}") + cfg = OmegaConf.load(config_path) + return OmegaConf.to_container(cfg, resolve=True) + return None + + def _build_model_config(self) -> Dict[str, Any]: + """Build model configuration dict for NemotronVoiceChat.""" + # Start with loaded config or create minimal config + if self._model_config: + cfg = self._model_config + else: + # Build minimal config + cfg = { + "model": { + "stt": { + "model": { + "pretrained_s2s_model": self.config.model_path, + "predict_user_text": self.s2s_config.predict_user_text, + "inference_pad_boost": self.s2s_config.inference_pad_boost, + "inference_bos_boost": self.s2s_config.inference_bos_boost, + "inference_eos_boost": self.s2s_config.inference_eos_boost, + }, + "data": { + "source_sample_rate": self.s2s_config.source_sample_rate, + "target_sample_rate": self.s2s_config.target_sample_rate, + }, + "exp_manager": {"explicit_log_dir": "/tmp/s2s_inference"}, + }, + "speech_generation": { + "model": {}, + "data": { + "source_sample_rate": self.s2s_config.target_sample_rate, + "target_sample_rate": self.s2s_config.target_sample_rate, + }, + "exp_manager": {"explicit_log_dir": "/tmp/s2s_inference"}, + }, + "inference_speaker_reference": self.s2s_config.speaker_reference + or "/lustre/fsw/portfolios/convai/users/ecasanova/S2S-full-duplex/inference_references/Emma_S3_A1_SC7_singleturntarget_21_channel_1_audio_in.wav", + "extra_decoding_seconds": self.s2s_config.extra_decoding_seconds, + }, + "data": { + "frame_length": self.s2s_config.frame_length, + "source_sample_rate": self.s2s_config.source_sample_rate, + "target_sample_rate": self.s2s_config.target_sample_rate, + "input_roles": self.s2s_config.input_roles, + "output_roles": self.s2s_config.output_roles, + }, + "exp_manager": {"explicit_log_dir": "/tmp/s2s_inference"}, + } + + # Override with CLI parameters + if self.s2s_config.stt_ckpt_path: + cfg["model"]["stt"]["model"]["pretrained_s2s_model"] = self.s2s_config.stt_ckpt_path + elif self.config.model_path: + cfg.setdefault("model", {}).setdefault("stt", {}).setdefault("model", {})[ + "pretrained_s2s_model" + ] = self.config.model_path + + if self.s2s_config.tts_ckpt_path: + cfg["model"]["speech_generation"]["model"]["pretrained_model"] = self.s2s_config.tts_ckpt_path + + # Apply inference boosts + stt_model_cfg = cfg.get("model", {}).get("stt", {}).get("model", {}) + if self.s2s_config.inference_pad_boost: + stt_model_cfg["inference_pad_boost"] = self.s2s_config.inference_pad_boost + if self.s2s_config.inference_bos_boost: + stt_model_cfg["inference_bos_boost"] = self.s2s_config.inference_bos_boost + if self.s2s_config.inference_eos_boost: + stt_model_cfg["inference_eos_boost"] = self.s2s_config.inference_eos_boost + + # Apply extra_decoding_seconds + if self.s2s_config.extra_decoding_seconds: + cfg["model"]["extra_decoding_seconds"] = self.s2s_config.extra_decoding_seconds + + return cfg + + def load_model(self) -> None: + """Load the NemotronVoiceChat model.""" + print(f"[S2SBackend] Loading S2S model from {self.config.model_path}...") + + # Add code path if specified + self._add_code_path() + + try: + # Load config first + self._model_config = self._load_model_config() + model_config = self._build_model_config() + + from nemo.collections.speechlm2.models.nemotron_voicechat import NemotronVoiceChat + + print("[S2SBackend] Using NemotronVoiceChat model") + self._model = NemotronVoiceChat(model_config) + + # Move to device and set eval mode + self._model = self._model.eval() + + # Handle dtype + dtype = getattr(torch, self.config.dtype, torch.bfloat16) + if hasattr(self._model, "to"): + try: + self._model = self._model.to(dtype) + except Exception as e: + print(f"[S2SBackend] Warning: Could not convert to {dtype}: {e}") + + # Move to device + self._model = self._model.to(self.config.device) + + # Cache tokenizer + self._tokenizer = self._model.stt_model.tokenizer + + self._is_loaded = True + + print("[S2SBackend] Model loaded successfully") + print(f" Model path: {self.config.model_path}") + print(f" Device: {self.config.device}") + print(f" decode_audio: {self.s2s_config.decode_audio}") + print(f" Frame length: {self.s2s_config.frame_length}s") + print(f" Source sample rate: {self.s2s_config.source_sample_rate}") + print(f" Extra decoding seconds: {self.s2s_config.extra_decoding_seconds}") + print(f" Inference boosts: pad={self.s2s_config.inference_pad_boost}, " + f"bos={self.s2s_config.inference_bos_boost}, eos={self.s2s_config.inference_eos_boost}") + + except ImportError as e: + raise RuntimeError( + f"Failed to import NemotronVoiceChat. Make sure NeMo with speechlm2 " + f"collection is installed. Error: {e}" + ) + except Exception as e: + import traceback + + traceback.print_exc() + raise RuntimeError(f"Failed to load S2S model: {e}") + + def generate(self, requests: List[GenerationRequest]) -> List[GenerationResult]: + """Generate text responses from audio inputs with batching support.""" + if not self._is_loaded: + return [GenerationResult(error="Model not loaded", request_id=r.request_id) for r in requests] + + if not requests: + return [] + + start_time = time.time() + temp_files = [] + results: List[GenerationResult] = [None] * len(requests) + valid_indices = [] + audio_list = [] + system_prompts = [] + + try: + # Step 1: Load and preprocess all audio and collect system prompts + for i, req in enumerate(requests): + try: + audio = self._load_and_preprocess_audio(req, temp_files) + audio_list.append(audio) + valid_indices.append(i) + # Collect system prompt (may be None) + system_prompts.append(req.system_prompt) + except Exception as e: + import traceback + + traceback.print_exc() + results[i] = GenerationResult(error=str(e), request_id=req.request_id) + + if not audio_list: + return results + + # Step 2: Pad to max length and create batch tensor + max_len = max(a.shape[0] for a in audio_list) + batch_size = len(audio_list) + + batch_tensor = torch.zeros(batch_size, max_len, dtype=torch.float32) + lengths = torch.zeros(batch_size, dtype=torch.long) + + for i, audio in enumerate(audio_list): + batch_tensor[i, : len(audio)] = torch.from_numpy(audio).float() + lengths[i] = len(audio) + + # Move to device + batch_tensor = batch_tensor.to(self.config.device) + lengths = lengths.to(self.config.device) + + # Step 2.5: Tokenize system prompts if any are provided + prompt_tokens, prompt_token_lens = self._tokenize_system_prompts(system_prompts, batch_size) + + # Set seed if provided + first_seed = next((r.seed for r in requests if r.seed is not None), None) + if first_seed is not None: + self._set_seed(first_seed) + + # Calculate input_pad_len from extra_decoding_seconds + input_pad_len = int(self.s2s_config.extra_decoding_seconds * self.s2s_config.source_sample_rate) + + # Step 3: Run batched inference + with torch.no_grad(): + outputs = self._model.offline_inference( + input_signal=batch_tensor, + input_signal_lens=lengths, + prompt_tokens=prompt_tokens, + prompt_token_lens=prompt_token_lens, + decode_audio=self.s2s_config.decode_audio, + input_pad_len=input_pad_len, + ) + + # Diagnostic: when decode_audio=True, log whether model returned audio + if self.s2s_config.decode_audio: + out_keys = list(outputs.keys()) if isinstance(outputs, dict) else type(outputs).__name__ + has_audio = isinstance(outputs, dict) and outputs.get("audio") is not None + print(f"[S2SBackend] decode_audio=True | outputs keys: {out_keys} | has 'audio': {has_audio}") + + # Step 4: Parse outputs back to individual results + for batch_idx, req_idx in enumerate(valid_indices): + try: + result = self._parse_batch_output(outputs, batch_idx, requests[req_idx]) + results[req_idx] = result + except Exception as e: + import traceback + + traceback.print_exc() + results[req_idx] = GenerationResult(error=str(e), request_id=requests[req_idx].request_id) + + elapsed_ms = (time.time() - start_time) * 1000 + + # Update timing info + for result in results: + if result is not None and result.is_success(): + result.generation_time_ms = elapsed_ms / len(requests) + + finally: + # Clean up temp files + for path in temp_files: + if os.path.exists(path): + os.unlink(path) + + return results + + def _load_and_preprocess_audio(self, request: GenerationRequest, temp_files: List[str]) -> np.ndarray: + """Load and preprocess audio from a request.""" + if not request.audio_bytes and not request.audio_path: + raise ValueError("Audio input is required for S2S backend") + + # Handle audio input + audio_path = request.audio_path + if request.audio_bytes: + temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) + temp_file.write(request.audio_bytes) + temp_file.close() + audio_path = temp_file.name + temp_files.append(audio_path) + + # Load audio + audio, sr = sf.read(audio_path) + + # Ensure mono + if audio.ndim > 1: + audio = audio.mean(axis=1) + + # Resample if needed + if sr != self.s2s_config.source_sample_rate: + try: + import librosa + + audio = librosa.resample(audio, orig_sr=sr, target_sr=self.s2s_config.source_sample_rate) + except ImportError: + import torchaudio + + audio_tensor = torch.from_numpy(audio).float().unsqueeze(0) + audio_tensor = torchaudio.functional.resample(audio_tensor, sr, self.s2s_config.source_sample_rate) + audio = audio_tensor.squeeze(0).numpy() + + # Legacy: add silence padding if extra_decoding_seconds is not set but silence_padding_sec is + if self.s2s_config.extra_decoding_seconds <= 0 and self.s2s_config.silence_padding_sec > 0: + audio = self._add_silence_padding(audio, self.s2s_config.source_sample_rate) + + return audio + + def _clean_special_tokens(self, text: str) -> str: + """Remove special timing/frame tokens from model output. + + The S2S model outputs special tokens like: + - <$X.XX$> - energy/confidence markers + - <|X.XX|> - timing/duration markers + + These should be stripped for clean text output. + """ + if not text: + return text + # Remove <$X.XX$> patterns (energy/confidence) + text = re.sub(r'<\$[\d.]+\$>', '', text) + # Remove <|X.XX|> patterns (timing) + text = re.sub(r'<\|[\d.]+\|>', '', text) + # Clean up extra whitespace + text = re.sub(r'\s+', ' ', text).strip() + return text + + def _parse_batch_output( + self, outputs: Dict[str, Any], batch_idx: int, request: GenerationRequest + ) -> GenerationResult: + """Parse output for a specific batch index.""" + # Extract text output + text_output = "" + if "text" in outputs and outputs["text"]: + if isinstance(outputs["text"], list) and len(outputs["text"]) > batch_idx: + text_output = outputs["text"][batch_idx] + elif not isinstance(outputs["text"], list): + text_output = outputs["text"] + + # Clean special tokens from output + text_output = self._clean_special_tokens(text_output) + + # Count tokens if available + num_tokens = 0 + if "tokens_text" in outputs and outputs["tokens_text"] is not None: + try: + tokens = outputs["tokens_text"][batch_idx] + if hasattr(tokens, "cpu"): + tokens = tokens.cpu() + num_tokens = len(tokens) if hasattr(tokens, "__len__") else tokens.shape[0] + except (IndexError, TypeError): + pass + + # Audio output (when decode_audio=True); same contract as s2s_voicechat_infer_backend + out_audio_bytes = None + out_sr = int(self.s2s_config.target_sample_rate) + if self.s2s_config.decode_audio: + audio_out = outputs.get("audio") + audio_len = outputs.get("audio_len") + if audio_out is None and batch_idx == 0: + print("[S2SBackend] decode_audio=True but outputs has no 'audio' key; model may not return audio.") + if audio_out is not None: + try: + wav = audio_out[batch_idx] + if hasattr(wav, "detach"): + wav = wav.detach().float().cpu().numpy() + wav = np.asarray(wav).squeeze() + if audio_len is not None: + try: + n = int(audio_len[batch_idx].item()) + wav = wav[:n] + except Exception: + pass + max_val = float(np.max(np.abs(wav))) if wav.size else 0.0 + if max_val > 0: + wav = wav / max_val * 0.95 + buf = io.BytesIO() + sf.write(buf, wav, out_sr, format="WAV") + out_audio_bytes = buf.getvalue() + except Exception as e: + print(f"[S2SBackend] Warning: failed encoding audio for {request.request_id}: {e}") + + return GenerationResult( + text=text_output, + audio_bytes=out_audio_bytes, + audio_sample_rate=out_sr, + request_id=request.request_id, + num_tokens_generated=num_tokens, + ) + + def _tokenize_system_prompts( + self, system_prompts: List[Optional[str]], batch_size: int + ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: + """Tokenize system prompts for the batch. + + Args: + system_prompts: List of system prompts (may contain None values) + batch_size: Batch size + + Returns: + Tuple of (prompt_tokens, prompt_token_lens) or (None, None) if no prompts + """ + # Check if any system prompts are provided + if not any(p for p in system_prompts): + return None, None + + if self._tokenizer is None: + print("[S2SBackend] Warning: No tokenizer available, skipping system prompts") + return None, None + + # Tokenize all prompts + tokenized = [] + for prompt in system_prompts: + if prompt: + # Use tokenizer to convert text to token IDs + if hasattr(self._tokenizer, "text_to_ids"): + tokens = self._tokenizer.text_to_ids(prompt) + elif hasattr(self._tokenizer, "encode"): + tokens = self._tokenizer.encode(prompt) + else: + tokens = [] + tokenized.append(tokens) + else: + tokenized.append([]) + + # Get max length and pad + max_prompt_len = max(len(t) for t in tokenized) if tokenized else 0 + if max_prompt_len == 0: + return None, None + + # Get pad token ID + if hasattr(self._tokenizer, "pad_id"): + pad_id = self._tokenizer.pad_id + elif hasattr(self._tokenizer, "pad_token_id"): + pad_id = self._tokenizer.pad_token_id + else: + pad_id = 0 + + # Create padded tensor + prompt_tokens = torch.full((batch_size, max_prompt_len), pad_id, dtype=torch.long) + prompt_token_lens = torch.zeros(batch_size, dtype=torch.long) + + for i, tokens in enumerate(tokenized): + if tokens: + prompt_tokens[i, : len(tokens)] = torch.tensor(tokens, dtype=torch.long) + prompt_token_lens[i] = len(tokens) + + # Move to device + prompt_tokens = prompt_tokens.to(self.config.device) + prompt_token_lens = prompt_token_lens.to(self.config.device) + + return prompt_tokens, prompt_token_lens + + def _set_seed(self, seed: int) -> None: + """Set random seeds for reproducibility.""" + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + def _add_silence_padding(self, audio: np.ndarray, sample_rate: int) -> np.ndarray: + """Append silence to audio (legacy, prefer extra_decoding_seconds).""" + if self.s2s_config.silence_padding_sec <= 0: + return audio + silence_samples = int(self.s2s_config.silence_padding_sec * sample_rate) + return np.concatenate([audio, np.zeros(silence_samples, dtype=audio.dtype)]) + + def validate_request(self, request: GenerationRequest) -> Optional[str]: + """Validate S2S request.""" + if not request.audio_bytes and not request.audio_path: + return "Audio input is required for S2S backend" + return None + + def health_check(self) -> Dict[str, Any]: + """Return health status with S2S-specific info.""" + base = super().health_check() + if self._is_loaded: + base.update( + { + "frame_length": self.s2s_config.frame_length, + "source_sample_rate": self.s2s_config.source_sample_rate, + "extra_decoding_seconds": self.s2s_config.extra_decoding_seconds, + "inference_pad_boost": self.s2s_config.inference_pad_boost, + "inference_bos_boost": self.s2s_config.inference_bos_boost, + "inference_eos_boost": self.s2s_config.inference_eos_boost, + "decode_audio": self.s2s_config.decode_audio, + } + ) + return base diff --git a/recipes/multimodal/server/backends/s2s_incremental_backend.py b/recipes/multimodal/server/backends/s2s_incremental_backend.py new file mode 100644 index 0000000000..f0e728f9e5 --- /dev/null +++ b/recipes/multimodal/server/backends/s2s_incremental_backend.py @@ -0,0 +1,1122 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Incremental Speech-to-Speech (S2S) backend using NemotronVoiceChat. + +This backend processes audio frame-by-frame (80ms frames), simulating real-time +streaming behavior. It produces both text output (agent response + ASR) and +audio output via TTS. + +Based on: niva_s2s/niva/core/s2s/inference_streaming_realtime.py +Config: nanov2_demo_model_eartts_updated.yaml +""" + +import io +import json +import os +import shutil +import tempfile +import time +from dataclasses import dataclass +from datetime import datetime +from typing import Any, Dict, List, Optional, Set + +import numpy as np +import torch +import torchaudio +from omegaconf import DictConfig, OmegaConf +from transformers import DynamicCache + +from .base import ( + BackendConfig, + GenerationRequest, + GenerationResult, + InferenceBackend, + Modality, +) + +# Streaming parameters +SAMPLE_RATE = 16000 +FRAME_SIZE_SEC = 0.08 # 80ms per frame +FRAME_SIZE_SAMPLES = int(SAMPLE_RATE * FRAME_SIZE_SEC) # 1280 samples +TTS_SAMPLE_RATE = 22050 + +# Default hyper-parameters +DEFAULT_BUFFER_SIZE_FRAMES = 70 +DEFAULT_NUM_FRAMES_PER_INFERENCE = 1 +DEFAULT_CODEC_TOKEN_HISTORY_SIZE = 60 + + +@dataclass +class S2SIncrementalConfig(BackendConfig): + """Configuration for incremental S2S backend.""" + + # Config file path (YAML) + config_path: Optional[str] = None + + # Model paths (can override config) + llm_checkpoint_path: Optional[str] = None + tts_checkpoint_path: Optional[str] = None + speaker_reference: Optional[str] = None + + # Frame processing + buffer_size_frames: int = DEFAULT_BUFFER_SIZE_FRAMES + num_frames_per_inference: int = DEFAULT_NUM_FRAMES_PER_INFERENCE + codec_token_history_size: int = DEFAULT_CODEC_TOKEN_HISTORY_SIZE + silence_padding_sec: float = 5.0 + + # Turn-taking + force_turn_taking: bool = True + force_turn_taking_threshold: int = 40 + force_turn_taking_pad_window: int = 25 + + # Audio decoding + decode_audio: bool = True + + # Session artifacts saving + save_session_artifacts: bool = True # Whether to save input/output artifacts per session + session_artifacts_dir: str = "/tmp/s2s_sessions" # Directory to save session artifacts + + # Per-frame alignment output + output_frame_alignment: bool = False # Whether to include per-frame alignment in debug output + + # Response end detection (for session backend) + response_end_detection_mode: str = "audio_energy" # "audio_energy" or "eos" + audio_energy_threshold: float = 0.01 # RMS threshold for audio energy detection + audio_energy_window_sec: float = 0.5 # Window size for audio energy calculation + max_response_duration_sec: float = 30.0 # Maximum response duration before forced stop + eos_detection_window: int = 10 # Consecutive PAD tokens to detect EOS + + @classmethod + def from_dict(cls, d: Dict[str, Any]) -> "S2SIncrementalConfig": + """Create config from dictionary.""" + known_fields = { + "model_path", + "device", + "dtype", + "max_new_tokens", + "temperature", + "top_p", + "top_k", + "config_path", + "llm_checkpoint_path", + "tts_checkpoint_path", + "speaker_reference", + "buffer_size_frames", + "num_frames_per_inference", + "codec_token_history_size", + "silence_padding_sec", + "force_turn_taking", + "force_turn_taking_threshold", + "force_turn_taking_pad_window", + "decode_audio", + "save_session_artifacts", + "session_artifacts_dir", + "output_frame_alignment", + "response_end_detection_mode", + "audio_energy_threshold", + "audio_energy_window_sec", + "max_response_duration_sec", + "eos_detection_window", + } + known = {k: v for k, v in d.items() if k in known_fields} + extra = {k: v for k, v in d.items() if k not in known_fields} + return cls(**known, extra_config=extra) + + +class S2SIncrementalBackend(InferenceBackend): + """ + Incremental Speech-to-Speech backend using NemotronVoiceChat. + + Processes audio frame-by-frame and generates synchronized text + audio output. + """ + + @property + def name(self) -> str: + return "s2s_incremental" + + @property + def supported_modalities(self) -> Set[Modality]: + return {Modality.TEXT, Modality.AUDIO_IN, Modality.AUDIO_OUT} + + def __init__(self, config: BackendConfig): + if isinstance(config, S2SIncrementalConfig): + self.inc_config = config + else: + self.inc_config = S2SIncrementalConfig.from_dict( + { + **{ + k: getattr(config, k) + for k in ["model_path", "device", "dtype", "max_new_tokens", "temperature", "top_p", "top_k"] + }, + **config.extra_config, + } + ) + + super().__init__(self.inc_config) + + self._tokenizer = None + self._model_cfg = None + + # TTS state + self.first_context_subword_id = None + self.generation_config = None + self.first_tts_code_input = None + self.first_tts_past_key_values_input = None + self.target_sample_rate = TTS_SAMPLE_RATE + self.target_fps = None + + def _resolve_dtype(self, compute_dtype): + """Resolve dtype string to torch dtype.""" + if isinstance(compute_dtype, torch.dtype): + return compute_dtype + if compute_dtype is None: + return torch.bfloat16 + if isinstance(compute_dtype, str): + mapping = { + "bfloat16": torch.bfloat16, + "bf16": torch.bfloat16, + "float16": torch.float16, + "fp16": torch.float16, + "float32": torch.float32, + "fp32": torch.float32, + } + return mapping.get(compute_dtype.lower(), torch.bfloat16) + return torch.bfloat16 + + def _load_and_merge_configs(self): + """Load and merge configurations from checkpoint and YAML config.""" + import json + + model_path = self.inc_config.tts_checkpoint_path or self.config.model_path + llm_path = self.inc_config.llm_checkpoint_path or self.config.model_path + + # Load nano's config (for LLM, perception) + nano_config_file = os.path.join(llm_path, "config.json") + print(f"[S2SIncremental] Loading nano config: {nano_config_file}") + with open(nano_config_file, "r") as f: + nano_cfg_dict = json.load(f) + nano_cfg = DictConfig(nano_cfg_dict) + + # Load eartts's config (for TTS) if different path + if model_path != llm_path: + eartts_config_file = os.path.join(model_path, "config.json") + print(f"[S2SIncremental] Loading eartts config: {eartts_config_file}") + with open(eartts_config_file, "r") as f: + eartts_cfg_dict = json.load(f) + eartts_cfg = DictConfig(eartts_cfg_dict) + + # Merge TTS config + if "model" in eartts_cfg and "speech_generation" in eartts_cfg.model: + nano_cfg.model.speech_generation = eartts_cfg.model.speech_generation + if "data" not in nano_cfg: + nano_cfg.data = eartts_cfg.data + + # Set speaker reference + speaker_ref = self.inc_config.speaker_reference + if not speaker_ref and self.inc_config.config_path: + # Try to get from YAML config + yaml_cfg = OmegaConf.load(self.inc_config.config_path) + speaker_ref = yaml_cfg.get("model", {}).get("inference_speaker_reference") + + if speaker_ref: + if "model" not in nano_cfg: + nano_cfg.model = {} + nano_cfg.model.inference_speaker_reference = speaker_ref + + return nano_cfg + + def load_model(self) -> None: + """Load the NemotronVoiceChat model with TTS support.""" + from safetensors.torch import load_file + + print(f"[S2SIncremental] Loading model from {self.config.model_path}...") + + try: + from nemo.collections.speechlm2.models.nemotron_voicechat import NemotronVoiceChat + from nemo.collections.speechlm2.parts.pretrained import set_model_dict_for_partial_init + except ImportError as e: + raise RuntimeError( + f"Failed to import NemotronVoiceChat. Make sure NeMo with speechlm2 " + f"collection is installed. Error: {e}" + ) + + # Set precision settings + torch.backends.cudnn.allow_tf32 = True + torch.backends.cuda.matmul.allow_tf32 = True + torch.set_float32_matmul_precision("high") + + # Load and merge configs + cfg = self._load_and_merge_configs() + self._model_cfg = cfg + + # Don't use pretrained paths - we'll load weights manually + cfg.model.stt.model.pretrained_s2s_model = None + if hasattr(cfg.model, "speech_generation") and hasattr(cfg.model.speech_generation, "model"): + cfg.model.speech_generation.model.pretrained_model = None + + cfg_dict = OmegaConf.to_container(cfg, resolve=True) + + # Initialize model structure + print("[S2SIncremental] Initializing model structure...") + self._model = NemotronVoiceChat(cfg_dict) + + # Load LLM + perception weights + model_path = self.config.model_path + llm_path = self.inc_config.llm_checkpoint_path or model_path + tts_path = self.inc_config.tts_checkpoint_path or model_path + + if llm_path: + safetensors_path = os.path.join(llm_path, "model.safetensors") + if os.path.exists(safetensors_path): + print(f"[S2SIncremental] Loading LLM weights from: {llm_path}") + nano_state_dict = load_file(safetensors_path) + + # Filter out TTS weights + tts_keys = ["tts_model.", "speech_generation."] + nano_filtered = { + k: v for k, v in nano_state_dict.items() if not any(k.startswith(prefix) for prefix in tts_keys) + } + + nano_filtered = set_model_dict_for_partial_init(nano_filtered, self._model.state_dict()) + self._model.load_state_dict(nano_filtered, strict=False) + + # Load TTS weights (always load, even if from same path as LLM since we filtered them out above) + if tts_path: + safetensors_path = os.path.join(tts_path, "model.safetensors") + if os.path.exists(safetensors_path): + print(f"[S2SIncremental] Loading TTS weights from: {tts_path}") + tts_state_dict = load_file(safetensors_path) + + tts_keys_filter = ["tts_model."] + tts_only = { + k: v for k, v in tts_state_dict.items() if any(k.startswith(prefix) for prefix in tts_keys_filter) + } + print(f"[S2SIncremental] Loading {len(tts_only)} TTS parameters") + + self._model.load_state_dict(tts_only, strict=False) + + # Setup model + self.dtype = self._resolve_dtype(self.config.dtype) + self._model.to(self.config.device) + self._model.eval() + + # Convert S2S components to configured dtype (keep TTS in float32) + print(f"[S2SIncremental] Converting S2S components to {self.dtype}") + self._model.stt_model.llm = self._model.stt_model.llm.to(self.dtype) + self._model.stt_model.lm_head = self._model.stt_model.lm_head.to(self.dtype) + self._model.stt_model.embed_tokens = self._model.stt_model.embed_tokens.to(self.dtype) + self._model.stt_model.asr_head = self._model.stt_model.asr_head.to(self.dtype) + self._model.stt_model.embed_asr_tokens = self._model.stt_model.embed_asr_tokens.to(self.dtype) + + self._model.on_train_epoch_start() + self._tokenizer = self._model.stt_model.tokenizer + + # Get TTS info + if hasattr(self._model, "tts_model") and self.inc_config.decode_audio: + self.target_fps = self._model.tts_model.target_fps + self.target_sample_rate = self._model.tts_model.target_sample_rate + print(f"[S2SIncremental] TTS: fps={self.target_fps}, sample_rate={self.target_sample_rate}") + self._prepare_tts_initial_state() + + self._is_loaded = True + print("[S2SIncremental] Model loaded successfully") + + def _get_bos_embedding(self): + """Get beginning of sequence embedding.""" + text_bos = torch.full((1,), fill_value=self._model.stt_model.text_pad_id, device=self.config.device) + input_embeds = self._model.stt_model.embed_tokens(text_bos) + return input_embeds.to(dtype=self.dtype) + + def _get_asr_bos_embedding(self): + """Get ASR BOS embedding.""" + text_bos = torch.full((1,), fill_value=self._model.stt_model.text_pad_id, device=self.config.device) + input_embeds = self._model.stt_model.embed_asr_tokens(text_bos) + return input_embeds.to(dtype=self.dtype) + + def _clone_cache(self, cache): + """Deep clone cache structures.""" + if cache is None: + return None + if isinstance(cache, torch.Tensor): + return cache.detach().clone() + if isinstance(cache, (list, tuple)): + return type(cache)(self._clone_cache(x) for x in cache) + if isinstance(cache, dict): + return {k: self._clone_cache(v) for k, v in cache.items()} + if hasattr(cache, "__dict__"): + import copy + + return copy.deepcopy(cache) + return cache + + def _decode_single_token(self, token_id: int, pad_id: int) -> str: + """Decode a single token to text.""" + try: + # Use ids_to_tokens which properly handles special tokens + tokens = self._tokenizer.ids_to_tokens([token_id]) + if tokens: + token_str = tokens[0] + # Replace Ġ with space for readability + token_str = token_str.replace("Ġ", " ") + return token_str + return f"" + except Exception: + return f"" + + def _init_frame_alignment(self) -> Dict[str, list]: + """Initialize frame alignment as dict of lists for space efficiency.""" + return { + "frame_idx": [], + "user_stream": [], + "agent_stream_token": [], + "agent_stream_decoded": [], + "asr_stream_token": [], + "asr_stream_decoded": [], + "is_tts_stop": [], + } + + def _append_frame_alignment( + self, + frame_alignment: Dict[str, list], + frame_idx: int, + phase: str, + gen_text: torch.Tensor, + gen_asr_text: torch.Tensor, + pad_id: int, + is_tts_stop: bool = False, + ) -> None: + """Append per-frame alignment information to dict of lists.""" + agent_token = gen_text[0, frame_idx].item() if frame_idx < gen_text.shape[1] else pad_id + asr_token = gen_asr_text[0, frame_idx].item() if frame_idx < gen_asr_text.shape[1] else pad_id + frame_alignment["frame_idx"].append(frame_idx) + frame_alignment["user_stream"].append(phase) + frame_alignment["agent_stream_token"].append(agent_token) + frame_alignment["agent_stream_decoded"].append(self._decode_single_token(agent_token, pad_id)) + frame_alignment["asr_stream_token"].append(asr_token) + frame_alignment["asr_stream_decoded"].append(self._decode_single_token(asr_token, pad_id)) + frame_alignment["is_tts_stop"].append(is_tts_stop) + + def _get_artifacts_dir(self, request_id: str) -> Optional[str]: + """Get or create artifacts directory for this request.""" + if not self.inc_config.save_session_artifacts: + return None + base_dir = self.inc_config.session_artifacts_dir + artifacts_dir = os.path.join(base_dir, request_id) + os.makedirs(artifacts_dir, exist_ok=True) + return artifacts_dir + + def _save_artifacts( + self, + artifacts_dir: str, + input_audio_path: str, + output_text: str, + output_audio_bytes: Optional[bytes], + debug_info: Dict[str, Any], + generation_time_ms: float, + ) -> Dict[str, str]: + """Save input/output artifacts to disk.""" + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + # Copy input audio + input_dest = os.path.join(artifacts_dir, f"{timestamp}_input.wav") + shutil.copy2(input_audio_path, input_dest) + + # Save output audio + output_audio_path = None + if output_audio_bytes: + output_audio_path = os.path.join(artifacts_dir, f"{timestamp}_output.wav") + with open(output_audio_path, "wb") as f: + f.write(output_audio_bytes) + + # Save output JSON + output_json_path = os.path.join(artifacts_dir, f"{timestamp}_output.json") + with open(output_json_path, "w") as f: + json.dump( + { + "timestamp": timestamp, + "text": output_text, + "audio_path": output_audio_path, + "debug_info": debug_info, + "generation_time_ms": generation_time_ms, + }, + f, + indent=2, + ) + + return {"artifacts_dir": artifacts_dir, "input_path": input_dest, "output_path": output_audio_path} + + def _generate_dual_channel_audio( + self, + artifacts_dir: str, + input_audio_path: str, + output_audio_bytes: Optional[bytes], + ) -> Optional[str]: + """Generate 2-channel audio (user=ch0, agent=ch1).""" + import soundfile as sf + + if not output_audio_bytes: + return None + + output_sr = TTS_SAMPLE_RATE + + # Load user audio + try: + user_audio, user_sr = sf.read(input_audio_path) + if user_sr != output_sr: + import scipy.signal + + user_audio = scipy.signal.resample(user_audio, int(len(user_audio) * output_sr / user_sr)) + if len(user_audio.shape) > 1: + user_audio = user_audio[:, 0] + except Exception as e: + print(f"[S2SIncremental] Error reading user audio: {e}") + return None + + # Load agent audio + try: + agent_audio, agent_sr = sf.read(io.BytesIO(output_audio_bytes)) + if agent_sr != output_sr: + import scipy.signal + + agent_audio = scipy.signal.resample(agent_audio, int(len(agent_audio) * output_sr / agent_sr)) + if len(agent_audio.shape) > 1: + agent_audio = agent_audio[:, 0] + except Exception as e: + print(f"[S2SIncremental] Error reading agent audio: {e}") + return None + + # Create 2-channel audio (zero-padded to max length) + max_len = max(len(user_audio), len(agent_audio)) + stereo = np.zeros((max_len, 2), dtype=np.float32) + stereo[: len(user_audio), 0] = user_audio + stereo[: len(agent_audio), 1] = agent_audio + + # Normalize + max_val = np.abs(stereo).max() + if max_val > 0: + stereo = stereo / max_val * 0.95 + + output_path = os.path.join(artifacts_dir, "dual_channel.wav") + sf.write(output_path, stereo, output_sr) + print(f"[S2SIncremental] Generated dual-channel audio: {output_path}") + return output_path + + def _prepare_tts_initial_state(self): + """Prepare TTS warmup state with speaker reference.""" + from nemo.collections.audio.parts.utils.resampling import resample + from nemo.collections.speechlm2.parts.precision import fp32_precision + + if not hasattr(self._model, "tts_model"): + return + + speaker_ref = None + if self._model_cfg and hasattr(self._model_cfg, "model"): + speaker_ref = self._model_cfg.model.get("inference_speaker_reference") + if not speaker_ref: + speaker_ref = self.inc_config.speaker_reference + + if not speaker_ref: + print("[S2SIncremental] Warning: No speaker reference, TTS disabled") + return + + print(f"[S2SIncremental] Preparing TTS with speaker: {speaker_ref}") + + with fp32_precision(): + speaker_audio, speaker_sr = torchaudio.load(speaker_ref) + speaker_audio = resample(speaker_audio, speaker_sr, self._model.tts_model.target_sample_rate) + + speaker_audio = speaker_audio.to(self.config.device) + speaker_audio_lens = torch.tensor([speaker_audio.size(1)], device=self.config.device).long() + + self._model.tts_model.set_init_inputs( + speaker_audio=speaker_audio, + speaker_audio_lens=speaker_audio_lens, + ) + init_inputs = self._model.tts_model.get_init_inputs(B=1) + + self.generation_config = self._model.tts_model._get_generation_config(guidance_enabled=True) + init_inputs.update({"use_cache": True, "past_key_values": None, "guidance_enabled": True}) + + # Debug: print generation config + print(f"[S2SIncremental] TTS generation_config: {self.generation_config}") + + with torch.no_grad(): + outputs = self._model.tts_model.tts_model(**init_inputs) + code = init_inputs["code"][:, -1:] + + self.first_context_subword_id = init_inputs["subword_ids"][:, -1].unsqueeze(-1) + self.first_tts_code_input = code.detach().clone() + self.first_tts_past_key_values_input = self._clone_cache(outputs.past_key_values) + + # Debug: print TTS state shapes + print(f"[S2SIncremental] first_context_subword_id shape: {self.first_context_subword_id.shape}") + print(f"[S2SIncremental] first_context_subword_id value: {self.first_context_subword_id}") + print(f"[S2SIncremental] first_tts_code_input shape: {self.first_tts_code_input.shape}") + print(f"[S2SIncremental] first_tts_code_input value: {self.first_tts_code_input}") + print(f"[S2SIncremental] codec_silence_tokens: {self._model.tts_model.codec_silence_tokens}") + print(f"[S2SIncremental] codec_token_history_size: {self.inc_config.codec_token_history_size}") + + print("[S2SIncremental] TTS warmup state prepared") + + def _samples_per_audio_output_frame(self): + """Calculate samples per audio output frame.""" + rate = self.target_sample_rate or TTS_SAMPLE_RATE + return int(float(rate) * FRAME_SIZE_SEC) + + def _update_audio_buffer(self, audio_buffer, buffer_fill_level, new_audio, buffer_size_samples): + """Update sliding window audio buffer.""" + if new_audio.shape[1] == 0: + current_buffer = audio_buffer[:, :buffer_fill_level] + return audio_buffer, buffer_fill_level, current_buffer + + remaining = new_audio + + if buffer_fill_level < buffer_size_samples and remaining.shape[1] > 0: + warmup_take = min(buffer_size_samples - buffer_fill_level, remaining.shape[1]) + if warmup_take > 0: + audio_buffer[:, buffer_fill_level : buffer_fill_level + warmup_take] = remaining[:, :warmup_take] + buffer_fill_level += warmup_take + remaining = remaining[:, warmup_take:] + + if remaining.shape[1] > 0: + if remaining.shape[1] >= buffer_size_samples: + audio_buffer = remaining[:, -buffer_size_samples:] + else: + audio_buffer = torch.cat([audio_buffer[:, remaining.shape[1] :], remaining], dim=1) + buffer_fill_level = buffer_size_samples + + current_buffer = ( + audio_buffer if buffer_fill_level == buffer_size_samples else audio_buffer[:, :buffer_fill_level] + ) + return audio_buffer, buffer_fill_level, current_buffer + + def _maybe_apply_forced_turn_taking(self, t, gen_text, gen_asr): + """Apply forced turn-taking rules based on ASR tokens.""" + if not self.inc_config.force_turn_taking: + return + + threshold = self.inc_config.force_turn_taking_threshold + pad_window_steps = self.inc_config.force_turn_taking_pad_window + + B = gen_text.size(0) + for batch_idx in range(B): + lookback_start = max(0, t - threshold) + agent_text_window = gen_text[batch_idx, lookback_start:t] + + if t < pad_window_steps: + continue + + pad_lookback_start = t - pad_window_steps + asr_recent_tokens = gen_asr[batch_idx, pad_lookback_start:t] + has_pad_window = ( + (asr_recent_tokens == self._model.stt_model.text_pad_id).all() if len(asr_recent_tokens) > 0 else False + ) + + if has_pad_window and pad_lookback_start > 0: + token_before_window = gen_asr[batch_idx, pad_lookback_start - 1] + has_pad_window = token_before_window != self._model.stt_model.text_pad_id + elif has_pad_window and pad_lookback_start == 0: + has_pad_window = False + + if has_pad_window: + if not (agent_text_window == self._model.stt_model.text_bos_id).any(): + gen_text[batch_idx, t] = self._model.stt_model.text_bos_id + + def infer_one_step( + self, + audio_input, + num_frames_per_inference, + frame_idx, + gen_text, + audio_toks_buffer, + input_embeds_history, + dynamic_cache, + embedding_position, + past_key_values, + code, + subword_mask, + gen_asr_text, + ): + """Process one inference step (potentially multiple frames).""" + from nemo.collections.speechlm2.parts.precision import fp32_precision + + use_cache = dynamic_cache is not None + batch_size = gen_text.shape[0] + device = self.config.device + + predicted_tokens = torch.empty((batch_size, num_frames_per_inference), dtype=gen_text.dtype, device=device) + asr_predicted_tokens = torch.empty((batch_size, num_frames_per_inference), dtype=gen_text.dtype, device=device) + tts_silence_mask = torch.ones((batch_size, num_frames_per_inference), dtype=torch.bool, device=device) + + # Perception step + buffer_len = torch.tensor([audio_input.shape[1]], dtype=torch.long, device=device) + source_encoded, _, _ = self._model.stt_model.perception( + input_signal=audio_input, + input_signal_length=buffer_len, + return_encoder_emb=True, + ) + source_encoded = source_encoded.to(self.dtype) + total_encoded_frames = source_encoded.shape[1] + + if embedding_position < 0: + newest_frame_index = total_encoded_frames + embedding_position + else: + newest_frame_index = embedding_position + + base_frame_index = newest_frame_index - (num_frames_per_inference - 1) + base_frame_index = max(base_frame_index, 0) + + new_input_embeds = [] + decode_audio = self.inc_config.decode_audio and hasattr(self._model, "tts_model") + + for chunk_offset in range(num_frames_per_inference): + current_frame_idx = frame_idx + chunk_offset + current_frame_index = min(base_frame_index + chunk_offset, total_encoded_frames - 1) + current_frame_embedding = source_encoded[:, current_frame_index : current_frame_index + 1, :] + + current_input_emb = current_frame_embedding.clone() + current_input_emb *= self._model.stt_model.cfg.get("duplex_nano_channel_weight", 1.0) + + if current_frame_idx == 0: + current_input_emb += self._get_bos_embedding() + current_input_emb += self._get_asr_bos_embedding() + else: + last_token_emb = self._model.stt_model.embed_tokens(gen_text[:, current_frame_idx - 1]) + current_input_emb += last_token_emb + last_asr_token_emb = self._model.stt_model.embed_asr_tokens(gen_asr_text[:, current_frame_idx - 1]) + current_input_emb += last_asr_token_emb + + # Forward pass + if use_cache: + ans = self._model.stt_model(current_input_emb, cache=dynamic_cache) + dynamic_cache = ans["cache"] + else: + new_input_embeds.append(current_input_emb) + full_input_embeds = torch.cat(input_embeds_history + new_input_embeds, dim=1) + ans = self._model.stt_model(full_input_embeds, cache=None) + + # Sample tokens + predicted_token = ans["text_logits"][:, -1].argmax(dim=-1) + asr_predicted_token = ans["asr_logits"][:, -1].argmax(dim=-1) + + gen_text[:, current_frame_idx] = predicted_token + predicted_tokens[:, chunk_offset] = predicted_token + gen_asr_text[:, current_frame_idx] = asr_predicted_token + asr_predicted_tokens[:, chunk_offset] = asr_predicted_token + + # Apply turn-taking + self._maybe_apply_forced_turn_taking(current_frame_idx, gen_text, gen_asr_text) + predicted_tokens[:, chunk_offset] = gen_text[:, current_frame_idx] + + # TTS step + if decode_audio and self.generation_config is not None: + current_subword_id = gen_text[:, current_frame_idx].unsqueeze(-1) + + if current_frame_idx == 0: + prev_subword_id = self.first_context_subword_id + else: + prev_subword_id = gen_text[:, current_frame_idx - 1].unsqueeze(-1) + + current_subword_mask = subword_mask[:, current_frame_idx].unsqueeze(-1) + + # Debug TTS inputs for first few frames + if current_frame_idx < 3: + print( + f"[DEBUG TTS frame {current_frame_idx}] current_subword_id: {current_subword_id.item()}, prev_subword_id: {prev_subword_id.item()}" + ) + print(f"[DEBUG TTS frame {current_frame_idx}] current_subword_mask: {current_subword_mask}") + print( + f"[DEBUG TTS frame {current_frame_idx}] prev_audio_tokens shape: {code.shape}, values: {code[0, 0, :5]}" + ) + + code, past_key_values = self._model.tts_model.infer_codes_one_step( + current_subword_id=current_subword_id, + prev_subword_id=prev_subword_id, + current_subword_mask=current_subword_mask, + prev_audio_tokens=code, + past_key_values=past_key_values, + guidance_enabled=True, + generation_config=self.generation_config, + ignore_eos_flag_stop=True, + ) + + # Debug TTS output for first few frames + if current_frame_idx < 3: + print(f"[DEBUG TTS frame {current_frame_idx}] NEW code: {code[0, 0, :5]}") + + audio_toks_buffer = torch.cat([audio_toks_buffer[:, 1:], code], dim=1) + + # Handle silence on EOS + if self._model.cfg.get("inference_force_speech_silence_on_eos", None): + silence_codes = self._model.tts_model.codec_silence_tokens.view(1, 1, -1).expand(code.shape) + code = torch.where( + current_subword_id.unsqueeze(-1) == self._model.tts_model.text_eos_id, + silence_codes, + code, + ) + + # Mark whether this frame's generated codec tokens correspond to silence + # (shape of `code` is [B, 1, C]; silence_codes is broadcast-compatible). + try: + silence_codes = self._model.tts_model.codec_silence_tokens.view(1, 1, -1).expand(code.shape) + tts_silence_mask[:, chunk_offset] = (code == silence_codes).all(dim=-1).squeeze(1) + except Exception: + # If something goes wrong (unexpected shapes), keep default True (treat as silence) + tts_silence_mask[:, chunk_offset] = True + + # Decode audio + decoded_audio_new = None + if decode_audio and audio_toks_buffer is not None: + samples_per_frame = self._samples_per_audio_output_frame() + len_audio_toks_buffer = torch.tensor( + [self.inc_config.codec_token_history_size], dtype=torch.long, device=device + ) + + # Debug: print audio_toks_buffer info + if frame_idx == 0: + print(f"[DEBUG] audio_toks_buffer shape: {audio_toks_buffer.shape}") + print(f"[DEBUG] audio_toks_buffer dtype: {audio_toks_buffer.dtype}") + print(f"[DEBUG] audio_toks_buffer sample values: {audio_toks_buffer[0, :3, :5]}") + + with fp32_precision(), torch.no_grad(): + decoded_audio, _ = self._model.tts_model.audio_codec.decode(audio_toks_buffer, len_audio_toks_buffer) + + # Debug: print decoded audio info + if frame_idx == 0: + print(f"[DEBUG] decoded_audio shape: {decoded_audio.shape}") + print(f"[DEBUG] decoded_audio dtype: {decoded_audio.dtype}") + print(f"[DEBUG] decoded_audio min/max: {decoded_audio.min():.4f} / {decoded_audio.max():.4f}") + + decoded_audio_new = decoded_audio[:, :, -samples_per_frame * num_frames_per_inference :] + + # Convert tokens to text + predicted_text_strs = [] + for predicted_tok_ids_b in predicted_tokens: + toks = self._tokenizer.ids_to_tokens(predicted_tok_ids_b.tolist()) + toks = [t.replace("", "").replace("Ġ", " ") for t in toks] + predicted_text_strs.append("".join(toks)) + + asr_predicted_text_strs = [] + for asr_tok_ids_b in asr_predicted_tokens: + toks = self._tokenizer.ids_to_tokens(asr_tok_ids_b.tolist()) + toks = [t.replace("", "").replace("Ġ", " ") for t in toks] + asr_predicted_text_strs.append("".join(toks)) + + return { + "predicted_text_tokens": predicted_tokens, + "asr_predicted_text_tokens": asr_predicted_tokens, + "audio_toks_buffer": audio_toks_buffer, + "decoded_audio_new": decoded_audio_new, + "predicted_text_strs": predicted_text_strs, + "asr_predicted_text_strs": asr_predicted_text_strs, + "tts_silence_mask": tts_silence_mask, + "input_embeds_history": input_embeds_history + new_input_embeds if not use_cache else input_embeds_history, + "dynamic_cache": dynamic_cache if use_cache else None, + "past_key_values": past_key_values, + "code": code, + } + + @torch.no_grad() + def inference_realtime_streaming(self, audio_path: str, num_frames_per_inference: int = None): + """ + Perform incremental streaming inference on audio file. + + Args: + audio_path: Path to input audio file + num_frames_per_inference: Frames to process per step (default: 1) + + Returns: + Dict with 'text', 'asr_text', 'audio' outputs + """ + import librosa + from nemo.collections.speechlm2.models.duplex_s2s_model import tokens_to_str + + if num_frames_per_inference is None: + num_frames_per_inference = self.inc_config.num_frames_per_inference + + device = self.config.device + buffer_size_frames = self.inc_config.buffer_size_frames + buffer_size_samples = buffer_size_frames * FRAME_SIZE_SAMPLES + + # Load audio + audio_signal, sr = librosa.load(audio_path, sr=SAMPLE_RATE) + + # Add silence padding + if self.inc_config.silence_padding_sec > 0: + silence_samples = int(self.inc_config.silence_padding_sec * SAMPLE_RATE) + audio_signal = np.concatenate([audio_signal, np.zeros(silence_samples)]) + + total_samples = len(audio_signal) + + # Calculate frames + total_frames_maybe = int(np.ceil(total_samples / FRAME_SIZE_SAMPLES)) + num_inference_steps = total_frames_maybe // num_frames_per_inference + if total_frames_maybe % num_frames_per_inference != 0: + num_inference_steps += 1 + total_frames = num_inference_steps * num_frames_per_inference + + # Pad audio + padded_samples = num_inference_steps * num_frames_per_inference * FRAME_SIZE_SAMPLES + if padded_samples > total_samples: + audio_signal = np.pad(audio_signal, (0, padded_samples - total_samples)) + + # Audio must be float32 for AudioPreprocessor accuracy + audio_tensor = torch.tensor(audio_signal, dtype=torch.float32, device=device).unsqueeze(0) + + # Check cache support + use_cache = "Nemotron" not in self._model.stt_model.cfg.pretrained_llm + + # Initialize buffers (float32 for audio preprocessing) + audio_buffer = torch.zeros(1, buffer_size_samples, dtype=torch.float32, device=device) + buffer_fill_level = 0 + + if use_cache: + llm_cache = DynamicCache() + else: + llm_cache = None + input_embeds_history = [] + + # Initialize TTS state + decode_audio = self.inc_config.decode_audio and hasattr(self._model, "tts_model") + code = None + past_key_values = None + subword_mask = None + audio_toks_buffer = None + + if decode_audio: + audio_toks_buffer = ( + self._model.tts_model.codec_silence_tokens.view(1, 1, -1) + .expand(-1, self.inc_config.codec_token_history_size, -1) + .to(device) + ) + + if self.first_tts_past_key_values_input is not None: + past_key_values = self._clone_cache(self.first_tts_past_key_values_input) + code = self.first_tts_code_input.detach().clone() + subword_mask = torch.ones(1, total_frames, device=device, dtype=torch.bool) + + gen_text = torch.full((1, total_frames), self._model.stt_model.text_pad_id, device=device, dtype=torch.long) + gen_asr_text = torch.full( + (1, total_frames), self._model.stt_model.text_pad_id, device=device, dtype=torch.long + ) + + audio_segments = [] + frame_alignment = self._init_frame_alignment() if self.inc_config.output_frame_alignment else None + pad_id = self._model.stt_model.text_pad_id + + # Frame-by-frame processing + frame_idx = 0 + while frame_idx < total_frames: + slice_start = frame_idx * FRAME_SIZE_SAMPLES + slice_n_samples = num_frames_per_inference * FRAME_SIZE_SAMPLES + new_audio = audio_tensor[:, slice_start : slice_start + slice_n_samples] + + audio_buffer, buffer_fill_level, current_buffer = self._update_audio_buffer( + audio_buffer, buffer_fill_level, new_audio, buffer_size_samples + ) + + result = self.infer_one_step( + audio_input=current_buffer, + num_frames_per_inference=num_frames_per_inference, + frame_idx=frame_idx, + gen_text=gen_text, + audio_toks_buffer=audio_toks_buffer if decode_audio else None, + input_embeds_history=input_embeds_history if not use_cache else [], + dynamic_cache=llm_cache if use_cache else None, + embedding_position=-1, + past_key_values=past_key_values if decode_audio else None, + code=code if decode_audio else None, + subword_mask=subword_mask if decode_audio else None, + gen_asr_text=gen_asr_text, + ) + + if not use_cache: + input_embeds_history = result["input_embeds_history"] + llm_cache = result["dynamic_cache"] + + if decode_audio: + audio_toks_buffer = result["audio_toks_buffer"] + if result["decoded_audio_new"] is not None: + audio_segments.append(result["decoded_audio_new"]) + past_key_values = result["past_key_values"] + code = result["code"] + + # Collect frame alignment + if frame_alignment is not None: + for i in range(num_frames_per_inference): + fi = frame_idx + i + if fi < total_frames: + self._append_frame_alignment(frame_alignment, fi, "user_turn", gen_text, gen_asr_text, pad_id) + + frame_idx += num_frames_per_inference + + # Prepare outputs + gen_text = gen_text[:, :total_frames] + gen_asr_text = gen_asr_text[:, :total_frames] + lengths = torch.tensor([total_frames], dtype=torch.long, device=device) + + text_output = tokens_to_str( + gen_text, + lengths, + tokenizer=self._tokenizer, + pad_id=self._model.stt_model.text_pad_id, + eval_text_turn_taking=True, + ) + asr_text_output = tokens_to_str( + gen_asr_text, + lengths, + tokenizer=self._tokenizer, + pad_id=self._model.stt_model.text_pad_id, + eval_text_turn_taking=True, + ) + + output_audio = None + if audio_segments: + output_audio = torch.cat(audio_segments, dim=-1) + print(f"[DEBUG] Final output_audio shape: {output_audio.shape}") + print(f"[DEBUG] Final output_audio min/max: {output_audio.min():.4f} / {output_audio.max():.4f}") + print(f"[DEBUG] Final output_audio mean/std: {output_audio.mean():.6f} / {output_audio.std():.4f}") + print(f"[DEBUG] Number of audio segments: {len(audio_segments)}") + + debug_info = {"total_frames": total_frames} + if frame_alignment is not None: + debug_info["frame_alignment"] = frame_alignment + + return { + "text": text_output, + "asr_text": asr_text_output, + "audio": output_audio, + "tokens_text": gen_text, + "tokens_len": lengths, + "debug_info": debug_info, + "input_audio_path": audio_path, + } + + def generate(self, requests: List[GenerationRequest]) -> List[GenerationResult]: + """Generate text + audio responses from audio inputs.""" + if not self._is_loaded: + return [GenerationResult(error="Model not loaded", request_id=r.request_id) for r in requests] + + if not requests: + return [] + + results = [] + + for req in requests: + start_time = time.time() + temp_file_path = None + + try: + # Get audio input + audio_path = req.audio_path + if req.audio_bytes: + temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) + temp_file.write(req.audio_bytes) + temp_file.close() + temp_file_path = temp_file.name + audio_path = temp_file_path + + if not audio_path: + results.append( + GenerationResult( + error="Audio input required for s2s_incremental backend", request_id=req.request_id + ) + ) + continue + + # Run inference + output = self.inference_realtime_streaming( + audio_path=audio_path, + num_frames_per_inference=self.inc_config.num_frames_per_inference, + ) + + # Encode audio to bytes + audio_bytes = None + if output["audio"] is not None: + audio_np = output["audio"].float().cpu().numpy().squeeze() + max_val = np.abs(audio_np).max() + if max_val > 0: + audio_np = audio_np / max_val * 0.95 + wav_buffer = io.BytesIO() + import soundfile as sf + + sf.write(wav_buffer, audio_np, self.target_sample_rate, format="WAV") + audio_bytes = wav_buffer.getvalue() + + elapsed_ms = (time.time() - start_time) * 1000 + output_text = output["text"][0] if output["text"] else "" + debug_info = output.get("debug_info", {}) + + # Save artifacts if enabled + request_id = req.request_id or datetime.now().strftime("%Y%m%d_%H%M%S") + artifacts_dir = self._get_artifacts_dir(request_id) + response_audio_bytes = audio_bytes + response_sample_rate = self.target_sample_rate + + if artifacts_dir: + self._save_artifacts(artifacts_dir, audio_path, output_text, audio_bytes, debug_info, elapsed_ms) + # Generate dual-channel audio and use it as the response + dual_path = self._generate_dual_channel_audio(artifacts_dir, audio_path, audio_bytes) + if dual_path: + debug_info["dual_channel_audio_path"] = dual_path + # Read dual-channel audio to return to client + with open(dual_path, "rb") as f: + response_audio_bytes = f.read() + response_sample_rate = TTS_SAMPLE_RATE # Dual-channel uses TTS sample rate + + results.append( + GenerationResult( + text=output_text, + audio_bytes=response_audio_bytes, + audio_sample_rate=response_sample_rate, + request_id=req.request_id, + generation_time_ms=elapsed_ms, + debug_info=debug_info, + ) + ) + + except Exception as e: + import traceback + + traceback.print_exc() + results.append(GenerationResult(error=str(e), request_id=req.request_id)) + + finally: + if temp_file_path and os.path.exists(temp_file_path): + os.unlink(temp_file_path) + + return results + + def validate_request(self, request: GenerationRequest) -> Optional[str]: + """Validate request for incremental S2S.""" + if not request.audio_bytes and not request.audio_path: + return "Audio input is required for s2s_incremental backend" + return None + + def health_check(self) -> Dict[str, Any]: + """Return health status.""" + base = super().health_check() + if self._is_loaded: + base.update( + { + "buffer_size_frames": self.inc_config.buffer_size_frames, + "num_frames_per_inference": self.inc_config.num_frames_per_inference, + "decode_audio": self.inc_config.decode_audio, + "target_sample_rate": self.target_sample_rate, + "tts_enabled": self.generation_config is not None, + } + ) + return base diff --git a/recipes/multimodal/server/backends/s2s_incremental_backend_v2.py b/recipes/multimodal/server/backends/s2s_incremental_backend_v2.py new file mode 100644 index 0000000000..6112b2f6d1 --- /dev/null +++ b/recipes/multimodal/server/backends/s2s_incremental_backend_v2.py @@ -0,0 +1,651 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +V2 Incremental Speech-to-Speech (S2S) backend. + +Wraps NemotronVoicechatInferenceWrapper from the NeMo inference pipeline +instead of manually implementing model loading and inference. This enables: + - vLLM acceleration for both LLM and EarTTS + - Perception cache (incremental encoder) + - Codec cache (incremental audio decoding, no clicking) + - System prompt prefill + - Proper token-to-text BPE decoding + +All model initialization, weight loading, and core inference logic is +delegated to the wrapper -- this backend only adapts the InferenceBackend +server interface around it. +""" + +import io +import json +import os +import re +import shutil +import tempfile +import time +from dataclasses import dataclass, field +from datetime import datetime +from typing import Any, Dict, List, Optional, Set + +import numpy as np +import torch +from omegaconf import DictConfig, OmegaConf + +from .base import ( + BackendConfig, + GenerationRequest, + GenerationResult, + InferenceBackend, + Modality, +) + +SAMPLE_RATE = 16000 +FRAME_SIZE_SEC = 0.08 +FRAME_SIZE_SAMPLES = int(SAMPLE_RATE * FRAME_SIZE_SEC) +TTS_SAMPLE_RATE = 22050 + +DEFAULT_BUFFER_SIZE_FRAMES = 71 +DEFAULT_NUM_FRAMES_PER_INFERENCE = 1 +DEFAULT_CODEC_TOKEN_HISTORY_SIZE = 600 + + +@dataclass +class S2SIncrementalV2Config(BackendConfig): + """Configuration for V2 incremental S2S backend. + + Extends the base BackendConfig with parameters for the NeMo inference + pipeline wrapper, including vLLM, perception/codec caches, and prompts. + """ + + config_path: Optional[str] = None + + llm_checkpoint_path: Optional[str] = None + tts_checkpoint_path: Optional[str] = None + speaker_reference: Optional[str] = None + + buffer_size_frames: int = DEFAULT_BUFFER_SIZE_FRAMES + num_frames_per_inference: int = DEFAULT_NUM_FRAMES_PER_INFERENCE + codec_token_history_size: int = DEFAULT_CODEC_TOKEN_HISTORY_SIZE + + # Padding: pad_to_duration_secs takes precedence; if unset, silence_padding_sec + # seconds of silence are appended (matching V1 behavior). + silence_padding_sec: float = 5.0 + pad_to_duration_secs: Optional[float] = None + + force_turn_taking: bool = False + force_turn_taking_threshold: int = 40 + force_turn_taking_pad_window: int = 25 + + decode_audio: bool = True + merge_user_channel: bool = False + use_asr_as_response: bool = False + + save_session_artifacts: bool = True + session_artifacts_dir: str = "/tmp/s2s_sessions" + + output_frame_alignment: bool = False + + response_end_detection_mode: str = "audio_energy" + audio_energy_threshold: float = 0.01 + audio_energy_window_sec: float = 0.5 + max_response_duration_sec: float = 30.0 + eos_detection_window: int = 10 + + engine_type: str = "native" + + use_perception_cache: bool = False + use_perception_cudagraph: bool = False + + use_codec_cache: bool = True + + repetition_penalty: float = 1.0 + top_p: float = 1.0 + temperature: float = 1.0 + + inference_pad_boost: Optional[float] = None + inference_bos_boost: Optional[float] = None + inference_eos_boost: Optional[float] = None + inference_user_pad_boost: Optional[float] = None + inference_user_bos_boost: Optional[float] = None + inference_user_eos_boost: Optional[float] = None + + system_prompt: Optional[str] = None + tts_system_prompt: Optional[str] = None + + vllm_llm_config: Optional[Dict[str, Any]] = field(default_factory=lambda: None) + vllm_tts_config: Optional[Dict[str, Any]] = field(default_factory=lambda: None) + + matmul_precision: str = "high" + + @classmethod + def from_dict(cls, d: Dict[str, Any]) -> "S2SIncrementalV2Config": + known_fields = {f.name for f in cls.__dataclass_fields__.values() if f.name != "extra_config"} + known = {k: v for k, v in d.items() if k in known_fields} + extra = {k: v for k, v in d.items() if k not in known_fields} + return cls(**known, extra_config=extra) + + +class S2SIncrementalBackendV2(InferenceBackend): + """ + V2 Incremental Speech-to-Speech backend. + + Wraps ``NemotronVoicechatInferenceWrapper`` for model loading, + frame-level inference, and audio decoding. Adds the + ``InferenceBackend`` interface required by the unified server. + """ + + @property + def name(self) -> str: + return "s2s_incremental_v2" + + @property + def supported_modalities(self) -> Set[Modality]: + return {Modality.TEXT, Modality.AUDIO_IN, Modality.AUDIO_OUT} + + def __init__(self, config: BackendConfig): + if isinstance(config, S2SIncrementalV2Config): + self.v2_config = config + else: + self.v2_config = S2SIncrementalV2Config.from_dict( + { + **{ + k: getattr(config, k) + for k in [ + "model_path", + "device", + "dtype", + "max_new_tokens", + "temperature", + "top_p", + "top_k", + ] + }, + **config.extra_config, + } + ) + + # Alias so code written against V1's inc_config keeps working + self.inc_config = self.v2_config + + super().__init__(self.v2_config) + + self._wrapper = None + self._tokenizer = None + self._model_cfg = None + self.dtype = None + + self.first_context_subword_id = None + self.generation_config = None + self.first_tts_code_input = None + self.first_tts_past_key_values_input = None + self.target_sample_rate = TTS_SAMPLE_RATE + self.target_fps = None + + # ------------------------------------------------------------------ + # Wrapper config builder + # ------------------------------------------------------------------ + def _build_wrapper_config(self) -> DictConfig: + """Translate ``S2SIncrementalV2Config`` into the ``DictConfig`` + expected by ``NemotronVoicechatInferenceWrapper``.""" + cfg = self.v2_config + model_path = cfg.tts_checkpoint_path or cfg.model_path + llm_path = cfg.llm_checkpoint_path or cfg.model_path + + d: Dict[str, Any] = { + "model_path": model_path, + "llm_checkpoint_path": llm_path, + "speaker_reference": cfg.speaker_reference, + "buffer_size_frames": cfg.buffer_size_frames, + "decode_audio": cfg.decode_audio, + "codec_token_history_size": cfg.codec_token_history_size, + "engine_type": cfg.engine_type, + "use_perception_cache": cfg.use_perception_cache, + "use_perception_cudagraph": cfg.use_perception_cudagraph, + "use_codec_cache": cfg.use_codec_cache, + "top_p": cfg.top_p, + "repetition_penalty": cfg.repetition_penalty, + "temperature": cfg.temperature, + "tts_system_prompt": cfg.tts_system_prompt, + "compute_dtype": cfg.dtype, + "device": cfg.device, + "force_turn_taking": cfg.force_turn_taking, + "force_turn_taking_threshold": cfg.force_turn_taking_threshold, + "force_turn_taking_pad_window": cfg.force_turn_taking_pad_window, + } + + for boost_key in ( + "inference_pad_boost", + "inference_bos_boost", + "inference_eos_boost", + "inference_user_pad_boost", + "inference_user_bos_boost", + "inference_user_eos_boost", + ): + val = getattr(cfg, boost_key, None) + if val is not None: + d[boost_key] = val + + if cfg.vllm_llm_config: + d["vllm_llm_config"] = cfg.vllm_llm_config + if cfg.vllm_tts_config: + d["vllm_tts_config"] = cfg.vllm_tts_config + + return OmegaConf.create(d) + + # ------------------------------------------------------------------ + # Model loading -- delegates entirely to the wrapper + # ------------------------------------------------------------------ + def load_model(self) -> None: + from nemo.collections.speechlm2.inference.model_wrappers.nemotron_voicechat_inference_wrapper import ( + NemotronVoicechatInferenceWrapper, + ) + + print(f"[S2SIncrementalV2] Loading model (engine={self.v2_config.engine_type})...") + + torch.set_float32_matmul_precision(self.v2_config.matmul_precision) + + model_cfg = self._build_wrapper_config() + self._wrapper = NemotronVoicechatInferenceWrapper(model_cfg=model_cfg) + + # Expose attributes expected by the server and session backend + self._model = self._wrapper.model + self._tokenizer = self._wrapper.tokenizer + self._model_cfg = self._wrapper.model_cfg + self.dtype = self._wrapper.dtype + self.target_sample_rate = getattr(self._wrapper, "target_sample_rate", TTS_SAMPLE_RATE) + self.target_fps = getattr(self._wrapper, "target_fps", None) + + self.first_context_subword_id = self._wrapper.first_context_subword_id + self.generation_config = self._wrapper.generation_config + self.first_tts_code_input = self._wrapper.first_tts_code_input + self.first_tts_past_key_values_input = self._wrapper.first_tts_past_key_values_input + + self._is_loaded = True + print("[S2SIncrementalV2] Model loaded successfully") + + # ------------------------------------------------------------------ + # Core inference -- thin delegates to wrapper + # ------------------------------------------------------------------ + def infer_one_step( + self, + audio_input, + num_frames_per_chunk=None, + num_frames_per_inference=None, + frame_idx=0, + gen_text=None, + audio_toks_buffer=None, + input_embeds_history=None, + dynamic_cache=None, + embedding_position=None, # accepted for V1 compat, not forwarded + past_key_values=None, + code=None, + subword_mask=None, + gen_asr_text=None, + request_id=None, + perception_cache=None, + has_prompt=False, + codec_cache=None, + ): + """Delegate to the wrapper's ``infer_one_step``. + + Accepts both ``num_frames_per_chunk`` (new) and + ``num_frames_per_inference`` (V1 compat) -- the former takes + precedence. + """ + nfpc = num_frames_per_chunk or num_frames_per_inference or self.v2_config.num_frames_per_inference + return self._wrapper.infer_one_step( + audio_input=audio_input, + num_frames_per_chunk=nfpc, + frame_idx=frame_idx, + gen_text=gen_text, + audio_toks_buffer=audio_toks_buffer, + input_embeds_history=input_embeds_history if input_embeds_history is not None else [], + dynamic_cache=dynamic_cache, + past_key_values=past_key_values, + code=code, + subword_mask=subword_mask, + gen_asr_text=gen_asr_text, + request_id=request_id, + perception_cache=perception_cache, + has_prompt=has_prompt, + codec_cache=codec_cache, + ) + + def _compute_pad_audio_sec(self, audio_path: str) -> Optional[float]: + """Resolve the effective ``pad_audio_to_sec`` value for the wrapper.""" + if self.v2_config.pad_to_duration_secs is not None: + return float(self.v2_config.pad_to_duration_secs) + if self.v2_config.silence_padding_sec > 0: + import librosa + + duration = librosa.get_duration(filename=audio_path) + return duration + self.v2_config.silence_padding_sec + return None + + @torch.no_grad() + def inference_realtime_streaming( + self, + audio_path: str, + num_frames_per_inference: int = None, + request_id: Optional[str] = None, + system_prompt: Optional[str] = None, + ) -> Dict[str, Any]: + """Run streaming inference on an audio file. + + Delegates entirely to the wrapper's + ``inference_realtime_streaming``, translating V1-style + parameters (``silence_padding_sec``) to V2-style + (``pad_audio_to_sec``). + """ + nfpc = num_frames_per_inference or self.v2_config.num_frames_per_inference + pad_to = self._compute_pad_audio_sec(audio_path) + sys_prompt = system_prompt or self.v2_config.system_prompt + + result = self._wrapper.inference_realtime_streaming( + audio_path=audio_path, + num_frames_per_chunk=nfpc, + request_id=request_id, + pad_audio_to_sec=pad_to, + system_prompt=sys_prompt, + ) + + result["input_audio_path"] = audio_path + total = result.get("tokens_len", torch.tensor([0]))[0].item() + result.setdefault("debug_info", {"total_frames": total}) + + return result + + # ------------------------------------------------------------------ + # Server interface + # ------------------------------------------------------------------ + def generate(self, requests: List[GenerationRequest]) -> List[GenerationResult]: + if not self._is_loaded: + return [GenerationResult(error="Model not loaded", request_id=r.request_id) for r in requests] + if not requests: + return [] + + results = [] + for req in requests: + start_time = time.time() + temp_file_path = None + + try: + audio_path = req.audio_path + if req.audio_bytes: + temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) + temp_file.write(req.audio_bytes) + temp_file.close() + temp_file_path = temp_file.name + audio_path = temp_file_path + + if not audio_path: + results.append( + GenerationResult( + error="Audio input required for s2s_incremental_v2 backend", + request_id=req.request_id, + ) + ) + continue + + output = self.inference_realtime_streaming( + audio_path=audio_path, + num_frames_per_inference=self.v2_config.num_frames_per_inference, + request_id=req.request_id, + system_prompt=req.system_prompt, + ) + + audio_bytes = None + out_sr = self.target_sample_rate + if output.get("audio") is not None: + import soundfile as sf + + wav = output["audio"].float().cpu().numpy().squeeze() + + # Trim to padded input duration (input + extra padding) at target SR. + # Matches s2s_voicechat_infer_backend: trim to (input_len + padding) / source_sr * target_sr. + # This removes frame-alignment overshoot while keeping the full response. + pad_to = self._compute_pad_audio_sec(audio_path) + if pad_to is not None: + per_sample_pred_len = int(pad_to * out_sr) + if 0 < per_sample_pred_len < len(wav): + wav = wav[:per_sample_pred_len] + + max_val = float(np.abs(wav).max()) if wav.size else 0.0 + if max_val > 0: + wav = wav / max_val * 0.95 + + if self.v2_config.merge_user_channel: + merged = self._merge_user_model_audio(audio_path, wav, out_sr) + buf = io.BytesIO() + sf.write(buf, merged, out_sr, format="WAV") + audio_bytes = buf.getvalue() + else: + buf = io.BytesIO() + sf.write(buf, wav, out_sr, format="WAV") + audio_bytes = buf.getvalue() + + elapsed_ms = (time.time() - start_time) * 1000 + output_text = output["text"][0] if output.get("text") else "" + asr_text = output["asr_text"][0] if output.get("asr_text") else None + debug_info = output.get("debug_info", {}) + + if self.v2_config.use_asr_as_response and asr_text: + cleaned = asr_text + cleaned = re.sub(r"<[\$|][^>]*[\$|]>", "", cleaned) + cleaned = cleaned.replace("^", "") + cleaned = re.sub(r"\s+", " ", cleaned).strip() + output_text = cleaned + + request_id_key = req.request_id or datetime.now().strftime("%Y%m%d_%H%M%S") + artifacts_dir = self._get_artifacts_dir(request_id_key) + if artifacts_dir: + self._save_artifacts(artifacts_dir, audio_path, output_text, audio_bytes, debug_info, elapsed_ms) + + results.append( + GenerationResult( + text=output_text, + asr_text=asr_text, + audio_bytes=audio_bytes, + audio_sample_rate=out_sr, + request_id=req.request_id, + generation_time_ms=elapsed_ms, + debug_info=debug_info, + ) + ) + + except Exception as e: + import traceback + + traceback.print_exc() + results.append(GenerationResult(error=str(e), request_id=req.request_id)) + + finally: + if temp_file_path and os.path.exists(temp_file_path): + os.unlink(temp_file_path) + + return results + + # ------------------------------------------------------------------ + # Validation & health + # ------------------------------------------------------------------ + def validate_request(self, request: GenerationRequest) -> Optional[str]: + if not request.audio_bytes and not request.audio_path: + return "Audio input is required for s2s_incremental_v2 backend" + return None + + def health_check(self) -> Dict[str, Any]: + base = super().health_check() + if self._is_loaded: + base.update( + { + "buffer_size_frames": self.v2_config.buffer_size_frames, + "num_frames_per_inference": self.v2_config.num_frames_per_inference, + "decode_audio": self.v2_config.decode_audio, + "target_sample_rate": self.target_sample_rate, + "tts_enabled": self.generation_config is not None, + "engine_type": self.v2_config.engine_type, + "use_perception_cache": self.v2_config.use_perception_cache, + "use_codec_cache": self.v2_config.use_codec_cache, + } + ) + return base + + # ------------------------------------------------------------------ + # Wrapper delegates for session backend compatibility + # ------------------------------------------------------------------ + def _update_audio_buffer(self, audio_buffer, buffer_fill_level, new_audio, buffer_size_samples): + return self._wrapper._update_audio_buffer(audio_buffer, buffer_fill_level, new_audio, buffer_size_samples) + + def _clone_cache(self, cache): + return self._wrapper._clone_cache(cache) + + def _get_bos_embedding(self): + return self._wrapper._get_bos_embedding() + + def _get_asr_bos_embedding(self): + return self._wrapper._get_asr_bos_embedding() + + def _samples_per_audio_output_frame(self): + return self._wrapper._samples_per_audio_output_frame() + + def abort_request(self, request_id: Optional[str] = None) -> bool: + if self._wrapper is not None: + return self._wrapper.abort_request(request_id) + return False + + # ------------------------------------------------------------------ + # Artifact / dual-channel helpers (server-side I/O, not in wrapper) + # ------------------------------------------------------------------ + def _get_artifacts_dir(self, request_id: str) -> Optional[str]: + if not self.v2_config.save_session_artifacts: + return None + base_dir = self.v2_config.session_artifacts_dir + artifacts_dir = os.path.join(base_dir, request_id) + os.makedirs(artifacts_dir, exist_ok=True) + return artifacts_dir + + def _save_artifacts( + self, + artifacts_dir: str, + input_audio_path: str, + output_text: str, + output_audio_bytes: Optional[bytes], + debug_info: Dict[str, Any], + generation_time_ms: float, + ) -> Dict[str, str]: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + input_dest = os.path.join(artifacts_dir, f"{timestamp}_input.wav") + shutil.copy2(input_audio_path, input_dest) + + output_audio_path = None + if output_audio_bytes: + output_audio_path = os.path.join(artifacts_dir, f"{timestamp}_output.wav") + with open(output_audio_path, "wb") as f: + f.write(output_audio_bytes) + + output_json_path = os.path.join(artifacts_dir, f"{timestamp}_output.json") + with open(output_json_path, "w") as f: + json.dump( + { + "timestamp": timestamp, + "text": output_text, + "audio_path": output_audio_path, + "debug_info": debug_info, + "generation_time_ms": generation_time_ms, + }, + f, + indent=2, + ) + + return {"artifacts_dir": artifacts_dir, "input_path": input_dest, "output_path": output_audio_path} + + def _merge_user_model_audio( + self, + input_audio_path: str, + pred_audio: np.ndarray, + pred_sr: int, + ) -> np.ndarray: + """Merge user (input) and model (pred) into a two-channel array on one timeline. + + Mirrors s2s_voicechat_infer_backend._merge_user_model_audio: + channel 0 = user, channel 1 = pred, same length (padded), + so FDB ASR (--stereo) gets a single timeline. + """ + import soundfile as sf + + user_audio, user_sr = sf.read(input_audio_path) + user = np.asarray(user_audio, dtype=np.float64).squeeze() + pred = np.asarray(pred_audio, dtype=np.float64).squeeze() + if user.ndim > 1: + user = user.mean(axis=1) + if pred.ndim > 1: + pred = pred.mean(axis=1) + if user_sr != pred_sr: + try: + import librosa + user = librosa.resample(user, orig_sr=user_sr, target_sr=pred_sr) + except ImportError: + import scipy.signal + user = scipy.signal.resample(user, int(len(user) * pred_sr / user_sr)) + T1, T2 = user.shape[0], pred.shape[0] + max_len = max(T1, T2) + user_pad = np.pad(user, (0, max_len - T1), mode="constant", constant_values=0) + pred_pad = np.pad(pred, (0, max_len - T2), mode="constant", constant_values=0) + merged = np.stack([user_pad, pred_pad], axis=1).astype(np.float32) + return merged + + # ------------------------------------------------------------------ + # Frame alignment utilities (kept for debug/session compat) + # ------------------------------------------------------------------ + def _decode_single_token(self, token_id: int, pad_id: int) -> str: + try: + tokens = self._tokenizer.ids_to_tokens([token_id]) + if tokens: + return tokens[0].replace("\u0120", " ") + return f"" + except Exception: + return f"" + + def _init_frame_alignment(self) -> Dict[str, list]: + return { + "frame_idx": [], + "user_stream": [], + "agent_stream_token": [], + "agent_stream_decoded": [], + "asr_stream_token": [], + "asr_stream_decoded": [], + "is_tts_stop": [], + } + + def _append_frame_alignment( + self, + frame_alignment: Dict[str, list], + frame_idx: int, + phase: str, + gen_text: torch.Tensor, + gen_asr_text: torch.Tensor, + pad_id: int, + is_tts_stop: bool = False, + ) -> None: + agent_token = gen_text[0, frame_idx].item() if frame_idx < gen_text.shape[1] else pad_id + asr_token = gen_asr_text[0, frame_idx].item() if frame_idx < gen_asr_text.shape[1] else pad_id + frame_alignment["frame_idx"].append(frame_idx) + frame_alignment["user_stream"].append(phase) + frame_alignment["agent_stream_token"].append(agent_token) + frame_alignment["agent_stream_decoded"].append(self._decode_single_token(agent_token, pad_id)) + frame_alignment["asr_stream_token"].append(asr_token) + frame_alignment["asr_stream_decoded"].append(self._decode_single_token(asr_token, pad_id)) + frame_alignment["is_tts_stop"].append(is_tts_stop) diff --git a/recipes/multimodal/server/backends/s2s_session_backend.py b/recipes/multimodal/server/backends/s2s_session_backend.py new file mode 100644 index 0000000000..deb377af41 --- /dev/null +++ b/recipes/multimodal/server/backends/s2s_session_backend.py @@ -0,0 +1,1039 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Session-aware Speech-to-Speech (S2S) backend. + +Extends S2SIncrementalBackend to support multi-turn conversations by +persisting LLM KV cache and other state between HTTP requests. +""" + +import io +import json +import os +import shutil +import tempfile +import time +from datetime import datetime +from typing import Any, Dict, List, Optional + +import numpy as np +import scipy.signal +import torch + +from ..session_manager import SessionState, TurnData +from .base import GenerationRequest, GenerationResult +from .s2s_incremental_backend import ( + FRAME_SIZE_SAMPLES, + FRAME_SIZE_SEC, + SAMPLE_RATE, + TTS_SAMPLE_RATE, + S2SIncrementalBackend, +) + + +class S2SSessionBackend(S2SIncrementalBackend): + """ + Session-aware S2S backend that persists state between requests. + + Extends S2SIncrementalBackend to: + - Accept session state from SessionManager + - Restore LLM cache, frame index, and buffers from session + - Return updated state for saving back to session + """ + + @property + def name(self) -> str: + return "s2s_session" + + def _initialize_fresh_state(self, total_frames: int, device: str) -> Dict[str, Any]: + """Initialize fresh state for a new session.""" + from transformers import DynamicCache + + use_cache = "Nemotron" not in self._model.stt_model.cfg.pretrained_llm + + state = { + "frame_idx": 0, + "gen_text": torch.full( + (1, total_frames), self._model.stt_model.text_pad_id, device=device, dtype=torch.long + ), + "gen_asr_text": torch.full( + (1, total_frames), self._model.stt_model.text_pad_id, device=device, dtype=torch.long + ), + "audio_buffer": torch.zeros( + 1, self.inc_config.buffer_size_frames * FRAME_SIZE_SAMPLES, dtype=torch.float32, device=device + ), + "buffer_fill_level": 0, + "llm_cache": DynamicCache() if use_cache else None, + "input_embeds_history": [] if not use_cache else None, + } + return state + + def _restore_state_from_session( + self, session_state: SessionState, total_frames: int, device: str + ) -> Dict[str, Any]: + """Restore state from session, extending buffers if needed.""" + frame_idx = session_state.frame_idx + + # We need to extend gen_text and gen_asr_text to accommodate new frames + new_total_frames = frame_idx + total_frames + + # Restore or create gen_text + if session_state.gen_text is not None: + old_gen_text = session_state.gen_text.to(device) + if old_gen_text.shape[1] < new_total_frames: + gen_text = torch.full( + (1, new_total_frames), self._model.stt_model.text_pad_id, device=device, dtype=torch.long + ) + gen_text[:, : old_gen_text.shape[1]] = old_gen_text + else: + gen_text = old_gen_text + else: + gen_text = torch.full( + (1, new_total_frames), self._model.stt_model.text_pad_id, device=device, dtype=torch.long + ) + + # Restore or create gen_asr_text + if session_state.gen_asr_text is not None: + old_gen_asr_text = session_state.gen_asr_text.to(device) + if old_gen_asr_text.shape[1] < new_total_frames: + gen_asr_text = torch.full( + (1, new_total_frames), self._model.stt_model.text_pad_id, device=device, dtype=torch.long + ) + gen_asr_text[:, : old_gen_asr_text.shape[1]] = old_gen_asr_text + else: + gen_asr_text = old_gen_asr_text + else: + gen_asr_text = torch.full( + (1, new_total_frames), self._model.stt_model.text_pad_id, device=device, dtype=torch.long + ) + + # Restore audio buffer + buffer_size_samples = self.inc_config.buffer_size_frames * FRAME_SIZE_SAMPLES + if session_state.audio_buffer is not None: + audio_buffer = session_state.audio_buffer.to(device) + else: + audio_buffer = torch.zeros(1, buffer_size_samples, dtype=torch.float32, device=device) + + state = { + "frame_idx": frame_idx, + "gen_text": gen_text, + "gen_asr_text": gen_asr_text, + "audio_buffer": audio_buffer, + "buffer_fill_level": session_state.buffer_fill_level, + "llm_cache": session_state.llm_cache, + "input_embeds_history": session_state.input_embeds_history, + } + + return state + + def _save_state_to_session(self, session_state: SessionState, state: Dict[str, Any]): + """Save current state back to session.""" + session_state.frame_idx = state["frame_idx"] + session_state.gen_text = state["gen_text"].detach().cpu() + session_state.gen_asr_text = state["gen_asr_text"].detach().cpu() + session_state.audio_buffer = state["audio_buffer"].detach().cpu() + session_state.buffer_fill_level = state["buffer_fill_level"] + session_state.llm_cache = state["llm_cache"] + session_state.input_embeds_history = state.get("input_embeds_history") + + def _generate_dual_channel_audio_for_turn( + self, + input_audio_path: str, + output_audio_bytes: Optional[bytes], + ) -> tuple[Optional[bytes], int]: + """ + Generate 2-channel audio (user=ch0, agent=ch1) for a single turn. + + Returns: + Tuple of (audio_bytes, sample_rate) or (None, 0) if failed. + """ + import soundfile as sf + + if not output_audio_bytes: + return None, 0 + + output_sr = TTS_SAMPLE_RATE + + # Load user audio + try: + user_audio, user_sr = sf.read(input_audio_path) + if user_sr != output_sr: + user_audio = scipy.signal.resample(user_audio, int(len(user_audio) * output_sr / user_sr)) + if len(user_audio.shape) > 1: + user_audio = user_audio[:, 0] + except Exception as e: + print(f"[S2SSession] Error reading user audio: {e}") + return None, 0 + + # Load agent audio + try: + agent_audio, agent_sr = sf.read(io.BytesIO(output_audio_bytes)) + if agent_sr != output_sr: + agent_audio = scipy.signal.resample(agent_audio, int(len(agent_audio) * output_sr / agent_sr)) + if len(agent_audio.shape) > 1: + agent_audio = agent_audio[:, 0] + except Exception as e: + print(f"[S2SSession] Error reading agent audio: {e}") + return None, 0 + + # Create 2-channel audio (zero-padded to max length) + max_len = max(len(user_audio), len(agent_audio)) + stereo = np.zeros((max_len, 2), dtype=np.float32) + stereo[: len(user_audio), 0] = user_audio + stereo[: len(agent_audio), 1] = agent_audio + + # Normalize + max_val = np.abs(stereo).max() + if max_val > 0: + stereo = stereo / max_val * 0.95 + + # Encode to bytes + wav_buffer = io.BytesIO() + sf.write(wav_buffer, stereo, output_sr, format="WAV") + print( + f"[S2SSession] Generated dual-channel audio: user={len(user_audio)} samples, agent={len(agent_audio)} samples" + ) + return wav_buffer.getvalue(), output_sr + + def _get_session_artifacts_dir(self, session_id: str) -> Optional[str]: + """Get or create the artifacts directory for a session.""" + if not self.inc_config.save_session_artifacts: + return None + + base_dir = self.inc_config.session_artifacts_dir + if base_dir is None: + # Default to /tmp/s2s_sessions + base_dir = "/tmp/s2s_sessions" + + session_dir = os.path.join(base_dir, session_id) + os.makedirs(session_dir, exist_ok=True) + return session_dir + + def _save_session_artifacts( + self, + session_id: str, + turn_idx: int, + input_audio_path: str, + request_info: Dict[str, Any], + output_text: str, + output_audio_bytes: Optional[bytes], + debug_info: Dict[str, Any], + generation_time_ms: float, + ): + """Save session artifacts (input/output) to disk.""" + session_dir = self._get_session_artifacts_dir(session_id) + if session_dir is None: + return None + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + turn_prefix = f"turn{turn_idx:02d}_{timestamp}" + + # Save input audio + input_audio_dest = os.path.join(session_dir, f"{turn_prefix}_input.wav") + shutil.copy2(input_audio_path, input_audio_dest) + + # Save output audio + output_audio_dest = None + if output_audio_bytes: + output_audio_dest = os.path.join(session_dir, f"{turn_prefix}_output.wav") + with open(output_audio_dest, "wb") as f: + f.write(output_audio_bytes) + + # Build input JSON (with path instead of base64) + input_json = { + "session_id": session_id, + "turn_idx": turn_idx, + "timestamp": timestamp, + "request": { + **request_info, + "audio_path": input_audio_dest, + }, + } + + # Build output JSON + output_json = { + "session_id": session_id, + "turn_idx": turn_idx, + "timestamp": timestamp, + "text": output_text, + "audio_path": output_audio_dest, + "debug_info": debug_info, + "generation_time_ms": generation_time_ms, + } + + # Save JSON files + input_json_path = os.path.join(session_dir, f"{turn_prefix}_input.json") + output_json_path = os.path.join(session_dir, f"{turn_prefix}_output.json") + + with open(input_json_path, "w") as f: + json.dump(input_json, f, indent=2) + + with open(output_json_path, "w") as f: + json.dump(output_json, f, indent=2) + + print(f"[S2SSession] Saved artifacts to {session_dir}") + + return { + "session_dir": session_dir, + "input_audio_path": input_audio_dest, + "output_audio_path": output_audio_dest, + "input_json_path": input_json_path, + "output_json_path": output_json_path, + } + + @torch.no_grad() + def inference_with_session( + self, + audio_path: str, + session_state: Optional[SessionState] = None, + num_frames_per_inference: Optional[int] = None, + ) -> Dict[str, Any]: + """ + Perform inference with session state support. + + Args: + audio_path: Path to input audio file + session_state: Optional existing session state to restore from + num_frames_per_inference: Frames to process per step + + Returns: + Dict with 'text', 'asr_text', 'audio' outputs and 'session_state' + """ + import librosa + from nemo.collections.speechlm2.models.duplex_s2s_model import tokens_to_str + + if num_frames_per_inference is None: + num_frames_per_inference = self.inc_config.num_frames_per_inference + + device = self.config.device + buffer_size_frames = self.inc_config.buffer_size_frames + buffer_size_samples = buffer_size_frames * FRAME_SIZE_SAMPLES + + # Load audio for this turn + audio_signal, sr = librosa.load(audio_path, sr=SAMPLE_RATE) + + # Add silence padding for response + if self.inc_config.silence_padding_sec > 0: + silence_samples = int(self.inc_config.silence_padding_sec * SAMPLE_RATE) + audio_signal = np.concatenate([audio_signal, np.zeros(silence_samples)]) + + total_samples = len(audio_signal) + + # Calculate frames for this turn's audio + total_frames_maybe = int(np.ceil(total_samples / FRAME_SIZE_SAMPLES)) + num_inference_steps = total_frames_maybe // num_frames_per_inference + if total_frames_maybe % num_frames_per_inference != 0: + num_inference_steps += 1 + turn_frames = num_inference_steps * num_frames_per_inference + + # Pad audio to align with frames + padded_samples = num_inference_steps * num_frames_per_inference * FRAME_SIZE_SAMPLES + if padded_samples > total_samples: + audio_signal = np.pad(audio_signal, (0, padded_samples - total_samples)) + + audio_tensor = torch.tensor(audio_signal, dtype=torch.float32, device=device).unsqueeze(0) + + # Initialize or restore state + # Check if session has existing state (frame_idx > 0 indicates prior turns) + has_existing_state = session_state is not None and session_state.frame_idx > 0 + + if has_existing_state: + print(f"[S2SSession] Restoring state from session, frame_idx={session_state.frame_idx}") + state = self._restore_state_from_session(session_state, turn_frames, device) + else: + print("[S2SSession] Initializing fresh state") + if session_state is None: + session_state = SessionState(session_id="temp") + # For fresh state, we need to estimate total frames + max_frames = 50000 # Large buffer + state = self._initialize_fresh_state(max_frames, device) + + # Check cache support (Mamba models use input_embeds_history, others use DynamicCache) + use_cache = state["llm_cache"] is not None + if not use_cache: + input_embeds_history = state.get("input_embeds_history") or [] + print(f"[S2SSession] Using input_embeds_history (Mamba mode), history length: {len(input_embeds_history)}") + else: + input_embeds_history = [] + print("[S2SSession] Using DynamicCache mode") + + # Get starting frame index + start_frame_idx = state["frame_idx"] + gen_text = state["gen_text"] + gen_asr_text = state["gen_asr_text"] + audio_buffer = state["audio_buffer"] + buffer_fill_level = state["buffer_fill_level"] + llm_cache = state["llm_cache"] + + # Initialize TTS state for this turn (fresh each turn) + decode_audio = self.inc_config.decode_audio and hasattr(self._model, "tts_model") + code = None + past_key_values = None + subword_mask = None + audio_toks_buffer = None + + if decode_audio: + audio_toks_buffer = ( + self._model.tts_model.codec_silence_tokens.view(1, 1, -1) + .expand(-1, self.inc_config.codec_token_history_size, -1) + .to(device) + ) + if self.first_tts_past_key_values_input is not None: + past_key_values = self._clone_cache(self.first_tts_past_key_values_input) + code = self.first_tts_code_input.detach().clone() + # Create subword_mask with same size as gen_text to avoid index errors + subword_mask = torch.ones(1, gen_text.shape[1], device=device, dtype=torch.bool) + + audio_segments = [] + + # Response detection config + response_end_detection_mode = self.inc_config.response_end_detection_mode + audio_energy_threshold = self.inc_config.audio_energy_threshold + audio_energy_window_sec = self.inc_config.audio_energy_window_sec + max_response_frames = int(self.inc_config.max_response_duration_sec / FRAME_SIZE_SEC) + audio_energy_window_samples = int(audio_energy_window_sec * TTS_SAMPLE_RATE) + + # Per-frame alignment tracking (use same format as incremental backend) + output_frame_alignment = self.inc_config.output_frame_alignment + frame_alignment = self._init_frame_alignment() if output_frame_alignment else None + pad_id = self._model.stt_model.text_pad_id + + # Phase 1: Process audio frames + local_frame_idx = 0 + while local_frame_idx < turn_frames: + global_frame_idx = start_frame_idx + local_frame_idx + + slice_start = local_frame_idx * FRAME_SIZE_SAMPLES + slice_n_samples = num_frames_per_inference * FRAME_SIZE_SAMPLES + new_audio = audio_tensor[:, slice_start : slice_start + slice_n_samples] + + audio_buffer, buffer_fill_level, current_buffer = self._update_audio_buffer( + audio_buffer, buffer_fill_level, new_audio, buffer_size_samples + ) + + result = self.infer_one_step( + audio_input=current_buffer, + num_frames_per_inference=num_frames_per_inference, + frame_idx=global_frame_idx, + gen_text=gen_text, + audio_toks_buffer=audio_toks_buffer if decode_audio else None, + input_embeds_history=input_embeds_history if not use_cache else [], + dynamic_cache=llm_cache if use_cache else None, + embedding_position=-1, + past_key_values=past_key_values if decode_audio else None, + code=code if decode_audio else None, + subword_mask=subword_mask if decode_audio else None, + gen_asr_text=gen_asr_text, + ) + + if not use_cache: + input_embeds_history = result["input_embeds_history"] + llm_cache = result["dynamic_cache"] + + if decode_audio: + audio_toks_buffer = result["audio_toks_buffer"] + if result["decoded_audio_new"] is not None: + audio_segments.append(result["decoded_audio_new"]) + past_key_values = result["past_key_values"] + code = result["code"] + + # Collect frame alignment info (same format as incremental backend) + if frame_alignment is not None: + self._append_frame_alignment( + frame_alignment=frame_alignment, + frame_idx=global_frame_idx, + phase="user_turn", + gen_text=gen_text, + gen_asr_text=gen_asr_text, + pad_id=pad_id, + is_tts_stop=False, + ) + + local_frame_idx += num_frames_per_inference + + # Phase 2: Feed silence until response completes (energy-based detection) + silence_audio = torch.zeros( + 1, num_frames_per_inference * FRAME_SIZE_SAMPLES, dtype=torch.float32, device=device + ) + + audio_energy_response_started = False + consecutive_low_energy_samples = 0 + response_frames = 0 + recent_audio_buffer = [] + pad_id = self._model.stt_model.text_pad_id + response_started = False + consecutive_pad_count = 0 + stop_reason = "max_duration" # Default, will be updated if stopped earlier + + print(f"[S2SSession] Waiting for response (mode={response_end_detection_mode})...") + + while response_frames < max_response_frames: + global_frame_idx = start_frame_idx + local_frame_idx + + audio_buffer, buffer_fill_level, current_buffer = self._update_audio_buffer( + audio_buffer, buffer_fill_level, silence_audio, buffer_size_samples + ) + + # Ensure gen_text/gen_asr_text have enough space + if global_frame_idx >= gen_text.shape[1]: + new_size = global_frame_idx + 1000 + new_gen_text = torch.full((1, new_size), pad_id, device=device, dtype=torch.long) + new_gen_text[:, : gen_text.shape[1]] = gen_text + gen_text = new_gen_text + + new_gen_asr_text = torch.full((1, new_size), pad_id, device=device, dtype=torch.long) + new_gen_asr_text[:, : gen_asr_text.shape[1]] = gen_asr_text + gen_asr_text = new_gen_asr_text + + if subword_mask is not None: + new_subword_mask = torch.ones(1, new_size, device=device, dtype=torch.bool) + new_subword_mask[:, : subword_mask.shape[1]] = subword_mask + subword_mask = new_subword_mask + + result = self.infer_one_step( + audio_input=current_buffer, + num_frames_per_inference=num_frames_per_inference, + frame_idx=global_frame_idx, + gen_text=gen_text, + audio_toks_buffer=audio_toks_buffer if decode_audio else None, + input_embeds_history=input_embeds_history if not use_cache else [], + dynamic_cache=llm_cache if use_cache else None, + embedding_position=-1, + past_key_values=past_key_values if decode_audio else None, + code=code if decode_audio else None, + subword_mask=subword_mask if decode_audio else None, + gen_asr_text=gen_asr_text, + ) + + if not use_cache: + input_embeds_history = result["input_embeds_history"] + llm_cache = result["dynamic_cache"] + + if decode_audio: + audio_toks_buffer = result["audio_toks_buffer"] + if result["decoded_audio_new"] is not None: + audio_segments.append(result["decoded_audio_new"]) + recent_audio_buffer.append(result["decoded_audio_new"]) + past_key_values = result["past_key_values"] + code = result["code"] + + # Audio energy detection + if decode_audio and recent_audio_buffer: + recent_audio_cat = torch.cat(recent_audio_buffer, dim=-1) + if recent_audio_cat.shape[-1] > audio_energy_window_samples: + recent_audio_cat = recent_audio_cat[..., -audio_energy_window_samples:] + recent_audio_buffer = [recent_audio_cat] + + rms = torch.sqrt(torch.mean(recent_audio_cat**2)).item() + + if rms > audio_energy_threshold: + audio_energy_response_started = True + consecutive_low_energy_samples = 0 + else: + if audio_energy_response_started: + consecutive_low_energy_samples += ( + result["decoded_audio_new"].shape[-1] if result.get("decoded_audio_new") is not None else 0 + ) + + # Text EOS detection + current_token = gen_text[0, global_frame_idx].item() + if current_token != pad_id: + response_started = True + consecutive_pad_count = 0 + else: + if response_started: + consecutive_pad_count += 1 + + # Check stopping condition + should_stop = False + is_tts_stop = False + if response_end_detection_mode == "audio_energy": + if ( + decode_audio + and audio_energy_response_started + and consecutive_low_energy_samples >= audio_energy_window_samples + ): + should_stop = True + is_tts_stop = True + stop_reason = "audio_energy" + print("[S2SSession] Response completed (audio energy)") + elif response_end_detection_mode == "eos": + if response_started and consecutive_pad_count >= self.inc_config.eos_detection_window: + should_stop = True + is_tts_stop = True + stop_reason = "eos" + print("[S2SSession] Response completed (EOS)") + + # Collect frame alignment info for response phase (same format as incremental backend) + if frame_alignment is not None: + self._append_frame_alignment( + frame_alignment=frame_alignment, + frame_idx=global_frame_idx, + phase="agent_response", + gen_text=gen_text, + gen_asr_text=gen_asr_text, + pad_id=pad_id, + is_tts_stop=is_tts_stop, + ) + + local_frame_idx += num_frames_per_inference + response_frames += num_frames_per_inference + + if should_stop: + break + + if response_frames >= max_response_frames: + stop_reason = "max_duration" + print("[S2SSession] Response hit max duration") + + # Update frame index for next turn + final_frame_idx = start_frame_idx + local_frame_idx + + # Prepare outputs + total_frames = final_frame_idx + + # For current turn text output: decode only the new frames from this turn + current_turn_frames = final_frame_idx - start_frame_idx + gen_text_current_turn = gen_text[:, start_frame_idx:final_frame_idx] + gen_asr_text_current_turn = gen_asr_text[:, start_frame_idx:final_frame_idx] + current_turn_lengths = torch.tensor([current_turn_frames], dtype=torch.long, device=device) + + text_output = tokens_to_str( + gen_text_current_turn, + current_turn_lengths, + tokenizer=self._tokenizer, + pad_id=self._model.stt_model.text_pad_id, + eval_text_turn_taking=True, + ) + asr_text_output = tokens_to_str( + gen_asr_text_current_turn, + current_turn_lengths, + tokenizer=self._tokenizer, + pad_id=self._model.stt_model.text_pad_id, + eval_text_turn_taking=True, + ) + + # Keep full trimmed tensors for session state (needed for next turn) + gen_text_trimmed = gen_text[:, :total_frames] + lengths = torch.tensor([total_frames], dtype=torch.long, device=device) + + output_audio = None + if audio_segments: + output_audio = torch.cat(audio_segments, dim=-1) + + # Save state back to session + state["frame_idx"] = final_frame_idx + state["gen_text"] = gen_text + state["gen_asr_text"] = gen_asr_text + state["audio_buffer"] = audio_buffer + state["buffer_fill_level"] = buffer_fill_level + state["llm_cache"] = llm_cache + state["input_embeds_history"] = input_embeds_history + + self._save_state_to_session(session_state, state) + + debug_info = { + "start_frame_idx": start_frame_idx, + "final_frame_idx": final_frame_idx, + "turn_frames": turn_frames, + "response_frames": response_frames, + "stop_reason": stop_reason, + "audio_energy_response_started": audio_energy_response_started, + } + + # Add frame alignment if enabled + if frame_alignment: + debug_info["frame_alignment"] = frame_alignment + + return { + "text": text_output, + "asr_text": asr_text_output, + "audio": output_audio, + "tokens_text": gen_text_trimmed, + "tokens_len": lengths, + "session_state": session_state, + "debug_info": debug_info, + } + + def generate_with_session(self, request: GenerationRequest, session_state: Optional[SessionState] = None) -> tuple: + """ + Generate with session support. + + Args: + request: Generation request + session_state: Optional session state to restore + + Returns: + Tuple of (GenerationResult, updated SessionState) + """ + if not self._is_loaded: + return ( + GenerationResult(error="Model not loaded", request_id=request.request_id), + session_state, + ) + + start_time = time.time() + temp_files = [] + saved_input_audio_path = None + + try: + # Get audio path + audio_path = request.audio_path + if request.audio_bytes: + temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) + temp_file.write(request.audio_bytes) + temp_file.close() + temp_files.append(temp_file.name) + audio_path = temp_file.name + + if not audio_path: + return ( + GenerationResult(error="Audio input required", request_id=request.request_id), + session_state, + ) + + saved_input_audio_path = audio_path + + # Determine turn index from session state + turn_idx = 0 + if session_state is not None and session_state.frame_idx > 0: + # Estimate turn index from frame_idx (rough approximation) + # Better: track turn count in session state + turn_idx = getattr(session_state, "turn_count", 0) + + # Run inference with session + output = self.inference_with_session( + audio_path=audio_path, + session_state=session_state, + num_frames_per_inference=self.inc_config.num_frames_per_inference, + ) + + # Encode agent audio to bytes (single-channel for storage) + agent_audio_bytes = None + if output["audio"] is not None: + audio_np = output["audio"].float().cpu().numpy().squeeze() + max_val = np.abs(audio_np).max() + if max_val > 0: + audio_np = audio_np / max_val * 0.95 + + wav_buffer = io.BytesIO() + import soundfile as sf + + sf.write(wav_buffer, audio_np, self.target_sample_rate, format="WAV") + agent_audio_bytes = wav_buffer.getvalue() + + elapsed_ms = (time.time() - start_time) * 1000 + + # Generate dual-channel audio (user=ch0, agent=ch1) for response + response_audio_bytes = agent_audio_bytes + response_sample_rate = self.target_sample_rate + if saved_input_audio_path and agent_audio_bytes: + dual_audio_bytes, dual_sr = self._generate_dual_channel_audio_for_turn( + saved_input_audio_path, agent_audio_bytes + ) + if dual_audio_bytes: + response_audio_bytes = dual_audio_bytes + response_sample_rate = dual_sr + + updated_session = output["session_state"] + session_id = updated_session.session_id if updated_session else "unknown" + + # Update turn count in session + if updated_session is not None: + if not hasattr(updated_session, "turn_count"): + updated_session.turn_count = 0 + updated_session.turn_count += 1 + turn_idx = updated_session.turn_count - 1 + + # Save session artifacts + debug_info = output.get("debug_info", {}) + output_text = output["text"][0] if output["text"] else "" + + # Read input audio bytes for session storage + input_audio_bytes = None + user_duration_sec = 0.0 + if saved_input_audio_path and os.path.exists(saved_input_audio_path): + with open(saved_input_audio_path, "rb") as f: + input_audio_bytes = f.read() + # Calculate duration from debug_info + user_duration_sec = debug_info.get("turn_frames", 0) * FRAME_SIZE_SEC + + # Calculate agent audio duration + agent_duration_sec = 0.0 + if agent_audio_bytes: + agent_duration_sec = debug_info.get("response_frames", 0) * FRAME_SIZE_SEC + + # Store turn data in session (use single-channel agent audio for storage) + if updated_session is not None: + if not hasattr(updated_session, "turns") or updated_session.turns is None: + updated_session.turns = [] + turn_data = TurnData( + turn_idx=turn_idx, + user_audio_bytes=input_audio_bytes, + agent_audio_bytes=agent_audio_bytes, + agent_text=output_text, + user_duration_sec=user_duration_sec, + agent_duration_sec=agent_duration_sec, + ) + updated_session.turns.append(turn_data) + + artifacts_info = self._save_session_artifacts( + session_id=session_id, + turn_idx=turn_idx, + input_audio_path=saved_input_audio_path, + request_info={ + "request_id": request.request_id, + "text": request.text, + "user_prompt": request.user_prompt, + "max_new_tokens": request.max_new_tokens, + "temperature": request.temperature, + }, + output_text=output_text, + output_audio_bytes=agent_audio_bytes, + debug_info=debug_info, + generation_time_ms=elapsed_ms, + ) + + # Add artifacts info to debug_info + if artifacts_info: + debug_info["artifacts"] = artifacts_info + + # Add total_frames to match incremental backend debug format + debug_info["total_frames"] = debug_info.get("final_frame_idx", 0) + + # Add per-turn text responses to debug_info + if updated_session is not None and updated_session.turns: + debug_info["turn_texts"] = [t.agent_text for t in updated_session.turns] + + # Add ASR text (user speech transcription) to debug_info + asr_text_output = output.get("asr_text", [""])[0] if output.get("asr_text") else "" + debug_info["asr_text"] = asr_text_output + + result = GenerationResult( + text=output_text, + audio_bytes=response_audio_bytes, + audio_sample_rate=response_sample_rate, + request_id=request.request_id, + generation_time_ms=elapsed_ms, + debug_info=debug_info, + ) + + return (result, updated_session) + + except Exception as e: + import traceback + + traceback.print_exc() + return ( + GenerationResult(error=str(e), request_id=request.request_id), + session_state, + ) + + finally: + for temp_path in temp_files: + if os.path.exists(temp_path): + os.unlink(temp_path) + + def generate(self, requests: List[GenerationRequest]) -> List[GenerationResult]: + """ + Generate without session support (falls back to parent). + + For session support, use generate_with_session() instead. + """ + return super().generate(requests) + + def generate_session_audio( + self, + session_state: SessionState, + pause_between_turns_sec: float = 0.5, + ) -> Optional[str]: + """ + Generate a 2-channel WAV file for the entire session. + + Channel 0: User (speaker) audio + Channel 1: Agent audio + + Both channels include pauses to align with the conversation flow. + + Args: + session_state: The session state with turn data + pause_between_turns_sec: Pause duration between turns (seconds) + + Returns: + Path to the generated session audio file, or None if no turns + """ + if not session_state.turns: + print("[S2SSession] No turns to generate session audio") + return None + + import soundfile as sf + + session_dir = self._get_session_artifacts_dir(session_state.session_id) + if session_dir is None: + print("[S2SSession] Session artifacts disabled, skipping session audio") + return None + + os.makedirs(session_dir, exist_ok=True) + + # Target sample rate for output + output_sr = self.target_sample_rate + pause_samples = int(pause_between_turns_sec * output_sr) + + # Collect all audio segments with timing + user_segments = [] # List of (start_sample, audio_array) + agent_segments = [] # List of (start_sample, audio_array) + + current_sample = 0 + for turn in session_state.turns: + # Process user audio + if turn.user_audio_bytes: + try: + user_audio, user_sr = sf.read(io.BytesIO(turn.user_audio_bytes)) + # Resample if needed + if user_sr != output_sr: + # Simple resampling via linear interpolation + import scipy.signal + + num_samples = int(len(user_audio) * output_sr / user_sr) + user_audio = scipy.signal.resample(user_audio, num_samples) + # Ensure mono + if len(user_audio.shape) > 1: + user_audio = user_audio[:, 0] + user_segments.append((current_sample, user_audio)) + current_sample += len(user_audio) + except Exception as e: + print(f"[S2SSession] Error reading user audio: {e}") + + # Add pause after user speaks + current_sample += pause_samples + + # Process agent audio + if turn.agent_audio_bytes: + try: + agent_audio, agent_sr = sf.read(io.BytesIO(turn.agent_audio_bytes)) + # Resample if needed + if agent_sr != output_sr: + import scipy.signal + + num_samples = int(len(agent_audio) * output_sr / agent_sr) + agent_audio = scipy.signal.resample(agent_audio, num_samples) + # Ensure mono + if len(agent_audio.shape) > 1: + agent_audio = agent_audio[:, 0] + agent_segments.append((current_sample, agent_audio)) + current_sample += len(agent_audio) + except Exception as e: + print(f"[S2SSession] Error reading agent audio: {e}") + + # Add pause after agent speaks + current_sample += pause_samples + + if not user_segments and not agent_segments: + print("[S2SSession] No audio segments found") + return None + + # Create 2-channel array + total_samples = current_sample + stereo_audio = np.zeros((total_samples, 2), dtype=np.float32) + + # Fill user channel (channel 0) + for start_sample, audio in user_segments: + end_sample = min(start_sample + len(audio), total_samples) + stereo_audio[start_sample:end_sample, 0] = audio[: end_sample - start_sample] + + # Fill agent channel (channel 1) + for start_sample, audio in agent_segments: + end_sample = min(start_sample + len(audio), total_samples) + stereo_audio[start_sample:end_sample, 1] = audio[: end_sample - start_sample] + + # Normalize + max_val = np.abs(stereo_audio).max() + if max_val > 0: + stereo_audio = stereo_audio / max_val * 0.95 + + # Save to file + output_path = os.path.join(session_dir, "session_audio.wav") + sf.write(output_path, stereo_audio, output_sr) + + duration_sec = total_samples / output_sr + print( + f"[S2SSession] Generated session audio: {output_path} " + f"({duration_sec:.2f}s, {len(session_state.turns)} turns)" + ) + + return output_path + + def on_session_close(self, session_state: SessionState) -> Dict[str, Any]: + """ + Called when a session is closed/deleted. + + Generates the final session audio and returns summary info. + + Args: + session_state: The session being closed + + Returns: + Dict with session summary info + """ + result = { + "session_id": session_state.session_id, + "turn_count": len(session_state.turns) if session_state.turns else 0, + "turn_texts": [t.agent_text for t in session_state.turns] if session_state.turns else [], + } + + # Generate session audio + audio_path = self.generate_session_audio(session_state) + if audio_path: + result["session_audio_path"] = audio_path + + return result + + def warmup(self): + """ + Run a warmup inference to pre-compile Triton kernels. + + This prevents race conditions when multiple requests arrive simultaneously + before kernels are compiled. + """ + import tempfile + + import soundfile as sf + + print("[S2SSession] Running warmup inference...") + + # Create a short silence audio for warmup (0.5 seconds) + warmup_duration_sec = 0.5 + warmup_samples = int(warmup_duration_sec * SAMPLE_RATE) + warmup_audio = np.zeros(warmup_samples, dtype=np.float32) + + # Write to temp file + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: + sf.write(f.name, warmup_audio, SAMPLE_RATE) + warmup_path = f.name + + try: + # Run inference with minimal settings + from ..session_manager import SessionState + + session_state = SessionState(session_id="warmup") + _ = self.inference_with_session( + audio_path=warmup_path, + session_state=session_state, + num_frames_per_inference=self.inc_config.num_frames_per_inference, + ) + print("[S2SSession] Warmup complete") + finally: + if os.path.exists(warmup_path): + os.unlink(warmup_path) diff --git a/recipes/multimodal/server/backends/s2s_voicechat_infer_backend.py b/recipes/multimodal/server/backends/s2s_voicechat_infer_backend.py new file mode 100644 index 0000000000..607cd7d963 --- /dev/null +++ b/recipes/multimodal/server/backends/s2s_voicechat_infer_backend.py @@ -0,0 +1,594 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +NemotronVoiceChat offline backend that mirrors `examples/speechlm2/nemotron_voicechat_infer.py`. + +This backend: +- Loads a NeMo OmegaConf YAML (`--config_path`) +- Applies script-style overrides (checkpoint paths, boosts, speaker ref, extra decoding) +- Resolves config and instantiates `NemotronVoiceChat(OmegaConf.to_container(..., resolve=True))` +- Runs `offline_inference` per request (supports batch/padded inference) + +Default behavior is text-only (decode_audio=False), but audio output can be enabled. +Artifacts (input.wav / output.json / output.wav) can be written under nemo-skills `output_dir/`. +""" + +import io +import json +import os +import random +import re +import tempfile +import time +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Set, Tuple + +import numpy as np +import soundfile as sf +import torch +from omegaconf import OmegaConf + +from .base import BackendConfig, GenerationRequest, GenerationResult, InferenceBackend, Modality + + +@dataclass +class S2SVoiceChatInferConfig(BackendConfig): + # NeMo config + code injection + config_path: Optional[str] = None + code_path: Optional[str] = None + + # Inference knobs (match infer_nano9b_s2s.sh overrides) + extra_decoding_seconds: float = 0.0 + speaker_reference: Optional[str] = None + tts_ckpt_path: Optional[str] = None + inference_pad_boost: float = 0.0 + inference_bos_boost: float = 0.0 + inference_eos_boost: float = 0.0 + + # Output behavior + decode_audio: bool = False # default text-only; can be enabled + output_dir: Optional[str] = None + save_artifacts: bool = False + # Merge user (input) + model (pred) into two-channel WAV like NeMo ResultsLogger (same timeline for FDB) + merge_user_channel: bool = False + + # Prompt handling + ignore_system_prompt: bool = False + + # Audio preprocessing defaults (will be overridden from YAML if present) + source_sample_rate: int = 16000 + target_sample_rate: int = 22050 + + @classmethod + def from_dict(cls, d: Dict[str, Any]) -> "S2SVoiceChatInferConfig": + known_fields = { + "model_path", + "device", + "dtype", + "max_new_tokens", + "temperature", + "top_p", + "top_k", + "config_path", + "code_path", + "extra_decoding_seconds", + "speaker_reference", + "tts_ckpt_path", + "inference_pad_boost", + "inference_bos_boost", + "inference_eos_boost", + "decode_audio", + "output_dir", + "save_artifacts", + "merge_user_channel", + "ignore_system_prompt", + "source_sample_rate", + "target_sample_rate", + } + known = {k: v for k, v in d.items() if k in known_fields} + extra = {k: v for k, v in d.items() if k not in known_fields} + return cls(**known, extra_config=extra) + + +class S2SVoiceChatInferBackend(InferenceBackend): + @property + def name(self) -> str: + return "s2s_voicechat" + + @property + def supported_modalities(self) -> Set[Modality]: + # AUDIO_OUT is optional (decode_audio flag), but supported. + return {Modality.TEXT, Modality.AUDIO_IN, Modality.AUDIO_OUT} + + def __init__(self, config: BackendConfig): + if isinstance(config, S2SVoiceChatInferConfig): + self.vc_config = config + else: + self.vc_config = S2SVoiceChatInferConfig.from_dict( + { + **{ + k: getattr(config, k) + for k in ["model_path", "device", "dtype", "max_new_tokens", "temperature", "top_p", "top_k"] + }, + **config.extra_config, + } + ) + + super().__init__(self.vc_config) + self._tokenizer = None + + def _add_code_path(self) -> None: + import sys + + code_path = self.vc_config.code_path + if code_path and code_path not in sys.path: + sys.path.insert(0, code_path) + print(f"[S2SVoiceChat] Added {code_path} to PYTHONPATH") + + def _load_cfg(self) -> Any: + config_path = self.vc_config.config_path + if not config_path: + raise RuntimeError("s2s_voicechat requires --config_path pointing to a NeMo YAML (OmegaConf)") + if not os.path.exists(config_path): + raise RuntimeError(f"Config path does not exist: {config_path}") + return OmegaConf.load(config_path) + + def _apply_overrides(self, cfg: Any) -> Any: + # Use the --model path as pretrained_s2s_model (like infer_nano9b_s2s.sh does). + if self.config.model_path: + OmegaConf.update(cfg, "model.stt.model.pretrained_s2s_model", self.config.model_path, force_add=True) + + # TTS override semantics (match Kevin's inference recipe): + # - `pretrained_model`: checkpoint file (.ckpt/.nemo) + # - `pretrained_tts_model`: exported model directory (expects config.json inside) + if self.vc_config.tts_ckpt_path: + tts_path = self.vc_config.tts_ckpt_path + if os.path.isdir(tts_path): + OmegaConf.update(cfg, "model.speech_generation.model.pretrained_tts_model", tts_path, force_add=True) + # Avoid accidentally also loading weights from an old checkpoint path. + OmegaConf.update(cfg, "model.speech_generation.model.pretrained_model", None, force_add=True) + else: + OmegaConf.update(cfg, "model.speech_generation.model.pretrained_model", tts_path, force_add=True) + # Ensure we don't trigger directory-based loading by mistake. + OmegaConf.update(cfg, "model.speech_generation.model.pretrained_tts_model", None, force_add=True) + + if self.vc_config.speaker_reference: + OmegaConf.update(cfg, "model.inference_speaker_reference", self.vc_config.speaker_reference, force_add=True) + + if self.vc_config.extra_decoding_seconds: + OmegaConf.update(cfg, "model.extra_decoding_seconds", float(self.vc_config.extra_decoding_seconds), force_add=True) + + # Script defaults / common inference overrides + OmegaConf.update(cfg, "model.use_asr_timestamps", True, force_add=True) + OmegaConf.update(cfg, "model.stt.model.eval_text_turn_taking", True, force_add=True) + + # Boosts + if self.vc_config.inference_pad_boost: + OmegaConf.update(cfg, "model.stt.model.inference_pad_boost", float(self.vc_config.inference_pad_boost), force_add=True) + if self.vc_config.inference_bos_boost: + OmegaConf.update(cfg, "model.stt.model.inference_bos_boost", float(self.vc_config.inference_bos_boost), force_add=True) + if self.vc_config.inference_eos_boost: + OmegaConf.update(cfg, "model.stt.model.inference_eos_boost", float(self.vc_config.inference_eos_boost), force_add=True) + + # Pull sample rates for preprocessing (data.* is what nemotron_voicechat_infer.py uses) + try: + sr_in = OmegaConf.select(cfg, "data.source_sample_rate") + sr_out = OmegaConf.select(cfg, "data.target_sample_rate") + if sr_in is not None: + self.vc_config.source_sample_rate = int(sr_in) + if sr_out is not None: + self.vc_config.target_sample_rate = int(sr_out) + except Exception: + pass + + return cfg + + def load_model(self) -> None: + print(f"[S2SVoiceChat] Loading NemotronVoiceChat. model={self.config.model_path}") + self._add_code_path() + + # Match script's inference-safe numerical defaults + torch.set_float32_matmul_precision("medium") + torch.backends.cudnn.allow_tf32 = True + + try: + cfg = self._load_cfg() + cfg = self._apply_overrides(cfg) + OmegaConf.resolve(cfg) + model_config = OmegaConf.to_container(cfg, resolve=True) + + from nemo.collections.speechlm2.models.nemotron_voicechat import NemotronVoiceChat + + self._model = NemotronVoiceChat(model_config).eval() + + dtype = getattr(torch, self.config.dtype, torch.bfloat16) + try: + self._model = self._model.to(dtype) + except Exception as e: + print(f"[S2SVoiceChat] Warning: dtype conversion to {dtype} failed: {e}") + + self._model = self._model.to(self.config.device) + self._tokenizer = getattr(getattr(self._model, "stt_model", None), "tokenizer", None) + self._is_loaded = True + print("[S2SVoiceChat] Model loaded successfully") + print(f" device={self.config.device} dtype={self.config.dtype}") + print(f" source_sr={self.vc_config.source_sample_rate} target_sr={self.vc_config.target_sample_rate}") + print(f" decode_audio={self.vc_config.decode_audio} extra_decoding_seconds={self.vc_config.extra_decoding_seconds}") + except Exception as e: + import traceback + + traceback.print_exc() + raise RuntimeError(f"Failed to load s2s_voicechat backend: {e}") + + def _clean_special_tokens(self, text: str) -> str: + if not text: + return text + text = re.sub(r"<\\$[\\d.]+\\$>", "", text) + text = re.sub(r"<\\|[\\d.]+\\|>", "", text) + text = re.sub(r"\\s+", " ", text).strip() + return text + + def _tokenize_system_prompts( + self, system_prompts: List[Optional[str]], batch_size: int + ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: + if self.vc_config.ignore_system_prompt or not any(p for p in system_prompts): + return None, None + if self._tokenizer is None: + return None, None + + tokenized: List[List[int]] = [] + for prompt in system_prompts: + if not prompt: + tokenized.append([]) + continue + if hasattr(self._tokenizer, "text_to_ids"): + tokens = self._tokenizer.text_to_ids(prompt) + elif hasattr(self._tokenizer, "encode"): + tokens = self._tokenizer.encode(prompt) + else: + tokens = [] + tokenized.append(tokens) + + max_len = max((len(t) for t in tokenized), default=0) + if max_len == 0: + return None, None + + pad_id = 0 + if hasattr(self._tokenizer, "pad_id"): + pad_id = int(self._tokenizer.pad_id) + elif hasattr(self._tokenizer, "pad_token_id"): + pad_id = int(self._tokenizer.pad_token_id) + + prompt_tokens = torch.full((batch_size, max_len), pad_id, dtype=torch.long) + prompt_lens = torch.zeros(batch_size, dtype=torch.long) + for i, tokens in enumerate(tokenized): + if tokens: + prompt_tokens[i, : len(tokens)] = torch.tensor(tokens, dtype=torch.long) + prompt_lens[i] = len(tokens) + + return prompt_tokens.to(self.config.device), prompt_lens.to(self.config.device) + + def _load_and_preprocess_audio(self, request: GenerationRequest, temp_files: List[str]) -> Tuple[np.ndarray, str]: + if not request.audio_bytes and not request.audio_path: + raise ValueError("Audio input is required for s2s_voicechat backend") + + audio_path = request.audio_path + if request.audio_bytes: + tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) + tmp.write(request.audio_bytes) + tmp.close() + audio_path = tmp.name + temp_files.append(audio_path) + + audio, sr = sf.read(audio_path) + if audio.ndim > 1: + audio = audio.mean(axis=1) + + target_sr = int(self.vc_config.source_sample_rate) + if sr != target_sr: + try: + import librosa + + audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr) + except ImportError: + import torchaudio + + audio_tensor = torch.from_numpy(audio).float().unsqueeze(0) + audio_tensor = torchaudio.functional.resample(audio_tensor, sr, target_sr) + audio = audio_tensor.squeeze(0).numpy() + + return audio, audio_path + + def _merge_user_model_audio( + self, + user_audio: np.ndarray, + user_sr: int, + pred_audio: np.ndarray, + pred_sr: int, + ) -> np.ndarray: + """Merge user (input) and model (pred) into a two-channel WAV on one timeline. + Mirrors NeMo ResultsLogger.merge_and_save_audio: channel 0 = user, channel 1 = pred, + same length (padded), so FDB ASR (--stereo) gets a single timeline.""" + user = np.asarray(user_audio, dtype=np.float64).squeeze() + pred = np.asarray(pred_audio, dtype=np.float64).squeeze() + if user.ndim > 1: + user = user.mean(axis=1) + if pred.ndim > 1: + pred = pred.mean(axis=1) + # Resample user to pred_sr + if user_sr != pred_sr: + try: + import librosa + user = librosa.resample(user, orig_sr=user_sr, target_sr=pred_sr) + except ImportError: + import torchaudio + ut = torch.from_numpy(user).float().unsqueeze(0) + ut = torchaudio.functional.resample(ut, user_sr, pred_sr) + user = ut.squeeze(0).numpy() + T1, T2 = user.shape[0], pred.shape[0] + max_len = max(T1, T2) + user_pad = np.pad(user, (0, max_len - T1), mode="constant", constant_values=0) + pred_pad = np.pad(pred, (0, max_len - T2), mode="constant", constant_values=0) + # (samples, 2): ch0 = user, ch1 = pred (like NeMo .T after cat) + merged = np.stack([user_pad, pred_pad], axis=1).astype(np.float32) + return merged + + def _artifact_root(self) -> Optional[str]: + if not (self.vc_config.save_artifacts and self.vc_config.output_dir): + return None + job_id = os.environ.get("SLURM_JOB_ID") or os.environ.get("JOB_ID") or "local" + return os.path.join(self.vc_config.output_dir, "artifacts", job_id) + + def _save_artifacts( + self, + request_id: str, + input_audio_path: str, + output_text: str, + output_audio_bytes: Optional[bytes], + debug_info: Dict[str, Any], + elapsed_ms: float, + ) -> Optional[Dict[str, str]]: + root = self._artifact_root() + if root is None: + return None + req_dir = os.path.join(root, request_id) + os.makedirs(req_dir, exist_ok=True) + + out: Dict[str, str] = {} + try: + with open(input_audio_path, "rb") as fin, open(os.path.join(req_dir, "input.wav"), "wb") as fout: + fout.write(fin.read()) + out["input_wav"] = os.path.join(req_dir, "input.wav") + except Exception as e: + print(f"[S2SVoiceChat] Warning: failed saving input.wav: {e}") + + try: + meta = { + "request_id": request_id, + "text": output_text, + "generation_time_ms": elapsed_ms, + "debug_info": debug_info, + } + meta_path = os.path.join(req_dir, "output.json") + with open(meta_path, "w") as f: + json.dump(meta, f, indent=2) + out["output_json"] = meta_path + except Exception as e: + print(f"[S2SVoiceChat] Warning: failed saving output.json: {e}") + + if output_audio_bytes: + wav_path = os.path.join(req_dir, "output.wav") + try: + with open(wav_path, "wb") as f: + f.write(output_audio_bytes) + out["output_wav"] = wav_path + except Exception as e: + print(f"[S2SVoiceChat] Warning: failed saving output.wav: {e}") + + return out or None + + def generate(self, requests: List[GenerationRequest]) -> List[GenerationResult]: + if not self._is_loaded: + return [GenerationResult(error="Model not loaded", request_id=r.request_id) for r in requests] + if not requests: + return [] + + start_time = time.time() + temp_files: List[str] = [] + results: List[Optional[GenerationResult]] = [None] * len(requests) + + valid_indices: List[int] = [] + audio_list: List[np.ndarray] = [] + system_prompts: List[Optional[str]] = [] + input_audio_paths: Dict[int, str] = {} + + try: + for i, req in enumerate(requests): + try: + audio, audio_path = self._load_and_preprocess_audio(req, temp_files) + audio_list.append(audio) + valid_indices.append(i) + system_prompts.append(req.system_prompt) + input_audio_paths[i] = audio_path + except Exception as e: + results[i] = GenerationResult(error=str(e), request_id=req.request_id) + + if not audio_list: + return [r if r is not None else GenerationResult(error="No valid requests") for r in results] + + max_len = max(a.shape[0] for a in audio_list) + batch_size = len(audio_list) + batch = torch.zeros(batch_size, max_len, dtype=torch.float32) + lengths = torch.zeros(batch_size, dtype=torch.long) + for bi, audio in enumerate(audio_list): + batch[bi, : len(audio)] = torch.from_numpy(audio).float() + lengths[bi] = len(audio) + + batch = batch.to(self.config.device) + lengths = lengths.to(self.config.device) + + prompt_tokens, prompt_lens = self._tokenize_system_prompts(system_prompts, batch_size) + + first_seed = next((r.seed for r in requests if r.seed is not None), None) + if first_seed is not None: + random.seed(first_seed) + np.random.seed(first_seed) + torch.manual_seed(first_seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(first_seed) + + input_pad_len = int(float(self.vc_config.extra_decoding_seconds) * int(self.vc_config.source_sample_rate)) + + with torch.no_grad(): + outputs = self._model.offline_inference( + input_signal=batch, + input_signal_lens=lengths, + prompt_tokens=prompt_tokens, + prompt_token_lens=prompt_lens, + input_pad_len=input_pad_len, + decode_audio=bool(self.vc_config.decode_audio), + ) + + elapsed_ms = (time.time() - start_time) * 1000.0 + + texts = outputs.get("text") or [] + tokens_len = outputs.get("tokens_len") + asr_hyps = outputs.get("asr_hyps") + audio_out = outputs.get("audio") + audio_len = outputs.get("audio_len") + + for bi, req_i in enumerate(valid_indices): + req = requests[req_i] + request_id = req.request_id or f"req_{bi}" + + # Text + text = "" + if isinstance(texts, list) and bi < len(texts): + text = texts[bi] or "" + elif isinstance(texts, str): + text = texts + text = self._clean_special_tokens(text) + + # Token count + num_tokens = 0 + if tokens_len is not None: + try: + num_tokens = int(tokens_len[bi].item()) + except Exception: + num_tokens = 0 + + # Audio bytes (optional) + out_audio_bytes = None + out_sr = int(self.vc_config.target_sample_rate) + if self.vc_config.decode_audio and audio_out is not None: + try: + wav = audio_out[bi] + if hasattr(wav, "detach"): + wav = wav.detach().float().cpu().numpy() + wav = np.asarray(wav).squeeze() + # Trim to per-sample input duration to remove batch-padding artifacts. + # When multiple samples are batched, shorter ones get padded to the longest, + # and the model generates audio for all frames including padding. Trimming to the + # input duration (converted to target SR) matches NeMo ResultsLogger behavior and + # ensures the output WAV only contains audio corresponding to the actual input. + source_sr = int(self.vc_config.source_sample_rate) + per_sample_input_len = int(lengths[bi].item()) + input_pad_len + per_sample_pred_len = int(per_sample_input_len / source_sr * out_sr) + if per_sample_pred_len > 0 and per_sample_pred_len < len(wav): + wav = wav[:per_sample_pred_len] + elif audio_len is not None: + try: + n = int(audio_len[bi].item()) + wav = wav[:n] + except Exception: + pass + max_val = float(np.max(np.abs(wav))) if wav.size else 0.0 + if max_val > 0: + wav = wav / max_val * 0.95 + if self.vc_config.merge_user_channel: + user_audio = audio_list[bi] + merged = self._merge_user_model_audio( + user_audio, + int(self.vc_config.source_sample_rate), + wav, + out_sr, + ) + buf = io.BytesIO() + sf.write(buf, merged, out_sr, format="WAV") + out_audio_bytes = buf.getvalue() + else: + buf = io.BytesIO() + sf.write(buf, wav, out_sr, format="WAV") + out_audio_bytes = buf.getvalue() + except Exception as e: + print(f"[S2SVoiceChat] Warning: failed encoding audio for {request_id}: {e}") + + per_req_ms = elapsed_ms / max(1, len(requests)) + debug_info: Dict[str, Any] = { + "tokens_len": num_tokens, + "decode_audio": bool(self.vc_config.decode_audio), + "extra_decoding_seconds": float(self.vc_config.extra_decoding_seconds), + "source_sample_rate": int(self.vc_config.source_sample_rate), + "target_sample_rate": int(self.vc_config.target_sample_rate), + } + + # User ASR (if model returns it). We attach the per-request hypothesis to debug_info. + # Expected shape: list[str] aligned to batch, or a single str. + user_asr = None + if asr_hyps is not None: + if isinstance(asr_hyps, list): + if bi < len(asr_hyps): + user_asr = asr_hyps[bi] + elif isinstance(asr_hyps, str): + user_asr = asr_hyps + if user_asr: + debug_info["asr_hyp"] = user_asr + + artifacts = self._save_artifacts( + request_id=request_id, + input_audio_path=input_audio_paths.get(req_i, ""), + output_text=text, + output_audio_bytes=out_audio_bytes, + debug_info=debug_info, + elapsed_ms=per_req_ms, + ) + if artifacts: + debug_info["artifacts"] = artifacts + + results[req_i] = GenerationResult( + text=text, + audio_bytes=out_audio_bytes, + audio_sample_rate=out_sr, + request_id=req.request_id, + num_tokens_generated=num_tokens, + generation_time_ms=per_req_ms, + debug_info=debug_info, + ) + + return [r if r is not None else GenerationResult(error="Unknown error") for r in results] + + finally: + for p in temp_files: + try: + if os.path.exists(p): + os.unlink(p) + except Exception: + pass + + def validate_request(self, request: GenerationRequest) -> Optional[str]: + if not request.audio_bytes and not request.audio_path: + return "Audio input is required for s2s_voicechat backend" + return None + diff --git a/recipes/multimodal/server/session_manager.py b/recipes/multimodal/server/session_manager.py new file mode 100644 index 0000000000..113735b8ba --- /dev/null +++ b/recipes/multimodal/server/session_manager.py @@ -0,0 +1,249 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Session manager for S2S session backend. + +Manages session state (LLM KV cache, frame index, etc.) across HTTP requests +to enable multi-turn conversations. +""" + +import threading +import time +import uuid +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional + +import torch + + +@dataclass +class TurnData: + """Data for a single turn in a conversation.""" + + turn_idx: int + user_audio_bytes: Optional[bytes] = None # Input audio from user + agent_audio_bytes: Optional[bytes] = None # Output audio from agent + agent_text: str = "" # Text response for this turn + user_duration_sec: float = 0.0 # Duration of user audio + agent_duration_sec: float = 0.0 # Duration of agent audio + + +@dataclass +class SessionState: + """State that persists between turns in a session.""" + + session_id: str + + # LLM state + llm_cache: Any = None # DynamicCache (for non-Mamba models) + input_embeds_history: Any = None # List of embeddings (for Mamba models) + frame_idx: int = 0 + + # Token history (for turn-taking logic) + gen_text: Optional[torch.Tensor] = None + gen_asr_text: Optional[torch.Tensor] = None + + # Audio buffer state + audio_buffer: Optional[torch.Tensor] = None + buffer_fill_level: int = 0 + + # Turn tracking + turn_count: int = 0 + + # Per-turn data for session audio generation + turns: List[TurnData] = field(default_factory=list) + + # Timestamps + created_at: float = field(default_factory=time.time) + last_accessed: float = field(default_factory=time.time) + + def touch(self): + """Update last_accessed timestamp.""" + self.last_accessed = time.time() + + +class SessionManager: + """ + Manages session state for S2S multi-turn conversations. + + Thread-safe implementation with TTL-based cleanup. + """ + + def __init__(self, ttl_seconds: float = 300.0, max_sessions: int = 100): + """ + Initialize SessionManager. + + Args: + ttl_seconds: Time-to-live for sessions in seconds (default: 5 minutes) + max_sessions: Maximum number of concurrent sessions + """ + self.ttl_seconds = ttl_seconds + self.max_sessions = max_sessions + self.sessions: Dict[str, SessionState] = {} + self._lock = threading.RLock() + + def create_session(self, session_id: Optional[str] = None) -> SessionState: + """ + Create a new session. + + Args: + session_id: Optional session ID. If None, generates a UUID. + + Returns: + New SessionState object + """ + with self._lock: + if session_id is None: + session_id = str(uuid.uuid4()) + + # Clean up expired sessions first + self._cleanup_expired_locked() + + # Evict oldest if at capacity + if len(self.sessions) >= self.max_sessions: + self._evict_oldest_locked() + + state = SessionState(session_id=session_id) + self.sessions[session_id] = state + print(f"[SessionManager] Created session: {session_id}") + return state + + def get_session(self, session_id: str) -> Optional[SessionState]: + """ + Get existing session by ID. + + Args: + session_id: Session ID to look up + + Returns: + SessionState if found and not expired, None otherwise + """ + with self._lock: + state = self.sessions.get(session_id) + if state is None: + return None + + # Check if expired + if time.time() - state.last_accessed > self.ttl_seconds: + print(f"[SessionManager] Session expired: {session_id}") + del self.sessions[session_id] + return None + + state.touch() + return state + + def get_or_create_session(self, session_id: Optional[str] = None) -> SessionState: + """ + Get existing session or create new one. + + Args: + session_id: Session ID. If None, creates new session. + + Returns: + SessionState (existing or new) + """ + if session_id: + state = self.get_session(session_id) + if state is not None: + return state + + return self.create_session(session_id) + + def save_session(self, session_id: str, state: SessionState): + """ + Save/update session state. + + Args: + session_id: Session ID + state: SessionState to save + """ + with self._lock: + state.touch() + self.sessions[session_id] = state + + def delete_session(self, session_id: str) -> bool: + """ + Delete a session. + + Args: + session_id: Session ID to delete + + Returns: + True if session was deleted, False if not found + """ + with self._lock: + if session_id in self.sessions: + del self.sessions[session_id] + print(f"[SessionManager] Deleted session: {session_id}") + return True + return False + + def get_session_info(self, session_id: str) -> Optional[Dict[str, Any]]: + """ + Get session info without full state. + + Args: + session_id: Session ID + + Returns: + Dict with session metadata or None + """ + with self._lock: + state = self.sessions.get(session_id) + if state is None: + return None + + return { + "session_id": state.session_id, + "frame_idx": state.frame_idx, + "turn_count": state.turn_count, + "created_at": state.created_at, + "last_accessed": state.last_accessed, + "has_llm_cache": state.llm_cache is not None, + "has_input_embeds_history": state.input_embeds_history is not None + and len(state.input_embeds_history) > 0, + } + + def list_sessions(self) -> list: + """List all active session IDs.""" + with self._lock: + return list(self.sessions.keys()) + + def cleanup_expired(self): + """Clean up expired sessions (called periodically).""" + with self._lock: + self._cleanup_expired_locked() + + def _cleanup_expired_locked(self): + """Clean up expired sessions (must hold lock).""" + now = time.time() + expired = [sid for sid, state in self.sessions.items() if now - state.last_accessed > self.ttl_seconds] + for sid in expired: + print(f"[SessionManager] Cleaning up expired session: {sid}") + del self.sessions[sid] + + def _evict_oldest_locked(self): + """Evict oldest session to make room (must hold lock).""" + if not self.sessions: + return + + oldest_id = min(self.sessions.keys(), key=lambda sid: self.sessions[sid].last_accessed) + print(f"[SessionManager] Evicting oldest session: {oldest_id}") + del self.sessions[oldest_id] + + def __len__(self) -> int: + """Return number of active sessions.""" + with self._lock: + return len(self.sessions) diff --git a/recipes/multimodal/server/unified_server.py b/recipes/multimodal/server/unified_server.py new file mode 100644 index 0000000000..8180690518 --- /dev/null +++ b/recipes/multimodal/server/unified_server.py @@ -0,0 +1,829 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Unified NeMo Inference Server with OpenAI-compatible API. + +Supports multiple NeMo model backends: +- SALM: Speech-Augmented Language Model +- TTS: Text-to-Speech (MagpieTTS) +- S2S: Speech-to-Speech (Duplex) + +Exposes only /v1/chat/completions endpoint for OpenAI compatibility. + +Usage: + python unified_server.py --backend s2s --model /path/to/model +""" + +import asyncio +import base64 +import hashlib +import json +import os +import re +import time +from dataclasses import dataclass +from datetime import datetime +from typing import Any, Dict, List, Optional + +import uvicorn +from fastapi import FastAPI, HTTPException +from fastapi.responses import JSONResponse + +from .backends import BackendConfig, GenerationRequest, GenerationResult, get_backend +from .session_manager import SessionManager + +# Configuration from environment +HOST = os.getenv("UNIFIED_SERVER_HOST", "0.0.0.0") +PORT = int(os.getenv("UNIFIED_SERVER_PORT", "8000")) +BACKEND_TYPE = os.getenv("UNIFIED_SERVER_BACKEND", "salm") +MODEL_PATH = os.getenv("UNIFIED_SERVER_MODEL_PATH", "") +CODEC_MODEL_PATH = os.getenv("UNIFIED_SERVER_CODEC_MODEL_PATH", "") + +# Batching configuration +# Note: S2S backends process requests sequentially anyway, so batch_size>1 just adds delay +# Use batch_timeout=0 for immediate processing without waiting +BATCH_SIZE = int(os.getenv("UNIFIED_SERVER_BATCH_SIZE", "1")) +BATCH_TIMEOUT = float(os.getenv("UNIFIED_SERVER_BATCH_TIMEOUT", "0")) + +# Generation defaults +MAX_NEW_TOKENS = int(os.getenv("UNIFIED_SERVER_MAX_NEW_TOKENS", "512")) +TEMPERATURE = float(os.getenv("UNIFIED_SERVER_TEMPERATURE", "1.0")) +TOP_P = float(os.getenv("UNIFIED_SERVER_TOP_P", "1.0")) + +# Debug +DEBUG = os.getenv("DEBUG", "").lower() in ("true", "1", "yes", "on") +INCLUDE_DEBUG_INFO = os.getenv("INCLUDE_DEBUG_INFO", "true").lower() in ("true", "1", "yes", "on") + + +@dataclass +class PendingRequest: + """Container for a pending batched request.""" + + request: GenerationRequest + future: asyncio.Future + timestamp: float + + +class RequestBatcher: + """Manages request batching with configurable delay.""" + + def __init__(self, backend, batch_size: int, batch_timeout: float): + self.backend = backend + self.batch_size = batch_size + self.batch_timeout = batch_timeout + self.pending_requests: List[PendingRequest] = [] + self.lock = asyncio.Lock() + self.timeout_task: Optional[asyncio.Task] = None + self.processing = False + + # Stats + self.total_requests = 0 + self.total_batches = 0 + + async def add_request(self, request: GenerationRequest) -> GenerationResult: + """Add a request and wait for result.""" + future = asyncio.Future() + pending = PendingRequest(request=request, future=future, timestamp=time.time()) + + async with self.lock: + self.pending_requests.append(pending) + + # Check if we should process immediately + if len(self.pending_requests) >= self.batch_size: + if DEBUG: + print(f"[Batcher] Batch full ({self.batch_size}), processing immediately") + asyncio.create_task(self._process_batch()) + elif self.batch_timeout == 0: + # No delay mode + asyncio.create_task(self._process_batch()) + elif self.timeout_task is None or self.timeout_task.done(): + # Schedule timeout + self.timeout_task = asyncio.create_task(self._timeout_handler()) + + return await future + + async def _timeout_handler(self): + """Handle batch timeout.""" + await asyncio.sleep(self.batch_timeout) + async with self.lock: + if self.pending_requests and not self.processing: + if DEBUG: + print(f"[Batcher] Timeout, processing {len(self.pending_requests)} requests") + asyncio.create_task(self._process_batch()) + + async def _process_batch(self): + """Process pending requests as a batch.""" + async with self.lock: + if not self.pending_requests or self.processing: + return + + self.processing = True + batch = self.pending_requests[: self.batch_size] + self.pending_requests = self.pending_requests[self.batch_size :] + + try: + # Extract requests + requests = [p.request for p in batch] + + if DEBUG: + print(f"[Batcher] Processing batch of {len(requests)} requests") + + # Run inference in thread pool to not block event loop + loop = asyncio.get_event_loop() + results = await loop.run_in_executor(None, self.backend.generate, requests) + + # Complete futures + for pending, result in zip(batch, results): + if not pending.future.done(): + pending.future.set_result(result) + + # Update stats + self.total_requests += len(batch) + self.total_batches += 1 + + except Exception as e: + # Set exception for all pending requests + for pending in batch: + if not pending.future.done(): + pending.future.set_exception(e) + finally: + async with self.lock: + self.processing = False + # Process more if pending + if self.pending_requests: + if self.batch_timeout == 0 or len(self.pending_requests) >= self.batch_size: + asyncio.create_task(self._process_batch()) + elif self.timeout_task is None or self.timeout_task.done(): + self.timeout_task = asyncio.create_task(self._timeout_handler()) + + +# Global state +backend_instance = None +request_batcher = None +session_manager = None +server_config = {} + + +def compute_session_hash(messages: List[Dict[str, Any]]) -> str: + """Compute deterministic hash from messages for session identification. + + Used for automatic session management without client-provided session_id. + Hash is based on message roles and content (text only, not audio data). + """ + hash_parts = [] + for msg in messages: + role = msg.get("role", "") + content = msg.get("content", "") + # For list content, extract text parts only + if isinstance(content, list): + text_parts = [ + item.get("text", "") for item in content if isinstance(item, dict) and item.get("type") == "text" + ] + content = " ".join(text_parts) + hash_parts.append(f"{role}:{content}") + + combined = "|".join(hash_parts) + return hashlib.md5(combined.encode()).hexdigest()[:16] + + +def extract_audio_from_messages(messages: List[Dict[str, Any]]) -> List[bytes]: + """Extract all audio bytes from OpenAI-format messages. + + Supports two formats: + 1. audio_url: {"type": "audio_url", "audio_url": {"url": "data:audio/wav;base64,..."}} + 2. input_audio: {"type": "input_audio", "input_audio": {"data": "...", "format": "wav"}} + + Returns a list of audio bytes (one per audio found), preserving message order. + """ + audio_list = [] + for message in messages: + content = message.get("content") + if isinstance(content, list): + for item in content: + if isinstance(item, dict): + # Format 1: audio_url (data URL style) + if item.get("type") == "audio_url": + audio_url = item.get("audio_url", {}) + url = audio_url.get("url", "") + match = re.match(r"data:audio/\w+;base64,(.+)", url) + if match: + audio_list.append(base64.b64decode(match.group(1))) + + # Format 2: input_audio (OpenAI native style) + elif item.get("type") == "input_audio": + input_audio = item.get("input_audio", {}) + data = input_audio.get("data", "") + if data: + audio_list.append(base64.b64decode(data)) + return audio_list + + +def extract_text_from_messages(messages: List[Dict[str, Any]]) -> str: + """Extract text content from OpenAI-format messages.""" + texts = [] + for message in messages: + content = message.get("content") + if isinstance(content, str): + if content: + texts.append(content) + elif isinstance(content, list): + for item in content: + if isinstance(item, dict) and item.get("type") == "text": + text = item.get("text", "") + if text: + texts.append(text) + elif isinstance(item, str): + texts.append(item) + return " ".join(texts) + + +def extract_system_prompt(messages: List[Dict[str, Any]]) -> Optional[str]: + """Extract system prompt from messages.""" + for message in messages: + if message.get("role") == "system": + content = message.get("content") + if isinstance(content, str): + return content + elif isinstance(content, list): + texts = [ + item.get("text", "") for item in content if isinstance(item, dict) and item.get("type") == "text" + ] + return " ".join(texts) if texts else None + return None + + +def create_app( + backend_type: str = BACKEND_TYPE, + model_path: str = MODEL_PATH, + codec_model_path: str = CODEC_MODEL_PATH, + batch_size: int = BATCH_SIZE, + batch_timeout: float = BATCH_TIMEOUT, + device: str = "cuda", + dtype: str = "bfloat16", + extra_config: Dict[str, Any] = None, +) -> FastAPI: + """Create and configure the FastAPI app.""" + global backend_instance, request_batcher, session_manager, server_config + + # Extract server-level config from extra_config + ignore_system_prompt = extra_config.pop("ignore_system_prompt", False) if extra_config else False + session_ttl = extra_config.pop("session_ttl", 300.0) if extra_config else 300.0 + max_sessions = extra_config.pop("max_sessions", 100) if extra_config else 100 + + app = FastAPI( + title="Unified NeMo Inference Server", + description=f"OpenAI-compatible API for NeMo model inference ({backend_type} backend)", + version="1.0.0", + ) + + # Store config + server_config = { + "backend_type": backend_type, + "model_path": model_path, + "codec_model_path": codec_model_path, + "batch_size": batch_size, + "batch_timeout": batch_timeout, + "device": device, + "dtype": dtype, + "ignore_system_prompt": ignore_system_prompt, + "session_ttl": session_ttl, + "max_sessions": max_sessions, + } + + @app.on_event("startup") + async def startup(): + global backend_instance, request_batcher, session_manager + + # Build backend config + config_dict = { + "model_path": model_path, + "device": device, + "dtype": dtype, + "max_new_tokens": MAX_NEW_TOKENS, + "temperature": TEMPERATURE, + "top_p": TOP_P, + } + + # Add backend-specific config + if codec_model_path: + config_dict["codec_model_path"] = codec_model_path + + if extra_config: + config_dict.update(extra_config) + + config = BackendConfig.from_dict(config_dict) + + # Get and instantiate backend + print(f"[Server] Initializing {backend_type} backend...") + BackendClass = get_backend(backend_type) + backend_instance = BackendClass(config) + + # Load model + backend_instance.load_model() + + # Create batcher + request_batcher = RequestBatcher(backend_instance, batch_size, batch_timeout) + + # Initialize session manager for session-aware backends + if backend_type == "s2s_session": + session_manager = SessionManager(ttl_seconds=session_ttl, max_sessions=max_sessions) + print(f"[Server] Session manager initialized (TTL: {session_ttl}s, max: {max_sessions})") + + # Warmup inference to pre-compile Triton kernels (avoids race conditions on first requests) + print("[Server] Running warmup inference to compile Triton kernels...") + try: + backend_instance.warmup() + print("[Server] Warmup complete - Triton kernels compiled") + except Exception as e: + print(f"[Server] Warmup failed (will compile on first request): {e}") + + print("[Server] Ready!") + print(f" Backend: {backend_type}") + print(f" Model: {model_path}") + print(f" Batch size: {batch_size}") + print(f" Batch timeout: {batch_timeout}s") + if ignore_system_prompt: + print(" System prompts: IGNORED") + + @app.get("/") + async def root(): + """Root endpoint with server info.""" + endpoints = ["/v1/chat/completions", "/health"] + if backend_type == "s2s_session": + endpoints.extend(["/v1/sessions", "/v1/sessions/{session_id}"]) + return { + "service": "Unified NeMo Inference Server", + "version": "1.0.0", + "backend": server_config.get("backend_type"), + "model": server_config.get("model_path"), + "endpoints": endpoints, + } + + # Session management endpoints (only for s2s_session backend) + @app.get("/v1/sessions") + async def list_sessions(): + """List all active sessions.""" + if session_manager is None: + raise HTTPException(status_code=404, detail="Session management not enabled for this backend") + return { + "sessions": session_manager.list_sessions(), + "count": len(session_manager), + "ttl_seconds": session_manager.ttl_seconds, + } + + @app.get("/v1/sessions/{session_id}") + async def get_session(session_id: str): + """Get session info.""" + if session_manager is None: + raise HTTPException(status_code=404, detail="Session management not enabled for this backend") + info = session_manager.get_session_info(session_id) + if info is None: + raise HTTPException(status_code=404, detail=f"Session not found: {session_id}") + return info + + @app.delete("/v1/sessions/{session_id}") + async def delete_session(session_id: str): + """Delete a session and generate final session audio.""" + if session_manager is None: + raise HTTPException(status_code=404, detail="Session management not enabled for this backend") + + # Get session state before deleting + session_state = session_manager.get_session(session_id) + if session_state is None: + raise HTTPException(status_code=404, detail=f"Session not found: {session_id}") + + # Call on_session_close to generate session audio + close_result = {} + if backend_instance is not None and hasattr(backend_instance, "on_session_close"): + try: + close_result = backend_instance.on_session_close(session_state) + except Exception as e: + print(f"[Server] Error in on_session_close: {e}") + import traceback + + traceback.print_exc() + + # Now delete the session + session_manager.delete_session(session_id) + + return {"success": True, "session_id": session_id, **close_result} + + @app.get("/health") + async def health(): + """Health check endpoint.""" + if backend_instance is None: + return JSONResponse(status_code=503, content={"status": "not_ready", "error": "Backend not initialized"}) + + health_info = backend_instance.health_check() + health_info["status"] = "healthy" if backend_instance.is_loaded else "not_ready" + health_info["timestamp"] = datetime.now().isoformat() + + return health_info + + @app.get("/v1/models") + async def list_models(): + """OpenAI-compatible models endpoint.""" + model_id = server_config.get("model_path", "unknown") if server_config else "unknown" + return { + "object": "list", + "data": [ + { + "id": model_id, + "object": "model", + "created": int(time.time()), + "owned_by": "nvidia", + } + ], + } + + @app.post("/v1/chat/completions") + async def chat_completions(request: Dict[str, Any]): + """OpenAI-compatible chat completions endpoint with audio support. + + Accepts messages in OpenAI format with audio_url for audio content: + { + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": [ + {"type": "text", "text": "..."}, + {"type": "audio_url", "audio_url": {"url": "data:audio/wav;base64,..."}} + ]} + ], + "max_tokens": 512, + "temperature": 1.0, + "extra_body": {"session_id": "optional-session-id"} + } + """ + if backend_instance is None or not backend_instance.is_loaded: + raise HTTPException(status_code=503, detail="Model not loaded") + + try: + messages = request.get("messages", []) + if not messages: + raise HTTPException(status_code=400, detail="No messages provided") + + # Extract session_id from extra_body (for s2s_session backend) + extra_body = request.get("extra_body", {}) + session_id = extra_body.get("session_id") if isinstance(extra_body, dict) else None + + # Extract components from messages + audio_bytes_list = extract_audio_from_messages(messages) + text = extract_text_from_messages(messages) + system_prompt = extract_system_prompt(messages) + + # Honor ignore_system_prompt setting + if server_config.get("ignore_system_prompt", False): + system_prompt = None + + # Get generation parameters + max_tokens = request.get("max_tokens", MAX_NEW_TOKENS) + temperature = request.get("temperature", TEMPERATURE) + top_p = request.get("top_p", TOP_P) + seed = request.get("seed") + + # Create generation request + # For s2s_session: use last audio only (current turn) - history is in KV cache + # For other backends: use audio_bytes_list for multi-turn support + if backend_type == "s2s_session": + # Session backend only needs current turn's audio (last in list) + current_audio = audio_bytes_list[-1] if audio_bytes_list else None + gen_request = GenerationRequest( + text=text if text else None, + system_prompt=system_prompt, + audio_bytes=current_audio, + max_new_tokens=max_tokens, + temperature=temperature, + top_p=top_p, + seed=seed, + request_id=hashlib.md5(f"{time.time()}".encode()).hexdigest()[:8], + ) + else: + gen_request = GenerationRequest( + text=text if text else None, + system_prompt=system_prompt, + audio_bytes=audio_bytes_list[0] if len(audio_bytes_list) == 1 else None, + audio_bytes_list=audio_bytes_list if len(audio_bytes_list) > 1 else None, + max_new_tokens=max_tokens, + temperature=temperature, + top_p=top_p, + seed=seed, + request_id=hashlib.md5(f"{time.time()}".encode()).hexdigest()[:8], + ) + + # Validate request + error = backend_instance.validate_request(gen_request) + if error: + raise HTTPException(status_code=400, detail=error) + + # Handle s2s_session backend with session support + if backend_type == "s2s_session" and session_manager is not None: + # Compute session IDs from message history (OpenAI API compatible) + # restore_session_id: hash of messages before last (assistant, user) pair + # save_session_id: hash of all messages (full conversation) + non_system_messages = [m for m in messages if m.get("role") != "system"] + + if len(non_system_messages) <= 1: + # First turn: no previous session to restore + restore_session_id = None + else: + # Multi-turn: restore from previous conversation state + # messages[:-2] excludes last assistant + last user + prefix_messages = messages[:-2] if len(messages) >= 2 else [] + restore_session_id = compute_session_hash(prefix_messages) if prefix_messages else None + + # Session to save after this turn (full conversation) + save_session_id = compute_session_hash(messages) + + # Use client-provided session_id if available, otherwise use computed one + effective_restore_id = session_id if session_id else restore_session_id + + # Get or create session + session_state = session_manager.get_or_create_session(effective_restore_id) + + # Run inference with session in thread pool + loop = asyncio.get_event_loop() + result, updated_session = await loop.run_in_executor( + None, + backend_instance.generate_with_session, + gen_request, + session_state, + ) + + # Save updated session state under the new hash (full conversation) + if updated_session is not None: + session_manager.save_session(save_session_id, updated_session) + session_id = save_session_id + else: + # Process through batcher (non-session path) + result = await request_batcher.add_request(gen_request) + session_id = None + + if not result.is_success(): + raise HTTPException(status_code=500, detail=result.error) + + # Build OpenAI-compatible response + response_id = f"chatcmpl-{hashlib.md5(str(time.time()).encode()).hexdigest()[:8]}" + + # Build message content + message_content = result.text or "" + + # Save outputs to files before sending response (in case client times out) + import json as json_lib + import os + from datetime import datetime + + save_dir = os.environ.get( + "AUDIO_SAVE_DIR", "/lustre/fsw/portfolios/llmservice/users/vmendelev/tmp/voicebench_test" + ) + os.makedirs(save_dir, exist_ok=True) + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + base_filename = f"response_{timestamp}_{response_id}" + + saved_audio_path = None + saved_json_path = None + + # Save JSON with text and debug info + try: + saved_json_path = os.path.join(save_dir, f"{base_filename}.json") + json_output = { + "response_id": response_id, + "timestamp": timestamp, + "text": message_content, + "asr_text": result.asr_text, + "debug_info": result.debug_info, + "generation_time_ms": result.generation_time_ms, + "num_tokens_generated": result.num_tokens_generated, + } + with open(saved_json_path, "w") as f: + json_lib.dump(json_output, f, indent=2) + print(f"[Server] JSON saved to: {saved_json_path}") + except Exception as e: + print(f"[Server] Warning: Failed to save JSON: {e}") + + # Include audio output if available (base64 encoded) + audio_output = None + if result.audio_bytes: + # Save audio file + try: + saved_audio_path = os.path.join(save_dir, f"{base_filename}.wav") + with open(saved_audio_path, "wb") as f: + f.write(result.audio_bytes) + print(f"[Server] Audio saved to: {saved_audio_path} ({len(result.audio_bytes)} bytes)") + except Exception as e: + print(f"[Server] Warning: Failed to save audio: {e}") + + audio_output = { + "data": base64.b64encode(result.audio_bytes).decode("utf-8"), + "format": result.audio_format or "wav", + "sample_rate": result.audio_sample_rate, + "expires_at": int(time.time()) + 3600, # 1 hour expiry + "transcript": result.text or "", # Text transcript of the audio + } + + # Embed debug_info in content as JSON (OpenAI-compatible) + final_content = message_content + if result.debug_info and INCLUDE_DEBUG_INFO: + final_content = f"{message_content}\n{json.dumps(result.debug_info)}" + + response = { + "id": response_id, + "object": "chat.completion", + "created": int(time.time()), + "model": server_config.get("model_path"), + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": final_content, + }, + "finish_reason": "stop", + } + ], + "usage": { + "prompt_tokens": -1, + "completion_tokens": result.num_tokens_generated or -1, + "total_tokens": -1, + }, + } + + # Add audio to response if available + if audio_output: + response["choices"][0]["message"]["audio"] = audio_output + + # Add ASR text (user speech transcription) if available + if result.asr_text is not None: + response["choices"][0]["message"]["asr_text"] = result.asr_text + + # Add debug info at top level too (for non-litellm clients) + if result.debug_info and INCLUDE_DEBUG_INFO: + response["debug_info"] = result.debug_info + + # Add saved file paths if available + if saved_audio_path: + response["saved_audio_path"] = saved_audio_path + if saved_json_path: + response["saved_json_path"] = saved_json_path + + # Add session_id for session-aware backends + if session_id: + response["session_id"] = session_id + + return response + + except HTTPException: + raise + except Exception as e: + import traceback + + traceback.print_exc() + raise HTTPException(status_code=500, detail=str(e)) + + return app + + +def main(): + """Run the server from command line.""" + import argparse + + parser = argparse.ArgumentParser(description="Unified NeMo Inference Server") + parser.add_argument( + "--backend", + default=BACKEND_TYPE, + choices=["salm", "tts", "s2s", "s2s_incremental", "s2s_session"], + help="Backend type to use", + ) + parser.add_argument("--model", default=MODEL_PATH, help="Path to model") + parser.add_argument("--codec_model", default=CODEC_MODEL_PATH, help="Path to codec model (for TTS/S2S)") + parser.add_argument("--host", default=HOST, help="Server host") + parser.add_argument("--port", type=int, default=PORT, help="Server port") + parser.add_argument("--batch_size", type=int, default=BATCH_SIZE, help="Batch size") + parser.add_argument( + "--batch_timeout", type=float, default=BATCH_TIMEOUT, help="Batch timeout in seconds (0 for no delay)" + ) + parser.add_argument("--device", default="cuda", help="Device to use") + parser.add_argument("--dtype", default="bfloat16", help="Model dtype") + parser.add_argument("--debug", action="store_true", help="Enable debug mode") + + # Backend-specific arguments + parser.add_argument("--prompt_format", default=None, help="Prompt format (SALM)") + parser.add_argument("--phoneme_input_type", default="predicted", help="Phoneme input type (TTS)") + parser.add_argument("--decoder_only_model", action="store_true", help="Use decoder-only model (TTS)") + parser.add_argument( + "--ignore_system_prompt", + action="store_true", + help="Ignore system prompts from requests (for models that don't support them)", + ) + parser.add_argument( + "--silence_padding_sec", + type=float, + default=5.0, + help="Seconds of silence to append after audio (S2S backend)", + ) + + # S2S Incremental backend arguments + parser.add_argument( + "--config_path", + type=str, + default=None, + help="Path to YAML config file (s2s_incremental backend)", + ) + parser.add_argument( + "--llm_checkpoint_path", + type=str, + default=None, + help="Path to LLM checkpoint (s2s_incremental backend)", + ) + parser.add_argument( + "--tts_checkpoint_path", + type=str, + default=None, + help="Path to TTS checkpoint (s2s_incremental backend)", + ) + parser.add_argument( + "--speaker_reference", + type=str, + default=None, + help="Path to speaker reference audio for TTS (s2s_incremental backend)", + ) + parser.add_argument( + "--num_frames_per_inference", + type=int, + default=1, + help="Frames per inference step (s2s_incremental backend)", + ) + parser.add_argument( + "--decode_audio", + action="store_true", + default=True, + help="Enable audio output via TTS (s2s_incremental backend)", + ) + parser.add_argument( + "--no_decode_audio", + action="store_true", + help="Disable audio output (s2s_incremental backend)", + ) + + args = parser.parse_args() + + if args.debug: + global DEBUG + DEBUG = True + + # Build extra config from backend-specific args + extra_config = {} + if args.prompt_format: + extra_config["prompt_format"] = args.prompt_format + if args.phoneme_input_type: + extra_config["phoneme_input_type"] = args.phoneme_input_type + if args.decoder_only_model: + extra_config["decoder_only_model"] = True + if args.silence_padding_sec != 5.0: # Only add if different from default + extra_config["silence_padding_sec"] = args.silence_padding_sec + extra_config["ignore_system_prompt"] = args.ignore_system_prompt + + # S2S Incremental backend config + if args.config_path: + extra_config["config_path"] = args.config_path + if args.llm_checkpoint_path: + extra_config["llm_checkpoint_path"] = args.llm_checkpoint_path + if args.tts_checkpoint_path: + extra_config["tts_checkpoint_path"] = args.tts_checkpoint_path + if args.speaker_reference: + extra_config["speaker_reference"] = args.speaker_reference + if args.num_frames_per_inference != 1: + extra_config["num_frames_per_inference"] = args.num_frames_per_inference + if args.no_decode_audio: + extra_config["decode_audio"] = False + elif args.decode_audio: + extra_config["decode_audio"] = True + + app = create_app( + backend_type=args.backend, + model_path=args.model, + codec_model_path=args.codec_model, + batch_size=args.batch_size, + batch_timeout=args.batch_timeout, + device=args.device, + dtype=args.dtype, + extra_config=extra_config if extra_config else None, + ) + + uvicorn.run(app, host=args.host, port=args.port, log_level="info") + + +if __name__ == "__main__": + main() diff --git a/tests/gpu-tests/test_eval.py b/tests/gpu-tests/test_eval.py index 47060a1368..ae7a6a4b7e 100644 --- a/tests/gpu-tests/test_eval.py +++ b/tests/gpu-tests/test_eval.py @@ -346,3 +346,101 @@ def test_megatron_eval(): # TODO: something is broken in megatron inference here as this should be 50! assert metrics["symbolic_correct"] >= 40 assert metrics["num_entries"] == 5 + + +@pytest.mark.gpu +def test_prepare_and_eval_all_datasets(): + model_path = require_env_var("NEMO_SKILLS_TEST_HF_MODEL") + model_type = require_env_var("NEMO_SKILLS_TEST_MODEL_TYPE") + + config_dir = Path(__file__).absolute().parent + datasets_dir = Path(__file__).absolute().parents[2] / "nemo_skills" / "dataset" + # not testing datasets that don't support max_samples, require explicit parameters or are very heavy to prepare + excluded_datasets = { + "__pycache__", + "ruler", + "bigcodebench", + "livecodebench", + "livebench_coding", + "livecodebench-pro", + "livecodebench-cpp", + "ioi24", + "ioi25", + "bfcl_v3", + "bfcl_v4", + "swe-bench", + "aai", + "human-eval", + "human-eval-infilling", + "mbpp", + "mmau-pro", + } + + dataset_names = sorted( + dataset.name + for dataset in datasets_dir.iterdir() + if dataset.is_dir() and (dataset / "prepare.py").exists() and dataset.name not in excluded_datasets + ) + + assert dataset_names, "No datasets found to prepare and evaluate" + + judge_datasets = [] + for dataset in dataset_names: + dataset_module = import_module(f"nemo_skills.dataset.{dataset}") + # Check if JUDGE_PIPELINE_ARGS exists (even if empty dict, which is falsy) + if hasattr(dataset_module, "JUDGE_PIPELINE_ARGS"): + judge_datasets.append(dataset) + + non_judge_datasets = [dataset for dataset in dataset_names if dataset not in judge_datasets] + + data_dir = Path(f"/tmp/nemo-skills-tests/{model_type}/data") + docker_rm([str(data_dir)]) + + prepare_data( + ctx=wrap_arguments(" ".join(dataset_names)), + cluster="test-local", + config_dir=str(config_dir), + data_dir=str(data_dir), + expname=f"prepare-all-datasets-{model_type}", + ) + + eval_kwargs = dict( + cluster="test-local", + config_dir=str(config_dir), + data_dir=str(data_dir), + model=model_path, + server_type="sglang", + server_gpus=1, + server_nodes=1, + auto_summarize_results=False, + ) + + common_ctx = "++max_samples=2 ++inference.tokens_to_generate=100 ++server.enable_soft_fail=True " + + output_dir = f"/tmp/nemo-skills-tests/{model_type}/all-datasets-eval" + docker_rm([output_dir]) + eval( + ctx=wrap_arguments(common_ctx), + output_dir=output_dir, + benchmarks=",".join(non_judge_datasets), + expname=f"eval-all-datasets-{model_type}", + **eval_kwargs, + ) + + run_cmd( + ctx=wrap_arguments(f"python -m nemo_skills.pipeline.summarize_results {output_dir}"), + cluster="test-local", + config_dir=str(config_dir), + ) + + eval_results_dir = Path(output_dir) / "eval-results" + metrics_path = eval_results_dir / "metrics.json" + assert metrics_path.exists(), "Missing aggregated metrics file" + with metrics_path.open() as f: + metrics = json.load(f) + + for dataset in non_judge_datasets: + assert dataset in metrics, f"Missing metrics for {dataset}" + + # TODO: add same for judge_datasets after generate supports num_jobs + # (otherwise it starts judge every time and takes forever) diff --git a/tests/gpu-tests/test_vllm_audio.py b/tests/gpu-tests/test_vllm_audio.py new file mode 100644 index 0000000000..8183adaa80 --- /dev/null +++ b/tests/gpu-tests/test_vllm_audio.py @@ -0,0 +1,84 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import shutil +import subprocess +import tempfile +from pathlib import Path + +import pytest +from utils import require_env_var + + +@pytest.mark.gpu +def test_vllm_audio_generation(): + """Integration test: Generate with vLLM server using audio input.""" + model_path = require_env_var("NEMO_SKILLS_TEST_HF_MODEL") + model_type = require_env_var("NEMO_SKILLS_TEST_MODEL_TYPE") + + output_dir = f"/tmp/nemo-skills-tests/{model_type}/vllm-audio-generation" + # Clean up output directory + if Path(output_dir).exists(): + shutil.rmtree(output_dir) + + # Create test input file with audio + with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f: + test_data = [ + { + "problem": "Transcribe this audio", + "audio": {"path": "/nemo_run/code/tests/slurm-tests/asr_nim/wavs/t2_16.wav"}, + }, + { + "problem": "What is in this audio?", + "audio": {"path": "/nemo_run/code/tests/slurm-tests/asr_nim/wavs/t3_16.wav"}, + }, + ] + for item in test_data: + f.write(json.dumps(item) + '\n') + input_file = f.name + + try: + cmd = ( + f"ns generate " + f" --cluster test-local --config_dir {Path(__file__).absolute().parent} " + f" --model {model_path} " + f" --output_dir {output_dir} " + f" --server_type vllm " + f" --server_gpus 1 " + f" --server_nodes 1 " + f" --server_args '--enforce-eager' " + f" --input_file={input_file} " + f" ++prompt_config=openai " + f" ++skip_filled=False " + ) + subprocess.run(cmd, shell=True, check=True) + + # Verify output exists and has audio-related generation + with open(f"{output_dir}/output.jsonl") as fin: + lines = fin.readlines() + + assert len(lines) == 2, "Should have 2 output lines" + + for line in lines: + data = json.loads(line) + assert "generation" in data, "Should have generation field" + assert len(data["generation"]) > 0, "Generation should not be empty" + # If model supports audio, generation should contain something + print(f"Generated: {data['generation']}") + + finally: + # Cleanup temp file + Path(input_file).unlink(missing_ok=True) + diff --git a/tests/test_vllm_audio.py b/tests/test_vllm_audio.py new file mode 100644 index 0000000000..56bee85aa2 --- /dev/null +++ b/tests/test_vllm_audio.py @@ -0,0 +1,156 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import base64 +import os +import tempfile +from unittest.mock import AsyncMock, patch + +import pytest + +from nemo_skills.inference.model.vllm import VLLMModel, audio_file_to_base64 + + +# ----------------------- +# Unit tests - no server required +# ----------------------- + +def test_audio_file_to_base64(): + """Test basic audio file encoding to base64.""" + with tempfile.NamedTemporaryFile(mode='wb', suffix='.wav', delete=False) as f: + test_content = b'RIFF' + b'\x00' * 100 + f.write(test_content) + temp_path = f.name + + try: + result = audio_file_to_base64(temp_path) + assert isinstance(result, str) + assert len(result) > 0 + decoded = base64.b64decode(result) + assert decoded == test_content + finally: + os.unlink(temp_path) + + +@pytest.fixture +def vllm_model(tmp_path): + """Create a VLLMModel instance for testing.""" + audio_dir = tmp_path / "audio" + audio_dir.mkdir() + model = VLLMModel(model="test-model", data_dir=str(tmp_path), base_url="http://localhost:5000") + return model + + +def test_content_text_to_list_with_audio(vllm_model, tmp_path): + """Test converting string content with audio to list format.""" + audio_path = tmp_path / "audio" / "test.wav" + audio_path.parent.mkdir(exist_ok=True) + with open(audio_path, 'wb') as f: + f.write(b'RIFF' + b'\x00' * 100) + + message = {"role": "user", "content": "Describe this audio", "audio": {"path": "audio/test.wav"}} + + result = vllm_model.content_text_to_list(message) + + assert isinstance(result["content"], list) + assert len(result["content"]) == 2 + assert result["content"][0]["type"] == "text" + assert result["content"][1]["type"] == "audio_url" + assert result["content"][1]["audio_url"]["url"].startswith("data:audio/wav;base64,") + + +def test_content_text_to_list_with_multiple_audios(vllm_model, tmp_path): + """Test handling message with multiple audio files.""" + audio_dir = tmp_path / "audio" + audio_dir.mkdir(exist_ok=True) + + for i in range(2): + with open(audio_dir / f"test_{i}.wav", 'wb') as f: + f.write(b'RIFF' + b'\x00' * 100) + + message = { + "role": "user", + "content": "Compare these", + "audios": [{"path": "audio/test_0.wav"}, {"path": "audio/test_1.wav"}], + } + + result = vllm_model.content_text_to_list(message) + + assert isinstance(result["content"], list) + assert len(result["content"]) == 3 + assert result["content"][0]["type"] == "text" + assert result["content"][1]["type"] == "audio_url" + assert result["content"][2]["type"] == "audio_url" + + +# ----------------------- +# Request building tests with audio +# ----------------------- + +def test_build_chat_request_with_audio(tmp_path, vllm_model): + """Test that chat request params are correctly built with audio content.""" + # Create audio file + audio_path = tmp_path / "audio" / "test.wav" + audio_path.parent.mkdir(exist_ok=True) + with open(audio_path, 'wb') as f: + f.write(b'RIFF' + b'\x00' * 100) + + messages = [{"role": "user", "content": "Test audio", "audio": {"path": "audio/test.wav"}}] + + # Build request params - this doesn't make any network calls + params = vllm_model._build_chat_request_params(messages=messages, stream=False, tokens_to_generate=10) + + # Validate request structure + assert "messages" in params + assert len(params["messages"]) == 1 + content_items = params["messages"][0]["content"] + assert isinstance(content_items, list) + assert len(content_items) == 2 + assert content_items[0]["type"] == "text" + assert content_items[1]["type"] == "audio_url" + + # Verify base64 encoding is valid + audio_url = content_items[1]["audio_url"]["url"] + assert audio_url.startswith("data:audio/wav;base64,") + audio_b64 = audio_url.split(",", 1)[1] + decoded = base64.b64decode(audio_b64) + assert decoded.startswith(b'RIFF') + + +@pytest.mark.asyncio +async def test_generate_with_audio_mocked_response(tmp_path, vllm_model): + """Test generate_async with audio by mocking the response (no real server call).""" + # Create audio file + audio_path = tmp_path / "audio" / "test.wav" + audio_path.parent.mkdir(exist_ok=True) + with open(audio_path, 'wb') as f: + f.write(b'RIFF' + b'\x00' * 100) + + messages = [{"role": "user", "content": "Describe this audio", "audio": {"path": "audio/test.wav"}}] + + # Mock the entire generate_async method - no actual API call made + mock_response = {"generation": "This audio contains speech", "num_generated_tokens": 5} + + with patch.object(vllm_model, "generate_async", new_callable=AsyncMock) as mock_generate: + mock_generate.return_value = mock_response + + # Call the mocked method + response = await vllm_model.generate_async(prompt=messages, tokens_to_generate=50, temperature=0.0) + + # Verify the mock was called correctly + assert response["generation"] == "This audio contains speech" + assert response["num_generated_tokens"] == 5 + mock_generate.assert_awaited_once() + +