From 99ef50a38c99243823badbd01b0e7c2b602d5928 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 13 Nov 2025 12:18:31 -0800 Subject: [PATCH 01/26] Added audio requests to vLLM models --- .../dataset/mmau-pro/closed_form/__init__.py | 1 + .../dataset/mmau-pro/open_ended/__init__.py | 2 +- nemo_skills/dataset/mmau-pro/prepare.py | 23 ++- .../evaluation/metrics/mmau_pro_metrics.py | 118 +++++++++++-- nemo_skills/inference/generate.py | 17 ++ nemo_skills/inference/model/vllm.py | 35 +++- nemo_skills/prompt/config/judge/mmau-pro.yaml | 30 ++++ nemo_skills/prompt/config/judge/speechlm.yaml | 28 ---- tests/gpu-tests/test_eval.py | 98 +++++++++++ tests/gpu-tests/test_vllm_audio.py | 84 ++++++++++ tests/test_vllm_audio.py | 156 ++++++++++++++++++ 11 files changed, 543 insertions(+), 49 deletions(-) create mode 100644 nemo_skills/prompt/config/judge/mmau-pro.yaml delete mode 100644 nemo_skills/prompt/config/judge/speechlm.yaml create mode 100644 tests/gpu-tests/test_vllm_audio.py create mode 100644 tests/test_vllm_audio.py diff --git a/nemo_skills/dataset/mmau-pro/closed_form/__init__.py b/nemo_skills/dataset/mmau-pro/closed_form/__init__.py index 4e3b424d84..4390c1d887 100644 --- a/nemo_skills/dataset/mmau-pro/closed_form/__init__.py +++ b/nemo_skills/dataset/mmau-pro/closed_form/__init__.py @@ -16,6 +16,7 @@ METRICS_TYPE = "mmau_pro_closed_form" SCORE_MODULE = "nemo_skills.evaluation.metrics.mmau_pro_metrics" GENERATION_ARGS = "++prompt_format=openai" +EVAL_ARGS = "++eval_type=mmau-pro" # NVEmbed judge configuration for closed-form evaluation JUDGE_PIPELINE_ARGS = { diff --git a/nemo_skills/dataset/mmau-pro/open_ended/__init__.py b/nemo_skills/dataset/mmau-pro/open_ended/__init__.py index 22773d6fed..c5f09272d2 100644 --- a/nemo_skills/dataset/mmau-pro/open_ended/__init__.py +++ b/nemo_skills/dataset/mmau-pro/open_ended/__init__.py @@ -23,4 +23,4 @@ "server_type": "openai", "server_address": "https://integrate.api.nvidia.com/v1", } -JUDGE_ARGS = "++prompt_config=judge/speechlm ++generation_key=judgement" +JUDGE_ARGS = "++prompt_config=judge/mmau-pro ++generation_key=judgement" diff --git a/nemo_skills/dataset/mmau-pro/prepare.py b/nemo_skills/dataset/mmau-pro/prepare.py index a6f04d621b..0ea66ec2b7 100644 --- a/nemo_skills/dataset/mmau-pro/prepare.py +++ b/nemo_skills/dataset/mmau-pro/prepare.py @@ -75,8 +75,8 @@ def format_entry(entry, with_audio=False): if category == "open": content = entry["question"] elif choices and len(choices) > 1: - options_text = "\n".join(f"{chr(65 + i)}. {choice}" for i, choice in enumerate(choices)) - content = f"{entry['question']}\n\n{options_text}" + options_text = "\n".join(f"{chr(65 + i)}) {choice}" for i, choice in enumerate(choices)) + content = f"{entry['question']}\n\n{options_text}\n\nRespond with the complete text of the correct option, not just the letter." else: content = entry["question"] @@ -84,13 +84,18 @@ def format_entry(entry, with_audio=False): if entry.get("audio_path"): audio_path = entry["audio_path"] - - if isinstance(audio_path, list) and audio_path: - user_message["audios"] = [{"path": path, "duration": 10.0} for path in audio_path] - elif isinstance(audio_path, str): - user_message["audio"] = {"path": audio_path, "duration": 10.0} - - formatted_entry["messages"] = [user_message] + # Prepend /dataset/mmau-pro/ to make paths absolute for cluster + if len(audio_path) == 1: + user_message["audio"] = {"path": f"/dataset/mmau-pro/{audio_path[0]}"} + else: + user_message["audios"] = [{"path": f"/dataset/mmau-pro/{path}"} for path in audio_path] + + # Don't use /no_think for open-ended questions to allow reasoning + system_content = "You are a helpful assistant." + if category != "open": + system_content += " /no_think" + + formatted_entry["messages"] = [{"role": "system", "content": system_content}, user_message] return formatted_entry diff --git a/nemo_skills/evaluation/metrics/mmau_pro_metrics.py b/nemo_skills/evaluation/metrics/mmau_pro_metrics.py index f079049cc1..000dbcf13f 100644 --- a/nemo_skills/evaluation/metrics/mmau_pro_metrics.py +++ b/nemo_skills/evaluation/metrics/mmau_pro_metrics.py @@ -13,14 +13,52 @@ # limitations under the License. import logging +import re + +import numpy as np from nemo_skills.evaluation.metrics.base import BaseMetrics, as_int, as_percentage -from nemo_skills.evaluation.metrics.utils import is_correct_judgement from nemo_skills.utils import get_logger_name LOG = logging.getLogger(get_logger_name(__file__)) +def extract_multicriteria_scores(judgement_text: str) -> dict[str, float]: + """Extract multi-criteria scores (1-5 scale) from LLM judge evaluation. + + Expected format: + CORRECTNESS: [score] - [justification] + RELEVANCE: [score] - [justification] + COMPLETENESS: [score] - [justification] + CLARITY: [score] - [justification] + OVERALL: [score] - [overall assessment] + + Returns: + Dictionary with keys: correctness, relevance, completeness, clarity, overall + Defaults to 3.0 if score not found. + """ + scores = {} + + patterns = { + "correctness": r"CORRECTNESS:\s*(\d+(?:\.\d+)?)", + "relevance": r"RELEVANCE:\s*(\d+(?:\.\d+)?)", + "completeness": r"COMPLETENESS:\s*(\d+(?:\.\d+)?)", + "clarity": r"CLARITY:\s*(\d+(?:\.\d+)?)", + "overall": r"OVERALL:\s*(\d+(?:\.\d+)?)", + } + + for criterion, pattern in patterns.items(): + match = re.search(pattern, judgement_text, re.IGNORECASE) + scores[criterion] = float(match.group(1)) if match else 3.0 + + # Fallback: compute overall if missing or still 3.0 + if "overall" not in scores or scores["overall"] == 3.0: + criteria_scores = [scores.get(k, 3.0) for k in ["correctness", "relevance", "completeness", "clarity"]] + scores["overall"] = sum(criteria_scores) / len(criteria_scores) + + return scores + + class MMAUProMetrics(BaseMetrics): """Metrics class for MMAU-Pro benchmark (all subgroups).""" @@ -28,16 +66,24 @@ def __init__(self, compute_no_answer: bool = True, max_k: int = 1): super().__init__(compute_no_answer=compute_no_answer) self.max_k = max_k + # Track multi-criteria scores for open-ended questions (1-5 scale) + self.multicriteria_scores = { + "correctness": [], + "relevance": [], + "completeness": [], + "clarity": [], + "overall": [], + } + def _get_score_dict(self, prediction: dict) -> dict[str, bool | int | float]: """Extract correctness scores from prediction.""" score_dict = {} - # Open-ended: extract from judge result + # Open-ended: use LLM judge correctness score >= 3 as correct if "judgement" in prediction: - judge_result = is_correct_judgement(prediction["judgement"]) - score_dict["judge_correct"] = judge_result - score_dict["correct"] = judge_result - # Closed-form and instruction following: use is_correct + multicriteria = extract_multicriteria_scores(prediction["judgement"]) + score_dict["correct"] = multicriteria.get("correctness", 3.0) >= 3.0 + # Closed-form / instruction-following: use binary correctness elif "is_correct" in prediction: score_dict["correct"] = prediction["is_correct"] else: @@ -58,24 +104,61 @@ def get_incorrect_sample(self, prediction: dict) -> dict: def update(self, predictions): """Update metrics with new predictions.""" super().update(predictions) + predicted_answers = [pred.get("generation", None).strip() or None for pred in predictions] self._compute_pass_at_k(predictions=predictions, predicted_answers=predicted_answers) self._compute_majority_at_k(predictions=predictions, predicted_answers=predicted_answers) + # Collect multi-criteria scores for open-ended questions + for pred in predictions: + if "judgement" in pred: + multicriteria = extract_multicriteria_scores(pred["judgement"]) + for criterion in self.multicriteria_scores: + self.multicriteria_scores[criterion].append(multicriteria.get(criterion, 3.0)) + def get_metrics(self): """Get computed metrics.""" metrics_dict = super().get_metrics() + for agg_mode, agg_metrics in metrics_dict.items(): - # Ensure avg_tokens is always present for MMAU-Pro + # Ensure avg_tokens is present if "avg_tokens" not in agg_metrics: agg_metrics["avg_tokens"] = 0 if "no_answer" in agg_metrics: agg_metrics["no_answer"] = agg_metrics["no_answer"] / 2.0 - # Set success_rate from correct or judge_correct - if "judge_correct" in agg_metrics: - agg_metrics["success_rate"] = agg_metrics["judge_correct"] + + # Add multi-criteria averages for open-ended (convert 1-5 scale to percentage) + if self.multicriteria_scores["overall"]: + for criterion in self.multicriteria_scores: + scores = self.multicriteria_scores[criterion] + if scores: + # Convert 1-5 scale to 0-100 percentage scale + avg_score = np.mean(scores) + std_score = np.std(scores) + agg_metrics[f"avg_{criterion}"] = (avg_score / 5.0) * 100 + agg_metrics[f"std_{criterion}"] = (std_score / 5.0) * 100 + + # Set correct and success_rate to avg_correctness for open-ended + agg_metrics["correct"] = agg_metrics["avg_correctness"] + agg_metrics["success_rate"] = agg_metrics["avg_correctness"] + + # Calculate good/poor response rates based on overall >= 4 or <= 2 + overall_scores = self.multicriteria_scores["overall"] + good_responses = sum(1 for score in overall_scores if score >= 4.0) + poor_responses = sum(1 for score in overall_scores if score <= 2.0) + + agg_metrics["good_response_rate"] = (good_responses / len(overall_scores)) * 100 + agg_metrics["poor_response_rate"] = (poor_responses / len(overall_scores)) * 100 + + # For closed-form / instruction-following: use binary correctness elif "correct" in agg_metrics: agg_metrics["success_rate"] = agg_metrics["correct"] + + # Round all numeric values to 2 decimal places + for key, value in agg_metrics.items(): + if isinstance(value, float) and not isinstance(value, bool): + agg_metrics[key] = round(value, 2) + return metrics_dict def metrics_to_print(self): @@ -87,5 +170,20 @@ def metrics_to_print(self): } if self.compute_no_answer: base_metrics["no_answer"] = as_percentage + + # Add multi-criteria metrics for open-ended questions (now in percentage format) + if self.multicriteria_scores["overall"]: + base_metrics.update( + { + "avg_overall": as_percentage, + "avg_correctness": as_percentage, + "avg_relevance": as_percentage, + "avg_completeness": as_percentage, + "avg_clarity": as_percentage, + "good_response_rate": as_percentage, + "poor_response_rate": as_percentage, + } + ) + base_metrics["num_entries"] = as_int return base_metrics diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index 136375db46..87151a66d6 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -399,6 +399,10 @@ def setup_prompt(self): def setup_llm(self): self.sandbox = get_sandbox(**self.cfg.sandbox) if self.cfg.sandbox is not None else None + self.data_dir = None + if "data_dir" in self.cfg.eval_config and not isinstance(self.cfg.eval_config.get("data_dir"), type(None)): + self.data_dir = self.cfg.eval_config["data_dir"] + if self.cfg.code_execution: llm = get_code_execution_model(**self.cfg.server, tokenizer=self.tokenizer, sandbox=self.sandbox) elif self.cfg.tool_modules is not None: @@ -545,6 +549,16 @@ def dump_outputs(self, outputs, data_points, fout): for output in outputs: fout.write(json.dumps(output) + "\n") + def drop_binary_data(self, output): + """Remove binary data (like base64 audio) from messages to keep output files smaller.""" + for message in output["messages"]: + # Skip if content is not a list (e.g., string content in system messages) + if not isinstance(message.get("content"), list): + continue + + # Filter out audio_url items from list-style content + message["content"] = [content for content in message["content"] if content.get("type") != "audio_url"] + async def postprocess_single_output(self, output, original_data_point): # to make it easier to follow up with other generations and limit accidental errors, we are adding # all of the original data to the output file alongside the new generations @@ -560,6 +574,9 @@ async def postprocess_single_output(self, output, original_data_point): for key in output: original_data_point.pop(key, None) output.update(original_data_point) + + self.drop_binary_data(output) + if self.cfg.parse_reasoning: parse_reasoning( output, diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py index e9a2146520..cff46cf0e6 100644 --- a/nemo_skills/inference/model/vllm.py +++ b/nemo_skills/inference/model/vllm.py @@ -12,7 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +import base64 import logging +import os import requests @@ -24,8 +26,16 @@ LOG = logging.getLogger(get_logger_name(__file__)) +def audio_file_to_base64(audio_file_path: str): + """Encodes an audio file into a base64 string.""" + with open(audio_file_path, "rb") as audio_file: + audio_content = audio_file.read() + return base64.b64encode(audio_content).decode("utf-8") + + class VLLMModel(BaseModel): - def __init__(self, **kwargs): + def __init__(self, data_dir: str = "", **kwargs): + self.data_dir = data_dir super().__init__(**kwargs) def _get_tokenizer_endpoint(self): @@ -99,6 +109,28 @@ def _build_completion_request_params( "extra_body": self._build_request_body(top_k, min_p, repetition_penalty, extra_body=extra_body), } + def content_text_to_list(self, message): + if "audio" in message or "audios" in message: + content = message["content"] + if isinstance(content, str): + message["content"] = [{"type": "text", "text": content}] + elif isinstance(content, list): + message["content"] = content + else: + raise TypeError(str(content)) + + if "audio" in message: + audio = message["audio"] + base64_audio = audio_file_to_base64(os.path.join(self.data_dir, audio["path"])) + audio_message = {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}} + message["content"].append(audio_message) + elif "audios" in message: + for audio in message["audios"]: + base64_audio = audio_file_to_base64(os.path.join(self.data_dir, audio["path"])) + audio_message = {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}} + message["content"].append(audio_message) + return message + def _build_chat_request_params( self, messages: list[dict], @@ -117,6 +149,7 @@ def _build_chat_request_params( tools: list[dict] | None = None, extra_body: dict = None, ) -> dict: + messages = [self.content_text_to_list(message) for message in messages] request = { "messages": messages, "max_tokens": tokens_to_generate, diff --git a/nemo_skills/prompt/config/judge/mmau-pro.yaml b/nemo_skills/prompt/config/judge/mmau-pro.yaml new file mode 100644 index 0000000000..5339e4ab0d --- /dev/null +++ b/nemo_skills/prompt/config/judge/mmau-pro.yaml @@ -0,0 +1,30 @@ +# Judge prompt configuration for Speech/Audio Language Model evaluation +# Used for evaluating open-ended responses in MMAU-Pro benchmark +# Uses multi-criteria scoring on 1-5 scale + +user: |- + You are an expert evaluator for audio and speech-related questions. Please evaluate the quality of a model's response to a question. + + Question: {question} + + Reference Answer: {expected_answer} + + Model Response: {generation} + + Please evaluate the model response on the following criteria and provide scores from 1-5 (where 5 is best): + + 1. **Correctness**: How factually accurate is the response compared to the reference? + 2. **Relevance**: How well does the response address the specific question asked? + 3. **Completeness**: Does the response cover all important aspects mentioned in the reference? + 4. **Clarity**: How clear and well-structured is the response? + + For each criterion, provide: + - A score from 1-5 + - A brief justification (1-2 sentences) + + Format your response as: + CORRECTNESS: [score] - [justification] + RELEVANCE: [score] - [justification] + COMPLETENESS: [score] - [justification] + CLARITY: [score] - [justification] + OVERALL: [average score] - [overall assessment] diff --git a/nemo_skills/prompt/config/judge/speechlm.yaml b/nemo_skills/prompt/config/judge/speechlm.yaml deleted file mode 100644 index 4862558145..0000000000 --- a/nemo_skills/prompt/config/judge/speechlm.yaml +++ /dev/null @@ -1,28 +0,0 @@ -# Judge prompt configuration for Speech/Audio Language Model evaluation -# Used for evaluating open-ended responses in MMAU-Pro benchmark -# Follows nemo-skills standard Yes/No judgement pattern - -user: |- - You are an expert evaluator for audio and speech-related questions. Please evaluate whether the model's response correctly answers the question. - - Question: {question} - - Reference Answer: {expected_answer} - - Model Response: {generation} - - Your task is to determine if the model's response is correct based on the reference answer. Consider: - - 1. **Factual Accuracy**: Is the information in the response factually correct? - 2. **Relevance**: Does the response address the specific question asked? - 3. **Completeness**: Does the response cover the key points from the reference answer? - - Please first explain your reasoning in 2-3 sentences, then provide your final judgement. - - Your final judgement must be either "Yes" or "No": - - "Yes" if the model response is correct and adequately answers the question - - "No" if the model response is incorrect, irrelevant, or inadequate - - Format your response as: - Reasoning: [Your explanation] - Judgement: [Yes or No] diff --git a/tests/gpu-tests/test_eval.py b/tests/gpu-tests/test_eval.py index 47060a1368..ae7a6a4b7e 100644 --- a/tests/gpu-tests/test_eval.py +++ b/tests/gpu-tests/test_eval.py @@ -346,3 +346,101 @@ def test_megatron_eval(): # TODO: something is broken in megatron inference here as this should be 50! assert metrics["symbolic_correct"] >= 40 assert metrics["num_entries"] == 5 + + +@pytest.mark.gpu +def test_prepare_and_eval_all_datasets(): + model_path = require_env_var("NEMO_SKILLS_TEST_HF_MODEL") + model_type = require_env_var("NEMO_SKILLS_TEST_MODEL_TYPE") + + config_dir = Path(__file__).absolute().parent + datasets_dir = Path(__file__).absolute().parents[2] / "nemo_skills" / "dataset" + # not testing datasets that don't support max_samples, require explicit parameters or are very heavy to prepare + excluded_datasets = { + "__pycache__", + "ruler", + "bigcodebench", + "livecodebench", + "livebench_coding", + "livecodebench-pro", + "livecodebench-cpp", + "ioi24", + "ioi25", + "bfcl_v3", + "bfcl_v4", + "swe-bench", + "aai", + "human-eval", + "human-eval-infilling", + "mbpp", + "mmau-pro", + } + + dataset_names = sorted( + dataset.name + for dataset in datasets_dir.iterdir() + if dataset.is_dir() and (dataset / "prepare.py").exists() and dataset.name not in excluded_datasets + ) + + assert dataset_names, "No datasets found to prepare and evaluate" + + judge_datasets = [] + for dataset in dataset_names: + dataset_module = import_module(f"nemo_skills.dataset.{dataset}") + # Check if JUDGE_PIPELINE_ARGS exists (even if empty dict, which is falsy) + if hasattr(dataset_module, "JUDGE_PIPELINE_ARGS"): + judge_datasets.append(dataset) + + non_judge_datasets = [dataset for dataset in dataset_names if dataset not in judge_datasets] + + data_dir = Path(f"/tmp/nemo-skills-tests/{model_type}/data") + docker_rm([str(data_dir)]) + + prepare_data( + ctx=wrap_arguments(" ".join(dataset_names)), + cluster="test-local", + config_dir=str(config_dir), + data_dir=str(data_dir), + expname=f"prepare-all-datasets-{model_type}", + ) + + eval_kwargs = dict( + cluster="test-local", + config_dir=str(config_dir), + data_dir=str(data_dir), + model=model_path, + server_type="sglang", + server_gpus=1, + server_nodes=1, + auto_summarize_results=False, + ) + + common_ctx = "++max_samples=2 ++inference.tokens_to_generate=100 ++server.enable_soft_fail=True " + + output_dir = f"/tmp/nemo-skills-tests/{model_type}/all-datasets-eval" + docker_rm([output_dir]) + eval( + ctx=wrap_arguments(common_ctx), + output_dir=output_dir, + benchmarks=",".join(non_judge_datasets), + expname=f"eval-all-datasets-{model_type}", + **eval_kwargs, + ) + + run_cmd( + ctx=wrap_arguments(f"python -m nemo_skills.pipeline.summarize_results {output_dir}"), + cluster="test-local", + config_dir=str(config_dir), + ) + + eval_results_dir = Path(output_dir) / "eval-results" + metrics_path = eval_results_dir / "metrics.json" + assert metrics_path.exists(), "Missing aggregated metrics file" + with metrics_path.open() as f: + metrics = json.load(f) + + for dataset in non_judge_datasets: + assert dataset in metrics, f"Missing metrics for {dataset}" + + # TODO: add same for judge_datasets after generate supports num_jobs + # (otherwise it starts judge every time and takes forever) diff --git a/tests/gpu-tests/test_vllm_audio.py b/tests/gpu-tests/test_vllm_audio.py new file mode 100644 index 0000000000..8183adaa80 --- /dev/null +++ b/tests/gpu-tests/test_vllm_audio.py @@ -0,0 +1,84 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import shutil +import subprocess +import tempfile +from pathlib import Path + +import pytest +from utils import require_env_var + + +@pytest.mark.gpu +def test_vllm_audio_generation(): + """Integration test: Generate with vLLM server using audio input.""" + model_path = require_env_var("NEMO_SKILLS_TEST_HF_MODEL") + model_type = require_env_var("NEMO_SKILLS_TEST_MODEL_TYPE") + + output_dir = f"/tmp/nemo-skills-tests/{model_type}/vllm-audio-generation" + # Clean up output directory + if Path(output_dir).exists(): + shutil.rmtree(output_dir) + + # Create test input file with audio + with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f: + test_data = [ + { + "problem": "Transcribe this audio", + "audio": {"path": "/nemo_run/code/tests/slurm-tests/asr_nim/wavs/t2_16.wav"}, + }, + { + "problem": "What is in this audio?", + "audio": {"path": "/nemo_run/code/tests/slurm-tests/asr_nim/wavs/t3_16.wav"}, + }, + ] + for item in test_data: + f.write(json.dumps(item) + '\n') + input_file = f.name + + try: + cmd = ( + f"ns generate " + f" --cluster test-local --config_dir {Path(__file__).absolute().parent} " + f" --model {model_path} " + f" --output_dir {output_dir} " + f" --server_type vllm " + f" --server_gpus 1 " + f" --server_nodes 1 " + f" --server_args '--enforce-eager' " + f" --input_file={input_file} " + f" ++prompt_config=openai " + f" ++skip_filled=False " + ) + subprocess.run(cmd, shell=True, check=True) + + # Verify output exists and has audio-related generation + with open(f"{output_dir}/output.jsonl") as fin: + lines = fin.readlines() + + assert len(lines) == 2, "Should have 2 output lines" + + for line in lines: + data = json.loads(line) + assert "generation" in data, "Should have generation field" + assert len(data["generation"]) > 0, "Generation should not be empty" + # If model supports audio, generation should contain something + print(f"Generated: {data['generation']}") + + finally: + # Cleanup temp file + Path(input_file).unlink(missing_ok=True) + diff --git a/tests/test_vllm_audio.py b/tests/test_vllm_audio.py new file mode 100644 index 0000000000..56bee85aa2 --- /dev/null +++ b/tests/test_vllm_audio.py @@ -0,0 +1,156 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import base64 +import os +import tempfile +from unittest.mock import AsyncMock, patch + +import pytest + +from nemo_skills.inference.model.vllm import VLLMModel, audio_file_to_base64 + + +# ----------------------- +# Unit tests - no server required +# ----------------------- + +def test_audio_file_to_base64(): + """Test basic audio file encoding to base64.""" + with tempfile.NamedTemporaryFile(mode='wb', suffix='.wav', delete=False) as f: + test_content = b'RIFF' + b'\x00' * 100 + f.write(test_content) + temp_path = f.name + + try: + result = audio_file_to_base64(temp_path) + assert isinstance(result, str) + assert len(result) > 0 + decoded = base64.b64decode(result) + assert decoded == test_content + finally: + os.unlink(temp_path) + + +@pytest.fixture +def vllm_model(tmp_path): + """Create a VLLMModel instance for testing.""" + audio_dir = tmp_path / "audio" + audio_dir.mkdir() + model = VLLMModel(model="test-model", data_dir=str(tmp_path), base_url="http://localhost:5000") + return model + + +def test_content_text_to_list_with_audio(vllm_model, tmp_path): + """Test converting string content with audio to list format.""" + audio_path = tmp_path / "audio" / "test.wav" + audio_path.parent.mkdir(exist_ok=True) + with open(audio_path, 'wb') as f: + f.write(b'RIFF' + b'\x00' * 100) + + message = {"role": "user", "content": "Describe this audio", "audio": {"path": "audio/test.wav"}} + + result = vllm_model.content_text_to_list(message) + + assert isinstance(result["content"], list) + assert len(result["content"]) == 2 + assert result["content"][0]["type"] == "text" + assert result["content"][1]["type"] == "audio_url" + assert result["content"][1]["audio_url"]["url"].startswith("data:audio/wav;base64,") + + +def test_content_text_to_list_with_multiple_audios(vllm_model, tmp_path): + """Test handling message with multiple audio files.""" + audio_dir = tmp_path / "audio" + audio_dir.mkdir(exist_ok=True) + + for i in range(2): + with open(audio_dir / f"test_{i}.wav", 'wb') as f: + f.write(b'RIFF' + b'\x00' * 100) + + message = { + "role": "user", + "content": "Compare these", + "audios": [{"path": "audio/test_0.wav"}, {"path": "audio/test_1.wav"}], + } + + result = vllm_model.content_text_to_list(message) + + assert isinstance(result["content"], list) + assert len(result["content"]) == 3 + assert result["content"][0]["type"] == "text" + assert result["content"][1]["type"] == "audio_url" + assert result["content"][2]["type"] == "audio_url" + + +# ----------------------- +# Request building tests with audio +# ----------------------- + +def test_build_chat_request_with_audio(tmp_path, vllm_model): + """Test that chat request params are correctly built with audio content.""" + # Create audio file + audio_path = tmp_path / "audio" / "test.wav" + audio_path.parent.mkdir(exist_ok=True) + with open(audio_path, 'wb') as f: + f.write(b'RIFF' + b'\x00' * 100) + + messages = [{"role": "user", "content": "Test audio", "audio": {"path": "audio/test.wav"}}] + + # Build request params - this doesn't make any network calls + params = vllm_model._build_chat_request_params(messages=messages, stream=False, tokens_to_generate=10) + + # Validate request structure + assert "messages" in params + assert len(params["messages"]) == 1 + content_items = params["messages"][0]["content"] + assert isinstance(content_items, list) + assert len(content_items) == 2 + assert content_items[0]["type"] == "text" + assert content_items[1]["type"] == "audio_url" + + # Verify base64 encoding is valid + audio_url = content_items[1]["audio_url"]["url"] + assert audio_url.startswith("data:audio/wav;base64,") + audio_b64 = audio_url.split(",", 1)[1] + decoded = base64.b64decode(audio_b64) + assert decoded.startswith(b'RIFF') + + +@pytest.mark.asyncio +async def test_generate_with_audio_mocked_response(tmp_path, vllm_model): + """Test generate_async with audio by mocking the response (no real server call).""" + # Create audio file + audio_path = tmp_path / "audio" / "test.wav" + audio_path.parent.mkdir(exist_ok=True) + with open(audio_path, 'wb') as f: + f.write(b'RIFF' + b'\x00' * 100) + + messages = [{"role": "user", "content": "Describe this audio", "audio": {"path": "audio/test.wav"}}] + + # Mock the entire generate_async method - no actual API call made + mock_response = {"generation": "This audio contains speech", "num_generated_tokens": 5} + + with patch.object(vllm_model, "generate_async", new_callable=AsyncMock) as mock_generate: + mock_generate.return_value = mock_response + + # Call the mocked method + response = await vllm_model.generate_async(prompt=messages, tokens_to_generate=50, temperature=0.0) + + # Verify the mock was called correctly + assert response["generation"] == "This audio contains speech" + assert response["num_generated_tokens"] == 5 + mock_generate.assert_awaited_once() + + From 8297aed7e8f88eea0ca178f74213cacad14d933c Mon Sep 17 00:00:00 2001 From: Valentin Mendelev Date: Thu, 18 Dec 2025 02:12:14 -0800 Subject: [PATCH 02/26] Intorduced vLLM_multimodal model to save multimodal outputs Signed-off-by: Valentin Mendelev --- nemo_skills/inference/generate.py | 15 ++- nemo_skills/inference/model/__init__.py | 2 + nemo_skills/inference/model/base.py | 5 + .../inference/model/vllm_multimodal.py | 110 ++++++++++++++++++ 4 files changed, 130 insertions(+), 2 deletions(-) create mode 100644 nemo_skills/inference/model/vllm_multimodal.py diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index 87151a66d6..a98127d834 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -403,8 +403,15 @@ def setup_llm(self): if "data_dir" in self.cfg.eval_config and not isinstance(self.cfg.eval_config.get("data_dir"), type(None)): self.data_dir = self.cfg.eval_config["data_dir"] + output_dir = str(Path(self.cfg.output_file).parent) if self.cfg.code_execution: - llm = get_code_execution_model(**self.cfg.server, tokenizer=self.tokenizer, sandbox=self.sandbox) + llm = get_code_execution_model( + **self.cfg.server, + tokenizer=self.tokenizer, + sandbox=self.sandbox, + data_dir=self.data_dir or "", + output_dir=output_dir, + ) elif self.cfg.tool_modules is not None: llm = get_tool_calling_model( **self.cfg.server, @@ -413,9 +420,13 @@ def setup_llm(self): schema_overrides=self.cfg.schema_overrides, tokenizer=self.tokenizer, additional_config={"sandbox": self.cfg.sandbox}, + data_dir=self.data_dir or "", + output_dir=output_dir, ) else: - llm = get_model(**self.cfg.server, tokenizer=self.tokenizer) + llm = get_model( + **self.cfg.server, tokenizer=self.tokenizer, data_dir=self.data_dir or "", output_dir=output_dir + ) if self.cfg.parallel_thinking.mode is not None: # We don't want to override these key variables which overlap with self.cfg diff --git a/nemo_skills/inference/model/__init__.py b/nemo_skills/inference/model/__init__.py index 164d92fcc8..595d8fd3ee 100644 --- a/nemo_skills/inference/model/__init__.py +++ b/nemo_skills/inference/model/__init__.py @@ -39,6 +39,7 @@ # Utilities from .vllm import VLLMModel +from .vllm_multimodal import VLLMMultimodalModel # Model implementations @@ -51,6 +52,7 @@ "azureopenai": AzureOpenAIModel, "gemini": GeminiModel, "vllm": VLLMModel, + "vllm_multimodal": VLLMMultimodalModel, "sglang": SGLangModel, "tts_nim": TTSNIMModel, "asr_nim": ASRNIMModel, diff --git a/nemo_skills/inference/model/base.py b/nemo_skills/inference/model/base.py index 9318bfb475..117096b4c7 100644 --- a/nemo_skills/inference/model/base.py +++ b/nemo_skills/inference/model/base.py @@ -75,9 +75,14 @@ def __init__( enable_soft_fail: bool = False, context_limit_retry_strategy: str | None = None, num_special_tokens_budget: int = 100, + # Directory paths for data and output + data_dir: str = "", + output_dir: str | None = None, ): self._tunnel = None self.model_name_or_path = model + self.data_dir = data_dir + self.output_dir = output_dir self.server_host = host self.server_port = port self.ssh_server = ssh_server diff --git a/nemo_skills/inference/model/vllm_multimodal.py b/nemo_skills/inference/model/vllm_multimodal.py new file mode 100644 index 0000000000..0569c9efd9 --- /dev/null +++ b/nemo_skills/inference/model/vllm_multimodal.py @@ -0,0 +1,110 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import base64 +import json +import logging +import os +import re + +from nemo_skills.utils import get_logger_name + +from .vllm import VLLMModel + +LOG = logging.getLogger(get_logger_name(__file__)) + +# Pattern to extract debug_info from content +DEBUG_INFO_PATTERN = re.compile(r"\n?(.*?)", re.DOTALL) + + +class VLLMMultimodalModel(VLLMModel): + """VLLMModel with support for saving audio responses to disk. + + When the server returns audio in the response, this model will: + 1. Save the audio bytes to a file in output_dir/audio/ + 2. Replace the base64 data with the file path in the result + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.output_audio_dir = None + if self.output_dir: + self.output_audio_dir = os.path.join(self.output_dir, "audio") + os.makedirs(self.output_audio_dir, exist_ok=True) + LOG.info(f"Audio responses will be saved to: {self.output_audio_dir}") + + def _parse_chat_completion_response(self, response, include_response: bool = False, **kwargs) -> dict: + """Parse chat completion response and save any audio to disk.""" + result = super()._parse_chat_completion_response(response, include_response=include_response, **kwargs) + + # Extract debug_info from content (embedded as JSON in tags) + if "generation" in result and result["generation"]: + match = DEBUG_INFO_PATTERN.search(result["generation"]) + if match: + try: + result["debug_info"] = json.loads(match.group(1)) + # Strip debug_info from generation + result["generation"] = DEBUG_INFO_PATTERN.sub("", result["generation"]) + except json.JSONDecodeError: + LOG.warning("Failed to parse debug_info JSON from content") + + choice = response.choices[0] + if hasattr(choice.message, "audio") and choice.message.audio: + audio_result = self._process_audio_response(choice.message.audio, response.id) + result["audio"] = audio_result + + # Strip audio data from serialized_output to avoid duplication + if "serialized_output" in result: + for item in result["serialized_output"]: + if isinstance(item, dict) and "audio" in item: + # Keep only metadata, remove base64 data + if isinstance(item["audio"], dict) and "data" in item["audio"]: + del item["audio"]["data"] + # Also strip debug_info from serialized content + if isinstance(item, dict) and "content" in item and item["content"]: + item["content"] = DEBUG_INFO_PATTERN.sub("", item["content"]) + + return result + + def _process_audio_response(self, audio_data, response_id: str) -> dict: + """Process audio data: save to file and return metadata with path.""" + audio_info = { + "format": getattr(audio_data, "format", "wav"), + "sample_rate": getattr(audio_data, "sample_rate", 22050), + "transcript": getattr(audio_data, "transcript", None), + } + + audio_base64 = getattr(audio_data, "data", None) + if not audio_base64: + return audio_info + + if self.output_audio_dir: + try: + audio_bytes = base64.b64decode(audio_base64) + filename = f"{response_id}.wav" + filepath = os.path.join(self.output_audio_dir, filename) + + with open(filepath, "wb") as f: + f.write(audio_bytes) + + audio_info["path"] = filepath + audio_info["size_bytes"] = len(audio_bytes) + LOG.info(f"Saved audio: {filepath} ({len(audio_bytes)} bytes)") + except Exception as e: + LOG.warning(f"Failed to save audio: {e}") + audio_info["data"] = audio_base64 + else: + audio_info["data"] = audio_base64 + + return audio_info From 25752670474372ac9a0b7530c9e719e196649b43 Mon Sep 17 00:00:00 2001 From: Valentin Mendelev Date: Thu, 18 Dec 2025 06:37:10 -0800 Subject: [PATCH 03/26] generation.py to respect separate server type for the client Signed-off-by: Valentin Mendelev --- nemo_skills/pipeline/utils/generation.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/nemo_skills/pipeline/utils/generation.py b/nemo_skills/pipeline/utils/generation.py index cd576053c1..4b45ed8a39 100644 --- a/nemo_skills/pipeline/utils/generation.py +++ b/nemo_skills/pipeline/utils/generation.py @@ -446,6 +446,9 @@ def configure_client( - server_address: Address of the server. - extra_arguments: Updated extra arguments for the command. """ + # Check if user already specified server.server_type in extra_arguments + user_specified_server_type = "++server.server_type=" in extra_arguments + if server_gpus: # we need to host the model server_port = get_free_port(strategy="random") if get_random_port else 5000 assert server_gpus is not None, "Need to specify server_gpus if hosting the model" @@ -462,14 +465,17 @@ def configure_client( } if server_container: server_config["container"] = server_container + # Only add server_type if user didn't specify it (allows vllm_multimodal override) + server_type_arg = "" if user_specified_server_type else f"++server.server_type={server_type} " extra_arguments = ( - f"{extra_arguments} ++server.server_type={server_type} ++server.host=127.0.0.1 " + f"{extra_arguments} {server_type_arg}++server.host=127.0.0.1 " f"++server.port={server_port} ++server.model={model} " ) else: # model is hosted elsewhere server_config = None + # Only add server_type if user didn't specify it + server_type_arg = "" if user_specified_server_type else f"++server.server_type={server_type} " extra_arguments = ( - f"{extra_arguments} ++server.server_type={server_type} " - f"++server.base_url={server_address} ++server.model={model} " + f"{extra_arguments} {server_type_arg}++server.base_url={server_address} ++server.model={model} " ) return server_config, server_address, extra_arguments From b8d95f0195d8a7f26ab724c8af07009d64052b09 Mon Sep 17 00:00:00 2001 From: Valentin Mendelev Date: Sat, 20 Dec 2025 05:39:26 -0800 Subject: [PATCH 04/26] Unified server to work with NeMo models not supported by vLLM Signed-off-by: Valentin Mendelev --- nemo_skills/inference/server/serve_unified.py | 397 ++++++++++ recipes/multimodal/server/__init__.py | 38 + .../multimodal/server/backends/__init__.py | 81 ++ recipes/multimodal/server/backends/base.py | 251 ++++++ recipes/multimodal/server/session_manager.py | 249 ++++++ recipes/multimodal/server/unified_server.py | 745 ++++++++++++++++++ 6 files changed, 1761 insertions(+) create mode 100644 nemo_skills/inference/server/serve_unified.py create mode 100644 recipes/multimodal/server/__init__.py create mode 100644 recipes/multimodal/server/backends/__init__.py create mode 100644 recipes/multimodal/server/backends/base.py create mode 100644 recipes/multimodal/server/session_manager.py create mode 100644 recipes/multimodal/server/unified_server.py diff --git a/nemo_skills/inference/server/serve_unified.py b/nemo_skills/inference/server/serve_unified.py new file mode 100644 index 0000000000..748b7cf044 --- /dev/null +++ b/nemo_skills/inference/server/serve_unified.py @@ -0,0 +1,397 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +CLI wrapper for the Unified NeMo Inference Server. + +This module provides a command-line interface compatible with nemo-skills +server deployment patterns. It translates standard vllm-style CLI arguments +to the unified server configuration. + +Usage via NeMo-Skills: + + # SALM backend (speech-augmented language model) + ns eval \\ + --server_type vllm \\ + --server_gpus 1 \\ + --model /path/to/model \\ + --server_entrypoint "-m nemo_skills.inference.server.serve_unified" \\ + --server_args "--backend salm" + + # TTS backend (text-to-speech) + ns eval \\ + --server_type vllm \\ + --server_gpus 1 \\ + --model /path/to/tts_model \\ + --server_entrypoint "-m nemo_skills.inference.server.serve_unified" \\ + --server_args "--backend tts --codec_model /path/to/codec" + + # S2S backend (speech-to-speech) + ns eval \\ + --server_type vllm \\ + --server_gpus 1 \\ + --model /path/to/s2s_model \\ + --server_entrypoint "-m nemo_skills.inference.server.serve_unified" \\ + --server_args "--backend s2s" + +Environment Variables: + UNIFIED_SERVER_HOST: Server host (default: 0.0.0.0) + UNIFIED_SERVER_PORT: Server port (default: 8000) + UNIFIED_SERVER_BACKEND: Backend type (default: salm) + UNIFIED_SERVER_MODEL_PATH: Path to model + UNIFIED_SERVER_CODEC_MODEL_PATH: Path to codec model + UNIFIED_SERVER_BATCH_SIZE: Batch size (default: 8) + UNIFIED_SERVER_BATCH_TIMEOUT: Batch timeout (default: 0.1) + DEBUG: Enable debug mode +""" + +import argparse +import inspect +import os +import shutil +import sys +from typing import Optional + + +def setup_pythonpath(code_path: Optional[str] = None): + """Set up PYTHONPATH for NeMo and the unified server. + + Args: + code_path: Single path or colon-separated paths to add to PYTHONPATH + """ + paths_to_add = [] + + # Add explicit code path(s) if provided (supports colon-separated paths) + if code_path: + for path in code_path.split(":"): + if path and path not in paths_to_add: + paths_to_add.append(path) + + # Add recipes path for unified server imports + # Look for the recipes directory relative to this file + this_dir = os.path.dirname(os.path.abspath(__file__)) + + # Try to find ns_eval root (go up from nemo_skills/inference/server/) + ns_eval_root = os.path.dirname(os.path.dirname(os.path.dirname(this_dir))) + if os.path.exists(os.path.join(ns_eval_root, "recipes")): + paths_to_add.append(ns_eval_root) + + # Also check /nemo_run/code pattern used in containers + if os.path.exists("/nemo_run/code"): + paths_to_add.append("/nemo_run/code") + + # Update PYTHONPATH + current_path = os.environ.get("PYTHONPATH", "") + for path in paths_to_add: + if path not in current_path.split(":"): + current_path = f"{path}:{current_path}" if current_path else path + + os.environ["PYTHONPATH"] = current_path + + # Also add to sys.path for immediate imports + for path in paths_to_add: + if path not in sys.path: + sys.path.insert(0, path) + + +def apply_safetensors_patch(hack_path: Optional[str]): + """Apply safetensors patch if provided (for some NeMo models).""" + if not hack_path or not os.path.exists(hack_path): + return + + try: + import safetensors.torch as st_torch + + dest_path = inspect.getfile(st_torch) + os.makedirs(os.path.dirname(dest_path), exist_ok=True) + shutil.copyfile(hack_path, dest_path) + print(f"[serve_unified] Applied safetensors patch: {hack_path} -> {dest_path}") + except Exception as e: + print(f"[serve_unified] Warning: Failed to apply safetensors patch: {e}") + + +def main(): + parser = argparse.ArgumentParser( + description="Unified NeMo Inference Server CLI wrapper", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + + # Standard vllm-style arguments (for nemo-skills compatibility) + parser.add_argument("--model", required=True, help="Path to the model") + parser.add_argument("--num_gpus", type=int, default=1, help="Number of GPUs to use") + parser.add_argument("--port", type=int, default=8000, help="Server port") + + # Backend selection + parser.add_argument( + "--backend", + default="salm", + choices=["salm", "tts", "s2s", "s2s_incremental", "s2s_session"], + help="Backend type: salm (speech-augmented LM), tts (text-to-speech), s2s (speech-to-speech offline), s2s_incremental (frame-by-frame processing), s2s_session (session-aware multi-turn)", + ) + + # Backend-specific model paths + parser.add_argument("--codec_model", default=None, help="Path to codec model (required for TTS, optional for S2S)") + + # Server configuration + parser.add_argument("--host", default="0.0.0.0", help="Server host") + parser.add_argument("--batch_size", type=int, default=8, help="Maximum batch size") + parser.add_argument( + "--batch_timeout", type=float, default=0.1, help="Batch timeout in seconds (0 for no batching delay)" + ) + + # Generation defaults + parser.add_argument("--max_new_tokens", type=int, default=512, help="Max tokens to generate") + parser.add_argument("--temperature", type=float, default=1.0, help="Generation temperature") + parser.add_argument("--top_p", type=float, default=1.0, help="Top-p sampling") + + # Model configuration + parser.add_argument("--device", default="cuda", help="Device to use") + parser.add_argument("--dtype", default="bfloat16", help="Model dtype") + + # Backend-specific options + parser.add_argument("--prompt_format", default=None, help="Prompt format (SALM backend)") + parser.add_argument( + "--phoneme_input_type", default="predicted", help="Phoneme input type: predicted or gt (TTS backend)" + ) + parser.add_argument( + "--decoder_only_model", action="store_true", help="Use decoder-only model architecture (TTS backend)" + ) + parser.add_argument("--use_local_transformer", action="store_true", help="Use local transformer (TTS backend)") + parser.add_argument("--top_k", type=int, default=None, help="Top-k sampling (TTS backend)") + + # Environment setup + parser.add_argument("--code_path", default=None, help="Path to NeMo source code to add to PYTHONPATH") + parser.add_argument("--hack_path", default=None, help="Path to safetensors/torch.py patch file") + + # S2S backend options + parser.add_argument( + "--ignore_system_prompt", + action="store_true", + help="Ignore system prompts from requests (for models that don't support them)", + ) + parser.add_argument( + "--silence_padding_sec", + type=float, + default=5.0, + help="Seconds of silence to append after audio (S2S backends)", + ) + + # S2S Incremental backend options + parser.add_argument( + "--config_path", + default=None, + help="Path to YAML config file (s2s_incremental backend)", + ) + parser.add_argument( + "--llm_checkpoint_path", + default=None, + help="Path to LLM checkpoint (s2s_incremental backend)", + ) + parser.add_argument( + "--tts_checkpoint_path", + default=None, + help="Path to TTS checkpoint (s2s_incremental backend)", + ) + parser.add_argument( + "--speaker_reference", + default=None, + help="Path to speaker reference audio for TTS (s2s_incremental backend)", + ) + parser.add_argument( + "--num_frames_per_inference", + type=int, + default=1, + help="Frames per inference step (s2s_incremental backend)", + ) + parser.add_argument( + "--no_decode_audio", + action="store_true", + help="Disable audio output (s2s_incremental backend)", + ) + + # Session management options (s2s_session backend) + parser.add_argument( + "--session_ttl", + type=float, + default=300.0, + help="Session time-to-live in seconds (s2s_session backend)", + ) + parser.add_argument( + "--max_sessions", + type=int, + default=100, + help="Maximum number of concurrent sessions (s2s_session backend)", + ) + parser.add_argument( + "--session_artifacts_dir", + type=str, + default=None, + help="Directory to save session artifacts (input/output audio, JSON). Default: /tmp/s2s_sessions", + ) + parser.add_argument( + "--no_save_session_artifacts", + action="store_true", + help="Disable saving session artifacts to disk", + ) + parser.add_argument( + "--output_frame_alignment", + action="store_true", + help="Include per-frame alignment data in debug output (user/agent/ASR per frame)", + ) + + # Debug + parser.add_argument("--debug", action="store_true", help="Enable debug mode") + + # Parse known args, allowing extra args to be passed through + args, extra_args = parser.parse_known_args() + + # Setup environment + setup_pythonpath(args.code_path) + apply_safetensors_patch(args.hack_path) + + # Set environment variables + os.environ["UNIFIED_SERVER_HOST"] = args.host + os.environ["UNIFIED_SERVER_PORT"] = str(args.port) + os.environ["UNIFIED_SERVER_BACKEND"] = args.backend + os.environ["UNIFIED_SERVER_MODEL_PATH"] = args.model + os.environ["UNIFIED_SERVER_BATCH_SIZE"] = str(args.batch_size) + os.environ["UNIFIED_SERVER_BATCH_TIMEOUT"] = str(args.batch_timeout) + os.environ["UNIFIED_SERVER_MAX_NEW_TOKENS"] = str(args.max_new_tokens) + os.environ["UNIFIED_SERVER_TEMPERATURE"] = str(args.temperature) + os.environ["UNIFIED_SERVER_TOP_P"] = str(args.top_p) + + if args.codec_model: + os.environ["UNIFIED_SERVER_CODEC_MODEL_PATH"] = args.codec_model + + if args.debug: + os.environ["DEBUG"] = "1" + + # Set CUDA devices + os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(i) for i in range(args.num_gpus)) + + # Build extra config for backend-specific options + extra_config = {} + + if args.prompt_format: + extra_config["prompt_format"] = args.prompt_format + + if args.backend == "tts": + extra_config["decoder_only_model"] = args.decoder_only_model + extra_config["phoneme_input_type"] = args.phoneme_input_type + extra_config["use_local_transformer"] = args.use_local_transformer + if args.top_k: + extra_config["top_k"] = args.top_k + + # S2S backend options + if args.backend in ("s2s", "s2s_incremental", "s2s_session"): + extra_config["ignore_system_prompt"] = args.ignore_system_prompt + if args.silence_padding_sec != 5.0: + extra_config["silence_padding_sec"] = args.silence_padding_sec + + # S2S Incremental/Session backend options (shared config) + if args.backend in ("s2s_incremental", "s2s_session"): + if args.config_path: + extra_config["config_path"] = args.config_path + if args.llm_checkpoint_path: + extra_config["llm_checkpoint_path"] = args.llm_checkpoint_path + if args.tts_checkpoint_path: + extra_config["tts_checkpoint_path"] = args.tts_checkpoint_path + if args.speaker_reference: + extra_config["speaker_reference"] = args.speaker_reference + if args.num_frames_per_inference != 1: + extra_config["num_frames_per_inference"] = args.num_frames_per_inference + if args.no_decode_audio: + extra_config["decode_audio"] = False + # Artifacts and alignment (available for both backends) + if args.session_artifacts_dir: + extra_config["session_artifacts_dir"] = args.session_artifacts_dir + extra_config["save_session_artifacts"] = not args.no_save_session_artifacts + extra_config["output_frame_alignment"] = args.output_frame_alignment + + # S2S Session backend options + if args.backend == "s2s_session": + extra_config["session_ttl"] = args.session_ttl + extra_config["max_sessions"] = args.max_sessions + + # Print configuration + print("=" * 60) + print("[serve_unified] Starting Unified NeMo Inference Server") + print("=" * 60) + print(f" Backend: {args.backend}") + print(f" Model: {args.model}") + if args.codec_model: + print(f" Codec Model: {args.codec_model}") + print(f" Port: {args.port}") + print(f" GPUs: {args.num_gpus}") + print(f" Batch Size: {args.batch_size}") + print(f" Batch Timeout: {args.batch_timeout}s") + print(f" Device: {args.device}") + print(f" Dtype: {args.dtype}") + if args.backend in ("s2s_incremental", "s2s_session"): + if args.config_path: + print(f" Config Path: {args.config_path}") + if args.llm_checkpoint_path: + print(f" LLM Checkpoint: {args.llm_checkpoint_path}") + if args.speaker_reference: + print(f" Speaker Reference: {args.speaker_reference}") + print(f" Frames per Inference: {args.num_frames_per_inference}") + print(f" Decode Audio: {not args.no_decode_audio}") + print(f" Save Artifacts: {not args.no_save_session_artifacts}") + if args.session_artifacts_dir: + print(f" Artifacts Dir: {args.session_artifacts_dir}") + else: + print(" Artifacts Dir: /tmp/s2s_sessions (default)") + print(f" Output Frame Alignment: {args.output_frame_alignment}") + if args.backend == "s2s_session": + print(f" Session TTL: {args.session_ttl}s") + print(f" Max Sessions: {args.max_sessions}") + if extra_config: + print(f" Extra Config: {extra_config}") + print("=" * 60) + + # Import and run the unified server + try: + import uvicorn + + from recipes.multimodal.server.unified_server import create_app + + app = create_app( + backend_type=args.backend, + model_path=args.model, + codec_model_path=args.codec_model or "", + batch_size=args.batch_size, + batch_timeout=args.batch_timeout, + device=args.device, + dtype=args.dtype, + extra_config=extra_config if extra_config else None, + ) + + uvicorn.run(app, host=args.host, port=args.port, log_level="info") + + except ImportError as e: + print(f"[serve_unified] Error: Failed to import unified server: {e}") + print("[serve_unified] Make sure the recipes.multimodal.server package is in PYTHONPATH") + sys.exit(1) + except Exception as e: + print(f"[serve_unified] Error: {e}") + import traceback + + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/recipes/multimodal/server/__init__.py b/recipes/multimodal/server/__init__.py new file mode 100644 index 0000000000..89a349346e --- /dev/null +++ b/recipes/multimodal/server/__init__.py @@ -0,0 +1,38 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Unified NeMo Inference Server package. + +Provides a pluggable FastAPI server that supports multiple NeMo model backends: +- SALM: Speech-Augmented Language Model (text output from text/audio input) +- TTS: Text-to-Speech (audio output from text input) +- S2S: Speech-to-Speech (text+audio output from audio input) +""" + +from .backends import ( + BackendConfig, + GenerationRequest, + GenerationResult, + InferenceBackend, + get_backend, +) + +__all__ = [ + "InferenceBackend", + "GenerationRequest", + "GenerationResult", + "BackendConfig", + "get_backend", +] diff --git a/recipes/multimodal/server/backends/__init__.py b/recipes/multimodal/server/backends/__init__.py new file mode 100644 index 0000000000..861b330f00 --- /dev/null +++ b/recipes/multimodal/server/backends/__init__.py @@ -0,0 +1,81 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Backend implementations for the Unified NeMo Inference Server. + +Available backends: +- salm: Speech-Augmented Language Model (text output from text/audio input) +- tts: Text-to-Speech using MagpieTTS (audio output from text input) +- s2s: Speech-to-Speech using DuplexS2S offline (text output from audio input) +- s2s_incremental: Speech-to-Speech using NemotronVoiceChat incremental (text+audio from audio) +- s2s_session: Speech-to-Speech with session support for multi-turn conversations +""" + +from .base import BackendConfig, GenerationRequest, GenerationResult, InferenceBackend, Modality + +__all__ = [ + "InferenceBackend", + "GenerationRequest", + "GenerationResult", + "BackendConfig", + "Modality", + "get_backend", + "list_backends", +] + +# Registry of available backends +BACKEND_REGISTRY = { + "salm": ("salm_backend", "SALMBackend"), + "tts": ("tts_backend", "TTSBackend"), + "s2s": ("s2s_backend", "S2SBackend"), + "s2s_incremental": ("s2s_incremental_backend", "S2SIncrementalBackend"), + "s2s_session": ("s2s_session_backend", "S2SSessionBackend"), +} + + +def list_backends() -> list: + """Return list of available backend names.""" + return list(BACKEND_REGISTRY.keys()) + + +def get_backend(backend_name: str) -> type: + """ + Get backend class by name with lazy loading. + + Args: + backend_name: One of 'salm', 'tts', 's2s' + + Returns: + Backend class (not instance) + + Raises: + ValueError: If backend name is unknown + ImportError: If backend dependencies are not available + """ + if backend_name not in BACKEND_REGISTRY: + available = ", ".join(BACKEND_REGISTRY.keys()) + raise ValueError(f"Unknown backend: '{backend_name}'. Available backends: {available}") + + module_name, class_name = BACKEND_REGISTRY[backend_name] + + import importlib + + try: + module = importlib.import_module(f".{module_name}", package=__name__) + return getattr(module, class_name) + except ImportError as e: + raise ImportError( + f"Failed to import backend '{backend_name}'. Make sure required dependencies are installed. Error: {e}" + ) from e diff --git a/recipes/multimodal/server/backends/base.py b/recipes/multimodal/server/backends/base.py new file mode 100644 index 0000000000..e1d62c9765 --- /dev/null +++ b/recipes/multimodal/server/backends/base.py @@ -0,0 +1,251 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Abstract base class for inference backends. + +All model backends (SALM, TTS, S2S, etc.) must implement this interface +to be usable with the unified inference server. +""" + +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from enum import Enum +from typing import Any, Dict, List, Optional, Set + + +class Modality(str, Enum): + """Supported input/output modalities.""" + + TEXT = "text" + AUDIO_IN = "audio_in" + AUDIO_OUT = "audio_out" + + +@dataclass +class BackendConfig: + """Base configuration for all backends.""" + + model_path: str + device: str = "cuda" + dtype: str = "bfloat16" + + # Generation defaults + max_new_tokens: int = 512 + temperature: float = 1.0 + top_p: float = 1.0 + top_k: Optional[int] = None + + # Additional model-specific configs passed through + extra_config: Dict[str, Any] = field(default_factory=dict) + + @classmethod + def from_dict(cls, d: Dict[str, Any]) -> "BackendConfig": + """Create config from dictionary, extracting known fields.""" + known_fields = {f.name for f in cls.__dataclass_fields__.values()} + known = {k: v for k, v in d.items() if k in known_fields and k != "extra_config"} + extra = {k: v for k, v in d.items() if k not in known_fields} + return cls(**known, extra_config=extra) + + +@dataclass +class GenerationRequest: + """ + A single generation request. + + Supports text and/or audio inputs depending on the backend's capabilities. + """ + + # Text inputs + text: Optional[str] = None + system_prompt: Optional[str] = None + user_prompt: Optional[str] = None + + # Audio input (raw bytes or file path) + audio_bytes: Optional[bytes] = None + audio_path: Optional[str] = None + sample_rate: int = 16000 + + # Multi-turn audio inputs (list of audio bytes or paths) + audio_bytes_list: Optional[List[bytes]] = None + audio_paths: Optional[List[str]] = None + + # Generation parameters (override backend defaults) + max_new_tokens: Optional[int] = None + temperature: Optional[float] = None + top_p: Optional[float] = None + top_k: Optional[int] = None + seed: Optional[int] = None + + # Additional parameters + extra_params: Dict[str, Any] = field(default_factory=dict) + + # Request tracking + request_id: Optional[str] = None + + +@dataclass +class GenerationResult: + """ + Result from a generation request. + + Contains text output and optionally audio output, plus metadata. + """ + + # Text output + text: str = "" + + # Audio output (raw bytes, can be encoded to base64 for JSON) + audio_bytes: Optional[bytes] = None + audio_sample_rate: int = 16000 + audio_format: str = "wav" + + # Metadata + request_id: Optional[str] = None + num_tokens_generated: int = 0 + generation_time_ms: float = 0.0 + + # Debug info (optional, backend-specific) + debug_info: Optional[Dict[str, Any]] = None + + # Error handling + error: Optional[str] = None + + def is_success(self) -> bool: + return self.error is None + + +class InferenceBackend(ABC): + """ + Abstract base class for inference backends. + + Implementations must provide: + - load_model(): Initialize the model from config + - generate(): Run inference on a batch of requests + - supported_modalities: What input/output types are supported + + The unified server uses this interface to handle any backend uniformly. + """ + + def __init__(self, config: BackendConfig): + """ + Initialize the backend with configuration. + + Args: + config: Backend configuration including model path and generation defaults + """ + self.config = config + self._model = None + self._is_loaded = False + + @property + @abstractmethod + def name(self) -> str: + """Return the backend name (e.g., 'salm', 'tts', 's2s').""" + pass + + @property + @abstractmethod + def supported_modalities(self) -> Set[Modality]: + """ + Return the set of supported modalities. + + Examples: + - SALM: {TEXT, AUDIO_IN} - text output from text/audio input + - TTS: {TEXT, AUDIO_OUT} - audio output from text input + - S2S: {TEXT, AUDIO_IN, AUDIO_OUT} - audio+text output from audio input + """ + pass + + @abstractmethod + def load_model(self) -> None: + """ + Load and initialize the model. + + Should set self._model and self._is_loaded = True on success. + Called once during server startup. + + Raises: + RuntimeError: If model loading fails + """ + pass + + @abstractmethod + def generate(self, requests: List[GenerationRequest]) -> List[GenerationResult]: + """ + Run inference on a batch of requests. + + Args: + requests: List of generation requests to process + + Returns: + List of generation results, one per request (same order) + + Note: + - Implementations should handle batching internally + - Each result should have request_id matching the input + - On error, set result.error instead of raising + """ + pass + + @property + def is_loaded(self) -> bool: + """Check if the model is loaded and ready.""" + return self._is_loaded + + def health_check(self) -> Dict[str, Any]: + """ + Return health status information. + + Override to add backend-specific health info. + """ + return { + "backend": self.name, + "model_loaded": self._is_loaded, + "model_path": self.config.model_path, + "device": self.config.device, + "modalities": [m.value for m in self.supported_modalities], + } + + def get_generation_params(self, request: GenerationRequest) -> Dict[str, Any]: + """ + Get effective generation parameters, merging request with config defaults. + """ + return { + "max_new_tokens": request.max_new_tokens or self.config.max_new_tokens, + "temperature": request.temperature or self.config.temperature, + "top_p": request.top_p or self.config.top_p, + "top_k": request.top_k or self.config.top_k, + } + + def validate_request(self, request: GenerationRequest) -> Optional[str]: + """ + Validate a request against supported modalities. + + Returns: + Error message if invalid, None if valid + """ + modalities = self.supported_modalities + + has_text_input = request.text is not None + has_audio_input = request.audio_bytes is not None or request.audio_path is not None + + # Check input modalities + if has_audio_input and Modality.AUDIO_IN not in modalities: + return f"Backend '{self.name}' does not support audio input" + + if not has_text_input and not has_audio_input: + return "Request must have either text or audio input" + + return None diff --git a/recipes/multimodal/server/session_manager.py b/recipes/multimodal/server/session_manager.py new file mode 100644 index 0000000000..113735b8ba --- /dev/null +++ b/recipes/multimodal/server/session_manager.py @@ -0,0 +1,249 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Session manager for S2S session backend. + +Manages session state (LLM KV cache, frame index, etc.) across HTTP requests +to enable multi-turn conversations. +""" + +import threading +import time +import uuid +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional + +import torch + + +@dataclass +class TurnData: + """Data for a single turn in a conversation.""" + + turn_idx: int + user_audio_bytes: Optional[bytes] = None # Input audio from user + agent_audio_bytes: Optional[bytes] = None # Output audio from agent + agent_text: str = "" # Text response for this turn + user_duration_sec: float = 0.0 # Duration of user audio + agent_duration_sec: float = 0.0 # Duration of agent audio + + +@dataclass +class SessionState: + """State that persists between turns in a session.""" + + session_id: str + + # LLM state + llm_cache: Any = None # DynamicCache (for non-Mamba models) + input_embeds_history: Any = None # List of embeddings (for Mamba models) + frame_idx: int = 0 + + # Token history (for turn-taking logic) + gen_text: Optional[torch.Tensor] = None + gen_asr_text: Optional[torch.Tensor] = None + + # Audio buffer state + audio_buffer: Optional[torch.Tensor] = None + buffer_fill_level: int = 0 + + # Turn tracking + turn_count: int = 0 + + # Per-turn data for session audio generation + turns: List[TurnData] = field(default_factory=list) + + # Timestamps + created_at: float = field(default_factory=time.time) + last_accessed: float = field(default_factory=time.time) + + def touch(self): + """Update last_accessed timestamp.""" + self.last_accessed = time.time() + + +class SessionManager: + """ + Manages session state for S2S multi-turn conversations. + + Thread-safe implementation with TTL-based cleanup. + """ + + def __init__(self, ttl_seconds: float = 300.0, max_sessions: int = 100): + """ + Initialize SessionManager. + + Args: + ttl_seconds: Time-to-live for sessions in seconds (default: 5 minutes) + max_sessions: Maximum number of concurrent sessions + """ + self.ttl_seconds = ttl_seconds + self.max_sessions = max_sessions + self.sessions: Dict[str, SessionState] = {} + self._lock = threading.RLock() + + def create_session(self, session_id: Optional[str] = None) -> SessionState: + """ + Create a new session. + + Args: + session_id: Optional session ID. If None, generates a UUID. + + Returns: + New SessionState object + """ + with self._lock: + if session_id is None: + session_id = str(uuid.uuid4()) + + # Clean up expired sessions first + self._cleanup_expired_locked() + + # Evict oldest if at capacity + if len(self.sessions) >= self.max_sessions: + self._evict_oldest_locked() + + state = SessionState(session_id=session_id) + self.sessions[session_id] = state + print(f"[SessionManager] Created session: {session_id}") + return state + + def get_session(self, session_id: str) -> Optional[SessionState]: + """ + Get existing session by ID. + + Args: + session_id: Session ID to look up + + Returns: + SessionState if found and not expired, None otherwise + """ + with self._lock: + state = self.sessions.get(session_id) + if state is None: + return None + + # Check if expired + if time.time() - state.last_accessed > self.ttl_seconds: + print(f"[SessionManager] Session expired: {session_id}") + del self.sessions[session_id] + return None + + state.touch() + return state + + def get_or_create_session(self, session_id: Optional[str] = None) -> SessionState: + """ + Get existing session or create new one. + + Args: + session_id: Session ID. If None, creates new session. + + Returns: + SessionState (existing or new) + """ + if session_id: + state = self.get_session(session_id) + if state is not None: + return state + + return self.create_session(session_id) + + def save_session(self, session_id: str, state: SessionState): + """ + Save/update session state. + + Args: + session_id: Session ID + state: SessionState to save + """ + with self._lock: + state.touch() + self.sessions[session_id] = state + + def delete_session(self, session_id: str) -> bool: + """ + Delete a session. + + Args: + session_id: Session ID to delete + + Returns: + True if session was deleted, False if not found + """ + with self._lock: + if session_id in self.sessions: + del self.sessions[session_id] + print(f"[SessionManager] Deleted session: {session_id}") + return True + return False + + def get_session_info(self, session_id: str) -> Optional[Dict[str, Any]]: + """ + Get session info without full state. + + Args: + session_id: Session ID + + Returns: + Dict with session metadata or None + """ + with self._lock: + state = self.sessions.get(session_id) + if state is None: + return None + + return { + "session_id": state.session_id, + "frame_idx": state.frame_idx, + "turn_count": state.turn_count, + "created_at": state.created_at, + "last_accessed": state.last_accessed, + "has_llm_cache": state.llm_cache is not None, + "has_input_embeds_history": state.input_embeds_history is not None + and len(state.input_embeds_history) > 0, + } + + def list_sessions(self) -> list: + """List all active session IDs.""" + with self._lock: + return list(self.sessions.keys()) + + def cleanup_expired(self): + """Clean up expired sessions (called periodically).""" + with self._lock: + self._cleanup_expired_locked() + + def _cleanup_expired_locked(self): + """Clean up expired sessions (must hold lock).""" + now = time.time() + expired = [sid for sid, state in self.sessions.items() if now - state.last_accessed > self.ttl_seconds] + for sid in expired: + print(f"[SessionManager] Cleaning up expired session: {sid}") + del self.sessions[sid] + + def _evict_oldest_locked(self): + """Evict oldest session to make room (must hold lock).""" + if not self.sessions: + return + + oldest_id = min(self.sessions.keys(), key=lambda sid: self.sessions[sid].last_accessed) + print(f"[SessionManager] Evicting oldest session: {oldest_id}") + del self.sessions[oldest_id] + + def __len__(self) -> int: + """Return number of active sessions.""" + with self._lock: + return len(self.sessions) diff --git a/recipes/multimodal/server/unified_server.py b/recipes/multimodal/server/unified_server.py new file mode 100644 index 0000000000..2cf989656c --- /dev/null +++ b/recipes/multimodal/server/unified_server.py @@ -0,0 +1,745 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Unified NeMo Inference Server with OpenAI-compatible API. + +Supports multiple NeMo model backends: +- SALM: Speech-Augmented Language Model +- TTS: Text-to-Speech (MagpieTTS) +- S2S: Speech-to-Speech (Duplex) + +Exposes only /v1/chat/completions endpoint for OpenAI compatibility. + +Usage: + python unified_server.py --backend s2s --model /path/to/model +""" + +import asyncio +import base64 +import hashlib +import json +import os +import re +import time +from dataclasses import dataclass +from datetime import datetime +from typing import Any, Dict, List, Optional + +import uvicorn +from fastapi import FastAPI, HTTPException +from fastapi.responses import JSONResponse + +from .backends import BackendConfig, GenerationRequest, GenerationResult, get_backend +from .session_manager import SessionManager + +# Configuration from environment +HOST = os.getenv("UNIFIED_SERVER_HOST", "0.0.0.0") +PORT = int(os.getenv("UNIFIED_SERVER_PORT", "8000")) +BACKEND_TYPE = os.getenv("UNIFIED_SERVER_BACKEND", "salm") +MODEL_PATH = os.getenv("UNIFIED_SERVER_MODEL_PATH", "") +CODEC_MODEL_PATH = os.getenv("UNIFIED_SERVER_CODEC_MODEL_PATH", "") + +# Batching configuration +BATCH_SIZE = int(os.getenv("UNIFIED_SERVER_BATCH_SIZE", "8")) +BATCH_TIMEOUT = float(os.getenv("UNIFIED_SERVER_BATCH_TIMEOUT", "0.1")) + +# Generation defaults +MAX_NEW_TOKENS = int(os.getenv("UNIFIED_SERVER_MAX_NEW_TOKENS", "512")) +TEMPERATURE = float(os.getenv("UNIFIED_SERVER_TEMPERATURE", "1.0")) +TOP_P = float(os.getenv("UNIFIED_SERVER_TOP_P", "1.0")) + +# Debug +DEBUG = os.getenv("DEBUG", "").lower() in ("true", "1", "yes", "on") + + +@dataclass +class PendingRequest: + """Container for a pending batched request.""" + + request: GenerationRequest + future: asyncio.Future + timestamp: float + + +class RequestBatcher: + """Manages request batching with configurable delay.""" + + def __init__(self, backend, batch_size: int, batch_timeout: float): + self.backend = backend + self.batch_size = batch_size + self.batch_timeout = batch_timeout + self.pending_requests: List[PendingRequest] = [] + self.lock = asyncio.Lock() + self.timeout_task: Optional[asyncio.Task] = None + self.processing = False + + # Stats + self.total_requests = 0 + self.total_batches = 0 + + async def add_request(self, request: GenerationRequest) -> GenerationResult: + """Add a request and wait for result.""" + future = asyncio.Future() + pending = PendingRequest(request=request, future=future, timestamp=time.time()) + + async with self.lock: + self.pending_requests.append(pending) + + # Check if we should process immediately + if len(self.pending_requests) >= self.batch_size: + if DEBUG: + print(f"[Batcher] Batch full ({self.batch_size}), processing immediately") + asyncio.create_task(self._process_batch()) + elif self.batch_timeout == 0: + # No delay mode + asyncio.create_task(self._process_batch()) + elif self.timeout_task is None or self.timeout_task.done(): + # Schedule timeout + self.timeout_task = asyncio.create_task(self._timeout_handler()) + + return await future + + async def _timeout_handler(self): + """Handle batch timeout.""" + await asyncio.sleep(self.batch_timeout) + async with self.lock: + if self.pending_requests and not self.processing: + if DEBUG: + print(f"[Batcher] Timeout, processing {len(self.pending_requests)} requests") + asyncio.create_task(self._process_batch()) + + async def _process_batch(self): + """Process pending requests as a batch.""" + async with self.lock: + if not self.pending_requests or self.processing: + return + + self.processing = True + batch = self.pending_requests[: self.batch_size] + self.pending_requests = self.pending_requests[self.batch_size :] + + try: + # Extract requests + requests = [p.request for p in batch] + + if DEBUG: + print(f"[Batcher] Processing batch of {len(requests)} requests") + + # Run inference in thread pool to not block event loop + loop = asyncio.get_event_loop() + results = await loop.run_in_executor(None, self.backend.generate, requests) + + # Complete futures + for pending, result in zip(batch, results): + if not pending.future.done(): + pending.future.set_result(result) + + # Update stats + self.total_requests += len(batch) + self.total_batches += 1 + + except Exception as e: + # Set exception for all pending requests + for pending in batch: + if not pending.future.done(): + pending.future.set_exception(e) + finally: + async with self.lock: + self.processing = False + # Process more if pending + if self.pending_requests: + if self.batch_timeout == 0 or len(self.pending_requests) >= self.batch_size: + asyncio.create_task(self._process_batch()) + elif self.timeout_task is None or self.timeout_task.done(): + self.timeout_task = asyncio.create_task(self._timeout_handler()) + + +# Global state +backend_instance = None +request_batcher = None +session_manager = None +server_config = {} + + +def extract_audio_from_messages(messages: List[Dict[str, Any]]) -> List[bytes]: + """Extract all audio bytes from OpenAI-format messages. + + Looks for audio_url in message content with format: + {"type": "audio_url", "audio_url": {"url": "data:audio/wav;base64,..."}} + + Returns a list of audio bytes (one per audio_url found), preserving message order. + """ + audio_list = [] + for message in messages: + content = message.get("content") + if isinstance(content, list): + for item in content: + if isinstance(item, dict) and item.get("type") == "audio_url": + audio_url = item.get("audio_url", {}) + url = audio_url.get("url", "") + # Parse data URL: data:audio/wav;base64, + match = re.match(r"data:audio/\w+;base64,(.+)", url) + if match: + audio_list.append(base64.b64decode(match.group(1))) + return audio_list + + +def extract_text_from_messages(messages: List[Dict[str, Any]]) -> str: + """Extract text content from OpenAI-format messages.""" + texts = [] + for message in messages: + content = message.get("content") + if isinstance(content, str): + if content: + texts.append(content) + elif isinstance(content, list): + for item in content: + if isinstance(item, dict) and item.get("type") == "text": + text = item.get("text", "") + if text: + texts.append(text) + elif isinstance(item, str): + texts.append(item) + return " ".join(texts) + + +def extract_system_prompt(messages: List[Dict[str, Any]]) -> Optional[str]: + """Extract system prompt from messages.""" + for message in messages: + if message.get("role") == "system": + content = message.get("content") + if isinstance(content, str): + return content + elif isinstance(content, list): + texts = [ + item.get("text", "") for item in content if isinstance(item, dict) and item.get("type") == "text" + ] + return " ".join(texts) if texts else None + return None + + +def create_app( + backend_type: str = BACKEND_TYPE, + model_path: str = MODEL_PATH, + codec_model_path: str = CODEC_MODEL_PATH, + batch_size: int = BATCH_SIZE, + batch_timeout: float = BATCH_TIMEOUT, + device: str = "cuda", + dtype: str = "bfloat16", + extra_config: Dict[str, Any] = None, +) -> FastAPI: + """Create and configure the FastAPI app.""" + global backend_instance, request_batcher, session_manager, server_config + + # Extract server-level config from extra_config + ignore_system_prompt = extra_config.pop("ignore_system_prompt", False) if extra_config else False + session_ttl = extra_config.pop("session_ttl", 300.0) if extra_config else 300.0 + max_sessions = extra_config.pop("max_sessions", 100) if extra_config else 100 + + app = FastAPI( + title="Unified NeMo Inference Server", + description=f"OpenAI-compatible API for NeMo model inference ({backend_type} backend)", + version="1.0.0", + ) + + # Store config + server_config = { + "backend_type": backend_type, + "model_path": model_path, + "codec_model_path": codec_model_path, + "batch_size": batch_size, + "batch_timeout": batch_timeout, + "device": device, + "dtype": dtype, + "ignore_system_prompt": ignore_system_prompt, + "session_ttl": session_ttl, + "max_sessions": max_sessions, + } + + @app.on_event("startup") + async def startup(): + global backend_instance, request_batcher, session_manager + + # Build backend config + config_dict = { + "model_path": model_path, + "device": device, + "dtype": dtype, + "max_new_tokens": MAX_NEW_TOKENS, + "temperature": TEMPERATURE, + "top_p": TOP_P, + } + + # Add backend-specific config + if codec_model_path: + config_dict["codec_model_path"] = codec_model_path + + if extra_config: + config_dict.update(extra_config) + + config = BackendConfig.from_dict(config_dict) + + # Get and instantiate backend + print(f"[Server] Initializing {backend_type} backend...") + BackendClass = get_backend(backend_type) + backend_instance = BackendClass(config) + + # Load model + backend_instance.load_model() + + # Create batcher + request_batcher = RequestBatcher(backend_instance, batch_size, batch_timeout) + + # Initialize session manager for session-aware backends + if backend_type == "s2s_session": + session_manager = SessionManager(ttl_seconds=session_ttl, max_sessions=max_sessions) + print(f"[Server] Session manager initialized (TTL: {session_ttl}s, max: {max_sessions})") + + print("[Server] Ready!") + print(f" Backend: {backend_type}") + print(f" Model: {model_path}") + print(f" Batch size: {batch_size}") + print(f" Batch timeout: {batch_timeout}s") + if ignore_system_prompt: + print(" System prompts: IGNORED") + + @app.get("/") + async def root(): + """Root endpoint with server info.""" + endpoints = ["/v1/chat/completions", "/health"] + if backend_type == "s2s_session": + endpoints.extend(["/v1/sessions", "/v1/sessions/{session_id}"]) + return { + "service": "Unified NeMo Inference Server", + "version": "1.0.0", + "backend": server_config.get("backend_type"), + "model": server_config.get("model_path"), + "endpoints": endpoints, + } + + # Session management endpoints (only for s2s_session backend) + @app.get("/v1/sessions") + async def list_sessions(): + """List all active sessions.""" + if session_manager is None: + raise HTTPException(status_code=404, detail="Session management not enabled for this backend") + return { + "sessions": session_manager.list_sessions(), + "count": len(session_manager), + "ttl_seconds": session_manager.ttl_seconds, + } + + @app.get("/v1/sessions/{session_id}") + async def get_session(session_id: str): + """Get session info.""" + if session_manager is None: + raise HTTPException(status_code=404, detail="Session management not enabled for this backend") + info = session_manager.get_session_info(session_id) + if info is None: + raise HTTPException(status_code=404, detail=f"Session not found: {session_id}") + return info + + @app.delete("/v1/sessions/{session_id}") + async def delete_session(session_id: str): + """Delete a session and generate final session audio.""" + if session_manager is None: + raise HTTPException(status_code=404, detail="Session management not enabled for this backend") + + # Get session state before deleting + session_state = session_manager.get_session(session_id) + if session_state is None: + raise HTTPException(status_code=404, detail=f"Session not found: {session_id}") + + # Call on_session_close to generate session audio + close_result = {} + if backend_instance is not None and hasattr(backend_instance, "on_session_close"): + try: + close_result = backend_instance.on_session_close(session_state) + except Exception as e: + print(f"[Server] Error in on_session_close: {e}") + import traceback + + traceback.print_exc() + + # Now delete the session + session_manager.delete_session(session_id) + + return {"success": True, "session_id": session_id, **close_result} + + @app.get("/health") + async def health(): + """Health check endpoint.""" + if backend_instance is None: + return JSONResponse(status_code=503, content={"status": "not_ready", "error": "Backend not initialized"}) + + health_info = backend_instance.health_check() + health_info["status"] = "healthy" if backend_instance.is_loaded else "not_ready" + health_info["timestamp"] = datetime.now().isoformat() + + return health_info + + @app.get("/v1/models") + async def list_models(): + """OpenAI-compatible models endpoint.""" + model_id = server_config.get("model_path", "unknown") if server_config else "unknown" + return { + "object": "list", + "data": [ + { + "id": model_id, + "object": "model", + "created": int(time.time()), + "owned_by": "nvidia", + } + ], + } + + @app.post("/v1/chat/completions") + async def chat_completions(request: Dict[str, Any]): + """OpenAI-compatible chat completions endpoint with audio support. + + Accepts messages in OpenAI format with audio_url for audio content: + { + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": [ + {"type": "text", "text": "..."}, + {"type": "audio_url", "audio_url": {"url": "data:audio/wav;base64,..."}} + ]} + ], + "max_tokens": 512, + "temperature": 1.0, + "extra_body": {"session_id": "optional-session-id"} + } + """ + if backend_instance is None or not backend_instance.is_loaded: + raise HTTPException(status_code=503, detail="Model not loaded") + + try: + messages = request.get("messages", []) + if not messages: + raise HTTPException(status_code=400, detail="No messages provided") + + # Extract session_id from extra_body (for s2s_session backend) + extra_body = request.get("extra_body", {}) + session_id = extra_body.get("session_id") if isinstance(extra_body, dict) else None + + # Extract components from messages + audio_bytes_list = extract_audio_from_messages(messages) + text = extract_text_from_messages(messages) + system_prompt = extract_system_prompt(messages) + + # Honor ignore_system_prompt setting + if server_config.get("ignore_system_prompt", False): + system_prompt = None + + # Get generation parameters + max_tokens = request.get("max_tokens", MAX_NEW_TOKENS) + temperature = request.get("temperature", TEMPERATURE) + top_p = request.get("top_p", TOP_P) + seed = request.get("seed") + + # Create generation request + # Use audio_bytes_list for multi-turn, or single audio_bytes for backwards compat + gen_request = GenerationRequest( + text=text if text else None, + system_prompt=system_prompt, + audio_bytes=audio_bytes_list[0] if len(audio_bytes_list) == 1 else None, + audio_bytes_list=audio_bytes_list if len(audio_bytes_list) > 1 else None, + max_new_tokens=max_tokens, + temperature=temperature, + top_p=top_p, + seed=seed, + request_id=hashlib.md5(f"{time.time()}".encode()).hexdigest()[:8], + ) + + # Validate request + error = backend_instance.validate_request(gen_request) + if error: + raise HTTPException(status_code=400, detail=error) + + # Handle s2s_session backend with session support + if backend_type == "s2s_session" and session_manager is not None: + # Get or create session + session_state = session_manager.get_or_create_session(session_id) + session_id = session_state.session_id + + # Run inference with session in thread pool + loop = asyncio.get_event_loop() + result, updated_session = await loop.run_in_executor( + None, + backend_instance.generate_with_session, + gen_request, + session_state, + ) + + # Save updated session state + if updated_session is not None: + session_manager.save_session(session_id, updated_session) + else: + # Process through batcher (non-session path) + result = await request_batcher.add_request(gen_request) + session_id = None + + if not result.is_success(): + raise HTTPException(status_code=500, detail=result.error) + + # Build OpenAI-compatible response + response_id = f"chatcmpl-{hashlib.md5(str(time.time()).encode()).hexdigest()[:8]}" + + # Build message content + message_content = result.text or "" + + # Save outputs to files before sending response (in case client times out) + import json as json_lib + import os + from datetime import datetime + + save_dir = os.environ.get( + "AUDIO_SAVE_DIR", "/lustre/fsw/portfolios/llmservice/users/vmendelev/tmp/voicebench_test" + ) + os.makedirs(save_dir, exist_ok=True) + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + base_filename = f"response_{timestamp}_{response_id}" + + saved_audio_path = None + saved_json_path = None + + # Save JSON with text and debug info + try: + saved_json_path = os.path.join(save_dir, f"{base_filename}.json") + json_output = { + "response_id": response_id, + "timestamp": timestamp, + "text": message_content, + "debug_info": result.debug_info, + "generation_time_ms": result.generation_time_ms, + "num_tokens_generated": result.num_tokens_generated, + } + with open(saved_json_path, "w") as f: + json_lib.dump(json_output, f, indent=2) + print(f"[Server] JSON saved to: {saved_json_path}") + except Exception as e: + print(f"[Server] Warning: Failed to save JSON: {e}") + + # Include audio output if available (base64 encoded) + audio_output = None + if result.audio_bytes: + # Save audio file + try: + saved_audio_path = os.path.join(save_dir, f"{base_filename}.wav") + with open(saved_audio_path, "wb") as f: + f.write(result.audio_bytes) + print(f"[Server] Audio saved to: {saved_audio_path} ({len(result.audio_bytes)} bytes)") + except Exception as e: + print(f"[Server] Warning: Failed to save audio: {e}") + + audio_output = { + "data": base64.b64encode(result.audio_bytes).decode("utf-8"), + "format": result.audio_format or "wav", + "sample_rate": result.audio_sample_rate, + "expires_at": int(time.time()) + 3600, # 1 hour expiry + "transcript": result.text or "", # Text transcript of the audio + } + + # Embed debug_info in content as JSON (OpenAI-compatible) + final_content = message_content + if result.debug_info: + final_content = f"{message_content}\n{json.dumps(result.debug_info)}" + + response = { + "id": response_id, + "object": "chat.completion", + "created": int(time.time()), + "model": server_config.get("model_path"), + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": final_content, + }, + "finish_reason": "stop", + } + ], + "usage": { + "prompt_tokens": -1, + "completion_tokens": result.num_tokens_generated or -1, + "total_tokens": -1, + }, + } + + # Add audio to response if available + if audio_output: + response["choices"][0]["message"]["audio"] = audio_output + + # Add debug info at top level too (for non-litellm clients) + if result.debug_info: + response["debug_info"] = result.debug_info + + # Add saved file paths if available + if saved_audio_path: + response["saved_audio_path"] = saved_audio_path + if saved_json_path: + response["saved_json_path"] = saved_json_path + + # Add session_id for session-aware backends + if session_id: + response["session_id"] = session_id + + return response + + except HTTPException: + raise + except Exception as e: + import traceback + + traceback.print_exc() + raise HTTPException(status_code=500, detail=str(e)) + + return app + + +def main(): + """Run the server from command line.""" + import argparse + + parser = argparse.ArgumentParser(description="Unified NeMo Inference Server") + parser.add_argument( + "--backend", + default=BACKEND_TYPE, + choices=["salm", "tts", "s2s", "s2s_incremental", "s2s_session"], + help="Backend type to use", + ) + parser.add_argument("--model", default=MODEL_PATH, help="Path to model") + parser.add_argument("--codec_model", default=CODEC_MODEL_PATH, help="Path to codec model (for TTS/S2S)") + parser.add_argument("--host", default=HOST, help="Server host") + parser.add_argument("--port", type=int, default=PORT, help="Server port") + parser.add_argument("--batch_size", type=int, default=BATCH_SIZE, help="Batch size") + parser.add_argument( + "--batch_timeout", type=float, default=BATCH_TIMEOUT, help="Batch timeout in seconds (0 for no delay)" + ) + parser.add_argument("--device", default="cuda", help="Device to use") + parser.add_argument("--dtype", default="bfloat16", help="Model dtype") + parser.add_argument("--debug", action="store_true", help="Enable debug mode") + + # Backend-specific arguments + parser.add_argument("--prompt_format", default=None, help="Prompt format (SALM)") + parser.add_argument("--phoneme_input_type", default="predicted", help="Phoneme input type (TTS)") + parser.add_argument("--decoder_only_model", action="store_true", help="Use decoder-only model (TTS)") + parser.add_argument( + "--ignore_system_prompt", + action="store_true", + help="Ignore system prompts from requests (for models that don't support them)", + ) + parser.add_argument( + "--silence_padding_sec", + type=float, + default=5.0, + help="Seconds of silence to append after audio (S2S backend)", + ) + + # S2S Incremental backend arguments + parser.add_argument( + "--config_path", + type=str, + default=None, + help="Path to YAML config file (s2s_incremental backend)", + ) + parser.add_argument( + "--llm_checkpoint_path", + type=str, + default=None, + help="Path to LLM checkpoint (s2s_incremental backend)", + ) + parser.add_argument( + "--tts_checkpoint_path", + type=str, + default=None, + help="Path to TTS checkpoint (s2s_incremental backend)", + ) + parser.add_argument( + "--speaker_reference", + type=str, + default=None, + help="Path to speaker reference audio for TTS (s2s_incremental backend)", + ) + parser.add_argument( + "--num_frames_per_inference", + type=int, + default=1, + help="Frames per inference step (s2s_incremental backend)", + ) + parser.add_argument( + "--decode_audio", + action="store_true", + default=True, + help="Enable audio output via TTS (s2s_incremental backend)", + ) + parser.add_argument( + "--no_decode_audio", + action="store_true", + help="Disable audio output (s2s_incremental backend)", + ) + + args = parser.parse_args() + + if args.debug: + global DEBUG + DEBUG = True + + # Build extra config from backend-specific args + extra_config = {} + if args.prompt_format: + extra_config["prompt_format"] = args.prompt_format + if args.phoneme_input_type: + extra_config["phoneme_input_type"] = args.phoneme_input_type + if args.decoder_only_model: + extra_config["decoder_only_model"] = True + if args.silence_padding_sec != 5.0: # Only add if different from default + extra_config["silence_padding_sec"] = args.silence_padding_sec + extra_config["ignore_system_prompt"] = args.ignore_system_prompt + + # S2S Incremental backend config + if args.config_path: + extra_config["config_path"] = args.config_path + if args.llm_checkpoint_path: + extra_config["llm_checkpoint_path"] = args.llm_checkpoint_path + if args.tts_checkpoint_path: + extra_config["tts_checkpoint_path"] = args.tts_checkpoint_path + if args.speaker_reference: + extra_config["speaker_reference"] = args.speaker_reference + if args.num_frames_per_inference != 1: + extra_config["num_frames_per_inference"] = args.num_frames_per_inference + if args.no_decode_audio: + extra_config["decode_audio"] = False + + app = create_app( + backend_type=args.backend, + model_path=args.model, + codec_model_path=args.codec_model, + batch_size=args.batch_size, + batch_timeout=args.batch_timeout, + device=args.device, + dtype=args.dtype, + extra_config=extra_config if extra_config else None, + ) + + uvicorn.run(app, host=args.host, port=args.port, log_level="info") + + +if __name__ == "__main__": + main() From 66667b0a7e8de7b68a74ece8afe9f655e7f1e820 Mon Sep 17 00:00:00 2001 From: Valentin Mendelev Date: Fri, 26 Dec 2025 13:42:29 -0800 Subject: [PATCH 05/26] Magpie TTS backend --- nemo_skills/inference/server/serve_unified.py | 23 +- .../multimodal/server/backends/__init__.py | 4 +- .../server/backends/magpie_tts_backend.py | 253 ++++++++++++++++++ recipes/multimodal/server/unified_server.py | 71 ++--- 4 files changed, 308 insertions(+), 43 deletions(-) create mode 100644 recipes/multimodal/server/backends/magpie_tts_backend.py diff --git a/nemo_skills/inference/server/serve_unified.py b/nemo_skills/inference/server/serve_unified.py index 748b7cf044..1b02e652e6 100644 --- a/nemo_skills/inference/server/serve_unified.py +++ b/nemo_skills/inference/server/serve_unified.py @@ -30,13 +30,13 @@ --server_entrypoint "-m nemo_skills.inference.server.serve_unified" \\ --server_args "--backend salm" - # TTS backend (text-to-speech) + # MagpieTTS backend (text-to-speech with RTF metrics) ns eval \\ --server_type vllm \\ --server_gpus 1 \\ --model /path/to/tts_model \\ --server_entrypoint "-m nemo_skills.inference.server.serve_unified" \\ - --server_args "--backend tts --codec_model /path/to/codec" + --server_args "--backend magpie_tts --codec_model /path/to/codec" # S2S backend (speech-to-speech) ns eval \\ @@ -138,8 +138,8 @@ def main(): parser.add_argument( "--backend", default="salm", - choices=["salm", "tts", "s2s", "s2s_incremental", "s2s_session"], - help="Backend type: salm (speech-augmented LM), tts (text-to-speech), s2s (speech-to-speech offline), s2s_incremental (frame-by-frame processing), s2s_session (session-aware multi-turn)", + choices=["salm", "magpie_tts", "s2s", "s2s_incremental", "s2s_session"], + help="Backend type: salm (speech-augmented LM), magpie_tts (MagpieTTS with RTF metrics), s2s (speech-to-speech offline), s2s_incremental (frame-by-frame processing), s2s_session (session-aware multi-turn)", ) # Backend-specific model paths @@ -170,7 +170,9 @@ def main(): "--decoder_only_model", action="store_true", help="Use decoder-only model architecture (TTS backend)" ) parser.add_argument("--use_local_transformer", action="store_true", help="Use local transformer (TTS backend)") - parser.add_argument("--top_k", type=int, default=None, help="Top-k sampling (TTS backend)") + parser.add_argument("--top_k", type=int, default=80, help="Top-k sampling (TTS backend)") + parser.add_argument("--use_cfg", action="store_true", help="Enable classifier-free guidance (TTS backend)") + parser.add_argument("--cfg_scale", type=float, default=2.5, help="CFG scale factor (TTS backend)") # Environment setup parser.add_argument("--code_path", default=None, help="Path to NeMo source code to add to PYTHONPATH") @@ -288,12 +290,13 @@ def main(): if args.prompt_format: extra_config["prompt_format"] = args.prompt_format - if args.backend == "tts": + if args.backend == "magpie_tts": extra_config["decoder_only_model"] = args.decoder_only_model extra_config["phoneme_input_type"] = args.phoneme_input_type extra_config["use_local_transformer"] = args.use_local_transformer - if args.top_k: - extra_config["top_k"] = args.top_k + extra_config["top_k"] = args.top_k + extra_config["use_cfg"] = args.use_cfg + extra_config["cfg_scale"] = args.cfg_scale # S2S backend options if args.backend in ("s2s", "s2s_incremental", "s2s_session"): @@ -340,6 +343,10 @@ def main(): print(f" Batch Timeout: {args.batch_timeout}s") print(f" Device: {args.device}") print(f" Dtype: {args.dtype}") + if args.backend == "magpie_tts": + print(f" Top-k: {args.top_k}") + print(f" CFG: {args.use_cfg} (scale: {args.cfg_scale})") + print(f" Local Transformer: {args.use_local_transformer}") if args.backend in ("s2s_incremental", "s2s_session"): if args.config_path: print(f" Config Path: {args.config_path}") diff --git a/recipes/multimodal/server/backends/__init__.py b/recipes/multimodal/server/backends/__init__.py index 861b330f00..fe3c4c1abd 100644 --- a/recipes/multimodal/server/backends/__init__.py +++ b/recipes/multimodal/server/backends/__init__.py @@ -17,7 +17,7 @@ Available backends: - salm: Speech-Augmented Language Model (text output from text/audio input) -- tts: Text-to-Speech using MagpieTTS (audio output from text input) +- magpie_tts: MagpieTTS using MagpieInferenceRunner with RTF metrics (audio output from text input) - s2s: Speech-to-Speech using DuplexS2S offline (text output from audio input) - s2s_incremental: Speech-to-Speech using NemotronVoiceChat incremental (text+audio from audio) - s2s_session: Speech-to-Speech with session support for multi-turn conversations @@ -38,7 +38,7 @@ # Registry of available backends BACKEND_REGISTRY = { "salm": ("salm_backend", "SALMBackend"), - "tts": ("tts_backend", "TTSBackend"), + "magpie_tts": ("magpie_tts_backend", "MagpieTTSBackend"), "s2s": ("s2s_backend", "S2SBackend"), "s2s_incremental": ("s2s_incremental_backend", "S2SIncrementalBackend"), "s2s_session": ("s2s_session_backend", "S2SSessionBackend"), diff --git a/recipes/multimodal/server/backends/magpie_tts_backend.py b/recipes/multimodal/server/backends/magpie_tts_backend.py new file mode 100644 index 0000000000..61b4fd32d6 --- /dev/null +++ b/recipes/multimodal/server/backends/magpie_tts_backend.py @@ -0,0 +1,253 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + +"""MagpieTTS backend using MagpieInferenceRunner with RTF metrics.""" + +import io +import json +import os +import shutil +import tempfile +import time +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Set + +import soundfile as sf + +from .base import BackendConfig, GenerationRequest, GenerationResult, InferenceBackend, Modality + + +@dataclass +class MagpieTTSConfig(BackendConfig): + codec_model_path: Optional[str] = None + top_k: int = 80 + temperature: float = 0.6 + use_cfg: bool = True + cfg_scale: float = 2.5 + max_decoder_steps: int = 440 + use_local_transformer: bool = False + output_sample_rate: int = 22050 + + @classmethod + def from_dict(cls, d: Dict[str, Any]) -> "MagpieTTSConfig": + known = { + "model_path", + "device", + "dtype", + "max_new_tokens", + "temperature", + "top_p", + "top_k", + "codec_model_path", + "use_cfg", + "cfg_scale", + "max_decoder_steps", + "use_local_transformer", + "output_sample_rate", + } + return cls( + **{k: v for k, v in d.items() if k in known}, extra_config={k: v for k, v in d.items() if k not in known} + ) + + +class MagpieTTSBackend(InferenceBackend): + """MagpieTTS backend. Input: JSON with 'text' and 'context_audio_filepath'.""" + + @property + def name(self) -> str: + return "magpie_tts" + + @property + def supported_modalities(self) -> Set[Modality]: + return {Modality.TEXT, Modality.AUDIO_OUT} + + def __init__(self, config: BackendConfig): + self.tts_config = ( + config + if isinstance(config, MagpieTTSConfig) + else MagpieTTSConfig.from_dict( + { + **{ + k: getattr(config, k) + for k in ["model_path", "device", "dtype", "max_new_tokens", "temperature", "top_p", "top_k"] + if hasattr(config, k) + }, + **config.extra_config, + } + ) + ) + super().__init__(self.tts_config) + self._model = self._runner = self._temp_dir = self._checkpoint_name = None + + def load_model(self) -> None: + from nemo.collections.tts.modules.magpietts_inference.inference import InferenceConfig, MagpieInferenceRunner + from nemo.collections.tts.modules.magpietts_inference.utils import ModelLoadConfig, load_magpie_model + + if not self.tts_config.codec_model_path: + raise ValueError("codec_model_path required") + + model_path = self.config.model_path + cfg = ModelLoadConfig(nemo_file=model_path, codecmodel_path=self.tts_config.codec_model_path) + self._model, self._checkpoint_name = load_magpie_model(cfg, device=self.config.device) + + self._runner = MagpieInferenceRunner( + self._model, + InferenceConfig( + temperature=self.tts_config.temperature, + topk=self.tts_config.top_k, + max_decoder_steps=self.tts_config.max_decoder_steps, + use_cfg=self.tts_config.use_cfg, + cfg_scale=self.tts_config.cfg_scale, + use_local_transformer=self.tts_config.use_local_transformer, + batch_size=32, + ), + ) + + self._temp_dir = tempfile.mkdtemp(prefix="magpie_tts_") + self.tts_config.output_sample_rate = self._model.sample_rate + self._is_loaded = True + print( + f"[MagpieTTSBackend] Loaded: {self._checkpoint_name}, sr={self._model.sample_rate}, cfg={self.tts_config.use_cfg}" + ) + + def _extract_json(self, text: str) -> dict: + """Extract JSON object from text, skipping non-JSON parts.""" + if not text: + return {"text": ""} + # Find first { and try to parse from there + idx = text.find("{") + if idx >= 0: + try: + return json.loads(text[idx:]) + except json.JSONDecodeError: + pass + return {"text": text} + + def generate(self, requests: List[GenerationRequest]) -> List[GenerationResult]: + if not self._is_loaded: + return [GenerationResult(error="Model not loaded", request_id=r.request_id) for r in requests] + if not requests: + return [] + + start_time = time.time() + batch_dir = os.path.join(self._temp_dir, f"batch_{int(time.time() * 1000)}") + output_dir = os.path.join(batch_dir, "output") + os.makedirs(output_dir, exist_ok=True) + + try: + # Parse requests, extracting JSON from text (skips non-JSON prefixes) + parsed = [self._extract_json(r.text) for r in requests] + + # Create audio_dir with symlinks to all context audio files (they may be in different dirs) + audio_dir = os.path.join(batch_dir, "audio") + os.makedirs(audio_dir, exist_ok=True) + + manifest_path = os.path.join(batch_dir, "manifest.json") + with open(manifest_path, "w") as f: + for i, p in enumerate(parsed): + ctx = p.get("context_audio_filepath", "") + if ctx and os.path.exists(ctx): + # Create unique symlink name to avoid collisions + link_name = f"ctx_{i}_{os.path.basename(ctx)}" + link_path = os.path.join(audio_dir, link_name) + if not os.path.exists(link_path): + os.symlink(ctx, link_path) + else: + link_name = f"d{i}.wav" + f.write( + json.dumps( + { + "text": p.get("text", ""), + "audio_filepath": link_name, + "context_audio_filepath": link_name, + "duration": p.get("duration", 5.0), + "context_audio_duration": p.get("context_audio_duration", 5.0), + } + ) + + "\n" + ) + + config_path = os.path.join(batch_dir, "config.json") + with open(config_path, "w") as f: + json.dump({"batch": {"manifest_path": manifest_path, "audio_dir": audio_dir}}, f) + + # Run inference + from nemo.collections.tts.modules.magpietts_inference.evaluate_generated_audio import load_evalset_config + + dataset = self._runner.create_dataset(load_evalset_config(config_path)) + rtf_list, _ = self._runner.run_inference_on_dataset( + dataset, output_dir, save_cross_attention_maps=False, save_context_audio=False + ) + + gen_time = time.time() - start_time + batch_metrics = { + "total_time_sec": gen_time, + "num_samples": len(requests), + **self._runner.compute_mean_rtf_metrics(rtf_list), + } + + # Build results + results = [] + for i, req in enumerate(requests): + path = os.path.join(output_dir, f"predicted_audio_{i}.wav") + if os.path.exists(path): + audio, sr = sf.read(path) + buf = io.BytesIO() + sf.write(buf, audio, sr, format="WAV") + buf.seek(0) + dur = len(audio) / sr + results.append( + GenerationResult( + text=parsed[i].get("text", ""), + audio_bytes=buf.read(), + audio_sample_rate=self.tts_config.output_sample_rate, + audio_format="wav", + request_id=req.request_id, + generation_time_ms=gen_time * 1000 / len(requests), + debug_info={ + "checkpoint": self._checkpoint_name, + "audio_duration_sec": dur, + "rtf": gen_time / len(requests) / dur if dur else 0, + "config": { + "temp": self.tts_config.temperature, + "top_k": self.tts_config.top_k, + "cfg": self.tts_config.use_cfg, + "cfg_scale": self.tts_config.cfg_scale, + }, + "batch_metrics": batch_metrics, + }, + ) + ) + else: + results.append(GenerationResult(error=f"Audio not found: {path}", request_id=req.request_id)) + return results + except Exception as e: + import traceback + + traceback.print_exc() + return [GenerationResult(error=str(e), request_id=r.request_id) for r in requests] + finally: + shutil.rmtree(batch_dir, ignore_errors=True) + + def validate_request(self, request: GenerationRequest) -> Optional[str]: + return "Text required" if not request.text else None + + def health_check(self) -> Dict[str, Any]: + h = super().health_check() + if self._is_loaded: + h.update( + { + "checkpoint": self._checkpoint_name, + "codec": self.tts_config.codec_model_path, + "cfg": self.tts_config.use_cfg, + "cfg_scale": self.tts_config.cfg_scale, + "sample_rate": self.tts_config.output_sample_rate, + } + ) + return h + + def __del__(self): + if getattr(self, "_temp_dir", None) and os.path.exists(self._temp_dir): + shutil.rmtree(self._temp_dir, ignore_errors=True) diff --git a/recipes/multimodal/server/unified_server.py b/recipes/multimodal/server/unified_server.py index 2cf989656c..096a78cee8 100644 --- a/recipes/multimodal/server/unified_server.py +++ b/recipes/multimodal/server/unified_server.py @@ -508,45 +508,50 @@ async def chat_completions(request: Dict[str, Any]): import os from datetime import datetime - save_dir = os.environ.get( - "AUDIO_SAVE_DIR", "/lustre/fsw/portfolios/llmservice/users/vmendelev/tmp/voicebench_test" - ) - os.makedirs(save_dir, exist_ok=True) - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - base_filename = f"response_{timestamp}_{response_id}" - + save_dir = os.environ.get("AUDIO_SAVE_DIR", "") + if save_dir: + try: + os.makedirs(save_dir, exist_ok=True) + except PermissionError: + save_dir = "" # Fall through to skip saving saved_audio_path = None saved_json_path = None - # Save JSON with text and debug info - try: - saved_json_path = os.path.join(save_dir, f"{base_filename}.json") - json_output = { - "response_id": response_id, - "timestamp": timestamp, - "text": message_content, - "debug_info": result.debug_info, - "generation_time_ms": result.generation_time_ms, - "num_tokens_generated": result.num_tokens_generated, - } - with open(saved_json_path, "w") as f: - json_lib.dump(json_output, f, indent=2) - print(f"[Server] JSON saved to: {saved_json_path}") - except Exception as e: - print(f"[Server] Warning: Failed to save JSON: {e}") + # Save outputs if AUDIO_SAVE_DIR is set and writable + if save_dir: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + base_filename = f"response_{timestamp}_{response_id}" - # Include audio output if available (base64 encoded) - audio_output = None - if result.audio_bytes: - # Save audio file + # Save JSON with text and debug info try: - saved_audio_path = os.path.join(save_dir, f"{base_filename}.wav") - with open(saved_audio_path, "wb") as f: - f.write(result.audio_bytes) - print(f"[Server] Audio saved to: {saved_audio_path} ({len(result.audio_bytes)} bytes)") + saved_json_path = os.path.join(save_dir, f"{base_filename}.json") + json_output = { + "response_id": response_id, + "timestamp": timestamp, + "text": message_content, + "debug_info": result.debug_info, + "generation_time_ms": result.generation_time_ms, + "num_tokens_generated": result.num_tokens_generated, + } + with open(saved_json_path, "w") as f: + json_lib.dump(json_output, f, indent=2) + print(f"[Server] JSON saved to: {saved_json_path}") except Exception as e: - print(f"[Server] Warning: Failed to save audio: {e}") + print(f"[Server] Warning: Failed to save JSON: {e}") + + # Save audio file if available + if result.audio_bytes: + try: + saved_audio_path = os.path.join(save_dir, f"{base_filename}.wav") + with open(saved_audio_path, "wb") as f: + f.write(result.audio_bytes) + print(f"[Server] Audio saved to: {saved_audio_path} ({len(result.audio_bytes)} bytes)") + except Exception as e: + print(f"[Server] Warning: Failed to save audio: {e}") + # Include audio output if available (base64 encoded) + audio_output = None + if result.audio_bytes: audio_output = { "data": base64.b64encode(result.audio_bytes).decode("utf-8"), "format": result.audio_format or "wav", @@ -621,7 +626,7 @@ def main(): parser.add_argument( "--backend", default=BACKEND_TYPE, - choices=["salm", "tts", "s2s", "s2s_incremental", "s2s_session"], + choices=["salm", "magpie_tts", "s2s", "s2s_incremental", "s2s_session"], help="Backend type to use", ) parser.add_argument("--model", default=MODEL_PATH, help="Path to model") From d916e10617c0a7c2c9f0686cd9bca1231c0fc12a Mon Sep 17 00:00:00 2001 From: Valentin Mendelev Date: Sat, 27 Dec 2025 09:07:04 -0800 Subject: [PATCH 06/26] nv_tts eval scripts --- .../dataset/nv_tts/scripts/__init__.py | 15 ++ nemo_skills/dataset/nv_tts/scripts/score.py | 168 ++++++++++++++++++ 2 files changed, 183 insertions(+) create mode 100644 nemo_skills/dataset/nv_tts/scripts/__init__.py create mode 100644 nemo_skills/dataset/nv_tts/scripts/score.py diff --git a/nemo_skills/dataset/nv_tts/scripts/__init__.py b/nemo_skills/dataset/nv_tts/scripts/__init__.py new file mode 100644 index 0000000000..9b5c777b89 --- /dev/null +++ b/nemo_skills/dataset/nv_tts/scripts/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""NV TTS evaluation scripts.""" diff --git a/nemo_skills/dataset/nv_tts/scripts/score.py b/nemo_skills/dataset/nv_tts/scripts/score.py new file mode 100644 index 0000000000..3d98adee42 --- /dev/null +++ b/nemo_skills/dataset/nv_tts/scripts/score.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Scoring and aggregation functions for TTS evaluation.""" + +import argparse +import json +import os +import tempfile + +from nemo.collections.tts.modules.magpietts_inference.evaluate_generated_audio import evaluate + + +def run_scoring( + results_dir: str, + sv_model: str = "titanet", + asr_model_name: str = "nvidia/parakeet-tdt-1.1b", + language: str = "en", + with_utmosv2: bool = False, +) -> None: + """Run NeMo scoring on all benchmarks in results_dir.""" + benchmarks_dir = os.path.join(results_dir, "eval-results") + if not os.path.exists(benchmarks_dir): + benchmarks_dir = results_dir + + scoring_cfg = { + "sv_model": sv_model, + "asr_model_name": asr_model_name, + "language": language, + "with_utmosv2": with_utmosv2, + } + + for benchmark in os.listdir(benchmarks_dir): + benchmark_dir = os.path.join(benchmarks_dir, benchmark) + if not os.path.isdir(benchmark_dir): + continue + + output_jsonl = os.path.join(benchmark_dir, "output.jsonl") + if not os.path.exists(output_jsonl): + continue + + print(f"\nScoring: {benchmark}") + metrics = score_benchmark(output_jsonl, scoring_cfg) + + # Save metrics.json + metrics_path = os.path.join(benchmark_dir, "metrics.json") + with open(metrics_path, "w") as f: + json.dump(metrics, f, indent=2) + print(f"Saved: {metrics_path}") + print(f" CER: {metrics.get('cer_cumulative', 'N/A'):.4f}") + print(f" WER: {metrics.get('wer_cumulative', 'N/A'):.4f}") + if "utmosv2_avg" in metrics: + print(f" UTMOSv2: {metrics.get('utmosv2_avg', 'N/A'):.4f}") + + +def score_benchmark(output_jsonl: str, scoring_cfg: dict) -> dict: + """Score a single benchmark.""" + # Parse output.jsonl + entries = [] + records = [] + with open(output_jsonl) as f: + for line in f: + if not line.strip(): + continue + record = json.loads(line) + records.append(record) + + # Extract manifest from user message + manifest_entry = None + for msg in record.get("messages", []): + if msg.get("role") == "user": + content = msg.get("content", "") + manifest_entry = json.loads(content) if isinstance(content, str) else content + break + + audio_path = record.get("audio", {}).get("path") + if audio_path and manifest_entry: + entries.append((manifest_entry, audio_path)) + + if not entries: + return {} + + # Create temp dir with manifest and symlinks + with tempfile.TemporaryDirectory(prefix="tts_scoring_") as tmp_dir: + manifest_path = os.path.join(tmp_dir, "manifest.json") + gen_audio_dir = os.path.join(tmp_dir, "generated") + os.makedirs(gen_audio_dir) + + with open(manifest_path, "w") as f: + for i, (manifest_entry, audio_path) in enumerate(entries): + f.write(json.dumps(manifest_entry) + "\n") + dst = os.path.join(gen_audio_dir, f"predicted_audio_{i}.wav") + if os.path.exists(audio_path): + os.symlink(audio_path, dst) + + avg_metrics, filewise_metrics = evaluate( + manifest_path=manifest_path, + audio_dir=None, + generated_audio_dir=gen_audio_dir, + language=scoring_cfg.get("language", "en"), + sv_model_type=scoring_cfg.get("sv_model", "titanet"), + asr_model_name=scoring_cfg.get("asr_model_name", "nvidia/parakeet-tdt-1.1b"), + with_utmosv2=scoring_cfg.get("with_utmosv2", False), + ) + + # Save output_with_metrics.jsonl + output_with_metrics_path = output_jsonl.replace("output.jsonl", "output_with_metrics.jsonl") + with open(output_with_metrics_path, "w") as f: + for i, record in enumerate(records): + if i < len(filewise_metrics): + record["metrics"] = filewise_metrics[i] + f.write(json.dumps(record) + "\n") + print(f"Saved: {output_with_metrics_path}") + + return avg_metrics + + +def run_aggregation(results_dir: str) -> None: + """Print summary of all metrics.""" + benchmarks_dir = os.path.join(results_dir, "eval-results") + if not os.path.exists(benchmarks_dir): + benchmarks_dir = results_dir + + print("\nAggregated Results:") + for benchmark in sorted(os.listdir(benchmarks_dir)): + metrics_path = os.path.join(benchmarks_dir, benchmark, "metrics.json") + if os.path.exists(metrics_path): + with open(metrics_path) as f: + metrics = json.load(f) + print(f" {benchmark}:") + print(f" CER: {metrics.get('cer_cumulative', 'N/A'):.4f}") + print(f" WER: {metrics.get('wer_cumulative', 'N/A'):.4f}") + if "utmosv2_avg" in metrics: + print(f" UTMOSv2: {metrics.get('utmosv2_avg', 'N/A'):.4f}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="TTS Scoring") + parser.add_argument("--results_dir", required=True) + parser.add_argument("--sv_model", default="titanet") + parser.add_argument("--asr_model_name", default="nvidia/parakeet-tdt-1.1b") + parser.add_argument("--language", default="en") + parser.add_argument("--with_utmosv2", action="store_true") + parser.add_argument("--aggregation_only", action="store_true") + args = parser.parse_args() + + if args.aggregation_only: + run_aggregation(args.results_dir) + else: + run_scoring( + args.results_dir, + sv_model=args.sv_model, + asr_model_name=args.asr_model_name, + language=args.language, + with_utmosv2=args.with_utmosv2, + ) From 14523f7a1ee86e83db7a5cb19eabf8a69604ff16 Mon Sep 17 00:00:00 2001 From: Valentin Mendelev Date: Sun, 28 Dec 2025 00:35:13 -0800 Subject: [PATCH 07/26] Checkpoint + hparams input instead of nemo --- nemo_skills/inference/server/serve_unified.py | 27 ++++++++++++++++ .../server/backends/magpie_tts_backend.py | 31 +++++++++++++++++-- 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/nemo_skills/inference/server/serve_unified.py b/nemo_skills/inference/server/serve_unified.py index 1b02e652e6..c6aa1764de 100644 --- a/nemo_skills/inference/server/serve_unified.py +++ b/nemo_skills/inference/server/serve_unified.py @@ -174,6 +174,15 @@ def main(): parser.add_argument("--use_cfg", action="store_true", help="Enable classifier-free guidance (TTS backend)") parser.add_argument("--cfg_scale", type=float, default=2.5, help="CFG scale factor (TTS backend)") + # Checkpoint loading options (for magpie_tts backend - alternative to --model .nemo) + parser.add_argument("--hparams_file", default=None, help="Path to hparams.yaml (use with --checkpoint_file)") + parser.add_argument("--checkpoint_file", default=None, help="Path to .ckpt checkpoint (use with --hparams_file)") + parser.add_argument( + "--legacy_codebooks", action="store_true", help="Use legacy codebook indices for old checkpoints" + ) + parser.add_argument("--legacy_text_conditioning", action="store_true", help="Use legacy text conditioning") + parser.add_argument("--hparams_from_wandb", action="store_true", help="hparams file was exported from wandb") + # Environment setup parser.add_argument("--code_path", default=None, help="Path to NeMo source code to add to PYTHONPATH") parser.add_argument("--hack_path", default=None, help="Path to safetensors/torch.py patch file") @@ -297,6 +306,17 @@ def main(): extra_config["top_k"] = args.top_k extra_config["use_cfg"] = args.use_cfg extra_config["cfg_scale"] = args.cfg_scale + # Checkpoint loading options + if args.hparams_file: + extra_config["hparams_file"] = args.hparams_file + if args.checkpoint_file: + extra_config["checkpoint_file"] = args.checkpoint_file + if args.legacy_codebooks: + extra_config["legacy_codebooks"] = True + if args.legacy_text_conditioning: + extra_config["legacy_text_conditioning"] = True + if args.hparams_from_wandb: + extra_config["hparams_from_wandb"] = True # S2S backend options if args.backend in ("s2s", "s2s_incremental", "s2s_session"): @@ -347,6 +367,13 @@ def main(): print(f" Top-k: {args.top_k}") print(f" CFG: {args.use_cfg} (scale: {args.cfg_scale})") print(f" Local Transformer: {args.use_local_transformer}") + if args.hparams_file and args.checkpoint_file: + print(f" Hparams: {args.hparams_file}") + print(f" Checkpoint: {args.checkpoint_file}") + if args.legacy_codebooks: + print(" Legacy Codebooks: True") + if args.legacy_text_conditioning: + print(" Legacy Text Conditioning: True") if args.backend in ("s2s_incremental", "s2s_session"): if args.config_path: print(f" Config Path: {args.config_path}") diff --git a/recipes/multimodal/server/backends/magpie_tts_backend.py b/recipes/multimodal/server/backends/magpie_tts_backend.py index 61b4fd32d6..e11187f71a 100644 --- a/recipes/multimodal/server/backends/magpie_tts_backend.py +++ b/recipes/multimodal/server/backends/magpie_tts_backend.py @@ -29,6 +29,12 @@ class MagpieTTSConfig(BackendConfig): max_decoder_steps: int = 440 use_local_transformer: bool = False output_sample_rate: int = 22050 + # Checkpoint loading options (alternative to model_path .nemo file) + hparams_file: Optional[str] = None + checkpoint_file: Optional[str] = None + legacy_codebooks: bool = False + legacy_text_conditioning: bool = False + hparams_from_wandb: bool = False @classmethod def from_dict(cls, d: Dict[str, Any]) -> "MagpieTTSConfig": @@ -46,6 +52,11 @@ def from_dict(cls, d: Dict[str, Any]) -> "MagpieTTSConfig": "max_decoder_steps", "use_local_transformer", "output_sample_rate", + "hparams_file", + "checkpoint_file", + "legacy_codebooks", + "legacy_text_conditioning", + "hparams_from_wandb", } return cls( **{k: v for k, v in d.items() if k in known}, extra_config={k: v for k, v in d.items() if k not in known} @@ -88,8 +99,24 @@ def load_model(self) -> None: if not self.tts_config.codec_model_path: raise ValueError("codec_model_path required") - model_path = self.config.model_path - cfg = ModelLoadConfig(nemo_file=model_path, codecmodel_path=self.tts_config.codec_model_path) + # Support both checkpoint mode (hparams + ckpt) and nemo mode + has_ckpt_mode = self.tts_config.hparams_file and self.tts_config.checkpoint_file + if has_ckpt_mode: + cfg = ModelLoadConfig( + hparams_file=self.tts_config.hparams_file, + checkpoint_file=self.tts_config.checkpoint_file, + codecmodel_path=self.tts_config.codec_model_path, + legacy_codebooks=self.tts_config.legacy_codebooks, + legacy_text_conditioning=self.tts_config.legacy_text_conditioning, + hparams_from_wandb=self.tts_config.hparams_from_wandb, + ) + else: + cfg = ModelLoadConfig( + nemo_file=self.config.model_path, + codecmodel_path=self.tts_config.codec_model_path, + legacy_codebooks=self.tts_config.legacy_codebooks, + legacy_text_conditioning=self.tts_config.legacy_text_conditioning, + ) self._model, self._checkpoint_name = load_magpie_model(cfg, device=self.config.device) self._runner = MagpieInferenceRunner( From 4aa3a2deb7804b9398ae0517e62597232194566a Mon Sep 17 00:00:00 2001 From: Valentin Mendelev Date: Sun, 28 Dec 2025 05:30:59 -0800 Subject: [PATCH 08/26] Per benchmark scoring jobs --- nemo_skills/dataset/nv_tts/scripts/score.py | 22 +++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/nemo_skills/dataset/nv_tts/scripts/score.py b/nemo_skills/dataset/nv_tts/scripts/score.py index 3d98adee42..13e6fc6210 100644 --- a/nemo_skills/dataset/nv_tts/scripts/score.py +++ b/nemo_skills/dataset/nv_tts/scripts/score.py @@ -29,8 +29,13 @@ def run_scoring( asr_model_name: str = "nvidia/parakeet-tdt-1.1b", language: str = "en", with_utmosv2: bool = False, + benchmark: str = None, ) -> None: - """Run NeMo scoring on all benchmarks in results_dir.""" + """Run NeMo scoring on benchmarks in results_dir. + + Args: + benchmark: If provided, score only this benchmark. Otherwise score all. + """ benchmarks_dir = os.path.join(results_dir, "eval-results") if not os.path.exists(benchmarks_dir): benchmarks_dir = results_dir @@ -42,16 +47,23 @@ def run_scoring( "with_utmosv2": with_utmosv2, } - for benchmark in os.listdir(benchmarks_dir): - benchmark_dir = os.path.join(benchmarks_dir, benchmark) + # Determine which benchmarks to score + if benchmark: + benchmarks_to_score = [benchmark] + else: + benchmarks_to_score = os.listdir(benchmarks_dir) + + for bench in benchmarks_to_score: + benchmark_dir = os.path.join(benchmarks_dir, bench) if not os.path.isdir(benchmark_dir): continue output_jsonl = os.path.join(benchmark_dir, "output.jsonl") if not os.path.exists(output_jsonl): + print(f"Skipping {bench}: output.jsonl not found") continue - print(f"\nScoring: {benchmark}") + print(f"\nScoring: {bench}") metrics = score_benchmark(output_jsonl, scoring_cfg) # Save metrics.json @@ -154,6 +166,7 @@ def run_aggregation(results_dir: str) -> None: parser.add_argument("--language", default="en") parser.add_argument("--with_utmosv2", action="store_true") parser.add_argument("--aggregation_only", action="store_true") + parser.add_argument("--benchmark", default=None, help="Score only this benchmark (e.g. nv_tts.libritts_seen)") args = parser.parse_args() if args.aggregation_only: @@ -165,4 +178,5 @@ def run_aggregation(results_dir: str) -> None: asr_model_name=args.asr_model_name, language=args.language, with_utmosv2=args.with_utmosv2, + benchmark=args.benchmark, ) From 5372f7e011d0d1aaeb26264916fe91c7506db407 Mon Sep 17 00:00:00 2001 From: Valentin Mendelev Date: Sun, 28 Dec 2025 06:31:13 -0800 Subject: [PATCH 09/26] nv_tts benchmarks and scripts to run them --- nemo_skills/dataset/nv_tts/__init__.py | 24 ++ nemo_skills/dataset/nv_tts/prepare.py | 235 ++++++++++++++++++ .../nv_tts/scripts/config/default.yaml | 31 +++ .../scripts/config/grpo_small_step1100.yaml | 39 +++ .../dataset/nv_tts/scripts/run_tts_eval.py | 184 ++++++++++++++ nemo_skills/dataset/nv_tts/scripts/score.py | 6 +- 6 files changed, 518 insertions(+), 1 deletion(-) create mode 100644 nemo_skills/dataset/nv_tts/__init__.py create mode 100644 nemo_skills/dataset/nv_tts/prepare.py create mode 100644 nemo_skills/dataset/nv_tts/scripts/config/default.yaml create mode 100644 nemo_skills/dataset/nv_tts/scripts/config/grpo_small_step1100.yaml create mode 100644 nemo_skills/dataset/nv_tts/scripts/run_tts_eval.py diff --git a/nemo_skills/dataset/nv_tts/__init__.py b/nemo_skills/dataset/nv_tts/__init__.py new file mode 100644 index 0000000000..b5f0290153 --- /dev/null +++ b/nemo_skills/dataset/nv_tts/__init__.py @@ -0,0 +1,24 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NV TTS evaluation dataset - for testing TTS models + +DATASET_GROUP = "tts" +IS_BENCHMARK_GROUP = True + +# BENCHMARKS will be populated dynamically based on the config file +# Example: {"nv_tts.libritts_seen": {}, "nv_tts.riva_hard_digits": {}} +BENCHMARKS = {} + +GENERATION_ARGS = "++prompt_format=openai" diff --git a/nemo_skills/dataset/nv_tts/prepare.py b/nemo_skills/dataset/nv_tts/prepare.py new file mode 100644 index 0000000000..6fa68d09b9 --- /dev/null +++ b/nemo_skills/dataset/nv_tts/prepare.py @@ -0,0 +1,235 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Prepare NV TTS evaluation datasets. + +Reads a config JSON file (local or remote) containing subtest definitions, +fetches manifest JSONL files, and generates nemo-skills test.jsonl files +with manifest content embedded as JSON in user message content. + +Usage: + python prepare.py --config login-eos.nvidia.com:/path/to/evalset_config.json + python prepare.py --config /local/path/to/evalset_config.json +""" + +import argparse +import json +import os +import subprocess +import tempfile +from pathlib import Path + +SYSTEM_MESSAGE = "You are a helpful assistant." + +# Template for subtest __init__.py files +INIT_TEMPLATE = """# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NV TTS subtest: {subtest_name} + +GENERATION_ARGS = "++prompt_format=openai" +""" + + +def is_remote_path(path: str) -> bool: + """Check if path is a remote path (host:/path format).""" + return ":" in path and not path.startswith("/") and not path.startswith(".") + + +def fetch_remote_file(remote_path: str, local_path: str) -> None: + """Fetch a file from a remote host using scp.""" + result = subprocess.run( + ["scp", remote_path, local_path], + capture_output=True, + text=True, + ) + if result.returncode != 0: + raise RuntimeError(f"Failed to fetch {remote_path}: {result.stderr}") + + +def read_file_content(path: str) -> str: + """Read file content, handling both local and remote paths.""" + if is_remote_path(path): + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".tmp") as tmp: + tmp_path = tmp.name + try: + fetch_remote_file(path, tmp_path) + with open(tmp_path, "r", encoding="utf-8") as f: + return f.read() + finally: + if os.path.exists(tmp_path): + os.unlink(tmp_path) + else: + with open(path, "r", encoding="utf-8") as f: + return f.read() + + +def get_remote_host(path: str) -> str: + """Extract host from a remote path.""" + if is_remote_path(path): + return path.split(":")[0] + return "" + + +def make_remote_path(host: str, path: str) -> str: + """Create a remote path from host and path.""" + if host: + return f"{host}:{path}" + return path + + +def format_manifest_entry(entry: dict, audio_dir: str) -> dict: + """Format a manifest entry into nemo-skills format. + + Args: + entry: Manifest entry with fields like text, context_audio_filepath, etc. + audio_dir: Base directory for audio files to make paths absolute. + + Returns: + Formatted entry with messages containing the manifest as JSON string. + """ + # Make audio paths absolute by combining with audio_dir + entry_with_absolute_paths = entry.copy() + + if "context_audio_filepath" in entry_with_absolute_paths and audio_dir: + entry_with_absolute_paths["context_audio_filepath"] = os.path.join( + audio_dir, entry_with_absolute_paths["context_audio_filepath"] + ) + + if "audio_filepath" in entry_with_absolute_paths and audio_dir: + entry_with_absolute_paths["audio_filepath"] = os.path.join( + audio_dir, entry_with_absolute_paths["audio_filepath"] + ) + + # Create the nemo-skills format entry + content = json.dumps(entry_with_absolute_paths) + + return { + "problem": "", + "messages": [ + {"role": "system", "content": SYSTEM_MESSAGE}, + {"role": "user", "content": content}, + ], + } + + +def create_subtest_init(subtest_dir: Path, subtest_name: str) -> None: + """Create __init__.py for a subtest directory.""" + content = INIT_TEMPLATE.format(subtest_name=subtest_name) + with open(subtest_dir / "__init__.py", "w", encoding="utf-8") as f: + f.write(content) + + +def process_subtest( + subtest_name: str, + config: dict, + output_dir: Path, + remote_host: str, +) -> int: + """Process a single subtest and generate test.jsonl. + + Args: + subtest_name: Name of the subtest (e.g., "libritts_seen"). + config: Subtest config with manifest_path, audio_dir, feature_dir. + output_dir: Base output directory for the dataset. + remote_host: Remote host for fetching files (empty for local). + + Returns: + Number of entries processed. + """ + subtest_dir = output_dir / subtest_name + subtest_dir.mkdir(parents=True, exist_ok=True) + + manifest_path = config["manifest_path"] + audio_dir = config.get("audio_dir", "") + + # Fetch manifest file + if remote_host: + manifest_remote = make_remote_path(remote_host, manifest_path) + print(f"Fetching manifest from {manifest_remote}...") + manifest_content = read_file_content(manifest_remote) + else: + print(f"Reading manifest from {manifest_path}...") + manifest_content = read_file_content(manifest_path) + + # Process manifest entries + output_file = subtest_dir / "test.jsonl" + count = 0 + + with open(output_file, "w", encoding="utf-8") as fout: + for line in manifest_content.strip().split("\n"): + if not line.strip(): + continue + + try: + entry = json.loads(line) + except json.JSONDecodeError as e: + print(f" Warning: Skipping invalid JSON line: {e}") + continue + + formatted = format_manifest_entry(entry, audio_dir) + fout.write(json.dumps(formatted) + "\n") + count += 1 + + # Create __init__.py + create_subtest_init(subtest_dir, subtest_name) + + print(f" Wrote {count} entries to {output_file}") + return count + + +def main(): + parser = argparse.ArgumentParser(description="Prepare NV TTS evaluation datasets") + parser.add_argument( + "--config", + required=True, + help="Path to config JSON file (local or remote: host:/path/to/config.json)", + ) + args = parser.parse_args() + + output_dir = Path(__file__).parent + + # Determine if config is remote and extract host + config_path = args.config + remote_host = get_remote_host(config_path) + + # Read config file + print(f"Reading config from {config_path}...") + config_content = read_file_content(config_path) + config = json.loads(config_content) + + print(f"Found {len(config)} subtests: {list(config.keys())}") + + total_entries = 0 + for subtest_name, subtest_config in config.items(): + print(f"\nProcessing {subtest_name}...") + count = process_subtest(subtest_name, subtest_config, output_dir, remote_host) + total_entries += count + + print(f"\nDone! Processed {total_entries} total entries across {len(config)} subtests.") + + +if __name__ == "__main__": + main() diff --git a/nemo_skills/dataset/nv_tts/scripts/config/default.yaml b/nemo_skills/dataset/nv_tts/scripts/config/default.yaml new file mode 100644 index 0000000000..4539b46402 --- /dev/null +++ b/nemo_skills/dataset/nv_tts/scripts/config/default.yaml @@ -0,0 +1,31 @@ +# TTS Pipeline Configuration + +# Cluster and execution settings (shared across all stages) +cluster: eos +container: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/containters/nemo-25.11.sqsh +partition: batch +mount_paths: /lustre:/lustre +output_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/nv_tts_eval_full_a3 + +# NeMo code path +nemo_code_path: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/experimenta/tts_eval/NeMo + +# Generation settings (ns eval arguments) +generation: + benchmarks: nv_tts.libritts_seen,nv_tts.libritts_test_clean,nv_tts.riva_hard_digits,nv_tts.riva_hard_letters,nv_tts.riva_hard_money,nv_tts.riva_hard_short,nv_tts.vctk + model: nvidia/magpie_tts_multilingual_357m + server_type: generic + server_gpus: 1 + server_entrypoint: python -m nemo_skills.inference.server.serve_unified + server_args: --backend magpie_tts --codec_model nvidia/nemo-nano-codec-22khz-1.89kbps-21.5fps --batch_size 32 --batch_timeout 0.1 --use_cfg --cfg_scale 2.5 + data_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/data_dir + num_chunks: 2 + extra_args: ++server.server_type=vllm_multimodal + +# Scoring settings +scoring: + sv_model: titanet + asr_model_name: nvidia/parakeet-tdt-1.1b + language: en + with_utmosv2: true + gpus: 1 diff --git a/nemo_skills/dataset/nv_tts/scripts/config/grpo_small_step1100.yaml b/nemo_skills/dataset/nv_tts/scripts/config/grpo_small_step1100.yaml new file mode 100644 index 0000000000..3499e16924 --- /dev/null +++ b/nemo_skills/dataset/nv_tts/scripts/config/grpo_small_step1100.yaml @@ -0,0 +1,39 @@ +# TTS Pipeline Configuration - GRPO Small Step 1100 + +# Cluster and execution settings (shared across all stages) +cluster: eos +container: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/containters/nemo-25.11.sqsh +partition: batch +mount_paths: /lustre:/lustre +output_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/nv_tts_eval_grpo_small_step1100_full + +# NeMo code path +nemo_code_path: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/experimenta/tts_eval/NeMo + +# Generation settings (ns eval arguments) +generation: + benchmarks: nv_tts.libritts_seen,nv_tts.libritts_test_clean,nv_tts.riva_hard_digits,nv_tts.riva_hard_letters,nv_tts.riva_hard_money,nv_tts.riva_hard_short,nv_tts.vctk + model: grpo_small_step1100 # name for logging, actual model loaded via hparams/checkpoint in server_args + server_type: generic + server_gpus: 1 + server_entrypoint: "cd /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/experimenta/tts_eval/NeMo && python -m nemo_skills.inference.server.serve_unified" + server_args: >- + --backend magpie_tts + --codec_model nvidia/nemo-nano-codec-22khz-1.89kbps-21.5fps + --batch_size 32 + --batch_timeout 0.1 + --use_cfg + --cfg_scale 2.5 + --hparams_file /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/experimenta/tts_eval/small_hparams.yaml + --checkpoint_file /lustre/fsw/llmservice_nemo_speechlm/users/agorodetskii/checkpoints/N2512_English_TTSArena/grpo_small_from_ft_step1100.ckpt + data_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/data_dir + num_chunks: 2 + extra_args: ++server.server_type=vllm_multimodal + +# Scoring settings +scoring: + sv_model: titanet + asr_model_name: nvidia/parakeet-tdt-1.1b + language: en + with_utmosv2: true + gpus: 1 diff --git a/nemo_skills/dataset/nv_tts/scripts/run_tts_eval.py b/nemo_skills/dataset/nv_tts/scripts/run_tts_eval.py new file mode 100644 index 0000000000..73e3b2b234 --- /dev/null +++ b/nemo_skills/dataset/nv_tts/scripts/run_tts_eval.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +TTS Pipeline: Generation -> Scoring (-> Aggregation) + +Usage: + python run_tts_eval.py --config config.yaml + python run_tts_eval.py --config config.yaml --stage scoring + python run_tts_eval.py --config config.yaml --stage aggregation +""" + +import argparse +import os + +import yaml + +from nemo_skills.pipeline.eval import eval as ns_eval +from nemo_skills.pipeline.run_cmd import run_cmd as ns_run_cmd + + +class MockContext: + """Mock typer.Context for programmatic calls.""" + + def __init__(self, extra_args=None): + self.args = extra_args or [] + + +def load_config(config_path: str) -> dict: + with open(config_path) as f: + return yaml.safe_load(f) + + +def run_generation(cfg: dict, expname: str): + """Run generation stage using ns eval, returns experiment object.""" + gen = cfg["generation"] + + # Add nemo_code_path to server_args + server_args = gen["server_args"] + if cfg.get("nemo_code_path"): + server_args += f" --code_path {cfg['nemo_code_path']}" + + # Parse extra_args for the context + extra_args = gen.get("extra_args", "").split() if gen.get("extra_args") else [] + ctx = MockContext(extra_args) + + # Call eval programmatically + return ns_eval( + ctx=ctx, + cluster=cfg["cluster"], + output_dir=cfg["output_dir"], + benchmarks=gen["benchmarks"], + model=gen["model"], + server_type=gen["server_type"], + server_gpus=gen["server_gpus"], + server_container=cfg["container"], + mount_paths=cfg["mount_paths"], + server_entrypoint=gen["server_entrypoint"], + server_args=server_args, + data_dir=gen["data_dir"], + num_chunks=gen["num_chunks"], + partition=cfg["partition"], + expname=expname, + auto_summarize_results=False, + ) + + +def main(): + parser = argparse.ArgumentParser(description="TTS Pipeline") + parser.add_argument("--config", required=True) + parser.add_argument( + "--stage", + choices=["all", "generation", "scoring", "aggregation"], + default="all", + help="Stage to run. 'all' runs generation+scoring (no aggregation)", + ) + parser.add_argument("--expname", default="tts_eval", help="Base experiment name for job tracking") + args = parser.parse_args() + + cfg = load_config(args.config) + scoring = cfg.get("scoring", {}) + hf_token = os.environ.get("HF_TOKEN", "") + nemo_path = cfg["nemo_code_path"] + output_dir = cfg["output_dir"] + + gen_exp_name = None + + # Stage 1: Generation + if args.stage in ("all", "generation"): + print("\n" + "=" * 60) + print("Stage 1: GENERATION") + print("=" * 60) + gen_exp = run_generation(cfg, args.expname) + # Extract experiment name/id for dependency tracking + gen_exp_name = args.expname # The expname we passed to ns_eval + print(f"Generation submitted: {gen_exp}") + + # Stage 2: Scoring (one job per benchmark, depends on generation) + if args.stage in ("all", "scoring"): + print("\n" + "=" * 60) + print("Stage 2: SCORING") + print("=" * 60) + + # Parse benchmarks list + benchmarks = cfg["generation"]["benchmarks"].split(",") + + install_cmd = None + if scoring.get("with_utmosv2"): + install_cmd = "pip install git+https://github.com/sarulab-speech/UTMOSv2.git@v1.2.1" + + # When running both stages, scoring depends on generation experiment (by name) + run_after = [gen_exp_name] if args.stage == "all" and gen_exp_name else None + + for benchmark in benchmarks: + benchmark = benchmark.strip() + # Benchmark dir in eval-results keeps dot notation (nv_tts.libritts_seen) + benchmark_dir = benchmark + + scoring_cmd = ( + f"HF_TOKEN={hf_token} " + f"PYTHONPATH={nemo_path}:$PYTHONPATH " + f"python -m nemo_skills.dataset.nv_tts.scripts.score " + f"--results_dir {output_dir} " + f"--benchmark {benchmark_dir} " + f"--sv_model {scoring.get('sv_model', 'titanet')} " + f"--asr_model_name {scoring.get('asr_model_name', 'nvidia/parakeet-tdt-1.1b')} " + f"--language {scoring.get('language', 'en')}" + ) + if scoring.get("with_utmosv2"): + scoring_cmd += " --with_utmosv2" + + # Short name for job (e.g. libritts_seen from nv_tts.libritts_seen) + short_name = benchmark.split(".")[-1] + print(f" Submitting scoring job for: {benchmark}") + + ns_run_cmd( + ctx=MockContext(), + cluster=cfg["cluster"], + container=cfg["container"], + partition=cfg["partition"], + num_gpus=scoring.get("gpus", 1), + mount_paths=cfg["mount_paths"], + command=scoring_cmd, + installation_command=install_cmd, + run_after=run_after, + expname=f"{args.expname}_score_{short_name}", + log_dir=f"{output_dir}/eval-logs", + ) + + # Stage 3: Aggregation (only if explicitly requested) + if args.stage == "aggregation": + print("\n" + "=" * 60) + print("Stage 3: AGGREGATION") + print("=" * 60) + agg_cmd = f"python -m nemo_skills.dataset.nv_tts.scripts.score --results_dir {output_dir} --aggregation_only" + ns_run_cmd( + ctx=MockContext(), + cluster=cfg["cluster"], + container=cfg["container"], + partition=cfg["partition"], + num_gpus=0, + mount_paths=cfg["mount_paths"], + command=agg_cmd, + expname=f"{args.expname}_agg", + log_dir=f"{output_dir}/eval-logs", + ) + + print("\nDone!") + + +if __name__ == "__main__": + main() diff --git a/nemo_skills/dataset/nv_tts/scripts/score.py b/nemo_skills/dataset/nv_tts/scripts/score.py index 13e6fc6210..215d332405 100644 --- a/nemo_skills/dataset/nv_tts/scripts/score.py +++ b/nemo_skills/dataset/nv_tts/scripts/score.py @@ -63,11 +63,15 @@ def run_scoring( print(f"Skipping {bench}: output.jsonl not found") continue + metrics_path = os.path.join(benchmark_dir, "metrics.json") + if os.path.exists(metrics_path): + print(f"Skipping {bench}: metrics.json already exists") + continue + print(f"\nScoring: {bench}") metrics = score_benchmark(output_jsonl, scoring_cfg) # Save metrics.json - metrics_path = os.path.join(benchmark_dir, "metrics.json") with open(metrics_path, "w") as f: json.dump(metrics, f, indent=2) print(f"Saved: {metrics_path}") From db37cff0cb9d07da7ffdebd85921abcd8a045912 Mon Sep 17 00:00:00 2001 From: Valentin Mendelev Date: Sun, 28 Dec 2025 09:46:38 -0800 Subject: [PATCH 10/26] EOS FIX 8 chunks per node --- .../nv_tts/scripts/config/default.yaml | 3 +- .../scripts/config/grpo_small_step1100.yaml | 5 +- .../dataset/nv_tts/scripts/run_tts_eval.py | 1 + nemo_skills/inference/server/serve_unified.py | 5 +- nemo_skills/pipeline/eval.py | 13 +++++ nemo_skills/pipeline/utils/eval.py | 39 +++++++++++++-- nemo_skills/pipeline/utils/exp.py | 8 ++-- nemo_skills/pipeline/utils/generation.py | 47 +++++++++++++++---- nemo_skills/pipeline/utils/server.py | 33 +++++++++---- 9 files changed, 124 insertions(+), 30 deletions(-) diff --git a/nemo_skills/dataset/nv_tts/scripts/config/default.yaml b/nemo_skills/dataset/nv_tts/scripts/config/default.yaml index 4539b46402..3bbc3b7703 100644 --- a/nemo_skills/dataset/nv_tts/scripts/config/default.yaml +++ b/nemo_skills/dataset/nv_tts/scripts/config/default.yaml @@ -19,7 +19,8 @@ generation: server_entrypoint: python -m nemo_skills.inference.server.serve_unified server_args: --backend magpie_tts --codec_model nvidia/nemo-nano-codec-22khz-1.89kbps-21.5fps --batch_size 32 --batch_timeout 0.1 --use_cfg --cfg_scale 2.5 data_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/data_dir - num_chunks: 2 + num_chunks: 8 + gpus_per_node: 8 # set to 1 for single-GPU mode, or 8 for multi-instance mode (num_chunks must be multiple of gpus_per_node) extra_args: ++server.server_type=vllm_multimodal # Scoring settings diff --git a/nemo_skills/dataset/nv_tts/scripts/config/grpo_small_step1100.yaml b/nemo_skills/dataset/nv_tts/scripts/config/grpo_small_step1100.yaml index 3499e16924..ad8cf100e7 100644 --- a/nemo_skills/dataset/nv_tts/scripts/config/grpo_small_step1100.yaml +++ b/nemo_skills/dataset/nv_tts/scripts/config/grpo_small_step1100.yaml @@ -5,7 +5,7 @@ cluster: eos container: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/containters/nemo-25.11.sqsh partition: batch mount_paths: /lustre:/lustre -output_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/nv_tts_eval_grpo_small_step1100_full +output_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/nv_tts_eval_grpo_small_step1100_full_8c_a3 # NeMo code path nemo_code_path: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/experimenta/tts_eval/NeMo @@ -27,7 +27,8 @@ generation: --hparams_file /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/experimenta/tts_eval/small_hparams.yaml --checkpoint_file /lustre/fsw/llmservice_nemo_speechlm/users/agorodetskii/checkpoints/N2512_English_TTSArena/grpo_small_from_ft_step1100.ckpt data_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/data_dir - num_chunks: 2 + num_chunks: 8 + gpus_per_node: 8 extra_args: ++server.server_type=vllm_multimodal # Scoring settings diff --git a/nemo_skills/dataset/nv_tts/scripts/run_tts_eval.py b/nemo_skills/dataset/nv_tts/scripts/run_tts_eval.py index 73e3b2b234..c4548d3305 100644 --- a/nemo_skills/dataset/nv_tts/scripts/run_tts_eval.py +++ b/nemo_skills/dataset/nv_tts/scripts/run_tts_eval.py @@ -71,6 +71,7 @@ def run_generation(cfg: dict, expname: str): server_args=server_args, data_dir=gen["data_dir"], num_chunks=gen["num_chunks"], + gpus_per_node=gen.get("gpus_per_node", 1), partition=cfg["partition"], expname=expname, auto_summarize_results=False, diff --git a/nemo_skills/inference/server/serve_unified.py b/nemo_skills/inference/server/serve_unified.py index c6aa1764de..47feb62f2a 100644 --- a/nemo_skills/inference/server/serve_unified.py +++ b/nemo_skills/inference/server/serve_unified.py @@ -290,8 +290,9 @@ def main(): if args.debug: os.environ["DEBUG"] = "1" - # Set CUDA devices - os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(i) for i in range(args.num_gpus)) + # Set CUDA devices (only if not already set by the environment, e.g., SLURM) + if "CUDA_VISIBLE_DEVICES" not in os.environ: + os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(i) for i in range(args.num_gpus)) # Build extra config for backend-specific options extra_config = {} diff --git a/nemo_skills/pipeline/eval.py b/nemo_skills/pipeline/eval.py index df273cfc19..4d493e61c5 100644 --- a/nemo_skills/pipeline/eval.py +++ b/nemo_skills/pipeline/eval.py @@ -288,6 +288,12 @@ def eval( None, help="Number of chunks to split the dataset into. If None, will not chunk the dataset.", ), + gpus_per_node: int = typer.Option( + 1, + help="Number of GPUs per node for multi-instance mode. " + "When > 1, launches multiple server instances (one per GPU) within a single job. " + "Requires num_chunks to be a multiple of gpus_per_node.", + ), chunk_ids: str = typer.Option( None, help="List of explicit chunk ids to run. Separate with , or .. to specify range. " @@ -493,6 +499,7 @@ def eval( eval_requires_judge=eval_requires_judge, generation_type=generation_type, generation_module=generation_module, + gpus_per_node=gpus_per_node, ) sbatch_kwargs = parse_kwargs(sbatch_kwargs, exclusive=exclusive, qos=qos, time_min=time_min) @@ -517,9 +524,14 @@ def eval( job_server_address, job_server_command, job_sandbox_env_overrides, + job_gpus_per_node, ) = job_args prev_tasks = _task_dependencies + # Add gpus_per_node to server config for multi-instance mode + if job_server_config and job_gpus_per_node > 1: + job_server_config["gpus_per_node"] = job_gpus_per_node + for _ in range(dependent_jobs + 1): has_tasks = True new_task = pipeline_utils.add_task( @@ -529,6 +541,7 @@ def eval( log_dir=log_dir, container=cluster_config["containers"]["nemo-skills"], cluster_config=cluster_config, + num_tasks=job_gpus_per_node, partition=partition, server_config=job_server_config, with_sandbox=job_needs_sandbox or with_sandbox, diff --git a/nemo_skills/pipeline/utils/eval.py b/nemo_skills/pipeline/utils/eval.py index 736d1c7cb6..7674c94f81 100644 --- a/nemo_skills/pipeline/utils/eval.py +++ b/nemo_skills/pipeline/utils/eval.py @@ -267,11 +267,20 @@ def prepare_eval_commands( eval_requires_judge, generation_type=None, generation_module=None, + gpus_per_node: int = 1, ): # TODO: there is a bit too much code duplication here and logic is quite dense, should try to refactor # TODO: should we allow setting num chunks per benchmark when not using groups? Maybe benchmark:rs_num:num_chunks? + # Validate gpus_per_node for multi-instance mode + if gpus_per_node > 1: + if num_chunks is None: + raise ValueError("gpus_per_node > 1 requires num_chunks to be specified") + if num_chunks % gpus_per_node != 0: + raise ValueError(f"num_chunks ({num_chunks}) must be a multiple of gpus_per_node ({gpus_per_node})") + LOG.info(f"Multi-instance mode: {gpus_per_node} GPUs per node, {num_chunks // gpus_per_node} jobs") + if generation_type is not None: if generation_module is not None: raise ValueError("Cannot specify both generation_module and generation_type. ") @@ -354,7 +363,12 @@ def prepare_eval_commands( rerun_done=rerun_done, ) for seed_idx, (seed, benchmark_chunk_ids) in enumerate(benchmark_args.remaining_jobs.items()): - total_evals += len(benchmark_chunk_ids) + # Multi-instance mode: count unique base chunks (each base chunk = 1 job) + if gpus_per_node > 1: + base_chunks = set((cid // gpus_per_node) * gpus_per_node for cid in benchmark_chunk_ids) + total_evals += len(base_chunks) + else: + total_evals += len(benchmark_chunk_ids) if num_jobs < 0: # if num_jobs is -1, we run all benchmarks in parallel @@ -376,6 +390,7 @@ def prepare_eval_commands( **server_parameters, extra_arguments=extra_arguments, get_random_port=get_random_port, + gpus_per_node=gpus_per_node, ) cur_eval = 0 @@ -398,7 +413,18 @@ def prepare_eval_commands( random_seed=seed, chunk_id=None, ) - for chunk_id in benchmark_chunk_ids: + # Multi-instance mode: compute which base chunks need to run + # If ANY chunk in a batch is incomplete, we run the entire batch (base_chunk) + if gpus_per_node > 1: + base_chunks_to_run = set() + for cid in benchmark_chunk_ids: + base_chunk = (cid // gpus_per_node) * gpus_per_node + base_chunks_to_run.add(base_chunk) + chunks_to_process = sorted(base_chunks_to_run) + else: + chunks_to_process = benchmark_chunk_ids + + for chunk_id in chunks_to_process: job_benchmarks.add(benchmark) effective_generation_module = generation_module or benchmark_args.generation_module @@ -430,12 +456,17 @@ def prepare_eval_commands( f"{job_extra_arguments} " ) + # Multi-instance mode: use shell expression for chunk_id + effective_chunk_id = chunk_id + if gpus_per_node > 1: + effective_chunk_id = f"$(({chunk_id} + $SLURM_LOCALID))" + cmd = pipeline_utils.get_generation_cmd( input_file=benchmark_args.input_file, output_dir=benchmark_output_dir, extra_arguments=full_extra_arguments, random_seed=seed, - chunk_id=chunk_id, + chunk_id=effective_chunk_id, num_chunks=benchmark_args.num_chunks, script=generation_module or benchmark_args.generation_module, # only logging for the first seed @@ -478,12 +509,14 @@ def prepare_eval_commands( # a check above guarantees that this is the same for all tasks in a job generation_task.get_server_command_fn(), job_sandbox_env_overrides, + gpus_per_node, # client num_tasks for multi-instance mode ) ) job_server_config, job_server_address, job_extra_arguments = pipeline_utils.configure_client( **server_parameters, extra_arguments=extra_arguments, get_random_port=get_random_port, + gpus_per_node=gpus_per_node, ) for job_benchmark in job_benchmarks: benchmarks_dict[job_benchmark].job_ids.append(cur_job_idx) diff --git a/nemo_skills/pipeline/utils/exp.py b/nemo_skills/pipeline/utils/exp.py index 5e8642f0a2..3eed966b33 100644 --- a/nemo_skills/pipeline/utils/exp.py +++ b/nemo_skills/pipeline/utils/exp.py @@ -126,7 +126,7 @@ def stdout(self) -> Path: @property def srun_stdout(self) -> Path: - return Path(self.folder) / f"{self.srun_prefix}%j_srun.log" + return Path(self.folder) / f"{self.srun_prefix}%j_%t_srun.log" @property def stderr(self) -> Path: @@ -134,7 +134,7 @@ def stderr(self) -> Path: @property def srun_stderr(self) -> Path: - return Path(self.folder) / f"{self.srun_prefix}%j_srun.log" + return Path(self.folder) / f"{self.srun_prefix}%j_%t_srun.log" @property def ls_term(self) -> str: @@ -143,7 +143,7 @@ def ls_term(self) -> str: The command used to list the files is ls -1 {ls_term} 2> /dev/null """ assert self.folder - return os.path.join(self.folder, "*%j_srun.log") + return os.path.join(self.folder, "*%j_*_srun.log") @dataclass(kw_only=True) @@ -312,7 +312,7 @@ def get_executor( srun_args = [ "--no-container-mount-home", "--mpi=pmix", - "--wait=10", + "--wait=240", # wait up to 4 minutes for slower tasks to complete (important for multi-instance mode) # we need to be explicit about this in srun as commands might need to run in parallel f"--ntasks-per-node={tasks_per_node}", f"--nodes={num_nodes}", diff --git a/nemo_skills/pipeline/utils/generation.py b/nemo_skills/pipeline/utils/generation.py index 4b45ed8a39..337b8e87e5 100644 --- a/nemo_skills/pipeline/utils/generation.py +++ b/nemo_skills/pipeline/utils/generation.py @@ -341,8 +341,15 @@ def get_generation_cmd( cmd += "++wait_for_sandbox=true " if chunk_id is not None: - cmd += f" ++num_chunks={num_chunks} ++chunk_id={chunk_id} " - output_file = get_chunked_rs_filename(output_dir, random_seed=random_seed, chunk_id=chunk_id) + # Check if chunk_id is a shell expression (e.g., "$((0 + $SLURM_LOCALID))") + is_shell_expr = isinstance(chunk_id, str) and "$" in str(chunk_id) + + if is_shell_expr: + # For shell expressions, use double quotes so shell expands the expression + cmd += f' ++num_chunks={num_chunks} "++chunk_id={chunk_id}" ' + else: + cmd += f" ++num_chunks={num_chunks} ++chunk_id={chunk_id} " + donefiles = [] # we are always waiting for all chunks in num_chunks, no matter chunk_ids in # the current run (as we don't want to merge partial jobs) @@ -351,10 +358,23 @@ def get_generation_cmd( donefile = f"{filename}.done" donefiles.append(donefile) - if job_end_cmd: - job_end_cmd += f" && touch {donefiles[chunk_id]} " + if is_shell_expr: + # For shell expression, compute the donefile path at runtime + # Get the base pattern with _chunk_0 and replace with shell expression + base_donefile = donefiles[0] # e.g., /path/output_chunk_0.jsonl.done + # Replace "_chunk_0.jsonl" with "_chunk_$((expr)).jsonl" where expr is expanded by shell + # Extract the expression part (e.g., "0 + $SLURM_LOCALID" from "$((0 + $SLURM_LOCALID))") + donefile_pattern = base_donefile.replace("_chunk_0.jsonl", f"_chunk_{chunk_id}.jsonl") + if job_end_cmd: + job_end_cmd += f' && touch "{donefile_pattern}" ' + else: + job_end_cmd = f'touch "{donefile_pattern}" ' else: - job_end_cmd = f"touch {donefiles[chunk_id]} " + output_file = get_chunked_rs_filename(output_dir, random_seed=random_seed, chunk_id=chunk_id) + if job_end_cmd: + job_end_cmd += f" && touch {donefiles[chunk_id]} " + else: + job_end_cmd = f"touch {donefiles[chunk_id]} " # getting file name as if there is no chunking since that's where we want to merge merged_output_file = get_chunked_rs_filename(output_dir=output_dir, random_seed=random_seed) @@ -424,6 +444,7 @@ def configure_client( get_random_port: bool, extra_arguments: str, server_container: str | None = None, + gpus_per_node: int = 1, ): """ Utility function to configure a client for the model inference server. @@ -439,6 +460,7 @@ def configure_client( get_random_port: Whether to get a random port for the server. extra_arguments: Extra arguments to pass to the command. server_container: Container to use for the server. + gpus_per_node: Number of GPUs per node for multi-instance mode. Returns: A tuple containing: @@ -467,10 +489,17 @@ def configure_client( server_config["container"] = server_container # Only add server_type if user didn't specify it (allows vllm_multimodal override) server_type_arg = "" if user_specified_server_type else f"++server.server_type={server_type} " - extra_arguments = ( - f"{extra_arguments} {server_type_arg}++server.host=127.0.0.1 " - f"++server.port={server_port} ++server.model={model} " - ) + if gpus_per_node > 1: + # Multi-instance mode: port is computed at runtime based on SLURM_LOCALID + extra_arguments = ( + f"{extra_arguments} {server_type_arg}++server.host=127.0.0.1 " + f'"++server.port=$(({server_port} + $SLURM_LOCALID))" ++server.model={model} ' + ) + else: + extra_arguments = ( + f"{extra_arguments} {server_type_arg}++server.host=127.0.0.1 " + f"++server.port={server_port} ++server.model={model} " + ) else: # model is hosted elsewhere server_config = None # Only add server_type if user didn't specify it diff --git a/nemo_skills/pipeline/utils/server.py b/nemo_skills/pipeline/utils/server.py index e9258e4e7b..631dfe7610 100644 --- a/nemo_skills/pipeline/utils/server.py +++ b/nemo_skills/pipeline/utils/server.py @@ -120,6 +120,7 @@ def get_server_command( server_port: int, server_args: str = "", server_entrypoint: str | None = None, + gpus_per_node: int = 1, ): num_tasks = num_gpus @@ -209,15 +210,29 @@ def get_server_command( elif server_type == "generic": if not server_entrypoint: raise ValueError("For 'generic' server type, 'server_entrypoint' must be specified.") - server_start_cmd = ( - f"{server_entrypoint} " - f" --model {model_path} " - f" --num_gpus {num_gpus} " - f" --num_nodes {num_nodes} " - f" --port {server_port} " - f" {server_args} " - ) - num_tasks = 1 + if gpus_per_node > 1: + # Multi-instance mode: each SLURM task gets its own GPU and port + server_start_cmd = ( + f"echo 'SLURM_LOCALID='$SLURM_LOCALID' SLURM_PROCID='$SLURM_PROCID && " + f"export CUDA_VISIBLE_DEVICES=${{SLURM_LOCALID:-0}} && " + f"{server_entrypoint} " + f" --model {model_path} " + f" --num_gpus 1 " + f" --num_nodes 1 " + f" --port $(({server_port} + ${{SLURM_LOCALID:-0}})) " + f" {server_args} " + ) + num_tasks = gpus_per_node + else: + server_start_cmd = ( + f"{server_entrypoint} " + f" --model {model_path} " + f" --num_gpus {num_gpus} " + f" --num_nodes {num_nodes} " + f" --port {server_port} " + f" {server_args} " + ) + num_tasks = 1 else: raise ValueError(f"Server type '{server_type}' not supported for model inference.") From a11456e19d01775b73061af1bcd9335221ad1fa5 Mon Sep 17 00:00:00 2001 From: Valentin Mendelev Date: Sun, 28 Dec 2025 11:55:45 -0800 Subject: [PATCH 11/26] Documentation and comparison script --- nemo_skills/dataset/nv_tts/TTS_eval.md | 123 +++++ .../scripts/compare_tts_eval_results.py | 425 ++++++++++++++++++ .../nv_tts/scripts/tts_comparison_report.md | 115 +++++ 3 files changed, 663 insertions(+) create mode 100644 nemo_skills/dataset/nv_tts/TTS_eval.md create mode 100644 nemo_skills/dataset/nv_tts/scripts/compare_tts_eval_results.py create mode 100644 nemo_skills/dataset/nv_tts/scripts/tts_comparison_report.md diff --git a/nemo_skills/dataset/nv_tts/TTS_eval.md b/nemo_skills/dataset/nv_tts/TTS_eval.md new file mode 100644 index 0000000000..73864defd7 --- /dev/null +++ b/nemo_skills/dataset/nv_tts/TTS_eval.md @@ -0,0 +1,123 @@ +# TTS Evaluation Based on NeMo-Skills + +This is an adaptation of `examples/tts/magpietts_inference.py` into NeMo-Skills. The generation and scoring are separated into 2 stages and can be effectively parallelized. The same code as in `magpietts_inference.py` is used for both stages. + +The test sets are also borrowed from the current evaluation setup. + +## Getting Started + +### 1. Clone and Setup + +```bash +# Clone this branch +git clone +cd ns_eval + +# Create a virtual environment and install nemo-skills +python -m venv .venv +source .venv/bin/activate +pip install -e . +``` + +### 2. Cluster Configuration + +Decide which cluster you want to work on and setup the corresponding cluster configuration. + +- An example configuration for EOS is provided in `cluster_configs/eos_example.yaml` +- You can get more configurations from the [NeMo-Skills cluster configs](https://github.com/NVIDIA/NeMo-Skills/tree/main/cluster_configs) +- Update the username in the configuration file + +Note that NeMo-Skills standard quant of resources is 1 gpu. Eos cluster is special because it allows assigning only full nodes (not e.g. 2 gpus from 8). So I had to write a fix, which is not nice, so probably won't be merged as is. You can remove the "EOS FIX 8 chunks per node" if running on other clusters. This may entail small changes in the config. + +### 3. Prepare Test Data + +You can either prepare a new test set or reuse an existing data directory. + +**To prepare a new test set:** + +```bash +cd /home/vmendelev/workspace/expressiveness/src/ns_eval && source .venv/bin/activate && \ +python nemo_skills/dataset/nv_tts/prepare.py \ + --config :/eval_config_full_fixed.json +``` + +This will prepare `test.jsonl` for each benchmark with pointers to the files on the cluster. + +**To reuse an existing data directory (EOS):** + +``` +/lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/data_dir +``` + +### 4. Configuration Files + +Review the config file and ensure all required artifacts are in the specified locations: + +| Config | Description | +|--------|-------------| +| `nemo_skills/dataset/nv_tts/scripts/config/default.yaml` | For `.nemo` model input | +| `nemo_skills/dataset/nv_tts/scripts/config/grpo_small_step1100.yaml` | For checkpoint + hparams input | + +### 5. Environment Setup + +Make sure `HF_TOKEN` is present in the environment: + +```bash +export HF_TOKEN= +# or source from your .env file +. ~/.env && export HF_TOKEN=$HF_READ_ONLY +``` + +## Running Evaluation + +### Full Evaluation (Generation + Scoring) + +```bash +cd /home/vmendelev/workspace/expressiveness/src/ns_eval && source .venv/bin/activate && \ +NEMO_SKILLS_DISABLE_UNCOMMITTED_CHANGES_CHECK=1 \ +python -m nemo_skills.dataset.nv_tts.scripts.run_tts_eval \ + --config nemo_skills/dataset/nv_tts/scripts/config/default.yaml \ + --stage all \ + --expname default_eval +``` + +### Stage Options + +| Stage | Description | +|-------|-------------| +| `all` | Run both generation and scoring | +| `generation` | Run only TTS generation | +| `scoring` | Run only scoring (requires completed generation) | +| `aggregation` | Print summary of all metrics | + +## Comparing Results + +To produce a comparison report between different evaluation runs: + +```bash +cd /home/vmendelev/workspace/expressiveness/src/ns_eval && source .venv/bin/activate && \ +python nemo_skills/dataset/nv_tts/scripts/compare_eval_results.py \ + --baseline_dir \ + --compare_dir \ + --output_file tts_comparison_report.md +``` + +See [example report](scripts/tts_comparison_report.md) for sample output. + +## Output Structure + +Results are saved to `output_dir/eval-results/` with the following structure: + +``` +output_dir/ +├── eval-results/ +│ ├── nv_tts.libritts_seen/ +│ │ ├── output.jsonl # Generated audio paths + metadata +│ │ ├── output_with_metrics.jsonl # With per-file metrics +│ │ ├── metrics.json # Aggregate metrics (CER, WER, UTMOSv2) +│ │ └── audio/ # Generated audio files +│ ├── nv_tts.vctk/ +│ │ └── ... +│ └── ... +└── eval-logs/ # Job logs +``` diff --git a/nemo_skills/dataset/nv_tts/scripts/compare_tts_eval_results.py b/nemo_skills/dataset/nv_tts/scripts/compare_tts_eval_results.py new file mode 100644 index 0000000000..b8780d8f1f --- /dev/null +++ b/nemo_skills/dataset/nv_tts/scripts/compare_tts_eval_results.py @@ -0,0 +1,425 @@ +#!/usr/bin/env python3 +""" +Compare multiple TTS evaluation results and generate a Markdown report. + +Usage: + python compare_tts_eval_results.py \ + --eval_folders "/path/to/eval1:Model A" "/path/to/eval2:Model B" \ + --output report.md + +Supports remote folders via SSH: + python compare_tts_eval_results.py \ + --eval_folders "host1:/path/to/eval1:Model A" "host2:/path/to/eval2:Model B" \ + --output report.md + +Each eval folder should contain subdirectories for different test sets, +each with a metrics.json file. +""" + +import argparse +import json +import os +import subprocess +from typing import Optional + + +def run_remote_cmd(host: str, cmd: str, timeout: int = 30) -> Optional[str]: + """Run a command on a remote host via SSH.""" + try: + result = subprocess.run(["ssh", host, cmd], capture_output=True, text=True, timeout=timeout) + if result.returncode != 0: + return None + return result.stdout.strip() + except (subprocess.TimeoutExpired, Exception): + return None + + +def list_test_sets_local(eval_folder: str) -> list[str]: + """List test set subdirectories in a local eval folder.""" + if not os.path.isdir(eval_folder): + return [] + return [d for d in os.listdir(eval_folder) if os.path.isdir(os.path.join(eval_folder, d))] + + +def list_test_sets_remote(host: str, eval_folder: str) -> list[str]: + """List test set subdirectories in a remote eval folder via SSH.""" + output = run_remote_cmd(host, f"ls -1 {eval_folder}") + if output is None: + return [] + return [d.strip() for d in output.split("\n") if d.strip()] + + +def load_metrics_local(metrics_path: str) -> Optional[dict]: + """Load metrics from a local JSON file.""" + if not os.path.exists(metrics_path): + return None + with open(metrics_path, "r") as f: + return json.load(f) + + +def load_metrics_remote(host: str, metrics_path: str) -> Optional[dict]: + """Load metrics from a remote JSON file via SSH.""" + output = run_remote_cmd(host, f"cat {metrics_path}") + if output is None: + return None + try: + return json.loads(output) + except json.JSONDecodeError: + return None + + +def load_metrics(path: str, host: Optional[str] = None) -> Optional[dict]: + """Load metrics from a local or remote JSON file.""" + if host: + return load_metrics_remote(host, path) + return load_metrics_local(path) + + +def list_test_sets(eval_folder: str, host: Optional[str] = None) -> list[str]: + """List test set subdirectories.""" + if host: + return list_test_sets_remote(host, eval_folder) + return list_test_sets_local(eval_folder) + + +def format_value(value, format_spec: str = ".4f", na_str: str = "N/A") -> str: + """Format a value for display.""" + if value is None: + return na_str + try: + return f"{value:{format_spec}}" + except (ValueError, TypeError): + return str(value) + + +def format_percent(value, na_str: str = "N/A") -> str: + """Format a value as percentage.""" + if value is None: + return na_str + try: + return f"{value * 100:.2f}%" + except (ValueError, TypeError): + return str(value) + + +def parse_folder_spec(spec: str) -> tuple[Optional[str], str, str]: + """ + Parse folder specification in format: + - 'path:name' (local) + - 'path' (local, auto-name) + - 'host:path:name' (remote) + - 'host:path' (remote, auto-name) + """ + parts = spec.split(":") + if len(parts) == 1: + path = parts[0] + name = os.path.basename(path.rstrip("/")) + return None, path, name + if len(parts) == 2: + if parts[0].startswith("/") or parts[0].startswith("."): + return None, parts[0], parts[1] + else: + host, path = parts[0], parts[1] + name = os.path.basename(path.rstrip("/")) + return host, path, name + if len(parts) == 3: + return parts[0], parts[1], parts[2] + if len(parts) > 3: + host = parts[0] + name = parts[-1] + path = ":".join(parts[1:-1]) + return host, path, name + return None, spec, os.path.basename(spec.rstrip("/")) + + +# TTS metrics: (key, display_name, format_func, higher_is_better) +TTS_METRICS = [ + ("wer_cumulative", "WER (cumulative)", format_percent, False), + ("cer_cumulative", "CER (cumulative)", format_percent, False), + ("wer_filewise_avg", "WER (filewise avg)", format_percent, False), + ("cer_filewise_avg", "CER (filewise avg)", format_percent, False), + ("utmosv2_avg", "UTMOS v2", lambda v: format_value(v, ".3f"), True), + ("ssim_pred_gt_avg", "SSIM (pred vs GT)", lambda v: format_value(v, ".4f"), True), + ("ssim_pred_context_avg", "SSIM (pred vs context)", lambda v: format_value(v, ".4f"), True), + ("total_gen_audio_seconds", "Total audio (sec)", lambda v: format_value(v, ".1f"), None), +] + + +def generate_test_set_table( + test_set: str, + models: list[tuple[str, dict]], +) -> list[str]: + """Generate a comparison table for a single test set.""" + lines = [] + lines.append(f"### {test_set}\n") + + # Table header + header = "| Metric | " + " | ".join(name for name, _ in models) + " |" + separator = "|" + "|".join(["---"] * (len(models) + 1)) + "|" + lines.append(header) + lines.append(separator) + + for metric_key, display_name, fmt_func, higher_better in TTS_METRICS: + values = [] + raw_values = [] + for _, m in models: + val = m.get(metric_key) + raw_values.append(val) + values.append(fmt_func(val)) + + # Highlight best value + if higher_better is not None: + valid_vals = [(i, v) for i, v in enumerate(raw_values) if v is not None] + if len(valid_vals) >= 2: + if higher_better: + best_idx = max(valid_vals, key=lambda x: x[1])[0] + else: + best_idx = min(valid_vals, key=lambda x: x[1])[0] + values[best_idx] = f"**{values[best_idx]}**" + + row = f"| {display_name} | " + " | ".join(values) + " |" + lines.append(row) + + lines.append("") + return lines + + +def compute_summary_metrics( + all_test_sets: list[str], + model_metrics: dict[str, dict[str, dict]], +) -> dict[str, dict[str, float]]: + """Compute average metrics across all test sets for each model.""" + summary = {} + for model_name, test_data in model_metrics.items(): + totals = {} + counts = {} + for test_set in all_test_sets: + if test_set not in test_data: + continue + m = test_data[test_set] + for key, _, _, _ in TTS_METRICS: + if key in m and m[key] is not None: + totals[key] = totals.get(key, 0) + m[key] + counts[key] = counts.get(key, 0) + 1 + summary[model_name] = {k: totals[k] / counts[k] for k in totals if counts.get(k, 0) > 0} + return summary + + +def determine_best_model(models: list[tuple[str, dict]]) -> tuple[str, str]: + """Determine the best model based on summary metrics.""" + if len(models) < 2: + return "", "Need at least 2 models to compare." + + scores = {name: 0.0 for name, _ in models} + comparisons = [] + + # Weight metrics + weights = { + "wer_cumulative": 2.0, + "cer_cumulative": 1.5, + "utmosv2_avg": 2.0, + "ssim_pred_gt_avg": 1.0, + } + + for metric_key, metric_name, _, higher_better in TTS_METRICS: + if higher_better is None: + continue + weight = weights.get(metric_key, 1.0) + valid = [(name, m.get(metric_key)) for name, m in models if m.get(metric_key) is not None] + if len(valid) < 2: + continue + + if higher_better: + best_val = max(v for _, v in valid) + worst_val = min(v for _, v in valid) + else: + best_val = min(v for _, v in valid) + worst_val = max(v for _, v in valid) + + if best_val == worst_val: + continue + + best_name = [n for n, v in valid if v == best_val][0] + for name, val in valid: + if higher_better: + normalized = (val - worst_val) / (best_val - worst_val) + else: + normalized = (worst_val - val) / (worst_val - best_val) + scores[name] += normalized * weight + comparisons.append(f"{best_name} leads in {metric_name}") + + if not any(scores.values()): + return "", "Unable to determine best model: insufficient metrics." + + sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True) + best_name = sorted_scores[0][0] + return best_name, ( + f"**{best_name}** performs best overall. " + f"Key advantages: {'; '.join(comparisons[:4]) if comparisons else 'balanced performance'}." + ) + + +def generate_report( + model_names: list[str], + all_test_sets: list[str], + model_metrics: dict[str, dict[str, dict]], + output_path: str, +): + """Generate the full Markdown comparison report.""" + lines = [] + lines.append("# TTS Evaluation Comparison Report\n") + lines.append(f"Comparing {len(model_names)} model(s): " + ", ".join(model_names) + "\n") + + # Summary section + lines.append("## Summary (Averaged Across Test Sets)\n") + summary = compute_summary_metrics(all_test_sets, model_metrics) + summary_models = [(name, summary.get(name, {})) for name in model_names] + + header = "| Metric | " + " | ".join(model_names) + " |" + separator = "|" + "|".join(["---"] * (len(model_names) + 1)) + "|" + lines.append(header) + lines.append(separator) + + for metric_key, display_name, fmt_func, higher_better in TTS_METRICS: + if metric_key == "total_gen_audio_seconds": + continue # Skip total audio in summary + values = [] + raw_values = [] + for name in model_names: + val = summary.get(name, {}).get(metric_key) + raw_values.append(val) + values.append(fmt_func(val)) + + if higher_better is not None: + valid_vals = [(i, v) for i, v in enumerate(raw_values) if v is not None] + if len(valid_vals) >= 2: + if higher_better: + best_idx = max(valid_vals, key=lambda x: x[1])[0] + else: + best_idx = min(valid_vals, key=lambda x: x[1])[0] + values[best_idx] = f"**{values[best_idx]}**" + + row = f"| {display_name} | " + " | ".join(values) + " |" + lines.append(row) + + lines.append("") + + # Best model analysis + best_name, explanation = determine_best_model(summary_models) + lines.append("### Analysis\n") + lines.append(explanation) + lines.append("") + + # Per-test-set sections + lines.append("## Per-Test-Set Results\n") + + for test_set in sorted(all_test_sets): + test_models = [] + for name in model_names: + m = model_metrics.get(name, {}).get(test_set) + if m is not None: + test_models.append((name, m)) + + if not test_models: + continue + + lines.extend(generate_test_set_table(test_set, test_models)) + + # Legend + lines.append("---") + lines.append("*Lower WER/CER is better, higher UTMOS/SSIM is better. **bold** = best value.*") + + report = "\n".join(lines) + + with open(output_path, "w") as f: + f.write(report) + + print(f"Report saved to: {output_path}") + print("\n" + "=" * 60) + print(report) + + +def main(): + parser = argparse.ArgumentParser( + description="Compare TTS evaluation results from multiple folders", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Remote folders (via SSH): + python compare_tts_eval_results.py \\ + --eval_folders "login-eos:/path/to/eval1/eval-results:Model A" \\ + "login-eos:/path/to/eval2/eval-results:Model B" \\ + --output comparison_report.md + + # Local folders: + python compare_tts_eval_results.py \\ + --eval_folders "/path/to/eval1:Model A" "/path/to/eval2:Model B" \\ + --output comparison_report.md + +Each eval folder should contain subdirectories for different test sets +(e.g., nv_tts.vctk, nv_tts.libritts_test_clean), each with a metrics.json. + """, + ) + parser.add_argument( + "--eval_folders", + nargs="+", + required=True, + help="Evaluation folders: 'path:name', 'host:path:name', or 'host:path'", + ) + script_dir = os.path.dirname(os.path.abspath(__file__)) + default_output = os.path.join(script_dir, "tts_comparison_report.md") + parser.add_argument("--output", type=str, default=default_output, help="Output Markdown file path") + + args = parser.parse_args() + + # Parse folder specs + folder_specs = [] + for spec in args.eval_folders: + host, path, name = parse_folder_spec(spec) + folder_specs.append((host, path, name)) + + # Discover all test sets across all models + all_test_sets = set() + model_test_sets = {} + for host, path, name in folder_specs: + test_sets = list_test_sets(path, host) + model_test_sets[name] = (host, path, test_sets) + all_test_sets.update(test_sets) + loc = f"{host}:{path}" if host else path + print(f"Found {len(test_sets)} test sets for {name} at {loc}") + + if not all_test_sets: + print("Error: No test sets found.") + return 1 + + # Load metrics for each model and test set + model_metrics: dict[str, dict[str, dict]] = {} + model_names = [] + + for host, path, name in folder_specs: + model_names.append(name) + model_metrics[name] = {} + test_sets = model_test_sets[name][2] + + for test_set in test_sets: + metrics_path = f"{path}/{test_set}/metrics.json" + metrics = load_metrics(metrics_path, host) + if metrics is not None: + model_metrics[name][test_set] = metrics + print(f" Loaded {test_set} for {name}") + else: + print(f" Skipping {test_set} for {name} (no metrics.json)") + + # Filter to test sets that have metrics for at least one model + valid_test_sets = [ts for ts in all_test_sets if any(ts in model_metrics.get(n, {}) for n in model_names)] + + if not valid_test_sets: + print("Error: No valid metrics found for any test set.") + return 1 + + generate_report(model_names, valid_test_sets, model_metrics, args.output) + return 0 + + +if __name__ == "__main__": + exit(main()) diff --git a/nemo_skills/dataset/nv_tts/scripts/tts_comparison_report.md b/nemo_skills/dataset/nv_tts/scripts/tts_comparison_report.md new file mode 100644 index 0000000000..361673d849 --- /dev/null +++ b/nemo_skills/dataset/nv_tts/scripts/tts_comparison_report.md @@ -0,0 +1,115 @@ +# TTS Evaluation Comparison Report + +Comparing 2 model(s): GRPO Step 1100, A3 Baseline + +## Summary (Averaged Across Test Sets) + +| Metric | GRPO Step 1100 | A3 Baseline | +|---|---|---| +| WER (cumulative) | 4.56% | **3.65%** | +| CER (cumulative) | 2.75% | **2.24%** | +| WER (filewise avg) | 4.49% | **3.54%** | +| CER (filewise avg) | 3.14% | **2.28%** | +| UTMOS v2 | **3.124** | 3.005 | +| SSIM (pred vs GT) | **0.6599** | 0.0255 | +| SSIM (pred vs context) | **0.6709** | 0.0259 | + +### Analysis + +**A3 Baseline** performs best overall. Key advantages: A3 Baseline leads in WER (cumulative); A3 Baseline leads in CER (cumulative); A3 Baseline leads in WER (filewise avg); A3 Baseline leads in CER (filewise avg). + +## Per-Test-Set Results + +### nv_tts.libritts_seen + +| Metric | GRPO Step 1100 | +|---|---| +| WER (cumulative) | 2.76% | +| CER (cumulative) | 1.92% | +| WER (filewise avg) | 2.69% | +| CER (filewise avg) | 1.83% | +| UTMOS v2 | 3.322 | +| SSIM (pred vs GT) | 0.8022 | +| SSIM (pred vs context) | 0.8022 | +| Total audio (sec) | 1314.7 | + +### nv_tts.libritts_test_clean + +| Metric | GRPO Step 1100 | A3 Baseline | +|---|---|---| +| WER (cumulative) | 1.37% | **1.27%** | +| CER (cumulative) | 0.46% | **0.41%** | +| WER (filewise avg) | 1.47% | **1.30%** | +| CER (filewise avg) | 0.51% | **0.42%** | +| UTMOS v2 | **3.279** | 3.153 | +| SSIM (pred vs GT) | **0.8378** | -0.0138 | +| SSIM (pred vs context) | **0.8378** | -0.0138 | +| Total audio (sec) | 12454.1 | 15257.6 | + +### nv_tts.riva_hard_digits + +| Metric | GRPO Step 1100 | A3 Baseline | +|---|---|---| +| WER (cumulative) | 3.52% | **2.13%** | +| CER (cumulative) | 2.58% | **1.41%** | +| WER (filewise avg) | 3.27% | **1.96%** | +| CER (filewise avg) | 2.43% | **1.32%** | +| UTMOS v2 | **3.119** | 3.035 | +| SSIM (pred vs GT) | **0.6976** | 0.0460 | +| SSIM (pred vs context) | **0.6976** | 0.0460 | +| Total audio (sec) | 2462.9 | 3022.4 | + +### nv_tts.riva_hard_letters + +| Metric | GRPO Step 1100 | A3 Baseline | +|---|---|---| +| WER (cumulative) | **5.34%** | 6.17% | +| CER (cumulative) | **2.92%** | 4.51% | +| WER (filewise avg) | **5.00%** | 5.82% | +| CER (filewise avg) | **2.82%** | 4.26% | +| UTMOS v2 | 2.988 | **2.991** | +| SSIM (pred vs GT) | **0.6505** | 0.0432 | +| SSIM (pred vs context) | **0.6505** | 0.0432 | +| Total audio (sec) | 1984.2 | 2432.8 | + +### nv_tts.riva_hard_money + +| Metric | GRPO Step 1100 | A3 Baseline | +|---|---|---| +| WER (cumulative) | 2.92% | **0.92%** | +| CER (cumulative) | 2.00% | **0.55%** | +| WER (filewise avg) | 2.86% | **0.86%** | +| CER (filewise avg) | 1.96% | **0.49%** | +| UTMOS v2 | **3.191** | 3.076 | +| SSIM (pred vs GT) | **0.7075** | 0.0428 | +| SSIM (pred vs context) | **0.7075** | 0.0428 | +| Total audio (sec) | 2635.0 | 3149.5 | + +### nv_tts.riva_hard_short + +| Metric | GRPO Step 1100 | A3 Baseline | +|---|---|---| +| WER (cumulative) | 15.66% | **9.84%** | +| CER (cumulative) | 9.24% | **6.11%** | +| WER (filewise avg) | 15.66% | **9.84%** | +| CER (filewise avg) | 12.32% | **6.76%** | +| UTMOS v2 | 2.525 | **2.544** | +| SSIM (pred vs GT) | **0.3004** | 0.0373 | +| SSIM (pred vs context) | **0.3004** | 0.0373 | +| Total audio (sec) | 312.4 | 573.5 | + +### nv_tts.vctk + +| Metric | GRPO Step 1100 | A3 Baseline | +|---|---|---| +| WER (cumulative) | **0.36%** | 1.55% | +| CER (cumulative) | **0.09%** | 0.46% | +| WER (filewise avg) | **0.47%** | 1.43% | +| CER (filewise avg) | **0.10%** | 0.45% | +| UTMOS v2 | **3.441** | 3.229 | +| SSIM (pred vs GT) | **0.6236** | -0.0028 | +| SSIM (pred vs context) | **0.7002** | -0.0004 | +| Total audio (sec) | 310.6 | 334.6 | + +--- +*Lower WER/CER is better, higher UTMOS/SSIM is better. **bold** = best value.* From e80448eb9dfcf92c85f730adc3cb6048347ff71d Mon Sep 17 00:00:00 2001 From: Valentin Mendelev Date: Sun, 28 Dec 2025 12:08:20 -0800 Subject: [PATCH 12/26] eos config example --- cluster_configs/eos_example.yaml | 48 ++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 cluster_configs/eos_example.yaml diff --git a/cluster_configs/eos_example.yaml b/cluster_configs/eos_example.yaml new file mode 100644 index 0000000000..124f85eb0d --- /dev/null +++ b/cluster_configs/eos_example.yaml @@ -0,0 +1,48 @@ +executor: slurm + +ssh_tunnel: + host: login-eos.nvidia.com + # ------------------------------- Fill this up! ------------------------------- + user: your_username + job_dir: /lustre/fsw/llmservice_nemo_speechlm/users/your_username/code/nemo-run + identity: "" + # ----------------------------------------------------------------------------- + +# if you're running directly from cluster, you only need to define job_dir and shouldn't use ssh_tunnel +# job_dir: + +account: llmservice_nemo_speechlm +partition: batch +job_name_prefix: "" + +disable_gpus_per_node: True + +containers: + trtllm: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-trtllm-latest.sqsh + vllm: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-vllm-latest.sqsh + sglang: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-sglang-latest.sqsh + nemo-rl: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-nemo-rl-latest.sqsh + megatron: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-megatron-latest.sqsh + sandbox: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-sandbox-latest.sqsh + nemo-skills: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-latest.sqsh + verl: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-verl-latest.sqsh + +mounts: + # - /lustre/fsw/llmservice_nemo_reasoning/hf_models:/hf_models + # - /lustre/fsw/llmservice_nemo_reasoning/images/swe-bench:/swe-bench-images + - /lustre/fsw/llmservice_nemo_speechlm:/lustre/fsw/llmservice_nemo_speechlm + + # you also need to mount your own workspace folder (or any other folder you need) + # - /lustre/fsw/llmservice_nemo_reasoning/users/igitman/:/workspace + +env_vars: + # ------------------------------- Fill this up! ------------------------------- + - HF_HOME=/lustre/fsw/llmservice_nemo_speechlm/users/your_username/hfcache + # ----------------------------------------------------------------------------- + +timeouts: + batch: 04:00:00 + interactive: 02:00:00 + +mail_type: FAIL +mail_user: # From 547c9120623a7f6c83a183c1c9a810d5e55d2d2f Mon Sep 17 00:00:00 2001 From: Valentin Mendelev Date: Fri, 9 Jan 2026 08:31:02 -0800 Subject: [PATCH 13/26] EAR TTS backend --- .../dataset/nv_tts/scripts/run_tts_eval.py | 10 +- nemo_skills/inference/server/serve_unified.py | 24 +- .../multimodal/server/backends/__init__.py | 6 +- .../server/backends/ear_tts_backend.py | 530 ++++++++++++++++++ 4 files changed, 562 insertions(+), 8 deletions(-) create mode 100644 recipes/multimodal/server/backends/ear_tts_backend.py diff --git a/nemo_skills/dataset/nv_tts/scripts/run_tts_eval.py b/nemo_skills/dataset/nv_tts/scripts/run_tts_eval.py index c4548d3305..2e51e9c2de 100644 --- a/nemo_skills/dataset/nv_tts/scripts/run_tts_eval.py +++ b/nemo_skills/dataset/nv_tts/scripts/run_tts_eval.py @@ -47,10 +47,10 @@ def run_generation(cfg: dict, expname: str): """Run generation stage using ns eval, returns experiment object.""" gen = cfg["generation"] - # Add nemo_code_path to server_args + # Add generation_code_path to server_args server_args = gen["server_args"] - if cfg.get("nemo_code_path"): - server_args += f" --code_path {cfg['nemo_code_path']}" + if cfg.get("generation_code_path"): + server_args += f" --code_path {cfg['generation_code_path']}" # Parse extra_args for the context extra_args = gen.get("extra_args", "").split() if gen.get("extra_args") else [] @@ -93,7 +93,7 @@ def main(): cfg = load_config(args.config) scoring = cfg.get("scoring", {}) hf_token = os.environ.get("HF_TOKEN", "") - nemo_path = cfg["nemo_code_path"] + scoring_code_path = cfg.get("scoring_code_path", "") output_dir = cfg["output_dir"] gen_exp_name = None @@ -131,7 +131,7 @@ def main(): scoring_cmd = ( f"HF_TOKEN={hf_token} " - f"PYTHONPATH={nemo_path}:$PYTHONPATH " + f"PYTHONPATH={scoring_code_path}:$PYTHONPATH " f"python -m nemo_skills.dataset.nv_tts.scripts.score " f"--results_dir {output_dir} " f"--benchmark {benchmark_dir} " diff --git a/nemo_skills/inference/server/serve_unified.py b/nemo_skills/inference/server/serve_unified.py index 47feb62f2a..76a4b5524a 100644 --- a/nemo_skills/inference/server/serve_unified.py +++ b/nemo_skills/inference/server/serve_unified.py @@ -138,8 +138,8 @@ def main(): parser.add_argument( "--backend", default="salm", - choices=["salm", "magpie_tts", "s2s", "s2s_incremental", "s2s_session"], - help="Backend type: salm (speech-augmented LM), magpie_tts (MagpieTTS with RTF metrics), s2s (speech-to-speech offline), s2s_incremental (frame-by-frame processing), s2s_session (session-aware multi-turn)", + choices=["salm", "magpie_tts", "ear_tts", "ear_tts_batch", "s2s", "s2s_incremental", "s2s_session"], + help="Backend type: salm (speech-augmented LM), magpie_tts (MagpieTTS with RTF metrics), ear_tts (EAR TTS streaming decode), ear_tts_batch (EAR TTS batch decode), s2s (speech-to-speech offline), s2s_incremental (frame-by-frame processing), s2s_session (session-aware multi-turn)", ) # Backend-specific model paths @@ -273,6 +273,10 @@ def main(): setup_pythonpath(args.code_path) apply_safetensors_patch(args.hack_path) + # Store code_path for backends that may need to add paths late + if args.code_path: + os.environ["UNIFIED_SERVER_CODE_PATH"] = args.code_path + # Set environment variables os.environ["UNIFIED_SERVER_HOST"] = args.host os.environ["UNIFIED_SERVER_PORT"] = str(args.port) @@ -325,6 +329,15 @@ def main(): if args.silence_padding_sec != 5.0: extra_config["silence_padding_sec"] = args.silence_padding_sec + # EAR TTS backend options + if args.backend in ("ear_tts", "ear_tts_batch"): + if args.config_path: + extra_config["config_path"] = args.config_path + if args.tts_checkpoint_path: + extra_config["tts_checkpoint_path"] = args.tts_checkpoint_path + if args.speaker_reference: + extra_config["speaker_reference"] = args.speaker_reference + # S2S Incremental/Session backend options (shared config) if args.backend in ("s2s_incremental", "s2s_session"): if args.config_path: @@ -375,6 +388,13 @@ def main(): print(" Legacy Codebooks: True") if args.legacy_text_conditioning: print(" Legacy Text Conditioning: True") + if args.backend in ("ear_tts", "ear_tts_batch"): + if args.config_path: + print(f" Config Path: {args.config_path}") + if args.speaker_reference: + print(f" Speaker Reference: {args.speaker_reference}") + if args.tts_checkpoint_path: + print(f" TTS Checkpoint: {args.tts_checkpoint_path}") if args.backend in ("s2s_incremental", "s2s_session"): if args.config_path: print(f" Config Path: {args.config_path}") diff --git a/recipes/multimodal/server/backends/__init__.py b/recipes/multimodal/server/backends/__init__.py index fe3c4c1abd..f31032bfea 100644 --- a/recipes/multimodal/server/backends/__init__.py +++ b/recipes/multimodal/server/backends/__init__.py @@ -18,6 +18,8 @@ Available backends: - salm: Speech-Augmented Language Model (text output from text/audio input) - magpie_tts: MagpieTTS using MagpieInferenceRunner with RTF metrics (audio output from text input) +- ear_tts: EAR TTS using NemotronVoiceChat TTS model (audio output from text input, streaming decode) +- ear_tts_batch: EAR TTS optimized version (audio output from text input, batch decode at end) - s2s: Speech-to-Speech using DuplexS2S offline (text output from audio input) - s2s_incremental: Speech-to-Speech using NemotronVoiceChat incremental (text+audio from audio) - s2s_session: Speech-to-Speech with session support for multi-turn conversations @@ -39,8 +41,10 @@ BACKEND_REGISTRY = { "salm": ("salm_backend", "SALMBackend"), "magpie_tts": ("magpie_tts_backend", "MagpieTTSBackend"), + "ear_tts": ("ear_tts_backend", "EarTTSBackend"), + "ear_tts_batch": ("ear_tts_backend", "EarTTSBatchBackend"), "s2s": ("s2s_backend", "S2SBackend"), - "s2s_incremental": ("s2s_incremental_backend", "S2SIncrementalBackend"), + "s2s_incremental": ("s2s_incremental_backend_c", "S2SIncrementalBackend"), "s2s_session": ("s2s_session_backend", "S2SSessionBackend"), } diff --git a/recipes/multimodal/server/backends/ear_tts_backend.py b/recipes/multimodal/server/backends/ear_tts_backend.py new file mode 100644 index 0000000000..7b9f4daf3f --- /dev/null +++ b/recipes/multimodal/server/backends/ear_tts_backend.py @@ -0,0 +1,530 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + +"""EAR TTS backend using NemotronVoiceChat's TTS model for text-to-speech synthesis.""" + +import io +import json +import os +import time +from dataclasses import dataclass +from typing import Any, Dict, List, Optional + +import numpy as np +import soundfile as sf +import torch +import torchaudio +from omegaconf import DictConfig, OmegaConf + +from .base import BackendConfig, GenerationRequest, GenerationResult +from .magpie_tts_backend import MagpieTTSBackend, MagpieTTSConfig + +# TTS constants +TTS_SAMPLE_RATE = 22050 +FRAME_SIZE_SEC = 0.08 # 80ms per frame +DEFAULT_CODEC_TOKEN_HISTORY_SIZE = 60 +SILENCE_THRESHOLD = 0.1 # Max magnitude threshold for silence detection +SILENCE_DURATION_SEC = 2.0 # Stop if last N seconds are silent + + +@dataclass +class EarTTSConfig(MagpieTTSConfig): + """Configuration for EAR TTS backend - extends MagpieTTSConfig.""" + + # EAR TTS specific paths + tts_checkpoint_path: Optional[str] = None # Path to TTS checkpoint (safetensors) + speaker_reference: Optional[str] = None # Speaker reference audio for voice cloning + config_path: Optional[str] = None # Optional YAML config path + + # TTS parameters + codec_token_history_size: int = DEFAULT_CODEC_TOKEN_HISTORY_SIZE + guidance_enabled: bool = True + + @classmethod + def from_dict(cls, d: Dict[str, Any]) -> "EarTTSConfig": + known_fields = { + "model_path", + "device", + "dtype", + "max_new_tokens", + "temperature", + "top_p", + "top_k", + "codec_model_path", + "use_cfg", + "cfg_scale", + "max_decoder_steps", + "use_local_transformer", + "output_sample_rate", + "hparams_file", + "checkpoint_file", + "legacy_codebooks", + "legacy_text_conditioning", + "hparams_from_wandb", + # EAR TTS specific + "tts_checkpoint_path", + "speaker_reference", + "config_path", + "codec_token_history_size", + "guidance_enabled", + } + known = {k: v for k, v in d.items() if k in known_fields} + extra = {k: v for k, v in d.items() if k not in known_fields} + return cls(**known, extra_config=extra) + + +class EarTTSBackend(MagpieTTSBackend): + """ + EAR TTS backend using NemotronVoiceChat's TTS model. + + Inherits from MagpieTTSBackend and overrides load_model() and generate() + to use the EAR TTS model instead of MagpieTTS. + """ + + @property + def name(self) -> str: + return "ear_tts" + + def __init__(self, config: BackendConfig): + # Convert to EarTTSConfig + if isinstance(config, EarTTSConfig): + self.ear_config = config + else: + self.ear_config = EarTTSConfig.from_dict( + { + **{ + k: getattr(config, k) + for k in ["model_path", "device", "dtype", "max_new_tokens", "temperature", "top_p", "top_k"] + if hasattr(config, k) + }, + **config.extra_config, + } + ) + + # Call grandparent __init__ to skip MagpieTTSBackend's init + from .base import InferenceBackend + + InferenceBackend.__init__(self, self.ear_config) + self.tts_config = self.ear_config + + self._model = None + self._model_cfg = None + self._tokenizer = None + + # TTS state + self.first_context_subword_id = None + self.generation_config = None + self.first_tts_code_input = None + self.first_tts_past_key_values_input = None + self.target_sample_rate = TTS_SAMPLE_RATE + self.target_fps = None + + def _clone_cache(self, cache): + """Deep clone cache structures.""" + if cache is None: + return None + if isinstance(cache, torch.Tensor): + return cache.detach().clone() + if isinstance(cache, (list, tuple)): + return type(cache)(self._clone_cache(x) for x in cache) + if isinstance(cache, dict): + return {k: self._clone_cache(v) for k, v in cache.items()} + if hasattr(cache, "__dict__"): + import copy + + return copy.deepcopy(cache) + return cache + + def load_model(self) -> None: + """Load the EAR TTS model from NemotronVoiceChat.""" + import sys + + from safetensors.torch import load_file + + print(f"[EarTTS] Loading model from {self.config.model_path}...") + + # Clear cached nemo modules FIRST to force reimport from our paths + nemo_modules = [k for k in sys.modules.keys() if k.startswith("nemo")] + for mod in nemo_modules: + del sys.modules[mod] + print(f"[EarTTS] Cleared {len(nemo_modules)} cached nemo modules") + + # Ensure code path is FIRST in sys.path (for speechlm2 module) + code_path = os.environ.get("UNIFIED_SERVER_CODE_PATH", "") + print(f"[EarTTS] UNIFIED_SERVER_CODE_PATH = '{code_path}'") + if code_path: + paths = [p for p in code_path.split(":") if p] + # Remove existing entries and re-add at front + for path in paths: + while path in sys.path: + sys.path.remove(path) + for path in reversed(paths): + sys.path.insert(0, path) + print(f"[EarTTS] Added to sys.path: {path}") + else: + print("[EarTTS] WARNING: No code path found in env!") + + # Debug: show current path + print(f"[EarTTS] sys.path (first 5): {sys.path[:5]}") + + try: + from nemo.collections.speechlm2.models.nemotron_voicechat import NemotronVoiceChat + from nemo.collections.speechlm2.parts.pretrained import set_model_dict_for_partial_init + except ImportError as e: + raise RuntimeError(f"Failed to import NemotronVoiceChat. Error: {e}") + + torch.backends.cudnn.allow_tf32 = True + torch.backends.cuda.matmul.allow_tf32 = True + torch.set_float32_matmul_precision("high") + + # Load config + tts_path = self.ear_config.tts_checkpoint_path or self.config.model_path + config_file = os.path.join(tts_path, "config.json") + print(f"[EarTTS] Loading config: {config_file}") + + with open(config_file, "r") as f: + cfg = DictConfig(json.load(f)) + + # Set speaker reference + speaker_ref = self.ear_config.speaker_reference + if not speaker_ref and self.ear_config.config_path: + yaml_cfg = OmegaConf.load(self.ear_config.config_path) + speaker_ref = yaml_cfg.get("model", {}).get("inference_speaker_reference") + if speaker_ref: + if "model" not in cfg: + cfg.model = {} + cfg.model.inference_speaker_reference = speaker_ref + + self._model_cfg = cfg + + # Disable pretrained model loading + if hasattr(cfg.model, "speech_generation") and hasattr(cfg.model.speech_generation, "model"): + cfg.model.speech_generation.model.pretrained_model = None + if hasattr(cfg.model, "stt") and hasattr(cfg.model.stt, "model"): + cfg.model.stt.model.pretrained_s2s_model = None + + # Initialize and load model + print("[EarTTS] Initializing model structure...") + self._model = NemotronVoiceChat(OmegaConf.to_container(cfg, resolve=True)) + + safetensors_path = os.path.join(tts_path, "model.safetensors") + if os.path.exists(safetensors_path): + print(f"[EarTTS] Loading TTS weights from: {tts_path}") + state_dict = load_file(safetensors_path) + tts_only = {k: v for k, v in state_dict.items() if k.startswith("tts_model.")} + print(f"[EarTTS] Loading {len(tts_only)} TTS parameters") + tts_only = set_model_dict_for_partial_init(tts_only, self._model.state_dict()) + self._model.load_state_dict(tts_only, strict=False) + else: + raise ValueError(f"TTS weights not found at {safetensors_path}") + + self._model.to(self.config.device) + self._model.eval() + self._tokenizer = self._model.stt_model.tokenizer + + if hasattr(self._model, "tts_model"): + self.target_fps = self._model.tts_model.target_fps + self.target_sample_rate = self._model.tts_model.target_sample_rate + print(f"[EarTTS] TTS: fps={self.target_fps}, sample_rate={self.target_sample_rate}") + self._prepare_tts_initial_state() + + self._is_loaded = True + print("[EarTTS] Model loaded successfully") + + def _prepare_tts_initial_state(self): + """Prepare TTS warmup state with speaker reference.""" + from nemo.collections.audio.parts.utils.resampling import resample + from nemo.collections.speechlm2.parts.precision import fp32_precision + + if not hasattr(self._model, "tts_model"): + return + + speaker_ref = self._model_cfg.model.get("inference_speaker_reference") if self._model_cfg else None + if not speaker_ref: + speaker_ref = self.ear_config.speaker_reference + if not speaker_ref: + print("[EarTTS] Warning: No speaker reference") + return + + print(f"[EarTTS] Preparing TTS with speaker: {speaker_ref}") + + with fp32_precision(): + speaker_audio, speaker_sr = torchaudio.load(speaker_ref) + speaker_audio = resample(speaker_audio, speaker_sr, self._model.tts_model.target_sample_rate) + + speaker_audio = speaker_audio.to(self.config.device) + speaker_audio_lens = torch.tensor([speaker_audio.size(1)], device=self.config.device).long() + + self._model.tts_model.set_init_inputs(speaker_audio=speaker_audio, speaker_audio_lens=speaker_audio_lens) + init_inputs = self._model.tts_model.get_init_inputs(B=1) + self.generation_config = self._model.tts_model._get_generation_config( + guidance_enabled=self.ear_config.guidance_enabled + ) + init_inputs.update({"use_cache": True, "past_key_values": None, "guidance_enabled": True}) + + with torch.no_grad(): + outputs = self._model.tts_model.tts_model(**init_inputs) + code = init_inputs["code"][:, -1:] + + self.first_context_subword_id = init_inputs["subword_ids"][:, -1].unsqueeze(-1) + self.first_tts_code_input = code.detach().clone() + self.first_tts_past_key_values_input = self._clone_cache(outputs.past_key_values) + print("[EarTTS] TTS warmup state prepared") + + @torch.no_grad() + def _synthesize_text(self, text: str) -> Optional[np.ndarray]: + """Synthesize audio from text using EAR TTS. + + Generates audio frames until the last 2 seconds have max magnitude below threshold, + indicating the model has finished speaking. + """ + from nemo.collections.speechlm2.parts.precision import fp32_precision + + if not text or not self.generation_config: + return None + + device = self.config.device + token_ids = self._tokenizer.text_to_ids(text) + if not token_ids: + return None + + num_tokens = len(token_ids) + token_tensor = torch.tensor(token_ids, dtype=torch.long, device=device).unsqueeze(0) + + # Max frames: generous upper bound (10x tokens should be plenty) + max_frames = num_tokens * 10 + + # Samples needed to check for silence (2 seconds) + sample_rate = self.target_sample_rate or TTS_SAMPLE_RATE + samples_for_silence_check = int(sample_rate * SILENCE_DURATION_SEC) + + # Initialize TTS state + past_key_values = self._clone_cache(self.first_tts_past_key_values_input) + code = self.first_tts_code_input.detach().clone() + codec_history_size = self.ear_config.codec_token_history_size + audio_toks_buffer = ( + self._model.tts_model.codec_silence_tokens.view(1, 1, -1).expand(-1, codec_history_size, -1).to(device) + ) + + audio_segments = [] + samples_per_frame = int(float(sample_rate) * FRAME_SIZE_SEC) + total_samples = 0 + + for frame_idx in range(max_frames): + # Cycle through tokens, repeating the last token after we've used all + token_idx = min(frame_idx, num_tokens - 1) + current_subword_id = token_tensor[:, token_idx].unsqueeze(-1) + + if frame_idx == 0: + prev_subword_id = self.first_context_subword_id + else: + prev_token_idx = min(frame_idx - 1, num_tokens - 1) + prev_subword_id = token_tensor[:, prev_token_idx].unsqueeze(-1) + + code, past_key_values = self._model.tts_model.infer_codes_one_step( + current_subword_id=current_subword_id, + prev_subword_id=prev_subword_id, + current_subword_mask=torch.ones(1, 1, device=device, dtype=torch.bool), + prev_audio_tokens=code, + past_key_values=past_key_values, + guidance_enabled=self.ear_config.guidance_enabled, + generation_config=self.generation_config, + ignore_eos_flag_stop=True, + ) + + audio_toks_buffer = torch.cat([audio_toks_buffer[:, 1:], code], dim=1) + + with fp32_precision(): + decoded_audio, _ = self._model.tts_model.audio_codec.decode( + audio_toks_buffer, torch.tensor([codec_history_size], dtype=torch.long, device=device) + ) + frame_audio = decoded_audio[:, :, -samples_per_frame:] + audio_segments.append(frame_audio) + total_samples += samples_per_frame + + # Check for silence after we have enough samples + if total_samples >= samples_for_silence_check: + # Get last 2 seconds of audio + recent_audio = torch.cat(audio_segments, dim=-1)[:, :, -samples_for_silence_check:] + max_magnitude = recent_audio.abs().max().item() + + if max_magnitude < SILENCE_THRESHOLD: + # Silence detected - stop generating + break + + if audio_segments: + audio_tensor = torch.cat(audio_segments, dim=-1) + + # Trim trailing silence + audio_np = audio_tensor.float().cpu().numpy().squeeze() + + # Find where audio becomes silent (from the end) + window_size = int(sample_rate * 0.1) # 100ms window + for trim_point in range(len(audio_np) - window_size, 0, -window_size): + window_max = np.abs(audio_np[trim_point : trim_point + window_size]).max() + if window_max >= SILENCE_THRESHOLD: + # Found non-silent audio, trim after this point + small buffer + audio_np = audio_np[: trim_point + window_size * 2] + break + + max_val = np.abs(audio_np).max() + if max_val > 0: + audio_np = audio_np / max_val * 0.95 + return audio_np + return None + + def generate(self, requests: List[GenerationRequest]) -> List[GenerationResult]: + """Generate audio from text requests.""" + if not self._is_loaded: + return [GenerationResult(error="Model not loaded", request_id=r.request_id) for r in requests] + if not requests: + return [] + + results = [] + for req in requests: + start_time = time.time() + try: + parsed = self._extract_json(req.text) + text = parsed.get("text", "") + + if not text: + results.append(GenerationResult(error="No text provided", request_id=req.request_id)) + continue + + audio_np = self._synthesize_text(text) + if audio_np is None: + results.append(GenerationResult(error="Failed to synthesize audio", request_id=req.request_id)) + continue + + wav_buffer = io.BytesIO() + sf.write(wav_buffer, audio_np, self.target_sample_rate, format="WAV") + elapsed_ms = (time.time() - start_time) * 1000 + audio_duration = len(audio_np) / self.target_sample_rate + + results.append( + GenerationResult( + text=text, + audio_bytes=wav_buffer.getvalue(), + audio_sample_rate=self.target_sample_rate, + audio_format="wav", + request_id=req.request_id, + generation_time_ms=elapsed_ms, + debug_info={ + "audio_duration_sec": audio_duration, + "rtf": elapsed_ms / 1000 / audio_duration if audio_duration > 0 else 0, + }, + ) + ) + except Exception as e: + import traceback + + traceback.print_exc() + results.append(GenerationResult(error=str(e), request_id=req.request_id)) + return results + + def health_check(self) -> Dict[str, Any]: + """Return health status.""" + h = super().health_check() + if self._is_loaded: + h.update( + { + "sample_rate": self.target_sample_rate, + "fps": self.target_fps, + "tts_enabled": self.generation_config is not None, + "speaker_reference": self.ear_config.speaker_reference, + } + ) + return h + + +class EarTTSBatchBackend(EarTTSBackend): + """ + Optimized EAR TTS backend that decodes audio only once at the end. + + This version generates all codes first (token-by-token), then decodes + the entire sequence in one batch operation - significantly faster than + decoding after every token. + """ + + @property + def name(self) -> str: + return "ear_tts_batch" + + @torch.no_grad() + def _synthesize_text(self, text: str) -> Optional[np.ndarray]: + """Synthesize audio from text - optimized batch decoding version. + + Generates 10x tokens worth of frames, decodes all at once, then trims trailing silence. + """ + from nemo.collections.speechlm2.parts.precision import fp32_precision + + if not text or not self.generation_config: + return None + + device = self.config.device + token_ids = self._tokenizer.text_to_ids(text) + if not token_ids: + return None + + num_tokens = len(token_ids) + max_frames = num_tokens * 10 # Generate more frames than tokens + token_tensor = torch.tensor(token_ids, dtype=torch.long, device=device).unsqueeze(0) + + # Initialize TTS state + past_key_values = self._clone_cache(self.first_tts_past_key_values_input) + code = self.first_tts_code_input.detach().clone() + + # Generate ALL codes first (no decoding in the loop) + all_codes = [] + + for frame_idx in range(max_frames): + # Cycle through tokens, repeating the last token after we've used all + token_idx = min(frame_idx, num_tokens - 1) + current_subword_id = token_tensor[:, token_idx].unsqueeze(-1) + + if frame_idx == 0: + prev_subword_id = self.first_context_subword_id + else: + prev_token_idx = min(frame_idx - 1, num_tokens - 1) + prev_subword_id = token_tensor[:, prev_token_idx].unsqueeze(-1) + + code, past_key_values = self._model.tts_model.infer_codes_one_step( + current_subword_id=current_subword_id, + prev_subword_id=prev_subword_id, + current_subword_mask=torch.ones(1, 1, device=device, dtype=torch.bool), + prev_audio_tokens=code, + past_key_values=past_key_values, + guidance_enabled=self.ear_config.guidance_enabled, + generation_config=self.generation_config, + ignore_eos_flag_stop=True, + ) + all_codes.append(code) + + # Decode ALL codes at once at the end + if all_codes: + all_codes_tensor = torch.cat(all_codes, dim=1) # [B, max_frames, codebook_dim] + len_codes = torch.tensor([max_frames], dtype=torch.long, device=device) + + with fp32_precision(): + decoded_audio, _ = self._model.tts_model.audio_codec.decode(all_codes_tensor, len_codes) + + audio_np = decoded_audio.float().cpu().numpy().squeeze() + + # Trim trailing silence + sample_rate = self.target_sample_rate or TTS_SAMPLE_RATE + window_size = int(sample_rate * 0.1) # 100ms window + for trim_point in range(len(audio_np) - window_size, 0, -window_size): + window_max = np.abs(audio_np[trim_point : trim_point + window_size]).max() + if window_max >= SILENCE_THRESHOLD: + # Found non-silent audio, trim after this point + small buffer + audio_np = audio_np[: trim_point + window_size * 2] + break + + max_val = np.abs(audio_np).max() + if max_val > 0: + audio_np = audio_np / max_val * 0.95 + return audio_np + + return None From d318068c90a8ecac5e8b1a04b966a0591eeab618 Mon Sep 17 00:00:00 2001 From: Valentin Mendelev Date: Fri, 9 Jan 2026 14:38:46 -0800 Subject: [PATCH 14/26] EAR TTS config --- .../scripts/config/ear_tts_hard_digits.yaml | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 nemo_skills/dataset/nv_tts/scripts/config/ear_tts_hard_digits.yaml diff --git a/nemo_skills/dataset/nv_tts/scripts/config/ear_tts_hard_digits.yaml b/nemo_skills/dataset/nv_tts/scripts/config/ear_tts_hard_digits.yaml new file mode 100644 index 0000000000..e8e5eff9e0 --- /dev/null +++ b/nemo_skills/dataset/nv_tts/scripts/config/ear_tts_hard_digits.yaml @@ -0,0 +1,42 @@ +# TTS Pipeline Configuration - EAR TTS with Hard Digits + +# Cluster and execution settings +cluster: eos +partition: batch +mount_paths: /lustre:/lustre +output_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/nv_tts_eval_ear_tts_hard_digits_a2 + +# === COPY THESE FROM OCI TO EOS === +# Container (copy from: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh) +container: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/containters/nemo_duplex_november_eartts.sqsh + +# NeMo code paths (separate for generation and scoring) +generation_code_path: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/code/NeMo-release_not_rebased +scoring_code_path: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/experimenta/tts_eval/NeMo + +# Generation settings +generation: + benchmarks: nv_tts.riva_hard_digits + # Model checkpoint (copy from: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-2mim_sw_et_eos_dp_eos_dup_fp32-stt-3-december_stt_edresson_model_R_digits_norm_eip_0.1_EA_model_step_9005) + model: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/models/ear_tts/duplex-eartts-2mim_sw_et_eos_dp_eos_dup_fp32-stt-3-december_stt_edresson_model_R_digits_norm_eip_0.1_EA_model_step_9005 + server_type: generic + server_gpus: 1 + server_entrypoint: python -m nemo_skills.inference.server.serve_unified + server_args: >- + --backend ear_tts + --speaker_reference /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/models/ear_tts/Emma_S3_A1_SC7_singleturntarget_21_channel_1_audio_in.wav + --config_path /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/models/ear_tts/nanov2_demo_model_eartts_updated.yaml + --batch_size 1 + --batch_timeout 0.1 + data_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/data_dir + num_chunks: 8 + gpus_per_node: 1 + extra_args: ++server.server_type=vllm_multimodal + +# Scoring settings +scoring: + sv_model: titanet + asr_model_name: nvidia/parakeet-tdt-1.1b + language: en + with_utmosv2: true + gpus: 1 From 18981f94d982e5443b1e1cc8d4c8e2d63105475e Mon Sep 17 00:00:00 2001 From: Valentin Mendelev Date: Fri, 30 Jan 2026 12:00:00 -0800 Subject: [PATCH 15/26] Fix MagpieTTS backend when no context audio is provided Create a small dummy context wav for requests without context_audio_filepath to prevent dataloader failures (missing d*.wav) and 500s from the unified server. --- recipes/multimodal/server/backends/magpie_tts_backend.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/recipes/multimodal/server/backends/magpie_tts_backend.py b/recipes/multimodal/server/backends/magpie_tts_backend.py index e11187f71a..9f1f052e27 100644 --- a/recipes/multimodal/server/backends/magpie_tts_backend.py +++ b/recipes/multimodal/server/backends/magpie_tts_backend.py @@ -183,6 +183,15 @@ def generate(self, requests: List[GenerationRequest]) -> List[GenerationResult]: os.symlink(ctx, link_path) else: link_name = f"d{i}.wav" + # Magpie inference expects a readable "context" wav for every manifest entry. + # When callers don't provide one (common for pure TTS-from-text prompts), + # create a tiny dummy wav so the dataloader doesn't crash. + link_path = os.path.join(audio_dir, link_name) + if not os.path.exists(link_path): + sr = int(getattr(self.tts_config, "output_sample_rate", 22050) or 22050) + dur_s = 0.1 + n = max(1, int(sr * dur_s)) + sf.write(link_path, [0.0] * n, sr) f.write( json.dumps( { From 8b9c22f546a0fca23db4e2dd734e32f29ad26ec7 Mon Sep 17 00:00:00 2001 From: Valentin Mendelev Date: Fri, 30 Jan 2026 13:51:50 -0800 Subject: [PATCH 16/26] Reset MagpieTTS decoder cache per request batch Avoid KV-cache shape mismatches when batch sizes vary between requests in the unified server. --- .../server/backends/magpie_tts_backend.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/recipes/multimodal/server/backends/magpie_tts_backend.py b/recipes/multimodal/server/backends/magpie_tts_backend.py index 9f1f052e27..3ed09b02e6 100644 --- a/recipes/multimodal/server/backends/magpie_tts_backend.py +++ b/recipes/multimodal/server/backends/magpie_tts_backend.py @@ -164,6 +164,21 @@ def generate(self, requests: List[GenerationRequest]) -> List[GenerationResult]: os.makedirs(output_dir, exist_ok=True) try: + # MagpieTTS uses KV caching internally during decoding. When the unified server + # batches requests, consecutive calls to this backend can have different batch + # sizes, and stale KV caches can trigger shape mismatches (e.g. cat on self_k). + # Reset caches at the start of each request batch to avoid cross-request leakage. + try: + if self._model is not None: + decoder = getattr(self._model, "decoder", None) + if decoder is not None and hasattr(decoder, "reset_cache"): + # Keep caching enabled for decoding speed, but start from a clean cache. + decoder.reset_cache(use_cache=True) + except Exception: + # Best-effort: if cache reset fails for any reason, continue and let the + # underlying stack surface the real error. + pass + # Parse requests, extracting JSON from text (skips non-JSON prefixes) parsed = [self._extract_json(r.text) for r in requests] From dfb522f027d6931ef6987784972b7c3ae093e842 Mon Sep 17 00:00:00 2001 From: Valentin Mendelev Date: Fri, 30 Jan 2026 14:01:15 -0800 Subject: [PATCH 17/26] Cache HF resolve URL loads in MagpieTTS backend Route HuggingFace resolve URLs used by NeMo audio codec checkpoints through huggingface_hub download so multi-rank server startup avoids repeated downloads and 429s. --- .../server/backends/magpie_tts_backend.py | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/recipes/multimodal/server/backends/magpie_tts_backend.py b/recipes/multimodal/server/backends/magpie_tts_backend.py index 3ed09b02e6..9ac2faabe5 100644 --- a/recipes/multimodal/server/backends/magpie_tts_backend.py +++ b/recipes/multimodal/server/backends/magpie_tts_backend.py @@ -93,6 +93,48 @@ def __init__(self, config: BackendConfig): self._model = self._runner = self._temp_dir = self._checkpoint_name = None def load_model(self) -> None: + # Some dependencies inside NeMo's audio codec model load weights via raw + # HuggingFace "resolve" URLs using fsspec's HTTP filesystem, which does + # not cache and can easily hit 429s when many ranks start concurrently. + # + # Patch NeMo's `load_fsspec()` at runtime to route HuggingFace resolve + # URLs through `huggingface_hub.hf_hub_download()` (uses file locks and + # local caching under HF_HOME/HF_HUB_CACHE), then load from the local path. + try: # best-effort; do not fail server if patching is unavailable + import os + import re + + import nemo.collections.tts.modules.audio_codec_modules as _acm # type: ignore + + _orig_load_fsspec = getattr(_acm, "load_fsspec", None) + if callable(_orig_load_fsspec) and not getattr(_acm, "_hf_load_fsspec_patched", False): + try: + from huggingface_hub import hf_hub_download # type: ignore + + def _hf_resolve_to_local(url: str) -> str | None: + m = re.match(r"^https?://huggingface\\.co/([^/]+)/([^/]+)/resolve/([^/]+)/(.+)$", url) + if not m: + return None + repo_id = f"{m.group(1)}/{m.group(2)}" + revision = m.group(3) + filename = m.group(4) + token = os.environ.get("HF_TOKEN") or None + return hf_hub_download(repo_id=repo_id, filename=filename, revision=revision, token=token) + + def _load_fsspec_patched(path: str, map_location: str = None, **kwargs): + if isinstance(path, str) and path.startswith("http"): + local = _hf_resolve_to_local(path) + if local: + return _orig_load_fsspec(local, map_location=map_location, **kwargs) + return _orig_load_fsspec(path, map_location=map_location, **kwargs) + + _acm.load_fsspec = _load_fsspec_patched # type: ignore[assignment] + _acm._hf_load_fsspec_patched = True # type: ignore[attr-defined] + except Exception: + pass + except Exception: + pass + from nemo.collections.tts.modules.magpietts_inference.inference import InferenceConfig, MagpieInferenceRunner from nemo.collections.tts.modules.magpietts_inference.utils import ModelLoadConfig, load_magpie_model From 80a2d7c001172b245defacc310d6865ae9cc1f62 Mon Sep 17 00:00:00 2001 From: Valentin Mendelev Date: Fri, 30 Jan 2026 14:24:46 -0800 Subject: [PATCH 18/26] Disable MagpieTTS KV cache to avoid shape mismatches Longform decoding with the transformer cache path can produce sequence-length mismatches; disable cache per request batch to prevent 500s in serve_unified. --- recipes/multimodal/server/backends/magpie_tts_backend.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/recipes/multimodal/server/backends/magpie_tts_backend.py b/recipes/multimodal/server/backends/magpie_tts_backend.py index 9ac2faabe5..b794de0a3f 100644 --- a/recipes/multimodal/server/backends/magpie_tts_backend.py +++ b/recipes/multimodal/server/backends/magpie_tts_backend.py @@ -214,8 +214,10 @@ def generate(self, requests: List[GenerationRequest]) -> List[GenerationResult]: if self._model is not None: decoder = getattr(self._model, "decoder", None) if decoder is not None and hasattr(decoder, "reset_cache"): - # Keep caching enabled for decoding speed, but start from a clean cache. - decoder.reset_cache(use_cache=True) + # Disable KV caching: NeMo's transformer cache path can produce + # sequence-length mismatches under longform decoding when the + # server batches/streams requests. + decoder.reset_cache(use_cache=False) except Exception: # Best-effort: if cache reset fails for any reason, continue and let the # underlying stack surface the real error. From 9fe27034283854f14f6a6909defd94486fa622eb Mon Sep 17 00:00:00 2001 From: Valentin Mendelev Date: Fri, 30 Jan 2026 15:11:32 -0800 Subject: [PATCH 19/26] Fix HF resolve URL caching in MagpieTTS backend Correct HuggingFace resolve URL matching so downloads go through hf_hub_download() and avoid multi-rank 429s. --- recipes/multimodal/server/backends/magpie_tts_backend.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/recipes/multimodal/server/backends/magpie_tts_backend.py b/recipes/multimodal/server/backends/magpie_tts_backend.py index b794de0a3f..4c2eee7477 100644 --- a/recipes/multimodal/server/backends/magpie_tts_backend.py +++ b/recipes/multimodal/server/backends/magpie_tts_backend.py @@ -112,7 +112,12 @@ def load_model(self) -> None: from huggingface_hub import hf_hub_download # type: ignore def _hf_resolve_to_local(url: str) -> str | None: - m = re.match(r"^https?://huggingface\\.co/([^/]+)/([^/]+)/resolve/([^/]+)/(.+)$", url) + # Some NeMo deps pass raw HF "resolve" URLs via fsspec http. + # Route them through HF Hub caching + file locks to avoid 429s. + if not isinstance(url, str): + return None + url_no_q = url.split("?", 1)[0] + m = re.match(r"^https?://huggingface\.co/([^/]+)/([^/]+)/resolve/([^/]+)/(.+)$", url_no_q) if not m: return None repo_id = f"{m.group(1)}/{m.group(2)}" From 136af12ec4b11030a8290f4a5133cc7edace3ee6 Mon Sep 17 00:00:00 2001 From: Valentin Mendelev Date: Fri, 30 Jan 2026 15:39:46 -0800 Subject: [PATCH 20/26] Avoid killing multi-instance tasks via srun --wait Stop setting srun --wait by default; allow opt-in via cluster_config.srun_wait_seconds. --- nemo_skills/pipeline/utils/exp.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/nemo_skills/pipeline/utils/exp.py b/nemo_skills/pipeline/utils/exp.py index 3eed966b33..c8729f89b3 100644 --- a/nemo_skills/pipeline/utils/exp.py +++ b/nemo_skills/pipeline/utils/exp.py @@ -312,13 +312,23 @@ def get_executor( srun_args = [ "--no-container-mount-home", "--mpi=pmix", - "--wait=240", # wait up to 4 minutes for slower tasks to complete (important for multi-instance mode) # we need to be explicit about this in srun as commands might need to run in parallel f"--ntasks-per-node={tasks_per_node}", f"--nodes={num_nodes}", # NeMo-run should take care of this, but we'll put it here temporarily f"--container-env={','.join([k.strip() for k in env_vars.keys()])}", ] + # IMPORTANT: + # Slurm's `srun --wait=` terminates the job step if other tasks are still + # running seconds after the first task exits. For multi-instance runs + # (e.g., chunked evaluation), task runtimes can differ widely, and a low wait + # will kill long-running tasks (observed with `--wait=240`). + # + # If you need this behavior, configure it explicitly in the cluster config: + # srun_wait_seconds: + srun_wait_seconds = cluster_config.get("srun_wait_seconds") + if srun_wait_seconds is not None: + srun_args.append(f"--wait={int(srun_wait_seconds)}") if overlap: srun_args.append("--overlap") if not cluster_config.get("disable_gpus_per_node", False) and gpus_per_node is not None: From 8f6d68ff9164c16d6928f963445eedc3c03e2822 Mon Sep 17 00:00:00 2001 From: Valentin Mendelev Date: Fri, 30 Jan 2026 15:53:19 -0800 Subject: [PATCH 21/26] Override srun wait for multi-instance jobs Add a large srun --wait for multi-instance runs to override nemo_run's default --wait=60, preventing premature termination when some ranks finish earlier. --- nemo_skills/pipeline/utils/exp.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/nemo_skills/pipeline/utils/exp.py b/nemo_skills/pipeline/utils/exp.py index c8729f89b3..7d1dc31487 100644 --- a/nemo_skills/pipeline/utils/exp.py +++ b/nemo_skills/pipeline/utils/exp.py @@ -320,13 +320,20 @@ def get_executor( ] # IMPORTANT: # Slurm's `srun --wait=` terminates the job step if other tasks are still - # running seconds after the first task exits. For multi-instance runs - # (e.g., chunked evaluation), task runtimes can differ widely, and a low wait - # will kill long-running tasks (observed with `--wait=240`). + # running seconds after the first task exits. # - # If you need this behavior, configure it explicitly in the cluster config: + # `nemo_run` adds `--wait=60` by default; for multi-instance runs (e.g., chunked + # evaluation) tasks can finish at very different times (some may exit quickly + # due to `++skip_filled=True`), which causes Slurm to kill still-running tasks. + # + # We override this with a large wait by default for multi-instance mode. + # You can customize via cluster config: # srun_wait_seconds: srun_wait_seconds = cluster_config.get("srun_wait_seconds") + if srun_wait_seconds is None and tasks_per_node > 1: + # Use a very large wait (1 day) so long-running ranks aren't killed just + # because other ranks finished earlier. + srun_wait_seconds = 24 * 60 * 60 if srun_wait_seconds is not None: srun_args.append(f"--wait={int(srun_wait_seconds)}") if overlap: From c23805d0e531119d295115787f2a74d4a29bcb39 Mon Sep 17 00:00:00 2001 From: Valentin Mendelev Date: Fri, 30 Jan 2026 16:10:34 -0800 Subject: [PATCH 22/26] Reduce MagpieTTS inference batch size Lower Magpie inference runner batch size to reduce memory/latency spikes under multi-instance load. --- recipes/multimodal/server/backends/magpie_tts_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/multimodal/server/backends/magpie_tts_backend.py b/recipes/multimodal/server/backends/magpie_tts_backend.py index 4c2eee7477..0e42b752b2 100644 --- a/recipes/multimodal/server/backends/magpie_tts_backend.py +++ b/recipes/multimodal/server/backends/magpie_tts_backend.py @@ -175,7 +175,7 @@ def _load_fsspec_patched(path: str, map_location: str = None, **kwargs): use_cfg=self.tts_config.use_cfg, cfg_scale=self.tts_config.cfg_scale, use_local_transformer=self.tts_config.use_local_transformer, - batch_size=32, + batch_size=16, ), ) From c4824123816b4daf64916a6281715c1096668138 Mon Sep 17 00:00:00 2001 From: Valentin Mendelev Date: Fri, 30 Jan 2026 16:35:08 -0800 Subject: [PATCH 23/26] Set multi-instance srun wait to 1 hour Use a 1-hour default srun --wait for multi-instance runs to avoid premature task termination when chunk runtimes differ. --- nemo_skills/pipeline/utils/exp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_skills/pipeline/utils/exp.py b/nemo_skills/pipeline/utils/exp.py index 7d1dc31487..6166ec3763 100644 --- a/nemo_skills/pipeline/utils/exp.py +++ b/nemo_skills/pipeline/utils/exp.py @@ -331,9 +331,9 @@ def get_executor( # srun_wait_seconds: srun_wait_seconds = cluster_config.get("srun_wait_seconds") if srun_wait_seconds is None and tasks_per_node > 1: - # Use a very large wait (1 day) so long-running ranks aren't killed just + # Use a reasonably large wait (1 hour) so long-running ranks aren't killed just # because other ranks finished earlier. - srun_wait_seconds = 24 * 60 * 60 + srun_wait_seconds = 60 * 60 if srun_wait_seconds is not None: srun_args.append(f"--wait={int(srun_wait_seconds)}") if overlap: From 5d104d344fb0b3ea85cceac1ba07ed4c3629b75b Mon Sep 17 00:00:00 2001 From: Valentin Mendelev Date: Sat, 31 Jan 2026 14:23:06 -0800 Subject: [PATCH 24/26] Add emergent_tts dataset + eval scripts Introduce the emergent_tts dataset package with prepare/generate/score helpers and default configs to run EmergentTTS evaluation via NeMo-Skills. Co-authored-by: Cursor --- nemo_skills/dataset/emergent_tts/__init__.py | 6 + .../dataset/emergent_tts/emergent/__init__.py | 3 + nemo_skills/dataset/emergent_tts/prepare.py | 238 +++++++++++++++++ .../dataset/emergent_tts/scripts/__init__.py | 2 + .../emergent_tts/scripts/check_deps.py | 95 +++++++ .../emergent_tts/scripts/config/default.yaml | 61 +++++ .../scripts/config/interactive_10.yaml | 22 ++ .../scripts/config/local_interactive_10.yaml | 30 +++ .../config/local_interactive_10_base.yaml | 26 ++ .../scripts/convert_ns_outputs_to_emergent.py | 92 +++++++ .../emergent_tts/scripts/run_tts_eval.py | 168 ++++++++++++ .../dataset/emergent_tts/scripts/score.py | 252 ++++++++++++++++++ 12 files changed, 995 insertions(+) create mode 100644 nemo_skills/dataset/emergent_tts/__init__.py create mode 100644 nemo_skills/dataset/emergent_tts/emergent/__init__.py create mode 100644 nemo_skills/dataset/emergent_tts/prepare.py create mode 100644 nemo_skills/dataset/emergent_tts/scripts/__init__.py create mode 100644 nemo_skills/dataset/emergent_tts/scripts/check_deps.py create mode 100644 nemo_skills/dataset/emergent_tts/scripts/config/default.yaml create mode 100644 nemo_skills/dataset/emergent_tts/scripts/config/interactive_10.yaml create mode 100644 nemo_skills/dataset/emergent_tts/scripts/config/local_interactive_10.yaml create mode 100644 nemo_skills/dataset/emergent_tts/scripts/config/local_interactive_10_base.yaml create mode 100644 nemo_skills/dataset/emergent_tts/scripts/convert_ns_outputs_to_emergent.py create mode 100644 nemo_skills/dataset/emergent_tts/scripts/run_tts_eval.py create mode 100644 nemo_skills/dataset/emergent_tts/scripts/score.py diff --git a/nemo_skills/dataset/emergent_tts/__init__.py b/nemo_skills/dataset/emergent_tts/__init__.py new file mode 100644 index 0000000000..c95f451485 --- /dev/null +++ b/nemo_skills/dataset/emergent_tts/__init__.py @@ -0,0 +1,6 @@ +"""EmergentTTS-Eval dataset integration for NeMo-Skills. + +This package contains tooling to prepare the EmergentTTS-Eval benchmark for +NeMo-Skills evaluation runs. +""" + diff --git a/nemo_skills/dataset/emergent_tts/emergent/__init__.py b/nemo_skills/dataset/emergent_tts/emergent/__init__.py new file mode 100644 index 0000000000..13edac0edf --- /dev/null +++ b/nemo_skills/dataset/emergent_tts/emergent/__init__.py @@ -0,0 +1,3 @@ +# EmergentTTS-Eval benchmark (NeMo-Skills) + +GENERATION_ARGS = "++prompt_format=openai" diff --git a/nemo_skills/dataset/emergent_tts/prepare.py b/nemo_skills/dataset/emergent_tts/prepare.py new file mode 100644 index 0000000000..f3616cc2d8 --- /dev/null +++ b/nemo_skills/dataset/emergent_tts/prepare.py @@ -0,0 +1,238 @@ +#!/usr/bin/env python3 +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Prepare EmergentTTS-Eval benchmark for NeMo-Skills. + +This script: +1) Downloads the EmergentTTS-Eval HF dataset +2) Saves baseline audios to wav files +3) Writes `data/emergent_tts_eval_data.jsonl` in Emergent's expected schema +4) Downloads `data/wv_mos.ckpt` +5) Writes NeMo-Skills `test.jsonl` for generation (OpenAI prompt format) + +Typical usage (to create everything under your shared NeMo-Skills data dir): + python prepare.py --output_dir /lustre/.../data_dir/emergent_tts +""" + +from __future__ import annotations + +import argparse +import json +import os +import time +import urllib.request +from urllib.error import ContentTooShortError +from pathlib import Path + + +SYSTEM_MESSAGE = "You are a helpful assistant." +DEFAULT_DATASET = "bosonai/EmergentTTS-Eval" +DEFAULT_SPLIT = "train" +WV_MOS_URL = "https://zenodo.org/record/6201162/files/wav2vec2.ckpt?download=1" + + +def _require_deps(): + try: + import numpy as np # noqa: F401 + from datasets import load_dataset # noqa: F401 + import librosa # noqa: F401 + import soundfile # noqa: F401 + from pydub import AudioSegment # noqa: F401 + from tqdm import tqdm # noqa: F401 + except Exception as e: # pragma: no cover + raise RuntimeError( + "Missing dependencies for EmergentTTS-Eval preparation.\n\n" + "Install into the repo venv:\n" + " cd /home/vmendelev/workspace/expressiveness/src/nemo-skills-tts-eval\n" + " . ./.venv/bin/activate\n" + " pip install datasets numpy pydub tqdm librosa soundfile\n" + ) from e + + +def _download_wv_mos(dst_path: Path, overwrite: bool) -> None: + dst_path.parent.mkdir(parents=True, exist_ok=True) + if dst_path.exists() and not overwrite: + return + tmp_path = dst_path.with_suffix(dst_path.suffix + ".tmp") + + # Zenodo downloads can occasionally fail with ContentTooShortError; retry. + max_attempts = 5 + for attempt in range(1, max_attempts + 1): + if tmp_path.exists(): + tmp_path.unlink() + try: + urllib.request.urlretrieve(WV_MOS_URL, str(tmp_path)) + tmp_path.replace(dst_path) + return + except ContentTooShortError as e: + # Partial download: wait and retry. + wait_s = min(5 * attempt, 30) + print(f"Warning: partial download for wv_mos.ckpt (attempt {attempt}/{max_attempts}): {e}") + time.sleep(wait_s) + except Exception as e: + wait_s = min(5 * attempt, 30) + print(f"Warning: failed downloading wv_mos.ckpt (attempt {attempt}/{max_attempts}): {e}") + time.sleep(wait_s) + + raise RuntimeError(f"Failed to download wv_mos.ckpt after {max_attempts} attempts: {WV_MOS_URL}") + + +def _write_benchmark_init(bench_dir: Path) -> None: + bench_dir.mkdir(parents=True, exist_ok=True) + init_path = bench_dir / "__init__.py" + init_path.write_text( + ( + "# EmergentTTS-Eval benchmark (NeMo-Skills)\n\n" + 'GENERATION_ARGS = "++prompt_format=openai"\n' + ), + encoding="utf-8", + ) + + +def _to_nemo_skills_entry(sample: dict) -> dict: + # MagpieTTS backend expects JSON with at least `text`. We also keep Emergent + # metadata to enable deterministic conversion/scoring later. + payload = { + "text": sample["text_to_synthesize"], + "text_to_synthesize": sample["text_to_synthesize"], + "category": sample["category"], + "evolution_depth": sample["evolution_depth"], + "language": sample["language"], + "unique_id_eval": sample["unique_id_eval"], + # Optional fields used by MagpieTTS evaluation code paths. + "context_audio_filepath": "", + "duration": 5.0, + "context_audio_duration": 5.0, + } + return { + "problem": "", + "messages": [ + {"role": "system", "content": SYSTEM_MESSAGE}, + {"role": "user", "content": json.dumps(payload, ensure_ascii=False)}, + ], + } + + +def main(): + _require_deps() + import numpy as np + from datasets import load_dataset + from pydub import AudioSegment + from tqdm import tqdm + + parser = argparse.ArgumentParser(description="Prepare EmergentTTS-Eval for NeMo-Skills") + parser.add_argument( + "--output_dir", + type=str, + default=str(Path(__file__).parent), + help="Where to create emergent_tts module structure (default: folder containing this script).", + ) + parser.add_argument("--dataset", type=str, default=DEFAULT_DATASET, help="HF dataset name") + parser.add_argument("--split", type=str, default=DEFAULT_SPLIT, help="HF split to download (train contains 1645)") + parser.add_argument( + "--overwrite", + action="store_true", + help="Overwrite existing files (baseline audios, jsonl, wv_mos.ckpt, test.jsonl).", + ) + parser.add_argument( + "--num_samples", + type=int, + default=None, + help="Optional: limit number of samples (debug). If set, takes the first N rows.", + ) + args = parser.parse_args() + + output_dir = Path(args.output_dir).resolve() + data_dir = output_dir / "data" + baseline_audios_dir = data_dir / "baseline_audios" + baseline_audios_dir.mkdir(parents=True, exist_ok=True) + + # Emergent expected files + emergent_jsonl_path = data_dir / "emergent_tts_eval_data.jsonl" + wv_mos_path = data_dir / "wv_mos.ckpt" + + # NeMo-Skills benchmark module structure + bench_dir = output_dir / "emergent" + test_jsonl_path = bench_dir / "test.jsonl" + _write_benchmark_init(bench_dir) + + # Download dataset + dataset_hf = load_dataset(args.dataset, split=args.split) + total = len(dataset_hf) if args.num_samples is None else min(args.num_samples, len(dataset_hf)) + + if emergent_jsonl_path.exists() and test_jsonl_path.exists() and not args.overwrite: + print(f"Found existing outputs under {output_dir}. Use --overwrite to rebuild.") + else: + if args.overwrite: + for p in [emergent_jsonl_path, test_jsonl_path]: + if p.exists(): + p.unlink() + + emergent_records: list[dict] = [] + + # Build emergent jsonl + baseline audios + for i in tqdm(range(total), desc="Preparing EmergentTTS-Eval"): + curr = dataset_hf[i] + unique_id = i + + # Save baseline audio + wav_path = baseline_audios_dir / f"{unique_id}.wav" + if args.overwrite or not wav_path.exists(): + audio_array = curr["audio"]["array"] + audio_sr = int(curr["audio"]["sampling_rate"]) + audio_array_int16 = np.int16(audio_array * 32767) + audio_segment = AudioSegment( + audio_array_int16.tobytes(), + frame_rate=audio_sr, + sample_width=2, + channels=1, + ) + audio_segment.export(str(wav_path), format="wav") + + emergent_records.append( + { + "unique_id_eval": unique_id, + "category": curr["category"], + "text_to_synthesize": curr["text_to_synthesize"], + "evolution_depth": curr["evolution_depth"], + "language": curr["language"], + } + ) + + # Write emergent jsonl data file + emergent_jsonl_path.parent.mkdir(parents=True, exist_ok=True) + with open(emergent_jsonl_path, "w", encoding="utf-8") as f: + for rec in emergent_records: + f.write(json.dumps(rec, ensure_ascii=False) + "\n") + + # Write NeMo-Skills test.jsonl + with open(test_jsonl_path, "w", encoding="utf-8") as f: + for rec in emergent_records: + f.write(json.dumps(_to_nemo_skills_entry(rec), ensure_ascii=False) + "\n") + + # Download MOS model checkpoint (used by Emergent scoring) + _download_wv_mos(wv_mos_path, overwrite=args.overwrite) + + print("\nPrepared EmergentTTS-Eval:") + print(f" - data dir: {data_dir}") + print(f" - baseline audios: {baseline_audios_dir}") + print(f" - emergent jsonl: {emergent_jsonl_path}") + print(f" - wv_mos.ckpt: {wv_mos_path}") + print(f" - nemo-skills test.jsonl: {test_jsonl_path}") + + +if __name__ == "__main__": + main() + diff --git a/nemo_skills/dataset/emergent_tts/scripts/__init__.py b/nemo_skills/dataset/emergent_tts/scripts/__init__.py new file mode 100644 index 0000000000..b1989f6c3b --- /dev/null +++ b/nemo_skills/dataset/emergent_tts/scripts/__init__.py @@ -0,0 +1,2 @@ +"""Scripts for running EmergentTTS-Eval via NeMo-Skills.""" + diff --git a/nemo_skills/dataset/emergent_tts/scripts/check_deps.py b/nemo_skills/dataset/emergent_tts/scripts/check_deps.py new file mode 100644 index 0000000000..459cfc3311 --- /dev/null +++ b/nemo_skills/dataset/emergent_tts/scripts/check_deps.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 + +"""Dependency checker for EmergentTTS-Eval integration. + +This script is meant to fail fast with a clear actionable message when you are +missing Python packages needed for: +- dataset preparation (`prepare.py`) +- scoring (EmergentTTS-Eval-public `inference.py`) +""" + +from __future__ import annotations + +import argparse +import importlib +import os +from pathlib import Path + + +def _try_import(module: str) -> str | None: + try: + importlib.import_module(module) + return None + except Exception as e: + return f"{module} ({type(e).__name__}: {e})" + + +def _venv_install_hint(*, emergent_repo_path: str | None) -> str: + repo_root = Path(__file__).resolve().parents[4] # .../nemo_skills/dataset/emergent_tts/scripts + lines = [ + "To install missing deps into the repo venv:", + f" cd {repo_root}", + " . ./.venv/bin/activate", + " pip install -e .", + " pip install librosa soundfile", + ] + if emergent_repo_path: + lines.append(f" pip install -r {Path(emergent_repo_path).resolve()}/requirements.txt") + else: + lines.append(" pip install -r /path/to/EmergentTTS-Eval-public/requirements.txt") + return "\n".join(lines) + + +def main(): + p = argparse.ArgumentParser(description="Check dependencies for EmergentTTS-Eval integration") + p.add_argument("--stage", choices=["prepare", "scoring", "all"], default="all") + p.add_argument( + "--emergent_repo_path", + default=os.environ.get("EMERGENT_TTS_EVAL_REPO", ""), + help="Path to EmergentTTS-Eval-public (used only to print install hint)", + ) + args = p.parse_args() + + emergent_repo_path = args.emergent_repo_path or None + + missing: list[str] = [] + + if args.stage in ("prepare", "all"): + for mod in ["datasets", "numpy", "pydub", "tqdm", "librosa", "soundfile"]: + err = _try_import(mod) + if err: + missing.append(err) + + if args.stage in ("scoring", "all"): + # Minimal set required by EmergentTTS-Eval-public scoring path (fetch-audios mode) + for mod in [ + "torch", + "transformers", + "editdistance", + "whisper_normalizer", + "json_repair", + "tenacity", + "openai", + "google.genai", + "pydub", + "librosa", + "soundfile", + ]: + err = _try_import(mod) + if err: + missing.append(err) + + if missing: + print("Missing required dependencies:\n") + for m in missing: + print(f"- {m}") + print() + print(_venv_install_hint(emergent_repo_path=emergent_repo_path)) + raise SystemExit(2) + + print("All required dependencies are available.") + + +if __name__ == "__main__": + main() + diff --git a/nemo_skills/dataset/emergent_tts/scripts/config/default.yaml b/nemo_skills/dataset/emergent_tts/scripts/config/default.yaml new file mode 100644 index 0000000000..5015e4151c --- /dev/null +++ b/nemo_skills/dataset/emergent_tts/scripts/config/default.yaml @@ -0,0 +1,61 @@ +# EmergentTTS-Eval pipeline configuration (example) +# +# NOTE: Before running generation, create the dataset under generation.data_dir: +# python nemo_skills/dataset/emergent_tts/prepare.py --output_dir /emergent_tts +# +# Then run: +# python -m nemo_skills.dataset.emergent_tts.scripts.run_tts_eval --config --stage all + +cluster: eos +container: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/containters/nemo-25.11.sqsh +partition: batch +mount_paths: /lustre:/lustre + +# Where NeMo-Skills will write eval-results/ and eval-logs/ +output_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/emergent_tts_eval_full_8chunks + +# NeMo source checkout on EOS (needed for MagpieTTS inference modules). +nemo_code_path: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/experimenta/tts_eval/NeMo + +generation: + benchmarks: emergent_tts.emergent + model: nvidia/magpie_tts_multilingual_357m + server_type: generic + # One GPU for the server process. + server_gpus: 1 + server_entrypoint: python -m nemo_skills.inference.server.serve_unified + server_args: --backend magpie_tts --codec_model nvidia/nemo-nano-codec-22khz-1.89kbps-21.5fps --batch_size 16 --batch_timeout 0.1 --use_cfg --cfg_scale 2.5 + + # Shared NeMo-Skills data_dir. Must contain emergent_tts/emergent/test.jsonl and emergent_tts/emergent/__init__.py + data_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/data_dir + + # Full run: split across 8 chunks. + num_chunks: 8 + # Request 8 GPUs per node for the generation job allocation. + # (Generation itself calls the server; this matches nv_tts scheduling expectations.) + gpus_per_node: 8 + extra_args: ++server.server_type=vllm_multimodal + +scoring: + gpus: 1 + # Container for scoring jobs (conversion + Emergent eval). Use the same container + # as the generation "main" job (not the Magpie server container). + container: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/containters/nemo-25.11.sqsh + # Install missing Python deps at job start (runs inside the scoring container). + # Keep this conservative: avoid upgrading core deps inside the base container. + installation_command: pip install editdistance whisper-normalizer json-repair tenacity + + # Path to EmergentTTS-Eval-public on the cluster (added to PYTHONPATH) + scoring_code_path: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/code/EmergentTTS-Eval-public + + # Path to Emergent data directory created by prepare.py: + # /emergent_tts/data + emergent_data_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/data_dir/emergent_tts/data + + # Judge configuration (OpenAI-compatible via NVIDIA Inference API) + judge_model: gcp/google/gemini-2.5-pro + judger_base_url: https://inference-api.nvidia.com/v1/chat/completions + num_threads: 8 + evaluate_function: win_rate + strong_prompting: false + diff --git a/nemo_skills/dataset/emergent_tts/scripts/config/interactive_10.yaml b/nemo_skills/dataset/emergent_tts/scripts/config/interactive_10.yaml new file mode 100644 index 0000000000..dfef60facf --- /dev/null +++ b/nemo_skills/dataset/emergent_tts/scripts/config/interactive_10.yaml @@ -0,0 +1,22 @@ +# EmergentTTS-Eval: interactive 10-sample generation smoke test + +cluster: eos +container: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/containters/nemo-25.11.sqsh +partition: interactive +mount_paths: /lustre:/lustre + +output_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/emergent_tts_smoke10 +nemo_code_path: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/experimenta/tts_eval/NeMo + +generation: + benchmarks: emergent_tts.emergent + model: nvidia/magpie_tts_multilingual_357m + server_type: generic + server_gpus: 1 + server_entrypoint: python -m nemo_skills.inference.server.serve_unified + server_args: --backend magpie_tts --codec_model nvidia/nemo-nano-codec-22khz-1.89kbps-21.5fps --batch_size 32 --batch_timeout 0.1 --use_cfg --cfg_scale 2.5 + data_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/data_dir + num_chunks: 1 + gpus_per_node: 1 + extra_args: ++max_samples=10 ++server.server_type=vllm_multimodal + diff --git a/nemo_skills/dataset/emergent_tts/scripts/config/local_interactive_10.yaml b/nemo_skills/dataset/emergent_tts/scripts/config/local_interactive_10.yaml new file mode 100644 index 0000000000..e50bca8e2f --- /dev/null +++ b/nemo_skills/dataset/emergent_tts/scripts/config/local_interactive_10.yaml @@ -0,0 +1,30 @@ +# EmergentTTS-Eval: local (docker) 10-sample generation smoke test +# +# Usage: +# export NEMO_SKILLS_CONFIG_DIR=/home/vmendelev/workspace/expressiveness/src/nemo-skills-tts-eval/cluster_configs +# python -m nemo_skills.dataset.emergent_tts.scripts.run_tts_eval --config --stage generation + +cluster: local_nemo_25_11 +partition: interactive + +output_dir: /home/vmendelev/workspace/expressiveness/src/nemo-skills-tts-eval/_local_runs/emergent_tts_smoke10 + +# Optional: if your local setup needs a NeMo source checkout for MagpieTTS inference, +# set this to an absolute host path and ensure it's mounted in the local cluster config. +nemo_code_path: /home/vmendelev/workspace/expressiveness/src/NeMo + +generation: + benchmarks: emergent_tts.emergent + model: nvidia/magpie_tts_multilingual_357m + server_type: generic + server_gpus: 1 + server_entrypoint: python -m nemo_skills.inference.server.serve_unified + server_args: --backend magpie_tts --codec_model nvidia/nemo-nano-codec-22khz-1.89kbps-21.5fps --batch_size 8 --batch_timeout 0.1 --use_cfg --cfg_scale 2.5 + + # Use the repo dataset folder (contains emergent_tts/emergent/test.jsonl). + data_dir: /home/vmendelev/workspace/expressiveness/src/nemo-skills-tts-eval/nemo_skills/dataset + + num_chunks: 1 + gpus_per_node: 1 + extra_args: ++max_samples=10 ++server.server_type=vllm_multimodal + diff --git a/nemo_skills/dataset/emergent_tts/scripts/config/local_interactive_10_base.yaml b/nemo_skills/dataset/emergent_tts/scripts/config/local_interactive_10_base.yaml new file mode 100644 index 0000000000..6816e5b523 --- /dev/null +++ b/nemo_skills/dataset/emergent_tts/scripts/config/local_interactive_10_base.yaml @@ -0,0 +1,26 @@ +# EmergentTTS-Eval: local (docker) 10-sample generation smoke test +# Using the base NeMo container (nvcr.io/nvidia/nemo:25.11). + +cluster: local_nemo_25_11_base +partition: interactive + +# For local docker runs, set the container to use for the *server* task. +# (The main task uses cluster_config.containers.nemo-skills.) +container: nvcr.io/nvidia/nemo:25.11 + +output_dir: /home/vmendelev/workspace/expressiveness/src/nemo-skills-tts-eval/_local_runs/emergent_tts_smoke10_base + +nemo_code_path: /home/vmendelev/workspace/expressiveness/src/NeMo + +generation: + benchmarks: emergent_tts.emergent + model: nvidia/magpie_tts_multilingual_357m + server_type: generic + server_gpus: 1 + server_entrypoint: python -m nemo_skills.inference.server.serve_unified + server_args: --backend magpie_tts --codec_model nvidia/nemo-nano-codec-22khz-1.89kbps-21.5fps --batch_size 8 --batch_timeout 0.1 --use_cfg --cfg_scale 2.5 + data_dir: /home/vmendelev/workspace/expressiveness/src/nemo-skills-tts-eval/nemo_skills/dataset + num_chunks: 1 + gpus_per_node: 1 + extra_args: ++max_samples=10 ++server.server_type=vllm_multimodal + diff --git a/nemo_skills/dataset/emergent_tts/scripts/convert_ns_outputs_to_emergent.py b/nemo_skills/dataset/emergent_tts/scripts/convert_ns_outputs_to_emergent.py new file mode 100644 index 0000000000..66ab564990 --- /dev/null +++ b/nemo_skills/dataset/emergent_tts/scripts/convert_ns_outputs_to_emergent.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +# Copyright (c) 2026, NVIDIA CORPORATION. + +from __future__ import annotations + +import argparse +import json +import os +import shutil +from pathlib import Path + + +def _extract_user_json(record: dict) -> dict | None: + for msg in record.get("messages", []): + if msg.get("role") != "user": + continue + content = msg.get("content") + if isinstance(content, dict): + return content + if isinstance(content, str): + try: + return json.loads(content) + except json.JSONDecodeError: + return None + return None + + +def _link_or_copy(src: str, dst: str, mode: str): + if mode == "symlink": + if os.path.islink(dst): + if os.readlink(dst) == src: + return + os.unlink(dst) + elif os.path.exists(dst): + os.unlink(dst) + os.symlink(src, dst) + return + + if mode == "copy": + shutil.copyfile(src, dst) + return + + raise ValueError(f"Unknown mode: {mode}") + + +def main(): + p = argparse.ArgumentParser(description="Convert NeMo-Skills TTS outputs into Emergent audio layout") + p.add_argument("--ns_output_jsonl", required=True, help="Path to NeMo-Skills output.jsonl") + p.add_argument("--out_dir", required=True, help="Destination directory for .wav") + p.add_argument("--mode", choices=["symlink", "copy"], default="symlink") + p.add_argument("--overwrite", action="store_true") + args = p.parse_args() + + out_dir = Path(args.out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + converted = 0 + skipped = 0 + missing = 0 + + with open(args.ns_output_jsonl, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + record = json.loads(line) + user_json = _extract_user_json(record) or {} + unique_id = user_json.get("unique_id_eval", record.get("unique_id_eval")) + audio_path = (record.get("audio") or {}).get("path") + + if unique_id is None: + skipped += 1 + continue + if not audio_path or not os.path.exists(audio_path): + missing += 1 + continue + + dst = out_dir / f"{unique_id}.wav" + if dst.exists() and not args.overwrite: + continue + _link_or_copy(audio_path, str(dst), args.mode) + converted += 1 + + print( + f"Converted {converted} files into {out_dir}. " + f"skipped(no unique_id_eval)={skipped}, missing_audio={missing}" + ) + + +if __name__ == "__main__": + main() + diff --git a/nemo_skills/dataset/emergent_tts/scripts/run_tts_eval.py b/nemo_skills/dataset/emergent_tts/scripts/run_tts_eval.py new file mode 100644 index 0000000000..19d08f3497 --- /dev/null +++ b/nemo_skills/dataset/emergent_tts/scripts/run_tts_eval.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python3 +# Copyright (c) 2026, NVIDIA CORPORATION. + +""" +Emergent TTS Pipeline: Generation -> Scoring (-> Aggregation) + +This mirrors `nemo_skills/dataset/nv_tts/scripts/run_tts_eval.py` but uses +EmergentTTS-Eval scoring logic. +""" + +import argparse +import os + +import yaml + +from nemo_skills.pipeline.eval import eval as ns_eval +from nemo_skills.pipeline.run_cmd import run_cmd as ns_run_cmd + + +class MockContext: + """Mock typer.Context for programmatic calls.""" + + def __init__(self, extra_args=None): + self.args = extra_args or [] + + +def load_config(config_path: str) -> dict: + with open(config_path) as f: + return yaml.safe_load(f) + + +def run_generation(cfg: dict, expname: str): + gen = cfg["generation"] + # Mirror nv_tts behavior: allow injecting a NeMo source checkout into PYTHONPATH + # for the unified server (MagpieTTS inference code lives in NeMo). + server_args = gen["server_args"] + generation_code_path = cfg.get("generation_code_path") or cfg.get("nemo_code_path") + if generation_code_path: + server_args += f" --code_path {generation_code_path}" + + extra_args = gen.get("extra_args", "").split() if gen.get("extra_args") else [] + ctx = MockContext(extra_args) + return ns_eval( + ctx=ctx, + cluster=cfg["cluster"], + output_dir=cfg["output_dir"], + benchmarks=gen["benchmarks"], + model=gen["model"], + server_type=gen["server_type"], + server_gpus=gen["server_gpus"], + # Local executor doesn't require explicit container/mount_paths in the run YAML. + # For slurm clusters these are required and should be present in the config. + server_container=cfg.get("container", ""), + mount_paths=cfg.get("mount_paths", ""), + server_entrypoint=gen["server_entrypoint"], + server_args=server_args, + data_dir=gen["data_dir"], + num_chunks=gen["num_chunks"], + gpus_per_node=gen.get("gpus_per_node", 1), + partition=cfg["partition"], + expname=expname, + auto_summarize_results=False, + ) + + +def main(): + parser = argparse.ArgumentParser(description="Emergent TTS Eval Pipeline") + parser.add_argument("--config", required=True) + parser.add_argument( + "--stage", + choices=["all", "generation", "scoring", "aggregation"], + default="all", + ) + parser.add_argument("--expname", default="emergent_tts_eval") + args = parser.parse_args() + + cfg = load_config(args.config) + scoring = cfg.get("scoring", {}) + output_dir = cfg["output_dir"] + + gen_exp_name = None + + if args.stage in ("all", "generation"): + print("\n" + "=" * 60) + print("Stage 1: GENERATION") + print("=" * 60) + gen_exp = run_generation(cfg, args.expname) + gen_exp_name = args.expname + print(f"Generation submitted: {gen_exp}") + + if args.stage in ("all", "scoring"): + print("\n" + "=" * 60) + print("Stage 2: SCORING (EmergentTTS-Eval)") + print("=" * 60) + + benchmarks = cfg["generation"]["benchmarks"].split(",") + run_after = [gen_exp_name] if args.stage == "all" and gen_exp_name else None + + scoring_code_path = scoring.get("scoring_code_path", "") + emergent_data_dir = scoring.get("emergent_data_dir", "") + install_cmd = scoring.get("installation_command") + scoring_container = scoring.get("container") or "nemo-skills" + + # Required by Emergent's judge clients + judger_api_key = ( + os.environ.get("JUDGER_API_KEY") + or os.environ.get("NVIDIA_API_KEY") + or os.environ.get("OPENAI_API_KEY") + or "" + ) + if not judger_api_key: + print("Warning: JUDGER_API_KEY/NVIDIA_API_KEY/OPENAI_API_KEY not set; win_rate judging may fail.") + + for benchmark in benchmarks: + benchmark = benchmark.strip() + short_name = benchmark.split(".")[-1] + score_cmd = ( + f"JUDGER_API_KEY={judger_api_key} " + f"PYTHONPATH={scoring_code_path}:$PYTHONPATH " + f"python -m nemo_skills.dataset.emergent_tts.scripts.score " + f"--results_dir {output_dir} " + f"--benchmark {benchmark} " + f"--emergent_data_dir {emergent_data_dir} " + f"--judge_model {scoring.get('judge_model', 'gcp/google/gemini-2.5-pro')} " + f"--judger_base_url {scoring.get('judger_base_url', 'https://inference-api.nvidia.com/v1/chat/completions')} " + f"--num_threads {int(scoring.get('num_threads', 8))} " + f"--evaluate_function {scoring.get('evaluate_function', 'win_rate')}" + ) + if scoring.get("strong_prompting"): + score_cmd += " --strong_prompting" + + ns_run_cmd( + ctx=MockContext(), + cluster=cfg["cluster"], + container=scoring_container, + partition=cfg["partition"], + num_gpus=int(scoring.get("gpus", 1)), + mount_paths=cfg["mount_paths"], + command=score_cmd, + installation_command=install_cmd, + run_after=run_after, + expname=f"{args.expname}_score_{short_name}", + log_dir=f"{output_dir}/eval-logs", + ) + + if args.stage == "aggregation": + print("\n" + "=" * 60) + print("Stage 3: AGGREGATION") + print("=" * 60) + agg_cmd = f"python -m nemo_skills.dataset.emergent_tts.scripts.score --results_dir {output_dir} --aggregation_only" + ns_run_cmd( + ctx=MockContext(), + cluster=cfg["cluster"], + container=cfg["container"], + partition=cfg["partition"], + num_gpus=0, + mount_paths=cfg["mount_paths"], + command=agg_cmd, + expname=f"{args.expname}_agg", + log_dir=f"{output_dir}/eval-logs", + ) + + print("\nDone!") + + +if __name__ == "__main__": + main() + diff --git a/nemo_skills/dataset/emergent_tts/scripts/score.py b/nemo_skills/dataset/emergent_tts/scripts/score.py new file mode 100644 index 0000000000..ec5c77c58b --- /dev/null +++ b/nemo_skills/dataset/emergent_tts/scripts/score.py @@ -0,0 +1,252 @@ +#!/usr/bin/env python3 +# Copyright (c) 2026, NVIDIA CORPORATION. + +"""Run EmergentTTS-Eval scoring on NeMo-Skills generated audio. + +This script expects NeMo-Skills generation output layout: + /eval-results//output.jsonl + +It will: + 1) Convert NeMo-Skills `output.jsonl` audio paths into Emergent layout + (/emergent-tts-eval_output-audios/.wav) + 2) Run Emergent scoring in fetch-audios mode (no re-generation) + 3) Write `metrics.json` in the benchmark folder for consistency with other evals +""" + +from __future__ import annotations + +import argparse +import json +import os +from pathlib import Path + + +def _benchmarks_dir(results_dir: str) -> Path: + p = Path(results_dir) / "eval-results" + return p if p.exists() else Path(results_dir) + + +def _normalize_openai_base_url(url: str) -> str: + # Some callers pass the full endpoint; OpenAI client expects base URL. + suffix = "/v1/chat/completions" + if url.endswith(suffix): + return url[: -len("/chat/completions")] + return url + + +class _NoopModelClient: + """A minimal Emergent model_client for scoring-only runs.""" + + def prepare_emergent_tts_sample(self, text_to_synthesize, category, strong_prompting, prompting_object, **kwargs): + if strong_prompting: + user_message = ( + prompting_object.USER_MESSAGE_STRONG_TEMPLATE.replace( + "{{{descriptions}}}", prompting_object.ALL_DESCRIPTIONS[category] + ).replace("{{{text_to_synthesize}}}", text_to_synthesize) + ) + else: + user_message = prompting_object.USER_MESSAGE_DEFAULT_TEMPLATE.replace( + "{{{text_to_synthesize}}}", text_to_synthesize + ) + return prompting_object.SYSTEM_PROMPT_DEFAULT, user_message + + +def _convert(ns_output_jsonl: Path, out_dir: Path, overwrite: bool) -> None: + from nemo_skills.dataset.emergent_tts.scripts.convert_ns_outputs_to_emergent import main as convert_main + + # Reuse converter as a library via argv. + import sys + + argv = sys.argv + try: + sys.argv = [ + argv[0], + "--ns_output_jsonl", + str(ns_output_jsonl), + "--out_dir", + str(out_dir), + "--mode", + "symlink", + ] + (["--overwrite"] if overwrite else []) + convert_main() + finally: + sys.argv = argv + + +def _run_emergent_scoring( + *, + benchmark_dir: Path, + emergent_data_base_path: Path, + fetch_audios_from_path: Path, + baseline_audios_path: Path, + judge_model: str, + judger_base_url: str, + num_threads: int, + depths_to_evaluate: str, + categories_to_evaluate: str, + evaluate_function: str, + strong_prompting: bool, +): + # Import from EmergentTTS-Eval-public (caller should add it to PYTHONPATH). + import inference as emergent_inference # type: ignore + + # Tell Emergent code where to find `emergent_tts_eval_data.jsonl` and `wv_mos.ckpt`. + os.environ["EMERGENT_TTS_DATA_BASE_PATH"] = str(emergent_data_base_path) + + emergent_inference.eval_api_closed_model( + model_client=_NoopModelClient(), + accelerator=None, + depths_to_evaluate=depths_to_evaluate, + categories_to_evaluate=categories_to_evaluate, + seed=42, + output_dir=str(benchmark_dir), + num_samples=None, + baseline_audios_path=str(baseline_audios_path), + fetch_audios_from_path=str(fetch_audios_from_path), + judge_model=judge_model, + temperature=0.0, + evaluate_function=evaluate_function, + strong_prompting=strong_prompting, + judger_base_url=_normalize_openai_base_url(judger_base_url) if judger_base_url else None, + num_threads=num_threads, + model_name="nemo-skills-generated", + ) + + +def run_scoring( + *, + results_dir: str, + benchmark: str | None, + emergent_data_dir: str, + judge_model: str, + judger_base_url: str, + num_threads: int, + depths_to_evaluate: str, + categories_to_evaluate: str, + evaluate_function: str, + strong_prompting: bool, + overwrite_converted: bool, +): + bdir = _benchmarks_dir(results_dir) + emergent_data_dir_p = Path(emergent_data_dir) + emergent_base = emergent_data_dir_p # expects emergent_tts_eval_data.jsonl and wv_mos.ckpt here + baseline_audios = emergent_data_dir_p / "baseline_audios" + + if benchmark: + benches = [benchmark] + else: + benches = [p.name for p in bdir.iterdir() if p.is_dir()] + + for bench in sorted(benches): + bench_dir = bdir / bench + output_jsonl = bench_dir / "output.jsonl" + if not output_jsonl.exists(): + print(f"Skipping {bench}: output.jsonl not found") + continue + + # Emergent uses this naming convention for generated audio dir (see inference.py). + converted_audio_dir = bench_dir / "emergent-tts-eval_output-audios" + converted_audio_dir.mkdir(parents=True, exist_ok=True) + _convert(output_jsonl, converted_audio_dir, overwrite=overwrite_converted) + + # Run Emergent scoring (writes emergent-tts-eval_* files into bench_dir) + _run_emergent_scoring( + benchmark_dir=bench_dir, + emergent_data_base_path=emergent_base, + fetch_audios_from_path=converted_audio_dir, + baseline_audios_path=baseline_audios, + judge_model=judge_model, + judger_base_url=judger_base_url, + num_threads=num_threads, + depths_to_evaluate=depths_to_evaluate, + categories_to_evaluate=categories_to_evaluate, + evaluate_function=evaluate_function, + strong_prompting=strong_prompting, + ) + + # Convert Emergent metrics file into `metrics.json` for NeMo-Skills conventions. + # Emergent prefix matches inference.py defaults when strong_prompting=False and voice_to_use=None. + emergent_metrics_path = bench_dir / "emergent-tts-eval_evaluation-metrics.json" + if emergent_metrics_path.exists(): + with open(emergent_metrics_path, "r", encoding="utf-8") as f: + metrics = json.load(f) + with open(bench_dir / "metrics.json", "w", encoding="utf-8") as f: + json.dump(metrics, f, indent=2) + print(f"[{bench}] Saved: {bench_dir/'metrics.json'}") + else: + print(f"[{bench}] Warning: Emergent metrics file not found at {emergent_metrics_path}") + + +def run_aggregation(results_dir: str): + bdir = _benchmarks_dir(results_dir) + print("\nAggregated Results (EmergentTTS-Eval):") + for benchmark in sorted([p.name for p in bdir.iterdir() if p.is_dir()]): + metrics_path = bdir / benchmark / "metrics.json" + if not metrics_path.exists(): + continue + with open(metrics_path, "r", encoding="utf-8") as f: + metrics = json.load(f) + # Keep this minimal; Emergent metrics are keyed like eval/wer, eval/mos, eval/win_rate, etc. + wer = metrics.get("eval/wer") + mos = metrics.get("eval/mos") + win = metrics.get("eval/win_rate") + print(f" {benchmark}:") + if wer is not None: + print(f" WER: {wer:.4f}") + if mos is not None: + print(f" MOS: {mos:.4f}") + if win is not None: + print(f" Win-rate: {win:.4f}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="EmergentTTS-Eval scoring for NeMo-Skills outputs") + parser.add_argument("--results_dir", required=True) + parser.add_argument("--benchmark", default=None, help="Score only this benchmark (e.g. emergent_tts.emergent)") + parser.add_argument("--aggregation_only", action="store_true") + + parser.add_argument( + "--emergent_data_dir", + required=False, + default=None, + help="Path containing Emergent files: emergent_tts_eval_data.jsonl, wv_mos.ckpt, baseline_audios/", + ) + parser.add_argument("--judge_model", default="gcp/google/gemini-2.5-pro") + parser.add_argument("--judger_base_url", default="https://inference-api.nvidia.com/v1/chat/completions") + parser.add_argument("--num_threads", type=int, default=8) + parser.add_argument("--depths_to_evaluate", default="0,1,2,3") + parser.add_argument( + "--categories_to_evaluate", + default="Emotions,Paralinguistics,Syntactic Complexity,Foreign Words,Questions,Pronunciation", + ) + parser.add_argument("--evaluate_function", default="win_rate") + parser.add_argument("--strong_prompting", action="store_true") + parser.add_argument("--overwrite_converted", action="store_true") + args = parser.parse_args() + + if args.aggregation_only: + run_aggregation(args.results_dir) + else: + emergent_data_dir = args.emergent_data_dir + if emergent_data_dir is None: + # Try to derive from NEMO_SKILLS_DATA_DIR (common in cluster configs). + emergent_data_dir = os.environ.get("EMERGENT_TTS_DATA_BASE_PATH") or os.environ.get("NEMO_SKILLS_DATA_DIR") + if emergent_data_dir: + emergent_data_dir = str(Path(emergent_data_dir) / "emergent_tts" / "data") + if emergent_data_dir is None: + raise SystemExit("--emergent_data_dir is required (or set EMERGENT_TTS_DATA_BASE_PATH/NEMO_SKILLS_DATA_DIR)") + + run_scoring( + results_dir=args.results_dir, + benchmark=args.benchmark, + emergent_data_dir=emergent_data_dir, + judge_model=args.judge_model, + judger_base_url=args.judger_base_url, + num_threads=args.num_threads, + depths_to_evaluate=args.depths_to_evaluate, + categories_to_evaluate=args.categories_to_evaluate, + evaluate_function=args.evaluate_function, + strong_prompting=args.strong_prompting, + overwrite_converted=args.overwrite_converted, + ) + From 52b6599fe4eed9ec4bc45f4cd7d9daa676ba48b6 Mon Sep 17 00:00:00 2001 From: Valentin Mendelev Date: Tue, 3 Feb 2026 06:13:48 -0800 Subject: [PATCH 25/26] Fix Emergent scoring deps and paths Install google-genai for EmergentTTS-Eval, run scoring from the dataset base dir so relative paths resolve, and avoid shipping large local caches/data. Document EmergentTTS-Eval usage in nv_tts guide. Co-authored-by: Cursor --- .gitignore | 8 +++ .../emergent_tts/scripts/config/default.yaml | 4 +- .../emergent_tts/scripts/run_tts_eval.py | 27 +++++---- .../dataset/emergent_tts/scripts/score.py | 45 ++++++++------ nemo_skills/dataset/nv_tts/TTS_eval.md | 60 +++++++++++++++++++ 5 files changed, 115 insertions(+), 29 deletions(-) diff --git a/.gitignore b/.gitignore index ecb9012331..e2218786e5 100644 --- a/.gitignore +++ b/.gitignore @@ -31,6 +31,14 @@ build .venv *.lock +# Local caches / secrets (never ship to remote via rsync) +.ssh/ +.hf_cache/ +.nemo_run/ + +# Emergent dataset artifacts (large; stored in shared data_dir instead) +nemo_skills/dataset/emergent_tts/data/ + __pycache__ .ipynb_checkpoints diff --git a/nemo_skills/dataset/emergent_tts/scripts/config/default.yaml b/nemo_skills/dataset/emergent_tts/scripts/config/default.yaml index 5015e4151c..9ffb01781f 100644 --- a/nemo_skills/dataset/emergent_tts/scripts/config/default.yaml +++ b/nemo_skills/dataset/emergent_tts/scripts/config/default.yaml @@ -43,7 +43,9 @@ scoring: container: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/containters/nemo-25.11.sqsh # Install missing Python deps at job start (runs inside the scoring container). # Keep this conservative: avoid upgrading core deps inside the base container. - installation_command: pip install editdistance whisper-normalizer json-repair tenacity + # EmergentTTS-Eval imports `from google import genai`, so ensure google-genai exists + # but install it without pulling/upgrading transitive deps (to avoid httpx/transformers churn). + installation_command: pip install editdistance whisper-normalizer json-repair tenacity && pip install --no-deps google-genai # Path to EmergentTTS-Eval-public on the cluster (added to PYTHONPATH) scoring_code_path: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/code/EmergentTTS-Eval-public diff --git a/nemo_skills/dataset/emergent_tts/scripts/run_tts_eval.py b/nemo_skills/dataset/emergent_tts/scripts/run_tts_eval.py index 19d08f3497..2b12fd87b6 100644 --- a/nemo_skills/dataset/emergent_tts/scripts/run_tts_eval.py +++ b/nemo_skills/dataset/emergent_tts/scripts/run_tts_eval.py @@ -10,6 +10,7 @@ import argparse import os +from pathlib import Path import yaml @@ -100,6 +101,7 @@ def main(): emergent_data_dir = scoring.get("emergent_data_dir", "") install_cmd = scoring.get("installation_command") scoring_container = scoring.get("container") or "nemo-skills" + emergent_data_base_dir = str(Path(emergent_data_dir).parent) if emergent_data_dir else "" # Required by Emergent's judge clients judger_api_key = ( @@ -115,16 +117,17 @@ def main(): benchmark = benchmark.strip() short_name = benchmark.split(".")[-1] score_cmd = ( - f"JUDGER_API_KEY={judger_api_key} " - f"PYTHONPATH={scoring_code_path}:$PYTHONPATH " - f"python -m nemo_skills.dataset.emergent_tts.scripts.score " - f"--results_dir {output_dir} " - f"--benchmark {benchmark} " - f"--emergent_data_dir {emergent_data_dir} " - f"--judge_model {scoring.get('judge_model', 'gcp/google/gemini-2.5-pro')} " - f"--judger_base_url {scoring.get('judger_base_url', 'https://inference-api.nvidia.com/v1/chat/completions')} " - f"--num_threads {int(scoring.get('num_threads', 8))} " - f"--evaluate_function {scoring.get('evaluate_function', 'win_rate')}" + (f"cd {emergent_data_base_dir} && " if emergent_data_base_dir else "") + + f"JUDGER_API_KEY={judger_api_key} " + + f"PYTHONPATH={scoring_code_path}:$PYTHONPATH " + + "python -m nemo_skills.dataset.emergent_tts.scripts.score " + + f"--results_dir {output_dir} " + + f"--benchmark {benchmark} " + + f"--emergent_data_dir {emergent_data_dir} " + + f"--judge_model {scoring.get('judge_model', 'gcp/google/gemini-2.5-pro')} " + + f"--judger_base_url {scoring.get('judger_base_url', 'https://inference-api.nvidia.com/v1/chat/completions')} " + + f"--num_threads {int(scoring.get('num_threads', 8))} " + + f"--evaluate_function {scoring.get('evaluate_function', 'win_rate')}" ) if scoring.get("strong_prompting"): score_cmd += " --strong_prompting" @@ -139,6 +142,9 @@ def main(): command=score_cmd, installation_command=install_cmd, run_after=run_after, + # Ensure we ship the current repo state for scoring jobs. + # (Otherwise nemo_run may reuse an older code snapshot and miss fixes.) + reuse_code=False, expname=f"{args.expname}_score_{short_name}", log_dir=f"{output_dir}/eval-logs", ) @@ -156,6 +162,7 @@ def main(): num_gpus=0, mount_paths=cfg["mount_paths"], command=agg_cmd, + reuse_code=False, expname=f"{args.expname}_agg", log_dir=f"{output_dir}/eval-logs", ) diff --git a/nemo_skills/dataset/emergent_tts/scripts/score.py b/nemo_skills/dataset/emergent_tts/scripts/score.py index ec5c77c58b..3ff0a0ca6b 100644 --- a/nemo_skills/dataset/emergent_tts/scripts/score.py +++ b/nemo_skills/dataset/emergent_tts/scripts/score.py @@ -93,24 +93,33 @@ def _run_emergent_scoring( # Tell Emergent code where to find `emergent_tts_eval_data.jsonl` and `wv_mos.ckpt`. os.environ["EMERGENT_TTS_DATA_BASE_PATH"] = str(emergent_data_base_path) - emergent_inference.eval_api_closed_model( - model_client=_NoopModelClient(), - accelerator=None, - depths_to_evaluate=depths_to_evaluate, - categories_to_evaluate=categories_to_evaluate, - seed=42, - output_dir=str(benchmark_dir), - num_samples=None, - baseline_audios_path=str(baseline_audios_path), - fetch_audios_from_path=str(fetch_audios_from_path), - judge_model=judge_model, - temperature=0.0, - evaluate_function=evaluate_function, - strong_prompting=strong_prompting, - judger_base_url=_normalize_openai_base_url(judger_base_url) if judger_base_url else None, - num_threads=num_threads, - model_name="nemo-skills-generated", - ) + # EmergentTTS-Eval expects paths like "data/emergent_tts_eval_data.jsonl" relative + # to its *data base directory* (repo root). We keep the dataset in a shared path: + # <...>/emergent_tts/data/{emergent_tts_eval_data.jsonl,wv_mos.ckpt,baseline_audios/} + # So we temporarily `chdir` into the directory that contains the "data/" folder. + prev_cwd = os.getcwd() + try: + os.chdir(str(emergent_data_base_path.parent)) + emergent_inference.eval_api_closed_model( + model_client=_NoopModelClient(), + accelerator=None, + depths_to_evaluate=depths_to_evaluate, + categories_to_evaluate=categories_to_evaluate, + seed=42, + output_dir=str(benchmark_dir), + num_samples=None, + baseline_audios_path=str(baseline_audios_path), + fetch_audios_from_path=str(fetch_audios_from_path), + judge_model=judge_model, + temperature=0.0, + evaluate_function=evaluate_function, + strong_prompting=strong_prompting, + judger_base_url=_normalize_openai_base_url(judger_base_url) if judger_base_url else None, + num_threads=num_threads, + model_name="nemo-skills-generated", + ) + finally: + os.chdir(prev_cwd) def run_scoring( diff --git a/nemo_skills/dataset/nv_tts/TTS_eval.md b/nemo_skills/dataset/nv_tts/TTS_eval.md index 73864defd7..32dfff6a20 100644 --- a/nemo_skills/dataset/nv_tts/TTS_eval.md +++ b/nemo_skills/dataset/nv_tts/TTS_eval.md @@ -49,6 +49,66 @@ This will prepare `test.jsonl` for each benchmark with pointers to the files on /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/data_dir ``` +### EmergentTTS-Eval (new test set) + +EmergentTTS-Eval does **not** rely on cluster-local pre-existing audio paths. Instead, it is prepared by downloading the dataset and writing the NeMo-Skills `test.jsonl` + Emergent artifacts into your shared `data_dir`. + +**Prerequisites:** +- Your environment/container must have Python deps required for dataset prep: `datasets`, `numpy`, `pydub`, `tqdm`. +- `prepare.py` downloads from HuggingFace + Zenodo, so it must run in an environment with network access. + +**Prepare EmergentTTS-Eval under your shared data_dir:** + +```bash +cd /home/vmendelev/workspace/expressiveness/src/ns_eval && source .venv/bin/activate && \ +python nemo_skills/dataset/emergent_tts/prepare.py \ + --output_dir /emergent_tts +``` + +Optional flags: +- `--num_samples 10` (debug: only write first 10) +- `--overwrite` (re-download and regenerate outputs) + +This creates: +- `/emergent_tts/emergent/test.jsonl` (NeMo-Skills generation input) +- `/emergent_tts/data/emergent_tts_eval_data.jsonl` +- `/emergent_tts/data/baseline_audios/.wav` +- `/emergent_tts/data/wv_mos.ckpt` + +**Run Emergent generation/scoring:** + +- Use the example config: `nemo_skills/dataset/emergent_tts/scripts/config/default.yaml` +- Set in the config: + - `generation.data_dir: ` + - `scoring.emergent_data_dir: /emergent_tts/data` + - `scoring.scoring_code_path: ` + +```bash +cd /home/vmendelev/workspace/expressiveness/src/ns_eval && source .venv/bin/activate && \ +NEMO_SKILLS_DISABLE_UNCOMMITTED_CHANGES_CHECK=1 \ +python -m nemo_skills.dataset.emergent_tts.scripts.run_tts_eval \ + --config nemo_skills/dataset/emergent_tts/scripts/config/default.yaml \ + --stage all \ + --expname emergent_eval +``` + +**Required env vars (Emergent scoring):** + +```bash +export JUDGER_API_KEY= +``` + +**Verification flow (recommended):** +- **Generation-only smoke test (10 samples)**: + - set `partition: interactive`, `generation.num_chunks: 1` + - set `generation.extra_args: "++max_samples=10 ++server.server_type=vllm_multimodal"` + - run: `python -m nemo_skills.dataset.emergent_tts.scripts.run_tts_eval --config --stage generation` +- **Full run (~1645)**: + - restore `partition: batch` and your usual `generation.num_chunks` + - run: `python -m nemo_skills.dataset.emergent_tts.scripts.run_tts_eval --config --stage all` +- **Scoring-only**: + - run: `python -m nemo_skills.dataset.emergent_tts.scripts.run_tts_eval --config --stage scoring` + ### 4. Configuration Files Review the config file and ensure all required artifacts are in the specified locations: From 88bd09cde4d290599ee32667044a90fcd81e4ab6 Mon Sep 17 00:00:00 2001 From: Valentin Mendelev Date: Fri, 6 Feb 2026 02:23:50 -0800 Subject: [PATCH 26/26] Add emergent_tts README Document dataset preparation (HF_TOKEN) and evaluation workflow, including cloning and patching EmergentTTS-Eval for NVIDIA Inference API judging. Co-authored-by: Cursor --- nemo_skills/dataset/emergent_tts/README.md | 124 +++++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 nemo_skills/dataset/emergent_tts/README.md diff --git a/nemo_skills/dataset/emergent_tts/README.md b/nemo_skills/dataset/emergent_tts/README.md new file mode 100644 index 0000000000..140dbb5533 --- /dev/null +++ b/nemo_skills/dataset/emergent_tts/README.md @@ -0,0 +1,124 @@ +## EmergentTTS-Eval dataset (`emergent_tts`) + +This dataset integration lets you: + +- **Prepare** the EmergentTTS-Eval test set under a shared `data_dir` (download baseline audios + metadata + MOS model). +- **Generate** TTS outputs with NeMo-Skills (`ns eval` via `run_tts_eval.py`). +- **Score** the generated outputs with EmergentTTS-Eval (WER/MOS/win-rate, depending on config). + +### 1) Prepare the test set (requires `HF_TOKEN`) + +`prepare.py` downloads the dataset and writes all required artifacts into: + +- `/emergent_tts/emergent/test.jsonl` +- `/emergent_tts/data/emergent_tts_eval_data.jsonl` +- `/emergent_tts/data/baseline_audios/*.wav` +- `/emergent_tts/data/wv_mos.ckpt` + +Run it from your dev machine (or any environment with network access): + +```bash +cd /home/vmendelev/workspace/expressiveness/src/nemo-skills-tts-eval +. ./.venv/bin/activate + +export HF_TOKEN="" + +python nemo_skills/dataset/emergent_tts/prepare.py \ + --output_dir "/emergent_tts" +``` + +Optional flags: + +- `--num_samples 10`: write only the first 10 samples (smoke test). +- `--overwrite`: re-download / regenerate outputs. + +### 2) Configure evaluation + +Use the example configs in `nemo_skills/dataset/emergent_tts/scripts/config/`. + +In `scripts/config/default.yaml`, set: + +- `generation.data_dir: ` +- `scoring.emergent_data_dir: /emergent_tts/data` +- `scoring.scoring_code_path: /EmergentTTS-Eval-public` (on the cluster) + +### 3) Clone + patch EmergentTTS-Eval-public for NVIDIA Inference API judging + +On EOS (or wherever you run scoring), clone EmergentTTS-Eval: + +```bash +cd /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/code +git clone EmergentTTS-Eval-public +``` + +Then update Emergent’s judge client selection so that **Gemini models are called via NVIDIA’s OpenAI-compatible Inference API**. + +Target behavior: + +- **Model name** stays as: `gcp/google/gemini-2.5-pro` (or similar). +- **Base URL** is NVIDIA Inference API: `https://inference-api.nvidia.com/v1` +- **API key** comes from: `JUDGER_API_KEY` (or `NVIDIA_API_KEY`) + +Minimal patch checklist inside `EmergentTTS-Eval-public`: + +- In `api_clients.py` (or wherever the client is chosen), ensure `gcp/google/*` uses an **OpenAI-compatible** client (not the Google SDK client), e.g.: + - `OpenAI(base_url=, api_key=os.getenv("JUDGER_API_KEY"))` +- Thread `judger_base_url` through so calls use `https://inference-api.nvidia.com/v1` (not the full `/v1/chat/completions` endpoint). + +After patching, set these in `scripts/config/default.yaml`: + +- `scoring.judge_model: gcp/google/gemini-2.5-pro` +- `scoring.judger_base_url: https://inference-api.nvidia.com/v1/chat/completions` + +### 3) Run evaluation (generation + scoring) + +From your dev machine, submit jobs to EOS: + +```bash +cd /home/vmendelev/workspace/expressiveness/src/nemo-skills-tts-eval +. ./.venv/bin/activate +mkdir -p .nemo_run + +export NEMORUN_HOME="$PWD/.nemo_run" +export NEMO_SKILLS_CONFIG_DIR=/home/vmendelev/workspace/expressiveness/src/ns_eval/cluster_configs +export NEMO_SKILLS_DISABLE_UNCOMMITTED_CHANGES_CHECK=1 + +# Required for win-rate judging (NVIDIA Inference API key) +export JUDGER_API_KEY="" + +python -m nemo_skills.dataset.emergent_tts.scripts.run_tts_eval \ + --config nemo_skills/dataset/emergent_tts/scripts/config/default.yaml \ + --stage all \ + --expname emergent_eval +``` + +### 4) Smoke test (10 samples, interactive) + +```bash +cd /home/vmendelev/workspace/expressiveness/src/nemo-skills-tts-eval +. ./.venv/bin/activate +mkdir -p .nemo_run + +export NEMORUN_HOME="$PWD/.nemo_run" +export NEMO_SKILLS_CONFIG_DIR=/home/vmendelev/workspace/expressiveness/src/ns_eval/cluster_configs +export NEMO_SKILLS_DISABLE_UNCOMMITTED_CHANGES_CHECK=1 + +python -m nemo_skills.dataset.emergent_tts.scripts.run_tts_eval \ + --config nemo_skills/dataset/emergent_tts/scripts/config/interactive_10.yaml \ + --stage generation \ + --expname emergent_smoke10 +``` + +### Outputs + +NeMo-Skills generation writes: + +- `/eval-results/emergent_tts.emergent/output.jsonl` +- `/eval-results/emergent_tts.emergent/audio/*.wav` (or equivalent) + +Emergent scoring writes (in the same benchmark folder): + +- `emergent-tts-eval_*_evaluation-predictions.jsonl` +- `emergent-tts-eval_*_evaluation-metrics.json` +- `metrics.json` (a NeMo-Skills-friendly copy of Emergent metrics) +