From 99ef50a38c99243823badbd01b0e7c2b602d5928 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 13 Nov 2025 12:18:31 -0800
Subject: [PATCH 01/26] Added audio requests to vLLM models

---
 .../dataset/mmau-pro/closed_form/__init__.py  |   1 +
 .../dataset/mmau-pro/open_ended/__init__.py   |   2 +-
 nemo_skills/dataset/mmau-pro/prepare.py       |  23 ++-
 .../evaluation/metrics/mmau_pro_metrics.py    | 118 +++++++++++--
 nemo_skills/inference/generate.py             |  17 ++
 nemo_skills/inference/model/vllm.py           |  35 +++-
 nemo_skills/prompt/config/judge/mmau-pro.yaml |  30 ++++
 nemo_skills/prompt/config/judge/speechlm.yaml |  28 ----
 tests/gpu-tests/test_eval.py                  |  98 +++++++++++
 tests/gpu-tests/test_vllm_audio.py            |  84 ++++++++++
 tests/test_vllm_audio.py                      | 156 ++++++++++++++++++
 11 files changed, 543 insertions(+), 49 deletions(-)
 create mode 100644 nemo_skills/prompt/config/judge/mmau-pro.yaml
 delete mode 100644 nemo_skills/prompt/config/judge/speechlm.yaml
 create mode 100644 tests/gpu-tests/test_vllm_audio.py
 create mode 100644 tests/test_vllm_audio.py

diff --git a/nemo_skills/dataset/mmau-pro/closed_form/__init__.py b/nemo_skills/dataset/mmau-pro/closed_form/__init__.py
index 4e3b424d84..4390c1d887 100644
--- a/nemo_skills/dataset/mmau-pro/closed_form/__init__.py
+++ b/nemo_skills/dataset/mmau-pro/closed_form/__init__.py
@@ -16,6 +16,7 @@
 METRICS_TYPE = "mmau_pro_closed_form"
 SCORE_MODULE = "nemo_skills.evaluation.metrics.mmau_pro_metrics"
 GENERATION_ARGS = "++prompt_format=openai"
+EVAL_ARGS = "++eval_type=mmau-pro"
 
 # NVEmbed judge configuration for closed-form evaluation
 JUDGE_PIPELINE_ARGS = {
diff --git a/nemo_skills/dataset/mmau-pro/open_ended/__init__.py b/nemo_skills/dataset/mmau-pro/open_ended/__init__.py
index 22773d6fed..c5f09272d2 100644
--- a/nemo_skills/dataset/mmau-pro/open_ended/__init__.py
+++ b/nemo_skills/dataset/mmau-pro/open_ended/__init__.py
@@ -23,4 +23,4 @@
     "server_type": "openai",
     "server_address": "https://integrate.api.nvidia.com/v1",
 }
-JUDGE_ARGS = "++prompt_config=judge/speechlm ++generation_key=judgement"
+JUDGE_ARGS = "++prompt_config=judge/mmau-pro ++generation_key=judgement"
diff --git a/nemo_skills/dataset/mmau-pro/prepare.py b/nemo_skills/dataset/mmau-pro/prepare.py
index a6f04d621b..0ea66ec2b7 100644
--- a/nemo_skills/dataset/mmau-pro/prepare.py
+++ b/nemo_skills/dataset/mmau-pro/prepare.py
@@ -75,8 +75,8 @@ def format_entry(entry, with_audio=False):
     if category == "open":
         content = entry["question"]
     elif choices and len(choices) > 1:
-        options_text = "\n".join(f"{chr(65 + i)}. {choice}" for i, choice in enumerate(choices))
-        content = f"{entry['question']}\n\n{options_text}"
+        options_text = "\n".join(f"{chr(65 + i)}) {choice}" for i, choice in enumerate(choices))
+        content = f"{entry['question']}\n\n{options_text}\n\nRespond with the complete text of the correct option, not just the letter."
     else:
         content = entry["question"]
 
@@ -84,13 +84,18 @@ def format_entry(entry, with_audio=False):
 
     if entry.get("audio_path"):
         audio_path = entry["audio_path"]
-
-        if isinstance(audio_path, list) and audio_path:
-            user_message["audios"] = [{"path": path, "duration": 10.0} for path in audio_path]
-        elif isinstance(audio_path, str):
-            user_message["audio"] = {"path": audio_path, "duration": 10.0}
-
-    formatted_entry["messages"] = [user_message]
+        # Prepend /dataset/mmau-pro/ to make paths absolute for cluster
+        if len(audio_path) == 1:
+            user_message["audio"] = {"path": f"/dataset/mmau-pro/{audio_path[0]}"}
+        else:
+            user_message["audios"] = [{"path": f"/dataset/mmau-pro/{path}"} for path in audio_path]
+
+    # Don't use /no_think for open-ended questions to allow reasoning
+    system_content = "You are a helpful assistant."
+    if category != "open":
+        system_content += " /no_think"
+
+    formatted_entry["messages"] = [{"role": "system", "content": system_content}, user_message]
     return formatted_entry
 
 
diff --git a/nemo_skills/evaluation/metrics/mmau_pro_metrics.py b/nemo_skills/evaluation/metrics/mmau_pro_metrics.py
index f079049cc1..000dbcf13f 100644
--- a/nemo_skills/evaluation/metrics/mmau_pro_metrics.py
+++ b/nemo_skills/evaluation/metrics/mmau_pro_metrics.py
@@ -13,14 +13,52 @@
 # limitations under the License.
 
 import logging
+import re
+
+import numpy as np
 
 from nemo_skills.evaluation.metrics.base import BaseMetrics, as_int, as_percentage
-from nemo_skills.evaluation.metrics.utils import is_correct_judgement
 from nemo_skills.utils import get_logger_name
 
 LOG = logging.getLogger(get_logger_name(__file__))
 
 
+def extract_multicriteria_scores(judgement_text: str) -> dict[str, float]:
+    """Extract multi-criteria scores (1-5 scale) from LLM judge evaluation.
+
+    Expected format:
+        CORRECTNESS: [score] - [justification]
+        RELEVANCE: [score] - [justification]
+        COMPLETENESS: [score] - [justification]
+        CLARITY: [score] - [justification]
+        OVERALL: [score] - [overall assessment]
+
+    Returns:
+        Dictionary with keys: correctness, relevance, completeness, clarity, overall
+        Defaults to 3.0 if score not found.
+    """
+    scores = {}
+
+    patterns = {
+        "correctness": r"CORRECTNESS:\s*(\d+(?:\.\d+)?)",
+        "relevance": r"RELEVANCE:\s*(\d+(?:\.\d+)?)",
+        "completeness": r"COMPLETENESS:\s*(\d+(?:\.\d+)?)",
+        "clarity": r"CLARITY:\s*(\d+(?:\.\d+)?)",
+        "overall": r"OVERALL:\s*(\d+(?:\.\d+)?)",
+    }
+
+    for criterion, pattern in patterns.items():
+        match = re.search(pattern, judgement_text, re.IGNORECASE)
+        scores[criterion] = float(match.group(1)) if match else 3.0
+
+    # Fallback: compute overall if missing or still 3.0
+    if "overall" not in scores or scores["overall"] == 3.0:
+        criteria_scores = [scores.get(k, 3.0) for k in ["correctness", "relevance", "completeness", "clarity"]]
+        scores["overall"] = sum(criteria_scores) / len(criteria_scores)
+
+    return scores
+
+
 class MMAUProMetrics(BaseMetrics):
     """Metrics class for MMAU-Pro benchmark (all subgroups)."""
 
@@ -28,16 +66,24 @@ def __init__(self, compute_no_answer: bool = True, max_k: int = 1):
         super().__init__(compute_no_answer=compute_no_answer)
         self.max_k = max_k
 
+        # Track multi-criteria scores for open-ended questions (1-5 scale)
+        self.multicriteria_scores = {
+            "correctness": [],
+            "relevance": [],
+            "completeness": [],
+            "clarity": [],
+            "overall": [],
+        }
+
     def _get_score_dict(self, prediction: dict) -> dict[str, bool | int | float]:
         """Extract correctness scores from prediction."""
         score_dict = {}
 
-        # Open-ended: extract from judge result
+        # Open-ended: use LLM judge correctness score >= 3 as correct
         if "judgement" in prediction:
-            judge_result = is_correct_judgement(prediction["judgement"])
-            score_dict["judge_correct"] = judge_result
-            score_dict["correct"] = judge_result
-        # Closed-form and instruction following: use is_correct
+            multicriteria = extract_multicriteria_scores(prediction["judgement"])
+            score_dict["correct"] = multicriteria.get("correctness", 3.0) >= 3.0
+        # Closed-form / instruction-following: use binary correctness
         elif "is_correct" in prediction:
             score_dict["correct"] = prediction["is_correct"]
         else:
@@ -58,24 +104,61 @@ def get_incorrect_sample(self, prediction: dict) -> dict:
     def update(self, predictions):
         """Update metrics with new predictions."""
         super().update(predictions)
+
         predicted_answers = [pred.get("generation", None).strip() or None for pred in predictions]
         self._compute_pass_at_k(predictions=predictions, predicted_answers=predicted_answers)
         self._compute_majority_at_k(predictions=predictions, predicted_answers=predicted_answers)
 
+        # Collect multi-criteria scores for open-ended questions
+        for pred in predictions:
+            if "judgement" in pred:
+                multicriteria = extract_multicriteria_scores(pred["judgement"])
+                for criterion in self.multicriteria_scores:
+                    self.multicriteria_scores[criterion].append(multicriteria.get(criterion, 3.0))
+
     def get_metrics(self):
         """Get computed metrics."""
         metrics_dict = super().get_metrics()
+
         for agg_mode, agg_metrics in metrics_dict.items():
-            # Ensure avg_tokens is always present for MMAU-Pro
+            # Ensure avg_tokens is present
             if "avg_tokens" not in agg_metrics:
                 agg_metrics["avg_tokens"] = 0
             if "no_answer" in agg_metrics:
                 agg_metrics["no_answer"] = agg_metrics["no_answer"] / 2.0
-            # Set success_rate from correct or judge_correct
-            if "judge_correct" in agg_metrics:
-                agg_metrics["success_rate"] = agg_metrics["judge_correct"]
+
+            # Add multi-criteria averages for open-ended (convert 1-5 scale to percentage)
+            if self.multicriteria_scores["overall"]:
+                for criterion in self.multicriteria_scores:
+                    scores = self.multicriteria_scores[criterion]
+                    if scores:
+                        # Convert 1-5 scale to 0-100 percentage scale
+                        avg_score = np.mean(scores)
+                        std_score = np.std(scores)
+                        agg_metrics[f"avg_{criterion}"] = (avg_score / 5.0) * 100
+                        agg_metrics[f"std_{criterion}"] = (std_score / 5.0) * 100
+
+                # Set correct and success_rate to avg_correctness for open-ended
+                agg_metrics["correct"] = agg_metrics["avg_correctness"]
+                agg_metrics["success_rate"] = agg_metrics["avg_correctness"]
+
+                # Calculate good/poor response rates based on overall >= 4 or <= 2
+                overall_scores = self.multicriteria_scores["overall"]
+                good_responses = sum(1 for score in overall_scores if score >= 4.0)
+                poor_responses = sum(1 for score in overall_scores if score <= 2.0)
+
+                agg_metrics["good_response_rate"] = (good_responses / len(overall_scores)) * 100
+                agg_metrics["poor_response_rate"] = (poor_responses / len(overall_scores)) * 100
+
+            # For closed-form / instruction-following: use binary correctness
             elif "correct" in agg_metrics:
                 agg_metrics["success_rate"] = agg_metrics["correct"]
+
+            # Round all numeric values to 2 decimal places
+            for key, value in agg_metrics.items():
+                if isinstance(value, float) and not isinstance(value, bool):
+                    agg_metrics[key] = round(value, 2)
+
         return metrics_dict
 
     def metrics_to_print(self):
@@ -87,5 +170,20 @@ def metrics_to_print(self):
         }
         if self.compute_no_answer:
             base_metrics["no_answer"] = as_percentage
+
+        # Add multi-criteria metrics for open-ended questions (now in percentage format)
+        if self.multicriteria_scores["overall"]:
+            base_metrics.update(
+                {
+                    "avg_overall": as_percentage,
+                    "avg_correctness": as_percentage,
+                    "avg_relevance": as_percentage,
+                    "avg_completeness": as_percentage,
+                    "avg_clarity": as_percentage,
+                    "good_response_rate": as_percentage,
+                    "poor_response_rate": as_percentage,
+                }
+            )
+
         base_metrics["num_entries"] = as_int
         return base_metrics
diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index 136375db46..87151a66d6 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -399,6 +399,10 @@ def setup_prompt(self):
     def setup_llm(self):
         self.sandbox = get_sandbox(**self.cfg.sandbox) if self.cfg.sandbox is not None else None
 
+        self.data_dir = None
+        if "data_dir" in self.cfg.eval_config and not isinstance(self.cfg.eval_config.get("data_dir"), type(None)):
+            self.data_dir = self.cfg.eval_config["data_dir"]
+
         if self.cfg.code_execution:
             llm = get_code_execution_model(**self.cfg.server, tokenizer=self.tokenizer, sandbox=self.sandbox)
         elif self.cfg.tool_modules is not None:
@@ -545,6 +549,16 @@ def dump_outputs(self, outputs, data_points, fout):
         for output in outputs:
             fout.write(json.dumps(output) + "\n")
 
+    def drop_binary_data(self, output):
+        """Remove binary data (like base64 audio) from messages to keep output files smaller."""
+        for message in output["messages"]:
+            # Skip if content is not a list (e.g., string content in system messages)
+            if not isinstance(message.get("content"), list):
+                continue
+
+            # Filter out audio_url items from list-style content
+            message["content"] = [content for content in message["content"] if content.get("type") != "audio_url"]
+
     async def postprocess_single_output(self, output, original_data_point):
         # to make it easier to follow up with other generations and limit accidental errors, we are adding
         # all of the original data to the output file alongside the new generations
@@ -560,6 +574,9 @@ async def postprocess_single_output(self, output, original_data_point):
         for key in output:
             original_data_point.pop(key, None)
         output.update(original_data_point)
+
+        self.drop_binary_data(output)
+
         if self.cfg.parse_reasoning:
             parse_reasoning(
                 output,
diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py
index e9a2146520..cff46cf0e6 100644
--- a/nemo_skills/inference/model/vllm.py
+++ b/nemo_skills/inference/model/vllm.py
@@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import base64
 import logging
+import os
 
 import requests
 
@@ -24,8 +26,16 @@
 LOG = logging.getLogger(get_logger_name(__file__))
 
 
+def audio_file_to_base64(audio_file_path: str):
+    """Encodes an audio file into a base64 string."""
+    with open(audio_file_path, "rb") as audio_file:
+        audio_content = audio_file.read()
+        return base64.b64encode(audio_content).decode("utf-8")
+
+
 class VLLMModel(BaseModel):
-    def __init__(self, **kwargs):
+    def __init__(self, data_dir: str = "", **kwargs):
+        self.data_dir = data_dir
         super().__init__(**kwargs)
 
     def _get_tokenizer_endpoint(self):
@@ -99,6 +109,28 @@ def _build_completion_request_params(
             "extra_body": self._build_request_body(top_k, min_p, repetition_penalty, extra_body=extra_body),
         }
 
+    def content_text_to_list(self, message):
+        if "audio" in message or "audios" in message:
+            content = message["content"]
+            if isinstance(content, str):
+                message["content"] = [{"type": "text", "text": content}]
+            elif isinstance(content, list):
+                message["content"] = content
+            else:
+                raise TypeError(str(content))
+
+        if "audio" in message:
+            audio = message["audio"]
+            base64_audio = audio_file_to_base64(os.path.join(self.data_dir, audio["path"]))
+            audio_message = {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}}
+            message["content"].append(audio_message)
+        elif "audios" in message:
+            for audio in message["audios"]:
+                base64_audio = audio_file_to_base64(os.path.join(self.data_dir, audio["path"]))
+                audio_message = {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}}
+                message["content"].append(audio_message)
+        return message
+
     def _build_chat_request_params(
         self,
         messages: list[dict],
@@ -117,6 +149,7 @@ def _build_chat_request_params(
         tools: list[dict] | None = None,
         extra_body: dict = None,
     ) -> dict:
+        messages = [self.content_text_to_list(message) for message in messages]
         request = {
             "messages": messages,
             "max_tokens": tokens_to_generate,
diff --git a/nemo_skills/prompt/config/judge/mmau-pro.yaml b/nemo_skills/prompt/config/judge/mmau-pro.yaml
new file mode 100644
index 0000000000..5339e4ab0d
--- /dev/null
+++ b/nemo_skills/prompt/config/judge/mmau-pro.yaml
@@ -0,0 +1,30 @@
+# Judge prompt configuration for Speech/Audio Language Model evaluation
+# Used for evaluating open-ended responses in MMAU-Pro benchmark
+# Uses multi-criteria scoring on 1-5 scale
+
+user: |-
+  You are an expert evaluator for audio and speech-related questions. Please evaluate the quality of a model's response to a question.
+
+  Question: {question}
+
+  Reference Answer: {expected_answer}
+
+  Model Response: {generation}
+
+  Please evaluate the model response on the following criteria and provide scores from 1-5 (where 5 is best):
+
+  1. **Correctness**: How factually accurate is the response compared to the reference?
+  2. **Relevance**: How well does the response address the specific question asked?
+  3. **Completeness**: Does the response cover all important aspects mentioned in the reference?
+  4. **Clarity**: How clear and well-structured is the response?
+
+  For each criterion, provide:
+  - A score from 1-5
+  - A brief justification (1-2 sentences)
+
+  Format your response as:
+  CORRECTNESS: [score] - [justification]
+  RELEVANCE: [score] - [justification]
+  COMPLETENESS: [score] - [justification]
+  CLARITY: [score] - [justification]
+  OVERALL: [average score] - [overall assessment]
diff --git a/nemo_skills/prompt/config/judge/speechlm.yaml b/nemo_skills/prompt/config/judge/speechlm.yaml
deleted file mode 100644
index 4862558145..0000000000
--- a/nemo_skills/prompt/config/judge/speechlm.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-# Judge prompt configuration for Speech/Audio Language Model evaluation
-# Used for evaluating open-ended responses in MMAU-Pro benchmark
-# Follows nemo-skills standard Yes/No judgement pattern
-
-user: |-
-  You are an expert evaluator for audio and speech-related questions. Please evaluate whether the model's response correctly answers the question.
-
-  Question: {question}
-
-  Reference Answer: {expected_answer}
-
-  Model Response: {generation}
-
-  Your task is to determine if the model's response is correct based on the reference answer. Consider:
-
-  1. **Factual Accuracy**: Is the information in the response factually correct?
-  2. **Relevance**: Does the response address the specific question asked?
-  3. **Completeness**: Does the response cover the key points from the reference answer?
-
-  Please first explain your reasoning in 2-3 sentences, then provide your final judgement.
-
-  Your final judgement must be either "Yes" or "No":
-  - "Yes" if the model response is correct and adequately answers the question
-  - "No" if the model response is incorrect, irrelevant, or inadequate
-
-  Format your response as:
-  Reasoning: [Your explanation]
-  Judgement: [Yes or No]
diff --git a/tests/gpu-tests/test_eval.py b/tests/gpu-tests/test_eval.py
index 47060a1368..ae7a6a4b7e 100644
--- a/tests/gpu-tests/test_eval.py
+++ b/tests/gpu-tests/test_eval.py
@@ -346,3 +346,101 @@ def test_megatron_eval():
     # TODO: something is broken in megatron inference here as this should be 50!
     assert metrics["symbolic_correct"] >= 40
     assert metrics["num_entries"] == 5
+
+
+@pytest.mark.gpu
+def test_prepare_and_eval_all_datasets():
+    model_path = require_env_var("NEMO_SKILLS_TEST_HF_MODEL")
+    model_type = require_env_var("NEMO_SKILLS_TEST_MODEL_TYPE")
+
+    config_dir = Path(__file__).absolute().parent
+    datasets_dir = Path(__file__).absolute().parents[2] / "nemo_skills" / "dataset"
+    # not testing datasets that don't support max_samples, require explicit parameters or are very heavy to prepare
+    excluded_datasets = {
+        "__pycache__",
+        "ruler",
+        "bigcodebench",
+        "livecodebench",
+        "livebench_coding",
+        "livecodebench-pro",
+        "livecodebench-cpp",
+        "ioi24",
+        "ioi25",
+        "bfcl_v3",
+        "bfcl_v4",
+        "swe-bench",
+        "aai",
+        "human-eval",
+        "human-eval-infilling",
+        "mbpp",
+        "mmau-pro",
+    }
+
+    dataset_names = sorted(
+        dataset.name
+        for dataset in datasets_dir.iterdir()
+        if dataset.is_dir() and (dataset / "prepare.py").exists() and dataset.name not in excluded_datasets
+    )
+
+    assert dataset_names, "No datasets found to prepare and evaluate"
+
+    judge_datasets = []
+    for dataset in dataset_names:
+        dataset_module = import_module(f"nemo_skills.dataset.{dataset}")
+        # Check if JUDGE_PIPELINE_ARGS exists (even if empty dict, which is falsy)
+        if hasattr(dataset_module, "JUDGE_PIPELINE_ARGS"):
+            judge_datasets.append(dataset)
+
+    non_judge_datasets = [dataset for dataset in dataset_names if dataset not in judge_datasets]
+
+    data_dir = Path(f"/tmp/nemo-skills-tests/{model_type}/data")
+    docker_rm([str(data_dir)])
+
+    prepare_data(
+        ctx=wrap_arguments(" ".join(dataset_names)),
+        cluster="test-local",
+        config_dir=str(config_dir),
+        data_dir=str(data_dir),
+        expname=f"prepare-all-datasets-{model_type}",
+    )
+
+    eval_kwargs = dict(
+        cluster="test-local",
+        config_dir=str(config_dir),
+        data_dir=str(data_dir),
+        model=model_path,
+        server_type="sglang",
+        server_gpus=1,
+        server_nodes=1,
+        auto_summarize_results=False,
+    )
+
+    common_ctx = "++max_samples=2 ++inference.tokens_to_generate=100 ++server.enable_soft_fail=True "
+
+    output_dir = f"/tmp/nemo-skills-tests/{model_type}/all-datasets-eval"
+    docker_rm([output_dir])
+    eval(
+        ctx=wrap_arguments(common_ctx),
+        output_dir=output_dir,
+        benchmarks=",".join(non_judge_datasets),
+        expname=f"eval-all-datasets-{model_type}",
+        **eval_kwargs,
+    )
+
+    run_cmd(
+        ctx=wrap_arguments(f"python -m nemo_skills.pipeline.summarize_results {output_dir}"),
+        cluster="test-local",
+        config_dir=str(config_dir),
+    )
+
+    eval_results_dir = Path(output_dir) / "eval-results"
+    metrics_path = eval_results_dir / "metrics.json"
+    assert metrics_path.exists(), "Missing aggregated metrics file"
+    with metrics_path.open() as f:
+        metrics = json.load(f)
+
+    for dataset in non_judge_datasets:
+        assert dataset in metrics, f"Missing metrics for {dataset}"
+
+    # TODO: add same for judge_datasets after generate supports num_jobs
+    # (otherwise it starts judge every time and takes forever)
diff --git a/tests/gpu-tests/test_vllm_audio.py b/tests/gpu-tests/test_vllm_audio.py
new file mode 100644
index 0000000000..8183adaa80
--- /dev/null
+++ b/tests/gpu-tests/test_vllm_audio.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import shutil
+import subprocess
+import tempfile
+from pathlib import Path
+
+import pytest
+from utils import require_env_var
+
+
+@pytest.mark.gpu
+def test_vllm_audio_generation():
+    """Integration test: Generate with vLLM server using audio input."""
+    model_path = require_env_var("NEMO_SKILLS_TEST_HF_MODEL")
+    model_type = require_env_var("NEMO_SKILLS_TEST_MODEL_TYPE")
+
+    output_dir = f"/tmp/nemo-skills-tests/{model_type}/vllm-audio-generation"
+    # Clean up output directory
+    if Path(output_dir).exists():
+        shutil.rmtree(output_dir)
+
+    # Create test input file with audio
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f:
+        test_data = [
+            {
+                "problem": "Transcribe this audio",
+                "audio": {"path": "/nemo_run/code/tests/slurm-tests/asr_nim/wavs/t2_16.wav"},
+            },
+            {
+                "problem": "What is in this audio?",
+                "audio": {"path": "/nemo_run/code/tests/slurm-tests/asr_nim/wavs/t3_16.wav"},
+            },
+        ]
+        for item in test_data:
+            f.write(json.dumps(item) + '\n')
+        input_file = f.name
+
+    try:
+        cmd = (
+            f"ns generate "
+            f"    --cluster test-local --config_dir {Path(__file__).absolute().parent} "
+            f"    --model {model_path} "
+            f"    --output_dir {output_dir} "
+            f"    --server_type vllm "
+            f"    --server_gpus 1 "
+            f"    --server_nodes 1 "
+            f"    --server_args '--enforce-eager' "
+            f"    --input_file={input_file} "
+            f"    ++prompt_config=openai "
+            f"    ++skip_filled=False "
+        )
+        subprocess.run(cmd, shell=True, check=True)
+
+        # Verify output exists and has audio-related generation
+        with open(f"{output_dir}/output.jsonl") as fin:
+            lines = fin.readlines()
+        
+        assert len(lines) == 2, "Should have 2 output lines"
+        
+        for line in lines:
+            data = json.loads(line)
+            assert "generation" in data, "Should have generation field"
+            assert len(data["generation"]) > 0, "Generation should not be empty"
+            # If model supports audio, generation should contain something
+            print(f"Generated: {data['generation']}")
+
+    finally:
+        # Cleanup temp file
+        Path(input_file).unlink(missing_ok=True)
+
diff --git a/tests/test_vllm_audio.py b/tests/test_vllm_audio.py
new file mode 100644
index 0000000000..56bee85aa2
--- /dev/null
+++ b/tests/test_vllm_audio.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import base64
+import os
+import tempfile
+from unittest.mock import AsyncMock, patch
+
+import pytest
+
+from nemo_skills.inference.model.vllm import VLLMModel, audio_file_to_base64
+
+
+# -----------------------
+# Unit tests - no server required
+# -----------------------
+
+def test_audio_file_to_base64():
+    """Test basic audio file encoding to base64."""
+    with tempfile.NamedTemporaryFile(mode='wb', suffix='.wav', delete=False) as f:
+        test_content = b'RIFF' + b'\x00' * 100
+        f.write(test_content)
+        temp_path = f.name
+
+    try:
+        result = audio_file_to_base64(temp_path)
+        assert isinstance(result, str)
+        assert len(result) > 0
+        decoded = base64.b64decode(result)
+        assert decoded == test_content
+    finally:
+        os.unlink(temp_path)
+
+
+@pytest.fixture
+def vllm_model(tmp_path):
+    """Create a VLLMModel instance for testing."""
+    audio_dir = tmp_path / "audio"
+    audio_dir.mkdir()
+    model = VLLMModel(model="test-model", data_dir=str(tmp_path), base_url="http://localhost:5000")
+    return model
+
+
+def test_content_text_to_list_with_audio(vllm_model, tmp_path):
+    """Test converting string content with audio to list format."""
+    audio_path = tmp_path / "audio" / "test.wav"
+    audio_path.parent.mkdir(exist_ok=True)
+    with open(audio_path, 'wb') as f:
+        f.write(b'RIFF' + b'\x00' * 100)
+
+    message = {"role": "user", "content": "Describe this audio", "audio": {"path": "audio/test.wav"}}
+
+    result = vllm_model.content_text_to_list(message)
+
+    assert isinstance(result["content"], list)
+    assert len(result["content"]) == 2
+    assert result["content"][0]["type"] == "text"
+    assert result["content"][1]["type"] == "audio_url"
+    assert result["content"][1]["audio_url"]["url"].startswith("data:audio/wav;base64,")
+
+
+def test_content_text_to_list_with_multiple_audios(vllm_model, tmp_path):
+    """Test handling message with multiple audio files."""
+    audio_dir = tmp_path / "audio"
+    audio_dir.mkdir(exist_ok=True)
+
+    for i in range(2):
+        with open(audio_dir / f"test_{i}.wav", 'wb') as f:
+            f.write(b'RIFF' + b'\x00' * 100)
+
+    message = {
+        "role": "user",
+        "content": "Compare these",
+        "audios": [{"path": "audio/test_0.wav"}, {"path": "audio/test_1.wav"}],
+    }
+
+    result = vllm_model.content_text_to_list(message)
+
+    assert isinstance(result["content"], list)
+    assert len(result["content"]) == 3
+    assert result["content"][0]["type"] == "text"
+    assert result["content"][1]["type"] == "audio_url"
+    assert result["content"][2]["type"] == "audio_url"
+
+
+# -----------------------
+# Request building tests with audio
+# -----------------------
+
+def test_build_chat_request_with_audio(tmp_path, vllm_model):
+    """Test that chat request params are correctly built with audio content."""
+    # Create audio file
+    audio_path = tmp_path / "audio" / "test.wav"
+    audio_path.parent.mkdir(exist_ok=True)
+    with open(audio_path, 'wb') as f:
+        f.write(b'RIFF' + b'\x00' * 100)
+
+    messages = [{"role": "user", "content": "Test audio", "audio": {"path": "audio/test.wav"}}]
+
+    # Build request params - this doesn't make any network calls
+    params = vllm_model._build_chat_request_params(messages=messages, stream=False, tokens_to_generate=10)
+
+    # Validate request structure
+    assert "messages" in params
+    assert len(params["messages"]) == 1
+    content_items = params["messages"][0]["content"]
+    assert isinstance(content_items, list)
+    assert len(content_items) == 2
+    assert content_items[0]["type"] == "text"
+    assert content_items[1]["type"] == "audio_url"
+
+    # Verify base64 encoding is valid
+    audio_url = content_items[1]["audio_url"]["url"]
+    assert audio_url.startswith("data:audio/wav;base64,")
+    audio_b64 = audio_url.split(",", 1)[1]
+    decoded = base64.b64decode(audio_b64)
+    assert decoded.startswith(b'RIFF')
+
+
+@pytest.mark.asyncio
+async def test_generate_with_audio_mocked_response(tmp_path, vllm_model):
+    """Test generate_async with audio by mocking the response (no real server call)."""
+    # Create audio file
+    audio_path = tmp_path / "audio" / "test.wav"
+    audio_path.parent.mkdir(exist_ok=True)
+    with open(audio_path, 'wb') as f:
+        f.write(b'RIFF' + b'\x00' * 100)
+
+    messages = [{"role": "user", "content": "Describe this audio", "audio": {"path": "audio/test.wav"}}]
+
+    # Mock the entire generate_async method - no actual API call made
+    mock_response = {"generation": "This audio contains speech", "num_generated_tokens": 5}
+    
+    with patch.object(vllm_model, "generate_async", new_callable=AsyncMock) as mock_generate:
+        mock_generate.return_value = mock_response
+
+        # Call the mocked method
+        response = await vllm_model.generate_async(prompt=messages, tokens_to_generate=50, temperature=0.0)
+
+        # Verify the mock was called correctly
+        assert response["generation"] == "This audio contains speech"
+        assert response["num_generated_tokens"] == 5
+        mock_generate.assert_awaited_once()
+
+

From 8297aed7e8f88eea0ca178f74213cacad14d933c Mon Sep 17 00:00:00 2001
From: Valentin Mendelev <vmendelev@nvidia.com>
Date: Thu, 18 Dec 2025 02:12:14 -0800
Subject: [PATCH 02/26] Intorduced vLLM_multimodal model to save multimodal
 outputs

Signed-off-by: Valentin Mendelev <vmendelev@nvidia.com>
---
 nemo_skills/inference/generate.py             |  15 ++-
 nemo_skills/inference/model/__init__.py       |   2 +
 nemo_skills/inference/model/base.py           |   5 +
 .../inference/model/vllm_multimodal.py        | 110 ++++++++++++++++++
 4 files changed, 130 insertions(+), 2 deletions(-)
 create mode 100644 nemo_skills/inference/model/vllm_multimodal.py

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index 87151a66d6..a98127d834 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -403,8 +403,15 @@ def setup_llm(self):
         if "data_dir" in self.cfg.eval_config and not isinstance(self.cfg.eval_config.get("data_dir"), type(None)):
             self.data_dir = self.cfg.eval_config["data_dir"]
 
+        output_dir = str(Path(self.cfg.output_file).parent)
         if self.cfg.code_execution:
-            llm = get_code_execution_model(**self.cfg.server, tokenizer=self.tokenizer, sandbox=self.sandbox)
+            llm = get_code_execution_model(
+                **self.cfg.server,
+                tokenizer=self.tokenizer,
+                sandbox=self.sandbox,
+                data_dir=self.data_dir or "",
+                output_dir=output_dir,
+            )
         elif self.cfg.tool_modules is not None:
             llm = get_tool_calling_model(
                 **self.cfg.server,
@@ -413,9 +420,13 @@ def setup_llm(self):
                 schema_overrides=self.cfg.schema_overrides,
                 tokenizer=self.tokenizer,
                 additional_config={"sandbox": self.cfg.sandbox},
+                data_dir=self.data_dir or "",
+                output_dir=output_dir,
             )
         else:
-            llm = get_model(**self.cfg.server, tokenizer=self.tokenizer)
+            llm = get_model(
+                **self.cfg.server, tokenizer=self.tokenizer, data_dir=self.data_dir or "", output_dir=output_dir
+            )
 
         if self.cfg.parallel_thinking.mode is not None:
             # We don't want to override these key variables which overlap with self.cfg
diff --git a/nemo_skills/inference/model/__init__.py b/nemo_skills/inference/model/__init__.py
index 164d92fcc8..595d8fd3ee 100644
--- a/nemo_skills/inference/model/__init__.py
+++ b/nemo_skills/inference/model/__init__.py
@@ -39,6 +39,7 @@
 
 # Utilities
 from .vllm import VLLMModel
+from .vllm_multimodal import VLLMMultimodalModel
 
 # Model implementations
 
@@ -51,6 +52,7 @@
     "azureopenai": AzureOpenAIModel,
     "gemini": GeminiModel,
     "vllm": VLLMModel,
+    "vllm_multimodal": VLLMMultimodalModel,
     "sglang": SGLangModel,
     "tts_nim": TTSNIMModel,
     "asr_nim": ASRNIMModel,
diff --git a/nemo_skills/inference/model/base.py b/nemo_skills/inference/model/base.py
index 9318bfb475..117096b4c7 100644
--- a/nemo_skills/inference/model/base.py
+++ b/nemo_skills/inference/model/base.py
@@ -75,9 +75,14 @@ def __init__(
         enable_soft_fail: bool = False,
         context_limit_retry_strategy: str | None = None,
         num_special_tokens_budget: int = 100,
+        # Directory paths for data and output
+        data_dir: str = "",
+        output_dir: str | None = None,
     ):
         self._tunnel = None
         self.model_name_or_path = model
+        self.data_dir = data_dir
+        self.output_dir = output_dir
         self.server_host = host
         self.server_port = port
         self.ssh_server = ssh_server
diff --git a/nemo_skills/inference/model/vllm_multimodal.py b/nemo_skills/inference/model/vllm_multimodal.py
new file mode 100644
index 0000000000..0569c9efd9
--- /dev/null
+++ b/nemo_skills/inference/model/vllm_multimodal.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import base64
+import json
+import logging
+import os
+import re
+
+from nemo_skills.utils import get_logger_name
+
+from .vllm import VLLMModel
+
+LOG = logging.getLogger(get_logger_name(__file__))
+
+# Pattern to extract debug_info from content
+DEBUG_INFO_PATTERN = re.compile(r"\n?<debug_info>(.*?)</debug_info>", re.DOTALL)
+
+
+class VLLMMultimodalModel(VLLMModel):
+    """VLLMModel with support for saving audio responses to disk.
+
+    When the server returns audio in the response, this model will:
+    1. Save the audio bytes to a file in output_dir/audio/
+    2. Replace the base64 data with the file path in the result
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.output_audio_dir = None
+        if self.output_dir:
+            self.output_audio_dir = os.path.join(self.output_dir, "audio")
+            os.makedirs(self.output_audio_dir, exist_ok=True)
+            LOG.info(f"Audio responses will be saved to: {self.output_audio_dir}")
+
+    def _parse_chat_completion_response(self, response, include_response: bool = False, **kwargs) -> dict:
+        """Parse chat completion response and save any audio to disk."""
+        result = super()._parse_chat_completion_response(response, include_response=include_response, **kwargs)
+
+        # Extract debug_info from content (embedded as JSON in <debug_info> tags)
+        if "generation" in result and result["generation"]:
+            match = DEBUG_INFO_PATTERN.search(result["generation"])
+            if match:
+                try:
+                    result["debug_info"] = json.loads(match.group(1))
+                    # Strip debug_info from generation
+                    result["generation"] = DEBUG_INFO_PATTERN.sub("", result["generation"])
+                except json.JSONDecodeError:
+                    LOG.warning("Failed to parse debug_info JSON from content")
+
+        choice = response.choices[0]
+        if hasattr(choice.message, "audio") and choice.message.audio:
+            audio_result = self._process_audio_response(choice.message.audio, response.id)
+            result["audio"] = audio_result
+
+        # Strip audio data from serialized_output to avoid duplication
+        if "serialized_output" in result:
+            for item in result["serialized_output"]:
+                if isinstance(item, dict) and "audio" in item:
+                    # Keep only metadata, remove base64 data
+                    if isinstance(item["audio"], dict) and "data" in item["audio"]:
+                        del item["audio"]["data"]
+                # Also strip debug_info from serialized content
+                if isinstance(item, dict) and "content" in item and item["content"]:
+                    item["content"] = DEBUG_INFO_PATTERN.sub("", item["content"])
+
+        return result
+
+    def _process_audio_response(self, audio_data, response_id: str) -> dict:
+        """Process audio data: save to file and return metadata with path."""
+        audio_info = {
+            "format": getattr(audio_data, "format", "wav"),
+            "sample_rate": getattr(audio_data, "sample_rate", 22050),
+            "transcript": getattr(audio_data, "transcript", None),
+        }
+
+        audio_base64 = getattr(audio_data, "data", None)
+        if not audio_base64:
+            return audio_info
+
+        if self.output_audio_dir:
+            try:
+                audio_bytes = base64.b64decode(audio_base64)
+                filename = f"{response_id}.wav"
+                filepath = os.path.join(self.output_audio_dir, filename)
+
+                with open(filepath, "wb") as f:
+                    f.write(audio_bytes)
+
+                audio_info["path"] = filepath
+                audio_info["size_bytes"] = len(audio_bytes)
+                LOG.info(f"Saved audio: {filepath} ({len(audio_bytes)} bytes)")
+            except Exception as e:
+                LOG.warning(f"Failed to save audio: {e}")
+                audio_info["data"] = audio_base64
+        else:
+            audio_info["data"] = audio_base64
+
+        return audio_info

From 25752670474372ac9a0b7530c9e719e196649b43 Mon Sep 17 00:00:00 2001
From: Valentin Mendelev <vmendelev@nvidia.com>
Date: Thu, 18 Dec 2025 06:37:10 -0800
Subject: [PATCH 03/26] generation.py to respect separate server type for the
 client

Signed-off-by: Valentin Mendelev <vmendelev@nvidia.com>
---
 nemo_skills/pipeline/utils/generation.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/nemo_skills/pipeline/utils/generation.py b/nemo_skills/pipeline/utils/generation.py
index cd576053c1..4b45ed8a39 100644
--- a/nemo_skills/pipeline/utils/generation.py
+++ b/nemo_skills/pipeline/utils/generation.py
@@ -446,6 +446,9 @@ def configure_client(
             - server_address: Address of the server.
             - extra_arguments: Updated extra arguments for the command.
     """
+    # Check if user already specified server.server_type in extra_arguments
+    user_specified_server_type = "++server.server_type=" in extra_arguments
+
     if server_gpus:  # we need to host the model
         server_port = get_free_port(strategy="random") if get_random_port else 5000
         assert server_gpus is not None, "Need to specify server_gpus if hosting the model"
@@ -462,14 +465,17 @@ def configure_client(
         }
         if server_container:
             server_config["container"] = server_container
+        # Only add server_type if user didn't specify it (allows vllm_multimodal override)
+        server_type_arg = "" if user_specified_server_type else f"++server.server_type={server_type} "
         extra_arguments = (
-            f"{extra_arguments} ++server.server_type={server_type} ++server.host=127.0.0.1 "
+            f"{extra_arguments} {server_type_arg}++server.host=127.0.0.1 "
             f"++server.port={server_port} ++server.model={model} "
         )
     else:  # model is hosted elsewhere
         server_config = None
+        # Only add server_type if user didn't specify it
+        server_type_arg = "" if user_specified_server_type else f"++server.server_type={server_type} "
         extra_arguments = (
-            f"{extra_arguments} ++server.server_type={server_type} "
-            f"++server.base_url={server_address} ++server.model={model} "
+            f"{extra_arguments} {server_type_arg}++server.base_url={server_address} ++server.model={model} "
         )
     return server_config, server_address, extra_arguments

From b8d95f0195d8a7f26ab724c8af07009d64052b09 Mon Sep 17 00:00:00 2001
From: Valentin Mendelev <vmendelev@nvidia.com>
Date: Sat, 20 Dec 2025 05:39:26 -0800
Subject: [PATCH 04/26] Unified server to work with NeMo models not supported
 by vLLM

Signed-off-by: Valentin Mendelev <vmendelev@nvidia.com>
---
 nemo_skills/inference/server/serve_unified.py | 397 ++++++++++
 recipes/multimodal/server/__init__.py         |  38 +
 .../multimodal/server/backends/__init__.py    |  81 ++
 recipes/multimodal/server/backends/base.py    | 251 ++++++
 recipes/multimodal/server/session_manager.py  | 249 ++++++
 recipes/multimodal/server/unified_server.py   | 745 ++++++++++++++++++
 6 files changed, 1761 insertions(+)
 create mode 100644 nemo_skills/inference/server/serve_unified.py
 create mode 100644 recipes/multimodal/server/__init__.py
 create mode 100644 recipes/multimodal/server/backends/__init__.py
 create mode 100644 recipes/multimodal/server/backends/base.py
 create mode 100644 recipes/multimodal/server/session_manager.py
 create mode 100644 recipes/multimodal/server/unified_server.py

diff --git a/nemo_skills/inference/server/serve_unified.py b/nemo_skills/inference/server/serve_unified.py
new file mode 100644
index 0000000000..748b7cf044
--- /dev/null
+++ b/nemo_skills/inference/server/serve_unified.py
@@ -0,0 +1,397 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+CLI wrapper for the Unified NeMo Inference Server.
+
+This module provides a command-line interface compatible with nemo-skills
+server deployment patterns. It translates standard vllm-style CLI arguments
+to the unified server configuration.
+
+Usage via NeMo-Skills:
+
+    # SALM backend (speech-augmented language model)
+    ns eval \\
+        --server_type vllm \\
+        --server_gpus 1 \\
+        --model /path/to/model \\
+        --server_entrypoint "-m nemo_skills.inference.server.serve_unified" \\
+        --server_args "--backend salm"
+
+    # TTS backend (text-to-speech)
+    ns eval \\
+        --server_type vllm \\
+        --server_gpus 1 \\
+        --model /path/to/tts_model \\
+        --server_entrypoint "-m nemo_skills.inference.server.serve_unified" \\
+        --server_args "--backend tts --codec_model /path/to/codec"
+
+    # S2S backend (speech-to-speech)
+    ns eval \\
+        --server_type vllm \\
+        --server_gpus 1 \\
+        --model /path/to/s2s_model \\
+        --server_entrypoint "-m nemo_skills.inference.server.serve_unified" \\
+        --server_args "--backend s2s"
+
+Environment Variables:
+    UNIFIED_SERVER_HOST: Server host (default: 0.0.0.0)
+    UNIFIED_SERVER_PORT: Server port (default: 8000)
+    UNIFIED_SERVER_BACKEND: Backend type (default: salm)
+    UNIFIED_SERVER_MODEL_PATH: Path to model
+    UNIFIED_SERVER_CODEC_MODEL_PATH: Path to codec model
+    UNIFIED_SERVER_BATCH_SIZE: Batch size (default: 8)
+    UNIFIED_SERVER_BATCH_TIMEOUT: Batch timeout (default: 0.1)
+    DEBUG: Enable debug mode
+"""
+
+import argparse
+import inspect
+import os
+import shutil
+import sys
+from typing import Optional
+
+
+def setup_pythonpath(code_path: Optional[str] = None):
+    """Set up PYTHONPATH for NeMo and the unified server.
+
+    Args:
+        code_path: Single path or colon-separated paths to add to PYTHONPATH
+    """
+    paths_to_add = []
+
+    # Add explicit code path(s) if provided (supports colon-separated paths)
+    if code_path:
+        for path in code_path.split(":"):
+            if path and path not in paths_to_add:
+                paths_to_add.append(path)
+
+    # Add recipes path for unified server imports
+    # Look for the recipes directory relative to this file
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+
+    # Try to find ns_eval root (go up from nemo_skills/inference/server/)
+    ns_eval_root = os.path.dirname(os.path.dirname(os.path.dirname(this_dir)))
+    if os.path.exists(os.path.join(ns_eval_root, "recipes")):
+        paths_to_add.append(ns_eval_root)
+
+    # Also check /nemo_run/code pattern used in containers
+    if os.path.exists("/nemo_run/code"):
+        paths_to_add.append("/nemo_run/code")
+
+    # Update PYTHONPATH
+    current_path = os.environ.get("PYTHONPATH", "")
+    for path in paths_to_add:
+        if path not in current_path.split(":"):
+            current_path = f"{path}:{current_path}" if current_path else path
+
+    os.environ["PYTHONPATH"] = current_path
+
+    # Also add to sys.path for immediate imports
+    for path in paths_to_add:
+        if path not in sys.path:
+            sys.path.insert(0, path)
+
+
+def apply_safetensors_patch(hack_path: Optional[str]):
+    """Apply safetensors patch if provided (for some NeMo models)."""
+    if not hack_path or not os.path.exists(hack_path):
+        return
+
+    try:
+        import safetensors.torch as st_torch
+
+        dest_path = inspect.getfile(st_torch)
+        os.makedirs(os.path.dirname(dest_path), exist_ok=True)
+        shutil.copyfile(hack_path, dest_path)
+        print(f"[serve_unified] Applied safetensors patch: {hack_path} -> {dest_path}")
+    except Exception as e:
+        print(f"[serve_unified] Warning: Failed to apply safetensors patch: {e}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Unified NeMo Inference Server CLI wrapper",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+
+    # Standard vllm-style arguments (for nemo-skills compatibility)
+    parser.add_argument("--model", required=True, help="Path to the model")
+    parser.add_argument("--num_gpus", type=int, default=1, help="Number of GPUs to use")
+    parser.add_argument("--port", type=int, default=8000, help="Server port")
+
+    # Backend selection
+    parser.add_argument(
+        "--backend",
+        default="salm",
+        choices=["salm", "tts", "s2s", "s2s_incremental", "s2s_session"],
+        help="Backend type: salm (speech-augmented LM), tts (text-to-speech), s2s (speech-to-speech offline), s2s_incremental (frame-by-frame processing), s2s_session (session-aware multi-turn)",
+    )
+
+    # Backend-specific model paths
+    parser.add_argument("--codec_model", default=None, help="Path to codec model (required for TTS, optional for S2S)")
+
+    # Server configuration
+    parser.add_argument("--host", default="0.0.0.0", help="Server host")
+    parser.add_argument("--batch_size", type=int, default=8, help="Maximum batch size")
+    parser.add_argument(
+        "--batch_timeout", type=float, default=0.1, help="Batch timeout in seconds (0 for no batching delay)"
+    )
+
+    # Generation defaults
+    parser.add_argument("--max_new_tokens", type=int, default=512, help="Max tokens to generate")
+    parser.add_argument("--temperature", type=float, default=1.0, help="Generation temperature")
+    parser.add_argument("--top_p", type=float, default=1.0, help="Top-p sampling")
+
+    # Model configuration
+    parser.add_argument("--device", default="cuda", help="Device to use")
+    parser.add_argument("--dtype", default="bfloat16", help="Model dtype")
+
+    # Backend-specific options
+    parser.add_argument("--prompt_format", default=None, help="Prompt format (SALM backend)")
+    parser.add_argument(
+        "--phoneme_input_type", default="predicted", help="Phoneme input type: predicted or gt (TTS backend)"
+    )
+    parser.add_argument(
+        "--decoder_only_model", action="store_true", help="Use decoder-only model architecture (TTS backend)"
+    )
+    parser.add_argument("--use_local_transformer", action="store_true", help="Use local transformer (TTS backend)")
+    parser.add_argument("--top_k", type=int, default=None, help="Top-k sampling (TTS backend)")
+
+    # Environment setup
+    parser.add_argument("--code_path", default=None, help="Path to NeMo source code to add to PYTHONPATH")
+    parser.add_argument("--hack_path", default=None, help="Path to safetensors/torch.py patch file")
+
+    # S2S backend options
+    parser.add_argument(
+        "--ignore_system_prompt",
+        action="store_true",
+        help="Ignore system prompts from requests (for models that don't support them)",
+    )
+    parser.add_argument(
+        "--silence_padding_sec",
+        type=float,
+        default=5.0,
+        help="Seconds of silence to append after audio (S2S backends)",
+    )
+
+    # S2S Incremental backend options
+    parser.add_argument(
+        "--config_path",
+        default=None,
+        help="Path to YAML config file (s2s_incremental backend)",
+    )
+    parser.add_argument(
+        "--llm_checkpoint_path",
+        default=None,
+        help="Path to LLM checkpoint (s2s_incremental backend)",
+    )
+    parser.add_argument(
+        "--tts_checkpoint_path",
+        default=None,
+        help="Path to TTS checkpoint (s2s_incremental backend)",
+    )
+    parser.add_argument(
+        "--speaker_reference",
+        default=None,
+        help="Path to speaker reference audio for TTS (s2s_incremental backend)",
+    )
+    parser.add_argument(
+        "--num_frames_per_inference",
+        type=int,
+        default=1,
+        help="Frames per inference step (s2s_incremental backend)",
+    )
+    parser.add_argument(
+        "--no_decode_audio",
+        action="store_true",
+        help="Disable audio output (s2s_incremental backend)",
+    )
+
+    # Session management options (s2s_session backend)
+    parser.add_argument(
+        "--session_ttl",
+        type=float,
+        default=300.0,
+        help="Session time-to-live in seconds (s2s_session backend)",
+    )
+    parser.add_argument(
+        "--max_sessions",
+        type=int,
+        default=100,
+        help="Maximum number of concurrent sessions (s2s_session backend)",
+    )
+    parser.add_argument(
+        "--session_artifacts_dir",
+        type=str,
+        default=None,
+        help="Directory to save session artifacts (input/output audio, JSON). Default: /tmp/s2s_sessions",
+    )
+    parser.add_argument(
+        "--no_save_session_artifacts",
+        action="store_true",
+        help="Disable saving session artifacts to disk",
+    )
+    parser.add_argument(
+        "--output_frame_alignment",
+        action="store_true",
+        help="Include per-frame alignment data in debug output (user/agent/ASR per frame)",
+    )
+
+    # Debug
+    parser.add_argument("--debug", action="store_true", help="Enable debug mode")
+
+    # Parse known args, allowing extra args to be passed through
+    args, extra_args = parser.parse_known_args()
+
+    # Setup environment
+    setup_pythonpath(args.code_path)
+    apply_safetensors_patch(args.hack_path)
+
+    # Set environment variables
+    os.environ["UNIFIED_SERVER_HOST"] = args.host
+    os.environ["UNIFIED_SERVER_PORT"] = str(args.port)
+    os.environ["UNIFIED_SERVER_BACKEND"] = args.backend
+    os.environ["UNIFIED_SERVER_MODEL_PATH"] = args.model
+    os.environ["UNIFIED_SERVER_BATCH_SIZE"] = str(args.batch_size)
+    os.environ["UNIFIED_SERVER_BATCH_TIMEOUT"] = str(args.batch_timeout)
+    os.environ["UNIFIED_SERVER_MAX_NEW_TOKENS"] = str(args.max_new_tokens)
+    os.environ["UNIFIED_SERVER_TEMPERATURE"] = str(args.temperature)
+    os.environ["UNIFIED_SERVER_TOP_P"] = str(args.top_p)
+
+    if args.codec_model:
+        os.environ["UNIFIED_SERVER_CODEC_MODEL_PATH"] = args.codec_model
+
+    if args.debug:
+        os.environ["DEBUG"] = "1"
+
+    # Set CUDA devices
+    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(i) for i in range(args.num_gpus))
+
+    # Build extra config for backend-specific options
+    extra_config = {}
+
+    if args.prompt_format:
+        extra_config["prompt_format"] = args.prompt_format
+
+    if args.backend == "tts":
+        extra_config["decoder_only_model"] = args.decoder_only_model
+        extra_config["phoneme_input_type"] = args.phoneme_input_type
+        extra_config["use_local_transformer"] = args.use_local_transformer
+        if args.top_k:
+            extra_config["top_k"] = args.top_k
+
+    # S2S backend options
+    if args.backend in ("s2s", "s2s_incremental", "s2s_session"):
+        extra_config["ignore_system_prompt"] = args.ignore_system_prompt
+        if args.silence_padding_sec != 5.0:
+            extra_config["silence_padding_sec"] = args.silence_padding_sec
+
+    # S2S Incremental/Session backend options (shared config)
+    if args.backend in ("s2s_incremental", "s2s_session"):
+        if args.config_path:
+            extra_config["config_path"] = args.config_path
+        if args.llm_checkpoint_path:
+            extra_config["llm_checkpoint_path"] = args.llm_checkpoint_path
+        if args.tts_checkpoint_path:
+            extra_config["tts_checkpoint_path"] = args.tts_checkpoint_path
+        if args.speaker_reference:
+            extra_config["speaker_reference"] = args.speaker_reference
+        if args.num_frames_per_inference != 1:
+            extra_config["num_frames_per_inference"] = args.num_frames_per_inference
+        if args.no_decode_audio:
+            extra_config["decode_audio"] = False
+        # Artifacts and alignment (available for both backends)
+        if args.session_artifacts_dir:
+            extra_config["session_artifacts_dir"] = args.session_artifacts_dir
+        extra_config["save_session_artifacts"] = not args.no_save_session_artifacts
+        extra_config["output_frame_alignment"] = args.output_frame_alignment
+
+    # S2S Session backend options
+    if args.backend == "s2s_session":
+        extra_config["session_ttl"] = args.session_ttl
+        extra_config["max_sessions"] = args.max_sessions
+
+    # Print configuration
+    print("=" * 60)
+    print("[serve_unified] Starting Unified NeMo Inference Server")
+    print("=" * 60)
+    print(f"  Backend: {args.backend}")
+    print(f"  Model: {args.model}")
+    if args.codec_model:
+        print(f"  Codec Model: {args.codec_model}")
+    print(f"  Port: {args.port}")
+    print(f"  GPUs: {args.num_gpus}")
+    print(f"  Batch Size: {args.batch_size}")
+    print(f"  Batch Timeout: {args.batch_timeout}s")
+    print(f"  Device: {args.device}")
+    print(f"  Dtype: {args.dtype}")
+    if args.backend in ("s2s_incremental", "s2s_session"):
+        if args.config_path:
+            print(f"  Config Path: {args.config_path}")
+        if args.llm_checkpoint_path:
+            print(f"  LLM Checkpoint: {args.llm_checkpoint_path}")
+        if args.speaker_reference:
+            print(f"  Speaker Reference: {args.speaker_reference}")
+        print(f"  Frames per Inference: {args.num_frames_per_inference}")
+        print(f"  Decode Audio: {not args.no_decode_audio}")
+        print(f"  Save Artifacts: {not args.no_save_session_artifacts}")
+        if args.session_artifacts_dir:
+            print(f"  Artifacts Dir: {args.session_artifacts_dir}")
+        else:
+            print("  Artifacts Dir: /tmp/s2s_sessions (default)")
+        print(f"  Output Frame Alignment: {args.output_frame_alignment}")
+    if args.backend == "s2s_session":
+        print(f"  Session TTL: {args.session_ttl}s")
+        print(f"  Max Sessions: {args.max_sessions}")
+    if extra_config:
+        print(f"  Extra Config: {extra_config}")
+    print("=" * 60)
+
+    # Import and run the unified server
+    try:
+        import uvicorn
+
+        from recipes.multimodal.server.unified_server import create_app
+
+        app = create_app(
+            backend_type=args.backend,
+            model_path=args.model,
+            codec_model_path=args.codec_model or "",
+            batch_size=args.batch_size,
+            batch_timeout=args.batch_timeout,
+            device=args.device,
+            dtype=args.dtype,
+            extra_config=extra_config if extra_config else None,
+        )
+
+        uvicorn.run(app, host=args.host, port=args.port, log_level="info")
+
+    except ImportError as e:
+        print(f"[serve_unified] Error: Failed to import unified server: {e}")
+        print("[serve_unified] Make sure the recipes.multimodal.server package is in PYTHONPATH")
+        sys.exit(1)
+    except Exception as e:
+        print(f"[serve_unified] Error: {e}")
+        import traceback
+
+        traceback.print_exc()
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recipes/multimodal/server/__init__.py b/recipes/multimodal/server/__init__.py
new file mode 100644
index 0000000000..89a349346e
--- /dev/null
+++ b/recipes/multimodal/server/__init__.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Unified NeMo Inference Server package.
+
+Provides a pluggable FastAPI server that supports multiple NeMo model backends:
+- SALM: Speech-Augmented Language Model (text output from text/audio input)
+- TTS: Text-to-Speech (audio output from text input)
+- S2S: Speech-to-Speech (text+audio output from audio input)
+"""
+
+from .backends import (
+    BackendConfig,
+    GenerationRequest,
+    GenerationResult,
+    InferenceBackend,
+    get_backend,
+)
+
+__all__ = [
+    "InferenceBackend",
+    "GenerationRequest",
+    "GenerationResult",
+    "BackendConfig",
+    "get_backend",
+]
diff --git a/recipes/multimodal/server/backends/__init__.py b/recipes/multimodal/server/backends/__init__.py
new file mode 100644
index 0000000000..861b330f00
--- /dev/null
+++ b/recipes/multimodal/server/backends/__init__.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Backend implementations for the Unified NeMo Inference Server.
+
+Available backends:
+- salm: Speech-Augmented Language Model (text output from text/audio input)
+- tts: Text-to-Speech using MagpieTTS (audio output from text input)
+- s2s: Speech-to-Speech using DuplexS2S offline (text output from audio input)
+- s2s_incremental: Speech-to-Speech using NemotronVoiceChat incremental (text+audio from audio)
+- s2s_session: Speech-to-Speech with session support for multi-turn conversations
+"""
+
+from .base import BackendConfig, GenerationRequest, GenerationResult, InferenceBackend, Modality
+
+__all__ = [
+    "InferenceBackend",
+    "GenerationRequest",
+    "GenerationResult",
+    "BackendConfig",
+    "Modality",
+    "get_backend",
+    "list_backends",
+]
+
+# Registry of available backends
+BACKEND_REGISTRY = {
+    "salm": ("salm_backend", "SALMBackend"),
+    "tts": ("tts_backend", "TTSBackend"),
+    "s2s": ("s2s_backend", "S2SBackend"),
+    "s2s_incremental": ("s2s_incremental_backend", "S2SIncrementalBackend"),
+    "s2s_session": ("s2s_session_backend", "S2SSessionBackend"),
+}
+
+
+def list_backends() -> list:
+    """Return list of available backend names."""
+    return list(BACKEND_REGISTRY.keys())
+
+
+def get_backend(backend_name: str) -> type:
+    """
+    Get backend class by name with lazy loading.
+
+    Args:
+        backend_name: One of 'salm', 'tts', 's2s'
+
+    Returns:
+        Backend class (not instance)
+
+    Raises:
+        ValueError: If backend name is unknown
+        ImportError: If backend dependencies are not available
+    """
+    if backend_name not in BACKEND_REGISTRY:
+        available = ", ".join(BACKEND_REGISTRY.keys())
+        raise ValueError(f"Unknown backend: '{backend_name}'. Available backends: {available}")
+
+    module_name, class_name = BACKEND_REGISTRY[backend_name]
+
+    import importlib
+
+    try:
+        module = importlib.import_module(f".{module_name}", package=__name__)
+        return getattr(module, class_name)
+    except ImportError as e:
+        raise ImportError(
+            f"Failed to import backend '{backend_name}'. Make sure required dependencies are installed. Error: {e}"
+        ) from e
diff --git a/recipes/multimodal/server/backends/base.py b/recipes/multimodal/server/backends/base.py
new file mode 100644
index 0000000000..e1d62c9765
--- /dev/null
+++ b/recipes/multimodal/server/backends/base.py
@@ -0,0 +1,251 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Abstract base class for inference backends.
+
+All model backends (SALM, TTS, S2S, etc.) must implement this interface
+to be usable with the unified inference server.
+"""
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Dict, List, Optional, Set
+
+
+class Modality(str, Enum):
+    """Supported input/output modalities."""
+
+    TEXT = "text"
+    AUDIO_IN = "audio_in"
+    AUDIO_OUT = "audio_out"
+
+
+@dataclass
+class BackendConfig:
+    """Base configuration for all backends."""
+
+    model_path: str
+    device: str = "cuda"
+    dtype: str = "bfloat16"
+
+    # Generation defaults
+    max_new_tokens: int = 512
+    temperature: float = 1.0
+    top_p: float = 1.0
+    top_k: Optional[int] = None
+
+    # Additional model-specific configs passed through
+    extra_config: Dict[str, Any] = field(default_factory=dict)
+
+    @classmethod
+    def from_dict(cls, d: Dict[str, Any]) -> "BackendConfig":
+        """Create config from dictionary, extracting known fields."""
+        known_fields = {f.name for f in cls.__dataclass_fields__.values()}
+        known = {k: v for k, v in d.items() if k in known_fields and k != "extra_config"}
+        extra = {k: v for k, v in d.items() if k not in known_fields}
+        return cls(**known, extra_config=extra)
+
+
+@dataclass
+class GenerationRequest:
+    """
+    A single generation request.
+
+    Supports text and/or audio inputs depending on the backend's capabilities.
+    """
+
+    # Text inputs
+    text: Optional[str] = None
+    system_prompt: Optional[str] = None
+    user_prompt: Optional[str] = None
+
+    # Audio input (raw bytes or file path)
+    audio_bytes: Optional[bytes] = None
+    audio_path: Optional[str] = None
+    sample_rate: int = 16000
+
+    # Multi-turn audio inputs (list of audio bytes or paths)
+    audio_bytes_list: Optional[List[bytes]] = None
+    audio_paths: Optional[List[str]] = None
+
+    # Generation parameters (override backend defaults)
+    max_new_tokens: Optional[int] = None
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
+    top_k: Optional[int] = None
+    seed: Optional[int] = None
+
+    # Additional parameters
+    extra_params: Dict[str, Any] = field(default_factory=dict)
+
+    # Request tracking
+    request_id: Optional[str] = None
+
+
+@dataclass
+class GenerationResult:
+    """
+    Result from a generation request.
+
+    Contains text output and optionally audio output, plus metadata.
+    """
+
+    # Text output
+    text: str = ""
+
+    # Audio output (raw bytes, can be encoded to base64 for JSON)
+    audio_bytes: Optional[bytes] = None
+    audio_sample_rate: int = 16000
+    audio_format: str = "wav"
+
+    # Metadata
+    request_id: Optional[str] = None
+    num_tokens_generated: int = 0
+    generation_time_ms: float = 0.0
+
+    # Debug info (optional, backend-specific)
+    debug_info: Optional[Dict[str, Any]] = None
+
+    # Error handling
+    error: Optional[str] = None
+
+    def is_success(self) -> bool:
+        return self.error is None
+
+
+class InferenceBackend(ABC):
+    """
+    Abstract base class for inference backends.
+
+    Implementations must provide:
+    - load_model(): Initialize the model from config
+    - generate(): Run inference on a batch of requests
+    - supported_modalities: What input/output types are supported
+
+    The unified server uses this interface to handle any backend uniformly.
+    """
+
+    def __init__(self, config: BackendConfig):
+        """
+        Initialize the backend with configuration.
+
+        Args:
+            config: Backend configuration including model path and generation defaults
+        """
+        self.config = config
+        self._model = None
+        self._is_loaded = False
+
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        """Return the backend name (e.g., 'salm', 'tts', 's2s')."""
+        pass
+
+    @property
+    @abstractmethod
+    def supported_modalities(self) -> Set[Modality]:
+        """
+        Return the set of supported modalities.
+
+        Examples:
+        - SALM: {TEXT, AUDIO_IN} - text output from text/audio input
+        - TTS: {TEXT, AUDIO_OUT} - audio output from text input
+        - S2S: {TEXT, AUDIO_IN, AUDIO_OUT} - audio+text output from audio input
+        """
+        pass
+
+    @abstractmethod
+    def load_model(self) -> None:
+        """
+        Load and initialize the model.
+
+        Should set self._model and self._is_loaded = True on success.
+        Called once during server startup.
+
+        Raises:
+            RuntimeError: If model loading fails
+        """
+        pass
+
+    @abstractmethod
+    def generate(self, requests: List[GenerationRequest]) -> List[GenerationResult]:
+        """
+        Run inference on a batch of requests.
+
+        Args:
+            requests: List of generation requests to process
+
+        Returns:
+            List of generation results, one per request (same order)
+
+        Note:
+            - Implementations should handle batching internally
+            - Each result should have request_id matching the input
+            - On error, set result.error instead of raising
+        """
+        pass
+
+    @property
+    def is_loaded(self) -> bool:
+        """Check if the model is loaded and ready."""
+        return self._is_loaded
+
+    def health_check(self) -> Dict[str, Any]:
+        """
+        Return health status information.
+
+        Override to add backend-specific health info.
+        """
+        return {
+            "backend": self.name,
+            "model_loaded": self._is_loaded,
+            "model_path": self.config.model_path,
+            "device": self.config.device,
+            "modalities": [m.value for m in self.supported_modalities],
+        }
+
+    def get_generation_params(self, request: GenerationRequest) -> Dict[str, Any]:
+        """
+        Get effective generation parameters, merging request with config defaults.
+        """
+        return {
+            "max_new_tokens": request.max_new_tokens or self.config.max_new_tokens,
+            "temperature": request.temperature or self.config.temperature,
+            "top_p": request.top_p or self.config.top_p,
+            "top_k": request.top_k or self.config.top_k,
+        }
+
+    def validate_request(self, request: GenerationRequest) -> Optional[str]:
+        """
+        Validate a request against supported modalities.
+
+        Returns:
+            Error message if invalid, None if valid
+        """
+        modalities = self.supported_modalities
+
+        has_text_input = request.text is not None
+        has_audio_input = request.audio_bytes is not None or request.audio_path is not None
+
+        # Check input modalities
+        if has_audio_input and Modality.AUDIO_IN not in modalities:
+            return f"Backend '{self.name}' does not support audio input"
+
+        if not has_text_input and not has_audio_input:
+            return "Request must have either text or audio input"
+
+        return None
diff --git a/recipes/multimodal/server/session_manager.py b/recipes/multimodal/server/session_manager.py
new file mode 100644
index 0000000000..113735b8ba
--- /dev/null
+++ b/recipes/multimodal/server/session_manager.py
@@ -0,0 +1,249 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Session manager for S2S session backend.
+
+Manages session state (LLM KV cache, frame index, etc.) across HTTP requests
+to enable multi-turn conversations.
+"""
+
+import threading
+import time
+import uuid
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+
+import torch
+
+
+@dataclass
+class TurnData:
+    """Data for a single turn in a conversation."""
+
+    turn_idx: int
+    user_audio_bytes: Optional[bytes] = None  # Input audio from user
+    agent_audio_bytes: Optional[bytes] = None  # Output audio from agent
+    agent_text: str = ""  # Text response for this turn
+    user_duration_sec: float = 0.0  # Duration of user audio
+    agent_duration_sec: float = 0.0  # Duration of agent audio
+
+
+@dataclass
+class SessionState:
+    """State that persists between turns in a session."""
+
+    session_id: str
+
+    # LLM state
+    llm_cache: Any = None  # DynamicCache (for non-Mamba models)
+    input_embeds_history: Any = None  # List of embeddings (for Mamba models)
+    frame_idx: int = 0
+
+    # Token history (for turn-taking logic)
+    gen_text: Optional[torch.Tensor] = None
+    gen_asr_text: Optional[torch.Tensor] = None
+
+    # Audio buffer state
+    audio_buffer: Optional[torch.Tensor] = None
+    buffer_fill_level: int = 0
+
+    # Turn tracking
+    turn_count: int = 0
+
+    # Per-turn data for session audio generation
+    turns: List[TurnData] = field(default_factory=list)
+
+    # Timestamps
+    created_at: float = field(default_factory=time.time)
+    last_accessed: float = field(default_factory=time.time)
+
+    def touch(self):
+        """Update last_accessed timestamp."""
+        self.last_accessed = time.time()
+
+
+class SessionManager:
+    """
+    Manages session state for S2S multi-turn conversations.
+
+    Thread-safe implementation with TTL-based cleanup.
+    """
+
+    def __init__(self, ttl_seconds: float = 300.0, max_sessions: int = 100):
+        """
+        Initialize SessionManager.
+
+        Args:
+            ttl_seconds: Time-to-live for sessions in seconds (default: 5 minutes)
+            max_sessions: Maximum number of concurrent sessions
+        """
+        self.ttl_seconds = ttl_seconds
+        self.max_sessions = max_sessions
+        self.sessions: Dict[str, SessionState] = {}
+        self._lock = threading.RLock()
+
+    def create_session(self, session_id: Optional[str] = None) -> SessionState:
+        """
+        Create a new session.
+
+        Args:
+            session_id: Optional session ID. If None, generates a UUID.
+
+        Returns:
+            New SessionState object
+        """
+        with self._lock:
+            if session_id is None:
+                session_id = str(uuid.uuid4())
+
+            # Clean up expired sessions first
+            self._cleanup_expired_locked()
+
+            # Evict oldest if at capacity
+            if len(self.sessions) >= self.max_sessions:
+                self._evict_oldest_locked()
+
+            state = SessionState(session_id=session_id)
+            self.sessions[session_id] = state
+            print(f"[SessionManager] Created session: {session_id}")
+            return state
+
+    def get_session(self, session_id: str) -> Optional[SessionState]:
+        """
+        Get existing session by ID.
+
+        Args:
+            session_id: Session ID to look up
+
+        Returns:
+            SessionState if found and not expired, None otherwise
+        """
+        with self._lock:
+            state = self.sessions.get(session_id)
+            if state is None:
+                return None
+
+            # Check if expired
+            if time.time() - state.last_accessed > self.ttl_seconds:
+                print(f"[SessionManager] Session expired: {session_id}")
+                del self.sessions[session_id]
+                return None
+
+            state.touch()
+            return state
+
+    def get_or_create_session(self, session_id: Optional[str] = None) -> SessionState:
+        """
+        Get existing session or create new one.
+
+        Args:
+            session_id: Session ID. If None, creates new session.
+
+        Returns:
+            SessionState (existing or new)
+        """
+        if session_id:
+            state = self.get_session(session_id)
+            if state is not None:
+                return state
+
+        return self.create_session(session_id)
+
+    def save_session(self, session_id: str, state: SessionState):
+        """
+        Save/update session state.
+
+        Args:
+            session_id: Session ID
+            state: SessionState to save
+        """
+        with self._lock:
+            state.touch()
+            self.sessions[session_id] = state
+
+    def delete_session(self, session_id: str) -> bool:
+        """
+        Delete a session.
+
+        Args:
+            session_id: Session ID to delete
+
+        Returns:
+            True if session was deleted, False if not found
+        """
+        with self._lock:
+            if session_id in self.sessions:
+                del self.sessions[session_id]
+                print(f"[SessionManager] Deleted session: {session_id}")
+                return True
+            return False
+
+    def get_session_info(self, session_id: str) -> Optional[Dict[str, Any]]:
+        """
+        Get session info without full state.
+
+        Args:
+            session_id: Session ID
+
+        Returns:
+            Dict with session metadata or None
+        """
+        with self._lock:
+            state = self.sessions.get(session_id)
+            if state is None:
+                return None
+
+            return {
+                "session_id": state.session_id,
+                "frame_idx": state.frame_idx,
+                "turn_count": state.turn_count,
+                "created_at": state.created_at,
+                "last_accessed": state.last_accessed,
+                "has_llm_cache": state.llm_cache is not None,
+                "has_input_embeds_history": state.input_embeds_history is not None
+                and len(state.input_embeds_history) > 0,
+            }
+
+    def list_sessions(self) -> list:
+        """List all active session IDs."""
+        with self._lock:
+            return list(self.sessions.keys())
+
+    def cleanup_expired(self):
+        """Clean up expired sessions (called periodically)."""
+        with self._lock:
+            self._cleanup_expired_locked()
+
+    def _cleanup_expired_locked(self):
+        """Clean up expired sessions (must hold lock)."""
+        now = time.time()
+        expired = [sid for sid, state in self.sessions.items() if now - state.last_accessed > self.ttl_seconds]
+        for sid in expired:
+            print(f"[SessionManager] Cleaning up expired session: {sid}")
+            del self.sessions[sid]
+
+    def _evict_oldest_locked(self):
+        """Evict oldest session to make room (must hold lock)."""
+        if not self.sessions:
+            return
+
+        oldest_id = min(self.sessions.keys(), key=lambda sid: self.sessions[sid].last_accessed)
+        print(f"[SessionManager] Evicting oldest session: {oldest_id}")
+        del self.sessions[oldest_id]
+
+    def __len__(self) -> int:
+        """Return number of active sessions."""
+        with self._lock:
+            return len(self.sessions)
diff --git a/recipes/multimodal/server/unified_server.py b/recipes/multimodal/server/unified_server.py
new file mode 100644
index 0000000000..2cf989656c
--- /dev/null
+++ b/recipes/multimodal/server/unified_server.py
@@ -0,0 +1,745 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Unified NeMo Inference Server with OpenAI-compatible API.
+
+Supports multiple NeMo model backends:
+- SALM: Speech-Augmented Language Model
+- TTS: Text-to-Speech (MagpieTTS)
+- S2S: Speech-to-Speech (Duplex)
+
+Exposes only /v1/chat/completions endpoint for OpenAI compatibility.
+
+Usage:
+    python unified_server.py --backend s2s --model /path/to/model
+"""
+
+import asyncio
+import base64
+import hashlib
+import json
+import os
+import re
+import time
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Any, Dict, List, Optional
+
+import uvicorn
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import JSONResponse
+
+from .backends import BackendConfig, GenerationRequest, GenerationResult, get_backend
+from .session_manager import SessionManager
+
+# Configuration from environment
+HOST = os.getenv("UNIFIED_SERVER_HOST", "0.0.0.0")
+PORT = int(os.getenv("UNIFIED_SERVER_PORT", "8000"))
+BACKEND_TYPE = os.getenv("UNIFIED_SERVER_BACKEND", "salm")
+MODEL_PATH = os.getenv("UNIFIED_SERVER_MODEL_PATH", "")
+CODEC_MODEL_PATH = os.getenv("UNIFIED_SERVER_CODEC_MODEL_PATH", "")
+
+# Batching configuration
+BATCH_SIZE = int(os.getenv("UNIFIED_SERVER_BATCH_SIZE", "8"))
+BATCH_TIMEOUT = float(os.getenv("UNIFIED_SERVER_BATCH_TIMEOUT", "0.1"))
+
+# Generation defaults
+MAX_NEW_TOKENS = int(os.getenv("UNIFIED_SERVER_MAX_NEW_TOKENS", "512"))
+TEMPERATURE = float(os.getenv("UNIFIED_SERVER_TEMPERATURE", "1.0"))
+TOP_P = float(os.getenv("UNIFIED_SERVER_TOP_P", "1.0"))
+
+# Debug
+DEBUG = os.getenv("DEBUG", "").lower() in ("true", "1", "yes", "on")
+
+
+@dataclass
+class PendingRequest:
+    """Container for a pending batched request."""
+
+    request: GenerationRequest
+    future: asyncio.Future
+    timestamp: float
+
+
+class RequestBatcher:
+    """Manages request batching with configurable delay."""
+
+    def __init__(self, backend, batch_size: int, batch_timeout: float):
+        self.backend = backend
+        self.batch_size = batch_size
+        self.batch_timeout = batch_timeout
+        self.pending_requests: List[PendingRequest] = []
+        self.lock = asyncio.Lock()
+        self.timeout_task: Optional[asyncio.Task] = None
+        self.processing = False
+
+        # Stats
+        self.total_requests = 0
+        self.total_batches = 0
+
+    async def add_request(self, request: GenerationRequest) -> GenerationResult:
+        """Add a request and wait for result."""
+        future = asyncio.Future()
+        pending = PendingRequest(request=request, future=future, timestamp=time.time())
+
+        async with self.lock:
+            self.pending_requests.append(pending)
+
+            # Check if we should process immediately
+            if len(self.pending_requests) >= self.batch_size:
+                if DEBUG:
+                    print(f"[Batcher] Batch full ({self.batch_size}), processing immediately")
+                asyncio.create_task(self._process_batch())
+            elif self.batch_timeout == 0:
+                # No delay mode
+                asyncio.create_task(self._process_batch())
+            elif self.timeout_task is None or self.timeout_task.done():
+                # Schedule timeout
+                self.timeout_task = asyncio.create_task(self._timeout_handler())
+
+        return await future
+
+    async def _timeout_handler(self):
+        """Handle batch timeout."""
+        await asyncio.sleep(self.batch_timeout)
+        async with self.lock:
+            if self.pending_requests and not self.processing:
+                if DEBUG:
+                    print(f"[Batcher] Timeout, processing {len(self.pending_requests)} requests")
+                asyncio.create_task(self._process_batch())
+
+    async def _process_batch(self):
+        """Process pending requests as a batch."""
+        async with self.lock:
+            if not self.pending_requests or self.processing:
+                return
+
+            self.processing = True
+            batch = self.pending_requests[: self.batch_size]
+            self.pending_requests = self.pending_requests[self.batch_size :]
+
+        try:
+            # Extract requests
+            requests = [p.request for p in batch]
+
+            if DEBUG:
+                print(f"[Batcher] Processing batch of {len(requests)} requests")
+
+            # Run inference in thread pool to not block event loop
+            loop = asyncio.get_event_loop()
+            results = await loop.run_in_executor(None, self.backend.generate, requests)
+
+            # Complete futures
+            for pending, result in zip(batch, results):
+                if not pending.future.done():
+                    pending.future.set_result(result)
+
+            # Update stats
+            self.total_requests += len(batch)
+            self.total_batches += 1
+
+        except Exception as e:
+            # Set exception for all pending requests
+            for pending in batch:
+                if not pending.future.done():
+                    pending.future.set_exception(e)
+        finally:
+            async with self.lock:
+                self.processing = False
+                # Process more if pending
+                if self.pending_requests:
+                    if self.batch_timeout == 0 or len(self.pending_requests) >= self.batch_size:
+                        asyncio.create_task(self._process_batch())
+                    elif self.timeout_task is None or self.timeout_task.done():
+                        self.timeout_task = asyncio.create_task(self._timeout_handler())
+
+
+# Global state
+backend_instance = None
+request_batcher = None
+session_manager = None
+server_config = {}
+
+
+def extract_audio_from_messages(messages: List[Dict[str, Any]]) -> List[bytes]:
+    """Extract all audio bytes from OpenAI-format messages.
+
+    Looks for audio_url in message content with format:
+    {"type": "audio_url", "audio_url": {"url": "data:audio/wav;base64,..."}}
+
+    Returns a list of audio bytes (one per audio_url found), preserving message order.
+    """
+    audio_list = []
+    for message in messages:
+        content = message.get("content")
+        if isinstance(content, list):
+            for item in content:
+                if isinstance(item, dict) and item.get("type") == "audio_url":
+                    audio_url = item.get("audio_url", {})
+                    url = audio_url.get("url", "")
+                    # Parse data URL: data:audio/wav;base64,<base64_data>
+                    match = re.match(r"data:audio/\w+;base64,(.+)", url)
+                    if match:
+                        audio_list.append(base64.b64decode(match.group(1)))
+    return audio_list
+
+
+def extract_text_from_messages(messages: List[Dict[str, Any]]) -> str:
+    """Extract text content from OpenAI-format messages."""
+    texts = []
+    for message in messages:
+        content = message.get("content")
+        if isinstance(content, str):
+            if content:
+                texts.append(content)
+        elif isinstance(content, list):
+            for item in content:
+                if isinstance(item, dict) and item.get("type") == "text":
+                    text = item.get("text", "")
+                    if text:
+                        texts.append(text)
+                elif isinstance(item, str):
+                    texts.append(item)
+    return " ".join(texts)
+
+
+def extract_system_prompt(messages: List[Dict[str, Any]]) -> Optional[str]:
+    """Extract system prompt from messages."""
+    for message in messages:
+        if message.get("role") == "system":
+            content = message.get("content")
+            if isinstance(content, str):
+                return content
+            elif isinstance(content, list):
+                texts = [
+                    item.get("text", "") for item in content if isinstance(item, dict) and item.get("type") == "text"
+                ]
+                return " ".join(texts) if texts else None
+    return None
+
+
+def create_app(
+    backend_type: str = BACKEND_TYPE,
+    model_path: str = MODEL_PATH,
+    codec_model_path: str = CODEC_MODEL_PATH,
+    batch_size: int = BATCH_SIZE,
+    batch_timeout: float = BATCH_TIMEOUT,
+    device: str = "cuda",
+    dtype: str = "bfloat16",
+    extra_config: Dict[str, Any] = None,
+) -> FastAPI:
+    """Create and configure the FastAPI app."""
+    global backend_instance, request_batcher, session_manager, server_config
+
+    # Extract server-level config from extra_config
+    ignore_system_prompt = extra_config.pop("ignore_system_prompt", False) if extra_config else False
+    session_ttl = extra_config.pop("session_ttl", 300.0) if extra_config else 300.0
+    max_sessions = extra_config.pop("max_sessions", 100) if extra_config else 100
+
+    app = FastAPI(
+        title="Unified NeMo Inference Server",
+        description=f"OpenAI-compatible API for NeMo model inference ({backend_type} backend)",
+        version="1.0.0",
+    )
+
+    # Store config
+    server_config = {
+        "backend_type": backend_type,
+        "model_path": model_path,
+        "codec_model_path": codec_model_path,
+        "batch_size": batch_size,
+        "batch_timeout": batch_timeout,
+        "device": device,
+        "dtype": dtype,
+        "ignore_system_prompt": ignore_system_prompt,
+        "session_ttl": session_ttl,
+        "max_sessions": max_sessions,
+    }
+
+    @app.on_event("startup")
+    async def startup():
+        global backend_instance, request_batcher, session_manager
+
+        # Build backend config
+        config_dict = {
+            "model_path": model_path,
+            "device": device,
+            "dtype": dtype,
+            "max_new_tokens": MAX_NEW_TOKENS,
+            "temperature": TEMPERATURE,
+            "top_p": TOP_P,
+        }
+
+        # Add backend-specific config
+        if codec_model_path:
+            config_dict["codec_model_path"] = codec_model_path
+
+        if extra_config:
+            config_dict.update(extra_config)
+
+        config = BackendConfig.from_dict(config_dict)
+
+        # Get and instantiate backend
+        print(f"[Server] Initializing {backend_type} backend...")
+        BackendClass = get_backend(backend_type)
+        backend_instance = BackendClass(config)
+
+        # Load model
+        backend_instance.load_model()
+
+        # Create batcher
+        request_batcher = RequestBatcher(backend_instance, batch_size, batch_timeout)
+
+        # Initialize session manager for session-aware backends
+        if backend_type == "s2s_session":
+            session_manager = SessionManager(ttl_seconds=session_ttl, max_sessions=max_sessions)
+            print(f"[Server] Session manager initialized (TTL: {session_ttl}s, max: {max_sessions})")
+
+        print("[Server] Ready!")
+        print(f"  Backend: {backend_type}")
+        print(f"  Model: {model_path}")
+        print(f"  Batch size: {batch_size}")
+        print(f"  Batch timeout: {batch_timeout}s")
+        if ignore_system_prompt:
+            print("  System prompts: IGNORED")
+
+    @app.get("/")
+    async def root():
+        """Root endpoint with server info."""
+        endpoints = ["/v1/chat/completions", "/health"]
+        if backend_type == "s2s_session":
+            endpoints.extend(["/v1/sessions", "/v1/sessions/{session_id}"])
+        return {
+            "service": "Unified NeMo Inference Server",
+            "version": "1.0.0",
+            "backend": server_config.get("backend_type"),
+            "model": server_config.get("model_path"),
+            "endpoints": endpoints,
+        }
+
+    # Session management endpoints (only for s2s_session backend)
+    @app.get("/v1/sessions")
+    async def list_sessions():
+        """List all active sessions."""
+        if session_manager is None:
+            raise HTTPException(status_code=404, detail="Session management not enabled for this backend")
+        return {
+            "sessions": session_manager.list_sessions(),
+            "count": len(session_manager),
+            "ttl_seconds": session_manager.ttl_seconds,
+        }
+
+    @app.get("/v1/sessions/{session_id}")
+    async def get_session(session_id: str):
+        """Get session info."""
+        if session_manager is None:
+            raise HTTPException(status_code=404, detail="Session management not enabled for this backend")
+        info = session_manager.get_session_info(session_id)
+        if info is None:
+            raise HTTPException(status_code=404, detail=f"Session not found: {session_id}")
+        return info
+
+    @app.delete("/v1/sessions/{session_id}")
+    async def delete_session(session_id: str):
+        """Delete a session and generate final session audio."""
+        if session_manager is None:
+            raise HTTPException(status_code=404, detail="Session management not enabled for this backend")
+
+        # Get session state before deleting
+        session_state = session_manager.get_session(session_id)
+        if session_state is None:
+            raise HTTPException(status_code=404, detail=f"Session not found: {session_id}")
+
+        # Call on_session_close to generate session audio
+        close_result = {}
+        if backend_instance is not None and hasattr(backend_instance, "on_session_close"):
+            try:
+                close_result = backend_instance.on_session_close(session_state)
+            except Exception as e:
+                print(f"[Server] Error in on_session_close: {e}")
+                import traceback
+
+                traceback.print_exc()
+
+        # Now delete the session
+        session_manager.delete_session(session_id)
+
+        return {"success": True, "session_id": session_id, **close_result}
+
+    @app.get("/health")
+    async def health():
+        """Health check endpoint."""
+        if backend_instance is None:
+            return JSONResponse(status_code=503, content={"status": "not_ready", "error": "Backend not initialized"})
+
+        health_info = backend_instance.health_check()
+        health_info["status"] = "healthy" if backend_instance.is_loaded else "not_ready"
+        health_info["timestamp"] = datetime.now().isoformat()
+
+        return health_info
+
+    @app.get("/v1/models")
+    async def list_models():
+        """OpenAI-compatible models endpoint."""
+        model_id = server_config.get("model_path", "unknown") if server_config else "unknown"
+        return {
+            "object": "list",
+            "data": [
+                {
+                    "id": model_id,
+                    "object": "model",
+                    "created": int(time.time()),
+                    "owned_by": "nvidia",
+                }
+            ],
+        }
+
+    @app.post("/v1/chat/completions")
+    async def chat_completions(request: Dict[str, Any]):
+        """OpenAI-compatible chat completions endpoint with audio support.
+
+        Accepts messages in OpenAI format with audio_url for audio content:
+        {
+            "messages": [
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": [
+                    {"type": "text", "text": "..."},
+                    {"type": "audio_url", "audio_url": {"url": "data:audio/wav;base64,..."}}
+                ]}
+            ],
+            "max_tokens": 512,
+            "temperature": 1.0,
+            "extra_body": {"session_id": "optional-session-id"}
+        }
+        """
+        if backend_instance is None or not backend_instance.is_loaded:
+            raise HTTPException(status_code=503, detail="Model not loaded")
+
+        try:
+            messages = request.get("messages", [])
+            if not messages:
+                raise HTTPException(status_code=400, detail="No messages provided")
+
+            # Extract session_id from extra_body (for s2s_session backend)
+            extra_body = request.get("extra_body", {})
+            session_id = extra_body.get("session_id") if isinstance(extra_body, dict) else None
+
+            # Extract components from messages
+            audio_bytes_list = extract_audio_from_messages(messages)
+            text = extract_text_from_messages(messages)
+            system_prompt = extract_system_prompt(messages)
+
+            # Honor ignore_system_prompt setting
+            if server_config.get("ignore_system_prompt", False):
+                system_prompt = None
+
+            # Get generation parameters
+            max_tokens = request.get("max_tokens", MAX_NEW_TOKENS)
+            temperature = request.get("temperature", TEMPERATURE)
+            top_p = request.get("top_p", TOP_P)
+            seed = request.get("seed")
+
+            # Create generation request
+            # Use audio_bytes_list for multi-turn, or single audio_bytes for backwards compat
+            gen_request = GenerationRequest(
+                text=text if text else None,
+                system_prompt=system_prompt,
+                audio_bytes=audio_bytes_list[0] if len(audio_bytes_list) == 1 else None,
+                audio_bytes_list=audio_bytes_list if len(audio_bytes_list) > 1 else None,
+                max_new_tokens=max_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                seed=seed,
+                request_id=hashlib.md5(f"{time.time()}".encode()).hexdigest()[:8],
+            )
+
+            # Validate request
+            error = backend_instance.validate_request(gen_request)
+            if error:
+                raise HTTPException(status_code=400, detail=error)
+
+            # Handle s2s_session backend with session support
+            if backend_type == "s2s_session" and session_manager is not None:
+                # Get or create session
+                session_state = session_manager.get_or_create_session(session_id)
+                session_id = session_state.session_id
+
+                # Run inference with session in thread pool
+                loop = asyncio.get_event_loop()
+                result, updated_session = await loop.run_in_executor(
+                    None,
+                    backend_instance.generate_with_session,
+                    gen_request,
+                    session_state,
+                )
+
+                # Save updated session state
+                if updated_session is not None:
+                    session_manager.save_session(session_id, updated_session)
+            else:
+                # Process through batcher (non-session path)
+                result = await request_batcher.add_request(gen_request)
+                session_id = None
+
+            if not result.is_success():
+                raise HTTPException(status_code=500, detail=result.error)
+
+            # Build OpenAI-compatible response
+            response_id = f"chatcmpl-{hashlib.md5(str(time.time()).encode()).hexdigest()[:8]}"
+
+            # Build message content
+            message_content = result.text or ""
+
+            # Save outputs to files before sending response (in case client times out)
+            import json as json_lib
+            import os
+            from datetime import datetime
+
+            save_dir = os.environ.get(
+                "AUDIO_SAVE_DIR", "/lustre/fsw/portfolios/llmservice/users/vmendelev/tmp/voicebench_test"
+            )
+            os.makedirs(save_dir, exist_ok=True)
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            base_filename = f"response_{timestamp}_{response_id}"
+
+            saved_audio_path = None
+            saved_json_path = None
+
+            # Save JSON with text and debug info
+            try:
+                saved_json_path = os.path.join(save_dir, f"{base_filename}.json")
+                json_output = {
+                    "response_id": response_id,
+                    "timestamp": timestamp,
+                    "text": message_content,
+                    "debug_info": result.debug_info,
+                    "generation_time_ms": result.generation_time_ms,
+                    "num_tokens_generated": result.num_tokens_generated,
+                }
+                with open(saved_json_path, "w") as f:
+                    json_lib.dump(json_output, f, indent=2)
+                print(f"[Server] JSON saved to: {saved_json_path}")
+            except Exception as e:
+                print(f"[Server] Warning: Failed to save JSON: {e}")
+
+            # Include audio output if available (base64 encoded)
+            audio_output = None
+            if result.audio_bytes:
+                # Save audio file
+                try:
+                    saved_audio_path = os.path.join(save_dir, f"{base_filename}.wav")
+                    with open(saved_audio_path, "wb") as f:
+                        f.write(result.audio_bytes)
+                    print(f"[Server] Audio saved to: {saved_audio_path} ({len(result.audio_bytes)} bytes)")
+                except Exception as e:
+                    print(f"[Server] Warning: Failed to save audio: {e}")
+
+                audio_output = {
+                    "data": base64.b64encode(result.audio_bytes).decode("utf-8"),
+                    "format": result.audio_format or "wav",
+                    "sample_rate": result.audio_sample_rate,
+                    "expires_at": int(time.time()) + 3600,  # 1 hour expiry
+                    "transcript": result.text or "",  # Text transcript of the audio
+                }
+
+            # Embed debug_info in content as JSON (OpenAI-compatible)
+            final_content = message_content
+            if result.debug_info:
+                final_content = f"{message_content}\n<debug_info>{json.dumps(result.debug_info)}</debug_info>"
+
+            response = {
+                "id": response_id,
+                "object": "chat.completion",
+                "created": int(time.time()),
+                "model": server_config.get("model_path"),
+                "choices": [
+                    {
+                        "index": 0,
+                        "message": {
+                            "role": "assistant",
+                            "content": final_content,
+                        },
+                        "finish_reason": "stop",
+                    }
+                ],
+                "usage": {
+                    "prompt_tokens": -1,
+                    "completion_tokens": result.num_tokens_generated or -1,
+                    "total_tokens": -1,
+                },
+            }
+
+            # Add audio to response if available
+            if audio_output:
+                response["choices"][0]["message"]["audio"] = audio_output
+
+            # Add debug info at top level too (for non-litellm clients)
+            if result.debug_info:
+                response["debug_info"] = result.debug_info
+
+            # Add saved file paths if available
+            if saved_audio_path:
+                response["saved_audio_path"] = saved_audio_path
+            if saved_json_path:
+                response["saved_json_path"] = saved_json_path
+
+            # Add session_id for session-aware backends
+            if session_id:
+                response["session_id"] = session_id
+
+            return response
+
+        except HTTPException:
+            raise
+        except Exception as e:
+            import traceback
+
+            traceback.print_exc()
+            raise HTTPException(status_code=500, detail=str(e))
+
+    return app
+
+
+def main():
+    """Run the server from command line."""
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Unified NeMo Inference Server")
+    parser.add_argument(
+        "--backend",
+        default=BACKEND_TYPE,
+        choices=["salm", "tts", "s2s", "s2s_incremental", "s2s_session"],
+        help="Backend type to use",
+    )
+    parser.add_argument("--model", default=MODEL_PATH, help="Path to model")
+    parser.add_argument("--codec_model", default=CODEC_MODEL_PATH, help="Path to codec model (for TTS/S2S)")
+    parser.add_argument("--host", default=HOST, help="Server host")
+    parser.add_argument("--port", type=int, default=PORT, help="Server port")
+    parser.add_argument("--batch_size", type=int, default=BATCH_SIZE, help="Batch size")
+    parser.add_argument(
+        "--batch_timeout", type=float, default=BATCH_TIMEOUT, help="Batch timeout in seconds (0 for no delay)"
+    )
+    parser.add_argument("--device", default="cuda", help="Device to use")
+    parser.add_argument("--dtype", default="bfloat16", help="Model dtype")
+    parser.add_argument("--debug", action="store_true", help="Enable debug mode")
+
+    # Backend-specific arguments
+    parser.add_argument("--prompt_format", default=None, help="Prompt format (SALM)")
+    parser.add_argument("--phoneme_input_type", default="predicted", help="Phoneme input type (TTS)")
+    parser.add_argument("--decoder_only_model", action="store_true", help="Use decoder-only model (TTS)")
+    parser.add_argument(
+        "--ignore_system_prompt",
+        action="store_true",
+        help="Ignore system prompts from requests (for models that don't support them)",
+    )
+    parser.add_argument(
+        "--silence_padding_sec",
+        type=float,
+        default=5.0,
+        help="Seconds of silence to append after audio (S2S backend)",
+    )
+
+    # S2S Incremental backend arguments
+    parser.add_argument(
+        "--config_path",
+        type=str,
+        default=None,
+        help="Path to YAML config file (s2s_incremental backend)",
+    )
+    parser.add_argument(
+        "--llm_checkpoint_path",
+        type=str,
+        default=None,
+        help="Path to LLM checkpoint (s2s_incremental backend)",
+    )
+    parser.add_argument(
+        "--tts_checkpoint_path",
+        type=str,
+        default=None,
+        help="Path to TTS checkpoint (s2s_incremental backend)",
+    )
+    parser.add_argument(
+        "--speaker_reference",
+        type=str,
+        default=None,
+        help="Path to speaker reference audio for TTS (s2s_incremental backend)",
+    )
+    parser.add_argument(
+        "--num_frames_per_inference",
+        type=int,
+        default=1,
+        help="Frames per inference step (s2s_incremental backend)",
+    )
+    parser.add_argument(
+        "--decode_audio",
+        action="store_true",
+        default=True,
+        help="Enable audio output via TTS (s2s_incremental backend)",
+    )
+    parser.add_argument(
+        "--no_decode_audio",
+        action="store_true",
+        help="Disable audio output (s2s_incremental backend)",
+    )
+
+    args = parser.parse_args()
+
+    if args.debug:
+        global DEBUG
+        DEBUG = True
+
+    # Build extra config from backend-specific args
+    extra_config = {}
+    if args.prompt_format:
+        extra_config["prompt_format"] = args.prompt_format
+    if args.phoneme_input_type:
+        extra_config["phoneme_input_type"] = args.phoneme_input_type
+    if args.decoder_only_model:
+        extra_config["decoder_only_model"] = True
+    if args.silence_padding_sec != 5.0:  # Only add if different from default
+        extra_config["silence_padding_sec"] = args.silence_padding_sec
+    extra_config["ignore_system_prompt"] = args.ignore_system_prompt
+
+    # S2S Incremental backend config
+    if args.config_path:
+        extra_config["config_path"] = args.config_path
+    if args.llm_checkpoint_path:
+        extra_config["llm_checkpoint_path"] = args.llm_checkpoint_path
+    if args.tts_checkpoint_path:
+        extra_config["tts_checkpoint_path"] = args.tts_checkpoint_path
+    if args.speaker_reference:
+        extra_config["speaker_reference"] = args.speaker_reference
+    if args.num_frames_per_inference != 1:
+        extra_config["num_frames_per_inference"] = args.num_frames_per_inference
+    if args.no_decode_audio:
+        extra_config["decode_audio"] = False
+
+    app = create_app(
+        backend_type=args.backend,
+        model_path=args.model,
+        codec_model_path=args.codec_model,
+        batch_size=args.batch_size,
+        batch_timeout=args.batch_timeout,
+        device=args.device,
+        dtype=args.dtype,
+        extra_config=extra_config if extra_config else None,
+    )
+
+    uvicorn.run(app, host=args.host, port=args.port, log_level="info")
+
+
+if __name__ == "__main__":
+    main()

From 66667b0a7e8de7b68a74ece8afe9f655e7f1e820 Mon Sep 17 00:00:00 2001
From: Valentin Mendelev <vmendelev@nvidia.com>
Date: Fri, 26 Dec 2025 13:42:29 -0800
Subject: [PATCH 05/26] Magpie TTS backend

---
 nemo_skills/inference/server/serve_unified.py |  23 +-
 .../multimodal/server/backends/__init__.py    |   4 +-
 .../server/backends/magpie_tts_backend.py     | 253 ++++++++++++++++++
 recipes/multimodal/server/unified_server.py   |  71 ++---
 4 files changed, 308 insertions(+), 43 deletions(-)
 create mode 100644 recipes/multimodal/server/backends/magpie_tts_backend.py

diff --git a/nemo_skills/inference/server/serve_unified.py b/nemo_skills/inference/server/serve_unified.py
index 748b7cf044..1b02e652e6 100644
--- a/nemo_skills/inference/server/serve_unified.py
+++ b/nemo_skills/inference/server/serve_unified.py
@@ -30,13 +30,13 @@
         --server_entrypoint "-m nemo_skills.inference.server.serve_unified" \\
         --server_args "--backend salm"
 
-    # TTS backend (text-to-speech)
+    # MagpieTTS backend (text-to-speech with RTF metrics)
     ns eval \\
         --server_type vllm \\
         --server_gpus 1 \\
         --model /path/to/tts_model \\
         --server_entrypoint "-m nemo_skills.inference.server.serve_unified" \\
-        --server_args "--backend tts --codec_model /path/to/codec"
+        --server_args "--backend magpie_tts --codec_model /path/to/codec"
 
     # S2S backend (speech-to-speech)
     ns eval \\
@@ -138,8 +138,8 @@ def main():
     parser.add_argument(
         "--backend",
         default="salm",
-        choices=["salm", "tts", "s2s", "s2s_incremental", "s2s_session"],
-        help="Backend type: salm (speech-augmented LM), tts (text-to-speech), s2s (speech-to-speech offline), s2s_incremental (frame-by-frame processing), s2s_session (session-aware multi-turn)",
+        choices=["salm", "magpie_tts", "s2s", "s2s_incremental", "s2s_session"],
+        help="Backend type: salm (speech-augmented LM), magpie_tts (MagpieTTS with RTF metrics), s2s (speech-to-speech offline), s2s_incremental (frame-by-frame processing), s2s_session (session-aware multi-turn)",
     )
 
     # Backend-specific model paths
@@ -170,7 +170,9 @@ def main():
         "--decoder_only_model", action="store_true", help="Use decoder-only model architecture (TTS backend)"
     )
     parser.add_argument("--use_local_transformer", action="store_true", help="Use local transformer (TTS backend)")
-    parser.add_argument("--top_k", type=int, default=None, help="Top-k sampling (TTS backend)")
+    parser.add_argument("--top_k", type=int, default=80, help="Top-k sampling (TTS backend)")
+    parser.add_argument("--use_cfg", action="store_true", help="Enable classifier-free guidance (TTS backend)")
+    parser.add_argument("--cfg_scale", type=float, default=2.5, help="CFG scale factor (TTS backend)")
 
     # Environment setup
     parser.add_argument("--code_path", default=None, help="Path to NeMo source code to add to PYTHONPATH")
@@ -288,12 +290,13 @@ def main():
     if args.prompt_format:
         extra_config["prompt_format"] = args.prompt_format
 
-    if args.backend == "tts":
+    if args.backend == "magpie_tts":
         extra_config["decoder_only_model"] = args.decoder_only_model
         extra_config["phoneme_input_type"] = args.phoneme_input_type
         extra_config["use_local_transformer"] = args.use_local_transformer
-        if args.top_k:
-            extra_config["top_k"] = args.top_k
+        extra_config["top_k"] = args.top_k
+        extra_config["use_cfg"] = args.use_cfg
+        extra_config["cfg_scale"] = args.cfg_scale
 
     # S2S backend options
     if args.backend in ("s2s", "s2s_incremental", "s2s_session"):
@@ -340,6 +343,10 @@ def main():
     print(f"  Batch Timeout: {args.batch_timeout}s")
     print(f"  Device: {args.device}")
     print(f"  Dtype: {args.dtype}")
+    if args.backend == "magpie_tts":
+        print(f"  Top-k: {args.top_k}")
+        print(f"  CFG: {args.use_cfg} (scale: {args.cfg_scale})")
+        print(f"  Local Transformer: {args.use_local_transformer}")
     if args.backend in ("s2s_incremental", "s2s_session"):
         if args.config_path:
             print(f"  Config Path: {args.config_path}")
diff --git a/recipes/multimodal/server/backends/__init__.py b/recipes/multimodal/server/backends/__init__.py
index 861b330f00..fe3c4c1abd 100644
--- a/recipes/multimodal/server/backends/__init__.py
+++ b/recipes/multimodal/server/backends/__init__.py
@@ -17,7 +17,7 @@
 
 Available backends:
 - salm: Speech-Augmented Language Model (text output from text/audio input)
-- tts: Text-to-Speech using MagpieTTS (audio output from text input)
+- magpie_tts: MagpieTTS using MagpieInferenceRunner with RTF metrics (audio output from text input)
 - s2s: Speech-to-Speech using DuplexS2S offline (text output from audio input)
 - s2s_incremental: Speech-to-Speech using NemotronVoiceChat incremental (text+audio from audio)
 - s2s_session: Speech-to-Speech with session support for multi-turn conversations
@@ -38,7 +38,7 @@
 # Registry of available backends
 BACKEND_REGISTRY = {
     "salm": ("salm_backend", "SALMBackend"),
-    "tts": ("tts_backend", "TTSBackend"),
+    "magpie_tts": ("magpie_tts_backend", "MagpieTTSBackend"),
     "s2s": ("s2s_backend", "S2SBackend"),
     "s2s_incremental": ("s2s_incremental_backend", "S2SIncrementalBackend"),
     "s2s_session": ("s2s_session_backend", "S2SSessionBackend"),
diff --git a/recipes/multimodal/server/backends/magpie_tts_backend.py b/recipes/multimodal/server/backends/magpie_tts_backend.py
new file mode 100644
index 0000000000..61b4fd32d6
--- /dev/null
+++ b/recipes/multimodal/server/backends/magpie_tts_backend.py
@@ -0,0 +1,253 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+
+"""MagpieTTS backend using MagpieInferenceRunner with RTF metrics."""
+
+import io
+import json
+import os
+import shutil
+import tempfile
+import time
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Set
+
+import soundfile as sf
+
+from .base import BackendConfig, GenerationRequest, GenerationResult, InferenceBackend, Modality
+
+
+@dataclass
+class MagpieTTSConfig(BackendConfig):
+    codec_model_path: Optional[str] = None
+    top_k: int = 80
+    temperature: float = 0.6
+    use_cfg: bool = True
+    cfg_scale: float = 2.5
+    max_decoder_steps: int = 440
+    use_local_transformer: bool = False
+    output_sample_rate: int = 22050
+
+    @classmethod
+    def from_dict(cls, d: Dict[str, Any]) -> "MagpieTTSConfig":
+        known = {
+            "model_path",
+            "device",
+            "dtype",
+            "max_new_tokens",
+            "temperature",
+            "top_p",
+            "top_k",
+            "codec_model_path",
+            "use_cfg",
+            "cfg_scale",
+            "max_decoder_steps",
+            "use_local_transformer",
+            "output_sample_rate",
+        }
+        return cls(
+            **{k: v for k, v in d.items() if k in known}, extra_config={k: v for k, v in d.items() if k not in known}
+        )
+
+
+class MagpieTTSBackend(InferenceBackend):
+    """MagpieTTS backend. Input: JSON with 'text' and 'context_audio_filepath'."""
+
+    @property
+    def name(self) -> str:
+        return "magpie_tts"
+
+    @property
+    def supported_modalities(self) -> Set[Modality]:
+        return {Modality.TEXT, Modality.AUDIO_OUT}
+
+    def __init__(self, config: BackendConfig):
+        self.tts_config = (
+            config
+            if isinstance(config, MagpieTTSConfig)
+            else MagpieTTSConfig.from_dict(
+                {
+                    **{
+                        k: getattr(config, k)
+                        for k in ["model_path", "device", "dtype", "max_new_tokens", "temperature", "top_p", "top_k"]
+                        if hasattr(config, k)
+                    },
+                    **config.extra_config,
+                }
+            )
+        )
+        super().__init__(self.tts_config)
+        self._model = self._runner = self._temp_dir = self._checkpoint_name = None
+
+    def load_model(self) -> None:
+        from nemo.collections.tts.modules.magpietts_inference.inference import InferenceConfig, MagpieInferenceRunner
+        from nemo.collections.tts.modules.magpietts_inference.utils import ModelLoadConfig, load_magpie_model
+
+        if not self.tts_config.codec_model_path:
+            raise ValueError("codec_model_path required")
+
+        model_path = self.config.model_path
+        cfg = ModelLoadConfig(nemo_file=model_path, codecmodel_path=self.tts_config.codec_model_path)
+        self._model, self._checkpoint_name = load_magpie_model(cfg, device=self.config.device)
+
+        self._runner = MagpieInferenceRunner(
+            self._model,
+            InferenceConfig(
+                temperature=self.tts_config.temperature,
+                topk=self.tts_config.top_k,
+                max_decoder_steps=self.tts_config.max_decoder_steps,
+                use_cfg=self.tts_config.use_cfg,
+                cfg_scale=self.tts_config.cfg_scale,
+                use_local_transformer=self.tts_config.use_local_transformer,
+                batch_size=32,
+            ),
+        )
+
+        self._temp_dir = tempfile.mkdtemp(prefix="magpie_tts_")
+        self.tts_config.output_sample_rate = self._model.sample_rate
+        self._is_loaded = True
+        print(
+            f"[MagpieTTSBackend] Loaded: {self._checkpoint_name}, sr={self._model.sample_rate}, cfg={self.tts_config.use_cfg}"
+        )
+
+    def _extract_json(self, text: str) -> dict:
+        """Extract JSON object from text, skipping non-JSON parts."""
+        if not text:
+            return {"text": ""}
+        # Find first { and try to parse from there
+        idx = text.find("{")
+        if idx >= 0:
+            try:
+                return json.loads(text[idx:])
+            except json.JSONDecodeError:
+                pass
+        return {"text": text}
+
+    def generate(self, requests: List[GenerationRequest]) -> List[GenerationResult]:
+        if not self._is_loaded:
+            return [GenerationResult(error="Model not loaded", request_id=r.request_id) for r in requests]
+        if not requests:
+            return []
+
+        start_time = time.time()
+        batch_dir = os.path.join(self._temp_dir, f"batch_{int(time.time() * 1000)}")
+        output_dir = os.path.join(batch_dir, "output")
+        os.makedirs(output_dir, exist_ok=True)
+
+        try:
+            # Parse requests, extracting JSON from text (skips non-JSON prefixes)
+            parsed = [self._extract_json(r.text) for r in requests]
+
+            # Create audio_dir with symlinks to all context audio files (they may be in different dirs)
+            audio_dir = os.path.join(batch_dir, "audio")
+            os.makedirs(audio_dir, exist_ok=True)
+
+            manifest_path = os.path.join(batch_dir, "manifest.json")
+            with open(manifest_path, "w") as f:
+                for i, p in enumerate(parsed):
+                    ctx = p.get("context_audio_filepath", "")
+                    if ctx and os.path.exists(ctx):
+                        # Create unique symlink name to avoid collisions
+                        link_name = f"ctx_{i}_{os.path.basename(ctx)}"
+                        link_path = os.path.join(audio_dir, link_name)
+                        if not os.path.exists(link_path):
+                            os.symlink(ctx, link_path)
+                    else:
+                        link_name = f"d{i}.wav"
+                    f.write(
+                        json.dumps(
+                            {
+                                "text": p.get("text", ""),
+                                "audio_filepath": link_name,
+                                "context_audio_filepath": link_name,
+                                "duration": p.get("duration", 5.0),
+                                "context_audio_duration": p.get("context_audio_duration", 5.0),
+                            }
+                        )
+                        + "\n"
+                    )
+
+            config_path = os.path.join(batch_dir, "config.json")
+            with open(config_path, "w") as f:
+                json.dump({"batch": {"manifest_path": manifest_path, "audio_dir": audio_dir}}, f)
+
+            # Run inference
+            from nemo.collections.tts.modules.magpietts_inference.evaluate_generated_audio import load_evalset_config
+
+            dataset = self._runner.create_dataset(load_evalset_config(config_path))
+            rtf_list, _ = self._runner.run_inference_on_dataset(
+                dataset, output_dir, save_cross_attention_maps=False, save_context_audio=False
+            )
+
+            gen_time = time.time() - start_time
+            batch_metrics = {
+                "total_time_sec": gen_time,
+                "num_samples": len(requests),
+                **self._runner.compute_mean_rtf_metrics(rtf_list),
+            }
+
+            # Build results
+            results = []
+            for i, req in enumerate(requests):
+                path = os.path.join(output_dir, f"predicted_audio_{i}.wav")
+                if os.path.exists(path):
+                    audio, sr = sf.read(path)
+                    buf = io.BytesIO()
+                    sf.write(buf, audio, sr, format="WAV")
+                    buf.seek(0)
+                    dur = len(audio) / sr
+                    results.append(
+                        GenerationResult(
+                            text=parsed[i].get("text", ""),
+                            audio_bytes=buf.read(),
+                            audio_sample_rate=self.tts_config.output_sample_rate,
+                            audio_format="wav",
+                            request_id=req.request_id,
+                            generation_time_ms=gen_time * 1000 / len(requests),
+                            debug_info={
+                                "checkpoint": self._checkpoint_name,
+                                "audio_duration_sec": dur,
+                                "rtf": gen_time / len(requests) / dur if dur else 0,
+                                "config": {
+                                    "temp": self.tts_config.temperature,
+                                    "top_k": self.tts_config.top_k,
+                                    "cfg": self.tts_config.use_cfg,
+                                    "cfg_scale": self.tts_config.cfg_scale,
+                                },
+                                "batch_metrics": batch_metrics,
+                            },
+                        )
+                    )
+                else:
+                    results.append(GenerationResult(error=f"Audio not found: {path}", request_id=req.request_id))
+            return results
+        except Exception as e:
+            import traceback
+
+            traceback.print_exc()
+            return [GenerationResult(error=str(e), request_id=r.request_id) for r in requests]
+        finally:
+            shutil.rmtree(batch_dir, ignore_errors=True)
+
+    def validate_request(self, request: GenerationRequest) -> Optional[str]:
+        return "Text required" if not request.text else None
+
+    def health_check(self) -> Dict[str, Any]:
+        h = super().health_check()
+        if self._is_loaded:
+            h.update(
+                {
+                    "checkpoint": self._checkpoint_name,
+                    "codec": self.tts_config.codec_model_path,
+                    "cfg": self.tts_config.use_cfg,
+                    "cfg_scale": self.tts_config.cfg_scale,
+                    "sample_rate": self.tts_config.output_sample_rate,
+                }
+            )
+        return h
+
+    def __del__(self):
+        if getattr(self, "_temp_dir", None) and os.path.exists(self._temp_dir):
+            shutil.rmtree(self._temp_dir, ignore_errors=True)
diff --git a/recipes/multimodal/server/unified_server.py b/recipes/multimodal/server/unified_server.py
index 2cf989656c..096a78cee8 100644
--- a/recipes/multimodal/server/unified_server.py
+++ b/recipes/multimodal/server/unified_server.py
@@ -508,45 +508,50 @@ async def chat_completions(request: Dict[str, Any]):
             import os
             from datetime import datetime
 
-            save_dir = os.environ.get(
-                "AUDIO_SAVE_DIR", "/lustre/fsw/portfolios/llmservice/users/vmendelev/tmp/voicebench_test"
-            )
-            os.makedirs(save_dir, exist_ok=True)
-            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-            base_filename = f"response_{timestamp}_{response_id}"
-
+            save_dir = os.environ.get("AUDIO_SAVE_DIR", "")
+            if save_dir:
+                try:
+                    os.makedirs(save_dir, exist_ok=True)
+                except PermissionError:
+                    save_dir = ""  # Fall through to skip saving
             saved_audio_path = None
             saved_json_path = None
 
-            # Save JSON with text and debug info
-            try:
-                saved_json_path = os.path.join(save_dir, f"{base_filename}.json")
-                json_output = {
-                    "response_id": response_id,
-                    "timestamp": timestamp,
-                    "text": message_content,
-                    "debug_info": result.debug_info,
-                    "generation_time_ms": result.generation_time_ms,
-                    "num_tokens_generated": result.num_tokens_generated,
-                }
-                with open(saved_json_path, "w") as f:
-                    json_lib.dump(json_output, f, indent=2)
-                print(f"[Server] JSON saved to: {saved_json_path}")
-            except Exception as e:
-                print(f"[Server] Warning: Failed to save JSON: {e}")
+            # Save outputs if AUDIO_SAVE_DIR is set and writable
+            if save_dir:
+                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+                base_filename = f"response_{timestamp}_{response_id}"
 
-            # Include audio output if available (base64 encoded)
-            audio_output = None
-            if result.audio_bytes:
-                # Save audio file
+                # Save JSON with text and debug info
                 try:
-                    saved_audio_path = os.path.join(save_dir, f"{base_filename}.wav")
-                    with open(saved_audio_path, "wb") as f:
-                        f.write(result.audio_bytes)
-                    print(f"[Server] Audio saved to: {saved_audio_path} ({len(result.audio_bytes)} bytes)")
+                    saved_json_path = os.path.join(save_dir, f"{base_filename}.json")
+                    json_output = {
+                        "response_id": response_id,
+                        "timestamp": timestamp,
+                        "text": message_content,
+                        "debug_info": result.debug_info,
+                        "generation_time_ms": result.generation_time_ms,
+                        "num_tokens_generated": result.num_tokens_generated,
+                    }
+                    with open(saved_json_path, "w") as f:
+                        json_lib.dump(json_output, f, indent=2)
+                    print(f"[Server] JSON saved to: {saved_json_path}")
                 except Exception as e:
-                    print(f"[Server] Warning: Failed to save audio: {e}")
+                    print(f"[Server] Warning: Failed to save JSON: {e}")
+
+                # Save audio file if available
+                if result.audio_bytes:
+                    try:
+                        saved_audio_path = os.path.join(save_dir, f"{base_filename}.wav")
+                        with open(saved_audio_path, "wb") as f:
+                            f.write(result.audio_bytes)
+                        print(f"[Server] Audio saved to: {saved_audio_path} ({len(result.audio_bytes)} bytes)")
+                    except Exception as e:
+                        print(f"[Server] Warning: Failed to save audio: {e}")
 
+            # Include audio output if available (base64 encoded)
+            audio_output = None
+            if result.audio_bytes:
                 audio_output = {
                     "data": base64.b64encode(result.audio_bytes).decode("utf-8"),
                     "format": result.audio_format or "wav",
@@ -621,7 +626,7 @@ def main():
     parser.add_argument(
         "--backend",
         default=BACKEND_TYPE,
-        choices=["salm", "tts", "s2s", "s2s_incremental", "s2s_session"],
+        choices=["salm", "magpie_tts", "s2s", "s2s_incremental", "s2s_session"],
         help="Backend type to use",
     )
     parser.add_argument("--model", default=MODEL_PATH, help="Path to model")

From d916e10617c0a7c2c9f0686cd9bca1231c0fc12a Mon Sep 17 00:00:00 2001
From: Valentin Mendelev <vmendelev@nvidia.com>
Date: Sat, 27 Dec 2025 09:07:04 -0800
Subject: [PATCH 06/26] nv_tts eval scripts

---
 .../dataset/nv_tts/scripts/__init__.py        |  15 ++
 nemo_skills/dataset/nv_tts/scripts/score.py   | 168 ++++++++++++++++++
 2 files changed, 183 insertions(+)
 create mode 100644 nemo_skills/dataset/nv_tts/scripts/__init__.py
 create mode 100644 nemo_skills/dataset/nv_tts/scripts/score.py

diff --git a/nemo_skills/dataset/nv_tts/scripts/__init__.py b/nemo_skills/dataset/nv_tts/scripts/__init__.py
new file mode 100644
index 0000000000..9b5c777b89
--- /dev/null
+++ b/nemo_skills/dataset/nv_tts/scripts/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""NV TTS evaluation scripts."""
diff --git a/nemo_skills/dataset/nv_tts/scripts/score.py b/nemo_skills/dataset/nv_tts/scripts/score.py
new file mode 100644
index 0000000000..3d98adee42
--- /dev/null
+++ b/nemo_skills/dataset/nv_tts/scripts/score.py
@@ -0,0 +1,168 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Scoring and aggregation functions for TTS evaluation."""
+
+import argparse
+import json
+import os
+import tempfile
+
+from nemo.collections.tts.modules.magpietts_inference.evaluate_generated_audio import evaluate
+
+
+def run_scoring(
+    results_dir: str,
+    sv_model: str = "titanet",
+    asr_model_name: str = "nvidia/parakeet-tdt-1.1b",
+    language: str = "en",
+    with_utmosv2: bool = False,
+) -> None:
+    """Run NeMo scoring on all benchmarks in results_dir."""
+    benchmarks_dir = os.path.join(results_dir, "eval-results")
+    if not os.path.exists(benchmarks_dir):
+        benchmarks_dir = results_dir
+
+    scoring_cfg = {
+        "sv_model": sv_model,
+        "asr_model_name": asr_model_name,
+        "language": language,
+        "with_utmosv2": with_utmosv2,
+    }
+
+    for benchmark in os.listdir(benchmarks_dir):
+        benchmark_dir = os.path.join(benchmarks_dir, benchmark)
+        if not os.path.isdir(benchmark_dir):
+            continue
+
+        output_jsonl = os.path.join(benchmark_dir, "output.jsonl")
+        if not os.path.exists(output_jsonl):
+            continue
+
+        print(f"\nScoring: {benchmark}")
+        metrics = score_benchmark(output_jsonl, scoring_cfg)
+
+        # Save metrics.json
+        metrics_path = os.path.join(benchmark_dir, "metrics.json")
+        with open(metrics_path, "w") as f:
+            json.dump(metrics, f, indent=2)
+        print(f"Saved: {metrics_path}")
+        print(f"  CER: {metrics.get('cer_cumulative', 'N/A'):.4f}")
+        print(f"  WER: {metrics.get('wer_cumulative', 'N/A'):.4f}")
+        if "utmosv2_avg" in metrics:
+            print(f"  UTMOSv2: {metrics.get('utmosv2_avg', 'N/A'):.4f}")
+
+
+def score_benchmark(output_jsonl: str, scoring_cfg: dict) -> dict:
+    """Score a single benchmark."""
+    # Parse output.jsonl
+    entries = []
+    records = []
+    with open(output_jsonl) as f:
+        for line in f:
+            if not line.strip():
+                continue
+            record = json.loads(line)
+            records.append(record)
+
+            # Extract manifest from user message
+            manifest_entry = None
+            for msg in record.get("messages", []):
+                if msg.get("role") == "user":
+                    content = msg.get("content", "")
+                    manifest_entry = json.loads(content) if isinstance(content, str) else content
+                    break
+
+            audio_path = record.get("audio", {}).get("path")
+            if audio_path and manifest_entry:
+                entries.append((manifest_entry, audio_path))
+
+    if not entries:
+        return {}
+
+    # Create temp dir with manifest and symlinks
+    with tempfile.TemporaryDirectory(prefix="tts_scoring_") as tmp_dir:
+        manifest_path = os.path.join(tmp_dir, "manifest.json")
+        gen_audio_dir = os.path.join(tmp_dir, "generated")
+        os.makedirs(gen_audio_dir)
+
+        with open(manifest_path, "w") as f:
+            for i, (manifest_entry, audio_path) in enumerate(entries):
+                f.write(json.dumps(manifest_entry) + "\n")
+                dst = os.path.join(gen_audio_dir, f"predicted_audio_{i}.wav")
+                if os.path.exists(audio_path):
+                    os.symlink(audio_path, dst)
+
+        avg_metrics, filewise_metrics = evaluate(
+            manifest_path=manifest_path,
+            audio_dir=None,
+            generated_audio_dir=gen_audio_dir,
+            language=scoring_cfg.get("language", "en"),
+            sv_model_type=scoring_cfg.get("sv_model", "titanet"),
+            asr_model_name=scoring_cfg.get("asr_model_name", "nvidia/parakeet-tdt-1.1b"),
+            with_utmosv2=scoring_cfg.get("with_utmosv2", False),
+        )
+
+        # Save output_with_metrics.jsonl
+        output_with_metrics_path = output_jsonl.replace("output.jsonl", "output_with_metrics.jsonl")
+        with open(output_with_metrics_path, "w") as f:
+            for i, record in enumerate(records):
+                if i < len(filewise_metrics):
+                    record["metrics"] = filewise_metrics[i]
+                f.write(json.dumps(record) + "\n")
+        print(f"Saved: {output_with_metrics_path}")
+
+        return avg_metrics
+
+
+def run_aggregation(results_dir: str) -> None:
+    """Print summary of all metrics."""
+    benchmarks_dir = os.path.join(results_dir, "eval-results")
+    if not os.path.exists(benchmarks_dir):
+        benchmarks_dir = results_dir
+
+    print("\nAggregated Results:")
+    for benchmark in sorted(os.listdir(benchmarks_dir)):
+        metrics_path = os.path.join(benchmarks_dir, benchmark, "metrics.json")
+        if os.path.exists(metrics_path):
+            with open(metrics_path) as f:
+                metrics = json.load(f)
+            print(f"  {benchmark}:")
+            print(f"    CER: {metrics.get('cer_cumulative', 'N/A'):.4f}")
+            print(f"    WER: {metrics.get('wer_cumulative', 'N/A'):.4f}")
+            if "utmosv2_avg" in metrics:
+                print(f"    UTMOSv2: {metrics.get('utmosv2_avg', 'N/A'):.4f}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="TTS Scoring")
+    parser.add_argument("--results_dir", required=True)
+    parser.add_argument("--sv_model", default="titanet")
+    parser.add_argument("--asr_model_name", default="nvidia/parakeet-tdt-1.1b")
+    parser.add_argument("--language", default="en")
+    parser.add_argument("--with_utmosv2", action="store_true")
+    parser.add_argument("--aggregation_only", action="store_true")
+    args = parser.parse_args()
+
+    if args.aggregation_only:
+        run_aggregation(args.results_dir)
+    else:
+        run_scoring(
+            args.results_dir,
+            sv_model=args.sv_model,
+            asr_model_name=args.asr_model_name,
+            language=args.language,
+            with_utmosv2=args.with_utmosv2,
+        )

From 14523f7a1ee86e83db7a5cb19eabf8a69604ff16 Mon Sep 17 00:00:00 2001
From: Valentin Mendelev <vmendelev@nvidia.com>
Date: Sun, 28 Dec 2025 00:35:13 -0800
Subject: [PATCH 07/26] Checkpoint + hparams input instead of nemo

---
 nemo_skills/inference/server/serve_unified.py | 27 ++++++++++++++++
 .../server/backends/magpie_tts_backend.py     | 31 +++++++++++++++++--
 2 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/nemo_skills/inference/server/serve_unified.py b/nemo_skills/inference/server/serve_unified.py
index 1b02e652e6..c6aa1764de 100644
--- a/nemo_skills/inference/server/serve_unified.py
+++ b/nemo_skills/inference/server/serve_unified.py
@@ -174,6 +174,15 @@ def main():
     parser.add_argument("--use_cfg", action="store_true", help="Enable classifier-free guidance (TTS backend)")
     parser.add_argument("--cfg_scale", type=float, default=2.5, help="CFG scale factor (TTS backend)")
 
+    # Checkpoint loading options (for magpie_tts backend - alternative to --model .nemo)
+    parser.add_argument("--hparams_file", default=None, help="Path to hparams.yaml (use with --checkpoint_file)")
+    parser.add_argument("--checkpoint_file", default=None, help="Path to .ckpt checkpoint (use with --hparams_file)")
+    parser.add_argument(
+        "--legacy_codebooks", action="store_true", help="Use legacy codebook indices for old checkpoints"
+    )
+    parser.add_argument("--legacy_text_conditioning", action="store_true", help="Use legacy text conditioning")
+    parser.add_argument("--hparams_from_wandb", action="store_true", help="hparams file was exported from wandb")
+
     # Environment setup
     parser.add_argument("--code_path", default=None, help="Path to NeMo source code to add to PYTHONPATH")
     parser.add_argument("--hack_path", default=None, help="Path to safetensors/torch.py patch file")
@@ -297,6 +306,17 @@ def main():
         extra_config["top_k"] = args.top_k
         extra_config["use_cfg"] = args.use_cfg
         extra_config["cfg_scale"] = args.cfg_scale
+        # Checkpoint loading options
+        if args.hparams_file:
+            extra_config["hparams_file"] = args.hparams_file
+        if args.checkpoint_file:
+            extra_config["checkpoint_file"] = args.checkpoint_file
+        if args.legacy_codebooks:
+            extra_config["legacy_codebooks"] = True
+        if args.legacy_text_conditioning:
+            extra_config["legacy_text_conditioning"] = True
+        if args.hparams_from_wandb:
+            extra_config["hparams_from_wandb"] = True
 
     # S2S backend options
     if args.backend in ("s2s", "s2s_incremental", "s2s_session"):
@@ -347,6 +367,13 @@ def main():
         print(f"  Top-k: {args.top_k}")
         print(f"  CFG: {args.use_cfg} (scale: {args.cfg_scale})")
         print(f"  Local Transformer: {args.use_local_transformer}")
+        if args.hparams_file and args.checkpoint_file:
+            print(f"  Hparams: {args.hparams_file}")
+            print(f"  Checkpoint: {args.checkpoint_file}")
+            if args.legacy_codebooks:
+                print("  Legacy Codebooks: True")
+            if args.legacy_text_conditioning:
+                print("  Legacy Text Conditioning: True")
     if args.backend in ("s2s_incremental", "s2s_session"):
         if args.config_path:
             print(f"  Config Path: {args.config_path}")
diff --git a/recipes/multimodal/server/backends/magpie_tts_backend.py b/recipes/multimodal/server/backends/magpie_tts_backend.py
index 61b4fd32d6..e11187f71a 100644
--- a/recipes/multimodal/server/backends/magpie_tts_backend.py
+++ b/recipes/multimodal/server/backends/magpie_tts_backend.py
@@ -29,6 +29,12 @@ class MagpieTTSConfig(BackendConfig):
     max_decoder_steps: int = 440
     use_local_transformer: bool = False
     output_sample_rate: int = 22050
+    # Checkpoint loading options (alternative to model_path .nemo file)
+    hparams_file: Optional[str] = None
+    checkpoint_file: Optional[str] = None
+    legacy_codebooks: bool = False
+    legacy_text_conditioning: bool = False
+    hparams_from_wandb: bool = False
 
     @classmethod
     def from_dict(cls, d: Dict[str, Any]) -> "MagpieTTSConfig":
@@ -46,6 +52,11 @@ def from_dict(cls, d: Dict[str, Any]) -> "MagpieTTSConfig":
             "max_decoder_steps",
             "use_local_transformer",
             "output_sample_rate",
+            "hparams_file",
+            "checkpoint_file",
+            "legacy_codebooks",
+            "legacy_text_conditioning",
+            "hparams_from_wandb",
         }
         return cls(
             **{k: v for k, v in d.items() if k in known}, extra_config={k: v for k, v in d.items() if k not in known}
@@ -88,8 +99,24 @@ def load_model(self) -> None:
         if not self.tts_config.codec_model_path:
             raise ValueError("codec_model_path required")
 
-        model_path = self.config.model_path
-        cfg = ModelLoadConfig(nemo_file=model_path, codecmodel_path=self.tts_config.codec_model_path)
+        # Support both checkpoint mode (hparams + ckpt) and nemo mode
+        has_ckpt_mode = self.tts_config.hparams_file and self.tts_config.checkpoint_file
+        if has_ckpt_mode:
+            cfg = ModelLoadConfig(
+                hparams_file=self.tts_config.hparams_file,
+                checkpoint_file=self.tts_config.checkpoint_file,
+                codecmodel_path=self.tts_config.codec_model_path,
+                legacy_codebooks=self.tts_config.legacy_codebooks,
+                legacy_text_conditioning=self.tts_config.legacy_text_conditioning,
+                hparams_from_wandb=self.tts_config.hparams_from_wandb,
+            )
+        else:
+            cfg = ModelLoadConfig(
+                nemo_file=self.config.model_path,
+                codecmodel_path=self.tts_config.codec_model_path,
+                legacy_codebooks=self.tts_config.legacy_codebooks,
+                legacy_text_conditioning=self.tts_config.legacy_text_conditioning,
+            )
         self._model, self._checkpoint_name = load_magpie_model(cfg, device=self.config.device)
 
         self._runner = MagpieInferenceRunner(

From 4aa3a2deb7804b9398ae0517e62597232194566a Mon Sep 17 00:00:00 2001
From: Valentin Mendelev <vmendelev@nvidia.com>
Date: Sun, 28 Dec 2025 05:30:59 -0800
Subject: [PATCH 08/26] Per benchmark scoring jobs

---
 nemo_skills/dataset/nv_tts/scripts/score.py | 22 +++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/nemo_skills/dataset/nv_tts/scripts/score.py b/nemo_skills/dataset/nv_tts/scripts/score.py
index 3d98adee42..13e6fc6210 100644
--- a/nemo_skills/dataset/nv_tts/scripts/score.py
+++ b/nemo_skills/dataset/nv_tts/scripts/score.py
@@ -29,8 +29,13 @@ def run_scoring(
     asr_model_name: str = "nvidia/parakeet-tdt-1.1b",
     language: str = "en",
     with_utmosv2: bool = False,
+    benchmark: str = None,
 ) -> None:
-    """Run NeMo scoring on all benchmarks in results_dir."""
+    """Run NeMo scoring on benchmarks in results_dir.
+
+    Args:
+        benchmark: If provided, score only this benchmark. Otherwise score all.
+    """
     benchmarks_dir = os.path.join(results_dir, "eval-results")
     if not os.path.exists(benchmarks_dir):
         benchmarks_dir = results_dir
@@ -42,16 +47,23 @@ def run_scoring(
         "with_utmosv2": with_utmosv2,
     }
 
-    for benchmark in os.listdir(benchmarks_dir):
-        benchmark_dir = os.path.join(benchmarks_dir, benchmark)
+    # Determine which benchmarks to score
+    if benchmark:
+        benchmarks_to_score = [benchmark]
+    else:
+        benchmarks_to_score = os.listdir(benchmarks_dir)
+
+    for bench in benchmarks_to_score:
+        benchmark_dir = os.path.join(benchmarks_dir, bench)
         if not os.path.isdir(benchmark_dir):
             continue
 
         output_jsonl = os.path.join(benchmark_dir, "output.jsonl")
         if not os.path.exists(output_jsonl):
+            print(f"Skipping {bench}: output.jsonl not found")
             continue
 
-        print(f"\nScoring: {benchmark}")
+        print(f"\nScoring: {bench}")
         metrics = score_benchmark(output_jsonl, scoring_cfg)
 
         # Save metrics.json
@@ -154,6 +166,7 @@ def run_aggregation(results_dir: str) -> None:
     parser.add_argument("--language", default="en")
     parser.add_argument("--with_utmosv2", action="store_true")
     parser.add_argument("--aggregation_only", action="store_true")
+    parser.add_argument("--benchmark", default=None, help="Score only this benchmark (e.g. nv_tts.libritts_seen)")
     args = parser.parse_args()
 
     if args.aggregation_only:
@@ -165,4 +178,5 @@ def run_aggregation(results_dir: str) -> None:
             asr_model_name=args.asr_model_name,
             language=args.language,
             with_utmosv2=args.with_utmosv2,
+            benchmark=args.benchmark,
         )

From 5372f7e011d0d1aaeb26264916fe91c7506db407 Mon Sep 17 00:00:00 2001
From: Valentin Mendelev <vmendelev@nvidia.com>
Date: Sun, 28 Dec 2025 06:31:13 -0800
Subject: [PATCH 09/26] nv_tts benchmarks and scripts to run them

---
 nemo_skills/dataset/nv_tts/__init__.py        |  24 ++
 nemo_skills/dataset/nv_tts/prepare.py         | 235 ++++++++++++++++++
 .../nv_tts/scripts/config/default.yaml        |  31 +++
 .../scripts/config/grpo_small_step1100.yaml   |  39 +++
 .../dataset/nv_tts/scripts/run_tts_eval.py    | 184 ++++++++++++++
 nemo_skills/dataset/nv_tts/scripts/score.py   |   6 +-
 6 files changed, 518 insertions(+), 1 deletion(-)
 create mode 100644 nemo_skills/dataset/nv_tts/__init__.py
 create mode 100644 nemo_skills/dataset/nv_tts/prepare.py
 create mode 100644 nemo_skills/dataset/nv_tts/scripts/config/default.yaml
 create mode 100644 nemo_skills/dataset/nv_tts/scripts/config/grpo_small_step1100.yaml
 create mode 100644 nemo_skills/dataset/nv_tts/scripts/run_tts_eval.py

diff --git a/nemo_skills/dataset/nv_tts/__init__.py b/nemo_skills/dataset/nv_tts/__init__.py
new file mode 100644
index 0000000000..b5f0290153
--- /dev/null
+++ b/nemo_skills/dataset/nv_tts/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# NV TTS evaluation dataset - for testing TTS models
+
+DATASET_GROUP = "tts"
+IS_BENCHMARK_GROUP = True
+
+# BENCHMARKS will be populated dynamically based on the config file
+# Example: {"nv_tts.libritts_seen": {}, "nv_tts.riva_hard_digits": {}}
+BENCHMARKS = {}
+
+GENERATION_ARGS = "++prompt_format=openai"
diff --git a/nemo_skills/dataset/nv_tts/prepare.py b/nemo_skills/dataset/nv_tts/prepare.py
new file mode 100644
index 0000000000..6fa68d09b9
--- /dev/null
+++ b/nemo_skills/dataset/nv_tts/prepare.py
@@ -0,0 +1,235 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Prepare NV TTS evaluation datasets.
+
+Reads a config JSON file (local or remote) containing subtest definitions,
+fetches manifest JSONL files, and generates nemo-skills test.jsonl files
+with manifest content embedded as JSON in user message content.
+
+Usage:
+    python prepare.py --config login-eos.nvidia.com:/path/to/evalset_config.json
+    python prepare.py --config /local/path/to/evalset_config.json
+"""
+
+import argparse
+import json
+import os
+import subprocess
+import tempfile
+from pathlib import Path
+
+SYSTEM_MESSAGE = "You are a helpful assistant."
+
+# Template for subtest __init__.py files
+INIT_TEMPLATE = """# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# NV TTS subtest: {subtest_name}
+
+GENERATION_ARGS = "++prompt_format=openai"
+"""
+
+
+def is_remote_path(path: str) -> bool:
+    """Check if path is a remote path (host:/path format)."""
+    return ":" in path and not path.startswith("/") and not path.startswith(".")
+
+
+def fetch_remote_file(remote_path: str, local_path: str) -> None:
+    """Fetch a file from a remote host using scp."""
+    result = subprocess.run(
+        ["scp", remote_path, local_path],
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0:
+        raise RuntimeError(f"Failed to fetch {remote_path}: {result.stderr}")
+
+
+def read_file_content(path: str) -> str:
+    """Read file content, handling both local and remote paths."""
+    if is_remote_path(path):
+        with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".tmp") as tmp:
+            tmp_path = tmp.name
+        try:
+            fetch_remote_file(path, tmp_path)
+            with open(tmp_path, "r", encoding="utf-8") as f:
+                return f.read()
+        finally:
+            if os.path.exists(tmp_path):
+                os.unlink(tmp_path)
+    else:
+        with open(path, "r", encoding="utf-8") as f:
+            return f.read()
+
+
+def get_remote_host(path: str) -> str:
+    """Extract host from a remote path."""
+    if is_remote_path(path):
+        return path.split(":")[0]
+    return ""
+
+
+def make_remote_path(host: str, path: str) -> str:
+    """Create a remote path from host and path."""
+    if host:
+        return f"{host}:{path}"
+    return path
+
+
+def format_manifest_entry(entry: dict, audio_dir: str) -> dict:
+    """Format a manifest entry into nemo-skills format.
+
+    Args:
+        entry: Manifest entry with fields like text, context_audio_filepath, etc.
+        audio_dir: Base directory for audio files to make paths absolute.
+
+    Returns:
+        Formatted entry with messages containing the manifest as JSON string.
+    """
+    # Make audio paths absolute by combining with audio_dir
+    entry_with_absolute_paths = entry.copy()
+
+    if "context_audio_filepath" in entry_with_absolute_paths and audio_dir:
+        entry_with_absolute_paths["context_audio_filepath"] = os.path.join(
+            audio_dir, entry_with_absolute_paths["context_audio_filepath"]
+        )
+
+    if "audio_filepath" in entry_with_absolute_paths and audio_dir:
+        entry_with_absolute_paths["audio_filepath"] = os.path.join(
+            audio_dir, entry_with_absolute_paths["audio_filepath"]
+        )
+
+    # Create the nemo-skills format entry
+    content = json.dumps(entry_with_absolute_paths)
+
+    return {
+        "problem": "",
+        "messages": [
+            {"role": "system", "content": SYSTEM_MESSAGE},
+            {"role": "user", "content": content},
+        ],
+    }
+
+
+def create_subtest_init(subtest_dir: Path, subtest_name: str) -> None:
+    """Create __init__.py for a subtest directory."""
+    content = INIT_TEMPLATE.format(subtest_name=subtest_name)
+    with open(subtest_dir / "__init__.py", "w", encoding="utf-8") as f:
+        f.write(content)
+
+
+def process_subtest(
+    subtest_name: str,
+    config: dict,
+    output_dir: Path,
+    remote_host: str,
+) -> int:
+    """Process a single subtest and generate test.jsonl.
+
+    Args:
+        subtest_name: Name of the subtest (e.g., "libritts_seen").
+        config: Subtest config with manifest_path, audio_dir, feature_dir.
+        output_dir: Base output directory for the dataset.
+        remote_host: Remote host for fetching files (empty for local).
+
+    Returns:
+        Number of entries processed.
+    """
+    subtest_dir = output_dir / subtest_name
+    subtest_dir.mkdir(parents=True, exist_ok=True)
+
+    manifest_path = config["manifest_path"]
+    audio_dir = config.get("audio_dir", "")
+
+    # Fetch manifest file
+    if remote_host:
+        manifest_remote = make_remote_path(remote_host, manifest_path)
+        print(f"Fetching manifest from {manifest_remote}...")
+        manifest_content = read_file_content(manifest_remote)
+    else:
+        print(f"Reading manifest from {manifest_path}...")
+        manifest_content = read_file_content(manifest_path)
+
+    # Process manifest entries
+    output_file = subtest_dir / "test.jsonl"
+    count = 0
+
+    with open(output_file, "w", encoding="utf-8") as fout:
+        for line in manifest_content.strip().split("\n"):
+            if not line.strip():
+                continue
+
+            try:
+                entry = json.loads(line)
+            except json.JSONDecodeError as e:
+                print(f"  Warning: Skipping invalid JSON line: {e}")
+                continue
+
+            formatted = format_manifest_entry(entry, audio_dir)
+            fout.write(json.dumps(formatted) + "\n")
+            count += 1
+
+    # Create __init__.py
+    create_subtest_init(subtest_dir, subtest_name)
+
+    print(f"  Wrote {count} entries to {output_file}")
+    return count
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Prepare NV TTS evaluation datasets")
+    parser.add_argument(
+        "--config",
+        required=True,
+        help="Path to config JSON file (local or remote: host:/path/to/config.json)",
+    )
+    args = parser.parse_args()
+
+    output_dir = Path(__file__).parent
+
+    # Determine if config is remote and extract host
+    config_path = args.config
+    remote_host = get_remote_host(config_path)
+
+    # Read config file
+    print(f"Reading config from {config_path}...")
+    config_content = read_file_content(config_path)
+    config = json.loads(config_content)
+
+    print(f"Found {len(config)} subtests: {list(config.keys())}")
+
+    total_entries = 0
+    for subtest_name, subtest_config in config.items():
+        print(f"\nProcessing {subtest_name}...")
+        count = process_subtest(subtest_name, subtest_config, output_dir, remote_host)
+        total_entries += count
+
+    print(f"\nDone! Processed {total_entries} total entries across {len(config)} subtests.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nemo_skills/dataset/nv_tts/scripts/config/default.yaml b/nemo_skills/dataset/nv_tts/scripts/config/default.yaml
new file mode 100644
index 0000000000..4539b46402
--- /dev/null
+++ b/nemo_skills/dataset/nv_tts/scripts/config/default.yaml
@@ -0,0 +1,31 @@
+# TTS Pipeline Configuration
+
+# Cluster and execution settings (shared across all stages)
+cluster: eos
+container: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/containters/nemo-25.11.sqsh
+partition: batch
+mount_paths: /lustre:/lustre
+output_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/nv_tts_eval_full_a3
+
+# NeMo code path
+nemo_code_path: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/experimenta/tts_eval/NeMo
+
+# Generation settings (ns eval arguments)
+generation:
+  benchmarks: nv_tts.libritts_seen,nv_tts.libritts_test_clean,nv_tts.riva_hard_digits,nv_tts.riva_hard_letters,nv_tts.riva_hard_money,nv_tts.riva_hard_short,nv_tts.vctk
+  model: nvidia/magpie_tts_multilingual_357m
+  server_type: generic
+  server_gpus: 1
+  server_entrypoint: python -m nemo_skills.inference.server.serve_unified
+  server_args: --backend magpie_tts --codec_model nvidia/nemo-nano-codec-22khz-1.89kbps-21.5fps --batch_size 32 --batch_timeout 0.1 --use_cfg --cfg_scale 2.5
+  data_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/data_dir
+  num_chunks: 2
+  extra_args: ++server.server_type=vllm_multimodal
+
+# Scoring settings
+scoring:
+  sv_model: titanet
+  asr_model_name: nvidia/parakeet-tdt-1.1b
+  language: en
+  with_utmosv2: true
+  gpus: 1
diff --git a/nemo_skills/dataset/nv_tts/scripts/config/grpo_small_step1100.yaml b/nemo_skills/dataset/nv_tts/scripts/config/grpo_small_step1100.yaml
new file mode 100644
index 0000000000..3499e16924
--- /dev/null
+++ b/nemo_skills/dataset/nv_tts/scripts/config/grpo_small_step1100.yaml
@@ -0,0 +1,39 @@
+# TTS Pipeline Configuration - GRPO Small Step 1100
+
+# Cluster and execution settings (shared across all stages)
+cluster: eos
+container: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/containters/nemo-25.11.sqsh
+partition: batch
+mount_paths: /lustre:/lustre
+output_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/nv_tts_eval_grpo_small_step1100_full
+
+# NeMo code path
+nemo_code_path: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/experimenta/tts_eval/NeMo
+
+# Generation settings (ns eval arguments)
+generation:
+  benchmarks: nv_tts.libritts_seen,nv_tts.libritts_test_clean,nv_tts.riva_hard_digits,nv_tts.riva_hard_letters,nv_tts.riva_hard_money,nv_tts.riva_hard_short,nv_tts.vctk
+  model: grpo_small_step1100  # name for logging, actual model loaded via hparams/checkpoint in server_args
+  server_type: generic
+  server_gpus: 1
+  server_entrypoint: "cd /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/experimenta/tts_eval/NeMo && python -m nemo_skills.inference.server.serve_unified"
+  server_args: >-
+    --backend magpie_tts
+    --codec_model nvidia/nemo-nano-codec-22khz-1.89kbps-21.5fps
+    --batch_size 32
+    --batch_timeout 0.1
+    --use_cfg
+    --cfg_scale 2.5
+    --hparams_file /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/experimenta/tts_eval/small_hparams.yaml
+    --checkpoint_file /lustre/fsw/llmservice_nemo_speechlm/users/agorodetskii/checkpoints/N2512_English_TTSArena/grpo_small_from_ft_step1100.ckpt
+  data_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/data_dir
+  num_chunks: 2
+  extra_args: ++server.server_type=vllm_multimodal
+
+# Scoring settings
+scoring:
+  sv_model: titanet
+  asr_model_name: nvidia/parakeet-tdt-1.1b
+  language: en
+  with_utmosv2: true
+  gpus: 1
diff --git a/nemo_skills/dataset/nv_tts/scripts/run_tts_eval.py b/nemo_skills/dataset/nv_tts/scripts/run_tts_eval.py
new file mode 100644
index 0000000000..73e3b2b234
--- /dev/null
+++ b/nemo_skills/dataset/nv_tts/scripts/run_tts_eval.py
@@ -0,0 +1,184 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+TTS Pipeline: Generation -> Scoring (-> Aggregation)
+
+Usage:
+    python run_tts_eval.py --config config.yaml
+    python run_tts_eval.py --config config.yaml --stage scoring
+    python run_tts_eval.py --config config.yaml --stage aggregation
+"""
+
+import argparse
+import os
+
+import yaml
+
+from nemo_skills.pipeline.eval import eval as ns_eval
+from nemo_skills.pipeline.run_cmd import run_cmd as ns_run_cmd
+
+
+class MockContext:
+    """Mock typer.Context for programmatic calls."""
+
+    def __init__(self, extra_args=None):
+        self.args = extra_args or []
+
+
+def load_config(config_path: str) -> dict:
+    with open(config_path) as f:
+        return yaml.safe_load(f)
+
+
+def run_generation(cfg: dict, expname: str):
+    """Run generation stage using ns eval, returns experiment object."""
+    gen = cfg["generation"]
+
+    # Add nemo_code_path to server_args
+    server_args = gen["server_args"]
+    if cfg.get("nemo_code_path"):
+        server_args += f" --code_path {cfg['nemo_code_path']}"
+
+    # Parse extra_args for the context
+    extra_args = gen.get("extra_args", "").split() if gen.get("extra_args") else []
+    ctx = MockContext(extra_args)
+
+    # Call eval programmatically
+    return ns_eval(
+        ctx=ctx,
+        cluster=cfg["cluster"],
+        output_dir=cfg["output_dir"],
+        benchmarks=gen["benchmarks"],
+        model=gen["model"],
+        server_type=gen["server_type"],
+        server_gpus=gen["server_gpus"],
+        server_container=cfg["container"],
+        mount_paths=cfg["mount_paths"],
+        server_entrypoint=gen["server_entrypoint"],
+        server_args=server_args,
+        data_dir=gen["data_dir"],
+        num_chunks=gen["num_chunks"],
+        partition=cfg["partition"],
+        expname=expname,
+        auto_summarize_results=False,
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser(description="TTS Pipeline")
+    parser.add_argument("--config", required=True)
+    parser.add_argument(
+        "--stage",
+        choices=["all", "generation", "scoring", "aggregation"],
+        default="all",
+        help="Stage to run. 'all' runs generation+scoring (no aggregation)",
+    )
+    parser.add_argument("--expname", default="tts_eval", help="Base experiment name for job tracking")
+    args = parser.parse_args()
+
+    cfg = load_config(args.config)
+    scoring = cfg.get("scoring", {})
+    hf_token = os.environ.get("HF_TOKEN", "")
+    nemo_path = cfg["nemo_code_path"]
+    output_dir = cfg["output_dir"]
+
+    gen_exp_name = None
+
+    # Stage 1: Generation
+    if args.stage in ("all", "generation"):
+        print("\n" + "=" * 60)
+        print("Stage 1: GENERATION")
+        print("=" * 60)
+        gen_exp = run_generation(cfg, args.expname)
+        # Extract experiment name/id for dependency tracking
+        gen_exp_name = args.expname  # The expname we passed to ns_eval
+        print(f"Generation submitted: {gen_exp}")
+
+    # Stage 2: Scoring (one job per benchmark, depends on generation)
+    if args.stage in ("all", "scoring"):
+        print("\n" + "=" * 60)
+        print("Stage 2: SCORING")
+        print("=" * 60)
+
+        # Parse benchmarks list
+        benchmarks = cfg["generation"]["benchmarks"].split(",")
+
+        install_cmd = None
+        if scoring.get("with_utmosv2"):
+            install_cmd = "pip install git+https://github.com/sarulab-speech/UTMOSv2.git@v1.2.1"
+
+        # When running both stages, scoring depends on generation experiment (by name)
+        run_after = [gen_exp_name] if args.stage == "all" and gen_exp_name else None
+
+        for benchmark in benchmarks:
+            benchmark = benchmark.strip()
+            # Benchmark dir in eval-results keeps dot notation (nv_tts.libritts_seen)
+            benchmark_dir = benchmark
+
+            scoring_cmd = (
+                f"HF_TOKEN={hf_token} "
+                f"PYTHONPATH={nemo_path}:$PYTHONPATH "
+                f"python -m nemo_skills.dataset.nv_tts.scripts.score "
+                f"--results_dir {output_dir} "
+                f"--benchmark {benchmark_dir} "
+                f"--sv_model {scoring.get('sv_model', 'titanet')} "
+                f"--asr_model_name {scoring.get('asr_model_name', 'nvidia/parakeet-tdt-1.1b')} "
+                f"--language {scoring.get('language', 'en')}"
+            )
+            if scoring.get("with_utmosv2"):
+                scoring_cmd += " --with_utmosv2"
+
+            # Short name for job (e.g. libritts_seen from nv_tts.libritts_seen)
+            short_name = benchmark.split(".")[-1]
+            print(f"  Submitting scoring job for: {benchmark}")
+
+            ns_run_cmd(
+                ctx=MockContext(),
+                cluster=cfg["cluster"],
+                container=cfg["container"],
+                partition=cfg["partition"],
+                num_gpus=scoring.get("gpus", 1),
+                mount_paths=cfg["mount_paths"],
+                command=scoring_cmd,
+                installation_command=install_cmd,
+                run_after=run_after,
+                expname=f"{args.expname}_score_{short_name}",
+                log_dir=f"{output_dir}/eval-logs",
+            )
+
+    # Stage 3: Aggregation (only if explicitly requested)
+    if args.stage == "aggregation":
+        print("\n" + "=" * 60)
+        print("Stage 3: AGGREGATION")
+        print("=" * 60)
+        agg_cmd = f"python -m nemo_skills.dataset.nv_tts.scripts.score --results_dir {output_dir} --aggregation_only"
+        ns_run_cmd(
+            ctx=MockContext(),
+            cluster=cfg["cluster"],
+            container=cfg["container"],
+            partition=cfg["partition"],
+            num_gpus=0,
+            mount_paths=cfg["mount_paths"],
+            command=agg_cmd,
+            expname=f"{args.expname}_agg",
+            log_dir=f"{output_dir}/eval-logs",
+        )
+
+    print("\nDone!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nemo_skills/dataset/nv_tts/scripts/score.py b/nemo_skills/dataset/nv_tts/scripts/score.py
index 13e6fc6210..215d332405 100644
--- a/nemo_skills/dataset/nv_tts/scripts/score.py
+++ b/nemo_skills/dataset/nv_tts/scripts/score.py
@@ -63,11 +63,15 @@ def run_scoring(
             print(f"Skipping {bench}: output.jsonl not found")
             continue
 
+        metrics_path = os.path.join(benchmark_dir, "metrics.json")
+        if os.path.exists(metrics_path):
+            print(f"Skipping {bench}: metrics.json already exists")
+            continue
+
         print(f"\nScoring: {bench}")
         metrics = score_benchmark(output_jsonl, scoring_cfg)
 
         # Save metrics.json
-        metrics_path = os.path.join(benchmark_dir, "metrics.json")
         with open(metrics_path, "w") as f:
             json.dump(metrics, f, indent=2)
         print(f"Saved: {metrics_path}")

From db37cff0cb9d07da7ffdebd85921abcd8a045912 Mon Sep 17 00:00:00 2001
From: Valentin Mendelev <vmendelev@nvidia.com>
Date: Sun, 28 Dec 2025 09:46:38 -0800
Subject: [PATCH 10/26] EOS FIX 8 chunks per node

---
 .../nv_tts/scripts/config/default.yaml        |  3 +-
 .../scripts/config/grpo_small_step1100.yaml   |  5 +-
 .../dataset/nv_tts/scripts/run_tts_eval.py    |  1 +
 nemo_skills/inference/server/serve_unified.py |  5 +-
 nemo_skills/pipeline/eval.py                  | 13 +++++
 nemo_skills/pipeline/utils/eval.py            | 39 +++++++++++++--
 nemo_skills/pipeline/utils/exp.py             |  8 ++--
 nemo_skills/pipeline/utils/generation.py      | 47 +++++++++++++++----
 nemo_skills/pipeline/utils/server.py          | 33 +++++++++----
 9 files changed, 124 insertions(+), 30 deletions(-)

diff --git a/nemo_skills/dataset/nv_tts/scripts/config/default.yaml b/nemo_skills/dataset/nv_tts/scripts/config/default.yaml
index 4539b46402..3bbc3b7703 100644
--- a/nemo_skills/dataset/nv_tts/scripts/config/default.yaml
+++ b/nemo_skills/dataset/nv_tts/scripts/config/default.yaml
@@ -19,7 +19,8 @@ generation:
   server_entrypoint: python -m nemo_skills.inference.server.serve_unified
   server_args: --backend magpie_tts --codec_model nvidia/nemo-nano-codec-22khz-1.89kbps-21.5fps --batch_size 32 --batch_timeout 0.1 --use_cfg --cfg_scale 2.5
   data_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/data_dir
-  num_chunks: 2
+  num_chunks: 8
+  gpus_per_node: 8  # set to 1 for single-GPU mode, or 8 for multi-instance mode (num_chunks must be multiple of gpus_per_node)
   extra_args: ++server.server_type=vllm_multimodal
 
 # Scoring settings
diff --git a/nemo_skills/dataset/nv_tts/scripts/config/grpo_small_step1100.yaml b/nemo_skills/dataset/nv_tts/scripts/config/grpo_small_step1100.yaml
index 3499e16924..ad8cf100e7 100644
--- a/nemo_skills/dataset/nv_tts/scripts/config/grpo_small_step1100.yaml
+++ b/nemo_skills/dataset/nv_tts/scripts/config/grpo_small_step1100.yaml
@@ -5,7 +5,7 @@ cluster: eos
 container: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/containters/nemo-25.11.sqsh
 partition: batch
 mount_paths: /lustre:/lustre
-output_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/nv_tts_eval_grpo_small_step1100_full
+output_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/nv_tts_eval_grpo_small_step1100_full_8c_a3
 
 # NeMo code path
 nemo_code_path: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/experimenta/tts_eval/NeMo
@@ -27,7 +27,8 @@ generation:
     --hparams_file /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/experimenta/tts_eval/small_hparams.yaml
     --checkpoint_file /lustre/fsw/llmservice_nemo_speechlm/users/agorodetskii/checkpoints/N2512_English_TTSArena/grpo_small_from_ft_step1100.ckpt
   data_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/data_dir
-  num_chunks: 2
+  num_chunks: 8
+  gpus_per_node: 8
   extra_args: ++server.server_type=vllm_multimodal
 
 # Scoring settings
diff --git a/nemo_skills/dataset/nv_tts/scripts/run_tts_eval.py b/nemo_skills/dataset/nv_tts/scripts/run_tts_eval.py
index 73e3b2b234..c4548d3305 100644
--- a/nemo_skills/dataset/nv_tts/scripts/run_tts_eval.py
+++ b/nemo_skills/dataset/nv_tts/scripts/run_tts_eval.py
@@ -71,6 +71,7 @@ def run_generation(cfg: dict, expname: str):
         server_args=server_args,
         data_dir=gen["data_dir"],
         num_chunks=gen["num_chunks"],
+        gpus_per_node=gen.get("gpus_per_node", 1),
         partition=cfg["partition"],
         expname=expname,
         auto_summarize_results=False,
diff --git a/nemo_skills/inference/server/serve_unified.py b/nemo_skills/inference/server/serve_unified.py
index c6aa1764de..47feb62f2a 100644
--- a/nemo_skills/inference/server/serve_unified.py
+++ b/nemo_skills/inference/server/serve_unified.py
@@ -290,8 +290,9 @@ def main():
     if args.debug:
         os.environ["DEBUG"] = "1"
 
-    # Set CUDA devices
-    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(i) for i in range(args.num_gpus))
+    # Set CUDA devices (only if not already set by the environment, e.g., SLURM)
+    if "CUDA_VISIBLE_DEVICES" not in os.environ:
+        os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(i) for i in range(args.num_gpus))
 
     # Build extra config for backend-specific options
     extra_config = {}
diff --git a/nemo_skills/pipeline/eval.py b/nemo_skills/pipeline/eval.py
index df273cfc19..4d493e61c5 100644
--- a/nemo_skills/pipeline/eval.py
+++ b/nemo_skills/pipeline/eval.py
@@ -288,6 +288,12 @@ def eval(
         None,
         help="Number of chunks to split the dataset into. If None, will not chunk the dataset.",
     ),
+    gpus_per_node: int = typer.Option(
+        1,
+        help="Number of GPUs per node for multi-instance mode. "
+        "When > 1, launches multiple server instances (one per GPU) within a single job. "
+        "Requires num_chunks to be a multiple of gpus_per_node.",
+    ),
     chunk_ids: str = typer.Option(
         None,
         help="List of explicit chunk ids to run. Separate with , or .. to specify range. "
@@ -493,6 +499,7 @@ def eval(
         eval_requires_judge=eval_requires_judge,
         generation_type=generation_type,
         generation_module=generation_module,
+        gpus_per_node=gpus_per_node,
     )
 
     sbatch_kwargs = parse_kwargs(sbatch_kwargs, exclusive=exclusive, qos=qos, time_min=time_min)
@@ -517,9 +524,14 @@ def eval(
                 job_server_address,
                 job_server_command,
                 job_sandbox_env_overrides,
+                job_gpus_per_node,
             ) = job_args
             prev_tasks = _task_dependencies
 
+            # Add gpus_per_node to server config for multi-instance mode
+            if job_server_config and job_gpus_per_node > 1:
+                job_server_config["gpus_per_node"] = job_gpus_per_node
+
             for _ in range(dependent_jobs + 1):
                 has_tasks = True
                 new_task = pipeline_utils.add_task(
@@ -529,6 +541,7 @@ def eval(
                     log_dir=log_dir,
                     container=cluster_config["containers"]["nemo-skills"],
                     cluster_config=cluster_config,
+                    num_tasks=job_gpus_per_node,
                     partition=partition,
                     server_config=job_server_config,
                     with_sandbox=job_needs_sandbox or with_sandbox,
diff --git a/nemo_skills/pipeline/utils/eval.py b/nemo_skills/pipeline/utils/eval.py
index 736d1c7cb6..7674c94f81 100644
--- a/nemo_skills/pipeline/utils/eval.py
+++ b/nemo_skills/pipeline/utils/eval.py
@@ -267,11 +267,20 @@ def prepare_eval_commands(
     eval_requires_judge,
     generation_type=None,
     generation_module=None,
+    gpus_per_node: int = 1,
 ):
     # TODO: there is a bit too much code duplication here and logic is quite dense, should try to refactor
 
     # TODO: should we allow setting num chunks per benchmark when not using groups? Maybe benchmark:rs_num:num_chunks?
 
+    # Validate gpus_per_node for multi-instance mode
+    if gpus_per_node > 1:
+        if num_chunks is None:
+            raise ValueError("gpus_per_node > 1 requires num_chunks to be specified")
+        if num_chunks % gpus_per_node != 0:
+            raise ValueError(f"num_chunks ({num_chunks}) must be a multiple of gpus_per_node ({gpus_per_node})")
+        LOG.info(f"Multi-instance mode: {gpus_per_node} GPUs per node, {num_chunks // gpus_per_node} jobs")
+
     if generation_type is not None:
         if generation_module is not None:
             raise ValueError("Cannot specify both generation_module and generation_type. ")
@@ -354,7 +363,12 @@ def prepare_eval_commands(
             rerun_done=rerun_done,
         )
         for seed_idx, (seed, benchmark_chunk_ids) in enumerate(benchmark_args.remaining_jobs.items()):
-            total_evals += len(benchmark_chunk_ids)
+            # Multi-instance mode: count unique base chunks (each base chunk = 1 job)
+            if gpus_per_node > 1:
+                base_chunks = set((cid // gpus_per_node) * gpus_per_node for cid in benchmark_chunk_ids)
+                total_evals += len(base_chunks)
+            else:
+                total_evals += len(benchmark_chunk_ids)
 
     if num_jobs < 0:
         # if num_jobs is -1, we run all benchmarks in parallel
@@ -376,6 +390,7 @@ def prepare_eval_commands(
         **server_parameters,
         extra_arguments=extra_arguments,
         get_random_port=get_random_port,
+        gpus_per_node=gpus_per_node,
     )
 
     cur_eval = 0
@@ -398,7 +413,18 @@ def prepare_eval_commands(
                     random_seed=seed,
                     chunk_id=None,
                 )
-            for chunk_id in benchmark_chunk_ids:
+            # Multi-instance mode: compute which base chunks need to run
+            # If ANY chunk in a batch is incomplete, we run the entire batch (base_chunk)
+            if gpus_per_node > 1:
+                base_chunks_to_run = set()
+                for cid in benchmark_chunk_ids:
+                    base_chunk = (cid // gpus_per_node) * gpus_per_node
+                    base_chunks_to_run.add(base_chunk)
+                chunks_to_process = sorted(base_chunks_to_run)
+            else:
+                chunks_to_process = benchmark_chunk_ids
+
+            for chunk_id in chunks_to_process:
                 job_benchmarks.add(benchmark)
 
                 effective_generation_module = generation_module or benchmark_args.generation_module
@@ -430,12 +456,17 @@ def prepare_eval_commands(
                     f"{job_extra_arguments} "
                 )
 
+                # Multi-instance mode: use shell expression for chunk_id
+                effective_chunk_id = chunk_id
+                if gpus_per_node > 1:
+                    effective_chunk_id = f"$(({chunk_id} + $SLURM_LOCALID))"
+
                 cmd = pipeline_utils.get_generation_cmd(
                     input_file=benchmark_args.input_file,
                     output_dir=benchmark_output_dir,
                     extra_arguments=full_extra_arguments,
                     random_seed=seed,
-                    chunk_id=chunk_id,
+                    chunk_id=effective_chunk_id,
                     num_chunks=benchmark_args.num_chunks,
                     script=generation_module or benchmark_args.generation_module,
                     # only logging for the first seed
@@ -478,12 +509,14 @@ def prepare_eval_commands(
                             # a check above guarantees that this is the same for all tasks in a job
                             generation_task.get_server_command_fn(),
                             job_sandbox_env_overrides,
+                            gpus_per_node,  # client num_tasks for multi-instance mode
                         )
                     )
                     job_server_config, job_server_address, job_extra_arguments = pipeline_utils.configure_client(
                         **server_parameters,
                         extra_arguments=extra_arguments,
                         get_random_port=get_random_port,
+                        gpus_per_node=gpus_per_node,
                     )
                     for job_benchmark in job_benchmarks:
                         benchmarks_dict[job_benchmark].job_ids.append(cur_job_idx)
diff --git a/nemo_skills/pipeline/utils/exp.py b/nemo_skills/pipeline/utils/exp.py
index 5e8642f0a2..3eed966b33 100644
--- a/nemo_skills/pipeline/utils/exp.py
+++ b/nemo_skills/pipeline/utils/exp.py
@@ -126,7 +126,7 @@ def stdout(self) -> Path:
 
     @property
     def srun_stdout(self) -> Path:
-        return Path(self.folder) / f"{self.srun_prefix}%j_srun.log"
+        return Path(self.folder) / f"{self.srun_prefix}%j_%t_srun.log"
 
     @property
     def stderr(self) -> Path:
@@ -134,7 +134,7 @@ def stderr(self) -> Path:
 
     @property
     def srun_stderr(self) -> Path:
-        return Path(self.folder) / f"{self.srun_prefix}%j_srun.log"
+        return Path(self.folder) / f"{self.srun_prefix}%j_%t_srun.log"
 
     @property
     def ls_term(self) -> str:
@@ -143,7 +143,7 @@ def ls_term(self) -> str:
         The command used to list the files is ls -1 {ls_term} 2> /dev/null
         """
         assert self.folder
-        return os.path.join(self.folder, "*%j_srun.log")
+        return os.path.join(self.folder, "*%j_*_srun.log")
 
 
 @dataclass(kw_only=True)
@@ -312,7 +312,7 @@ def get_executor(
     srun_args = [
         "--no-container-mount-home",
         "--mpi=pmix",
-        "--wait=10",
+        "--wait=240",  # wait up to 4 minutes for slower tasks to complete (important for multi-instance mode)
         # we need to be explicit about this in srun as commands might need to run in parallel
         f"--ntasks-per-node={tasks_per_node}",
         f"--nodes={num_nodes}",
diff --git a/nemo_skills/pipeline/utils/generation.py b/nemo_skills/pipeline/utils/generation.py
index 4b45ed8a39..337b8e87e5 100644
--- a/nemo_skills/pipeline/utils/generation.py
+++ b/nemo_skills/pipeline/utils/generation.py
@@ -341,8 +341,15 @@ def get_generation_cmd(
         cmd += "++wait_for_sandbox=true "
 
     if chunk_id is not None:
-        cmd += f" ++num_chunks={num_chunks} ++chunk_id={chunk_id} "
-        output_file = get_chunked_rs_filename(output_dir, random_seed=random_seed, chunk_id=chunk_id)
+        # Check if chunk_id is a shell expression (e.g., "$((0 + $SLURM_LOCALID))")
+        is_shell_expr = isinstance(chunk_id, str) and "$" in str(chunk_id)
+
+        if is_shell_expr:
+            # For shell expressions, use double quotes so shell expands the expression
+            cmd += f' ++num_chunks={num_chunks} "++chunk_id={chunk_id}" '
+        else:
+            cmd += f" ++num_chunks={num_chunks} ++chunk_id={chunk_id} "
+
         donefiles = []
         # we are always waiting for all chunks in num_chunks, no matter chunk_ids in
         # the current run (as we don't want to merge partial jobs)
@@ -351,10 +358,23 @@ def get_generation_cmd(
             donefile = f"{filename}.done"
             donefiles.append(donefile)
 
-        if job_end_cmd:
-            job_end_cmd += f" && touch {donefiles[chunk_id]} "
+        if is_shell_expr:
+            # For shell expression, compute the donefile path at runtime
+            # Get the base pattern with _chunk_0 and replace with shell expression
+            base_donefile = donefiles[0]  # e.g., /path/output_chunk_0.jsonl.done
+            # Replace "_chunk_0.jsonl" with "_chunk_$((expr)).jsonl" where expr is expanded by shell
+            # Extract the expression part (e.g., "0 + $SLURM_LOCALID" from "$((0 + $SLURM_LOCALID))")
+            donefile_pattern = base_donefile.replace("_chunk_0.jsonl", f"_chunk_{chunk_id}.jsonl")
+            if job_end_cmd:
+                job_end_cmd += f' && touch "{donefile_pattern}" '
+            else:
+                job_end_cmd = f'touch "{donefile_pattern}" '
         else:
-            job_end_cmd = f"touch {donefiles[chunk_id]} "
+            output_file = get_chunked_rs_filename(output_dir, random_seed=random_seed, chunk_id=chunk_id)
+            if job_end_cmd:
+                job_end_cmd += f" && touch {donefiles[chunk_id]} "
+            else:
+                job_end_cmd = f"touch {donefiles[chunk_id]} "
 
         # getting file name as if there is no chunking since that's where we want to merge
         merged_output_file = get_chunked_rs_filename(output_dir=output_dir, random_seed=random_seed)
@@ -424,6 +444,7 @@ def configure_client(
     get_random_port: bool,
     extra_arguments: str,
     server_container: str | None = None,
+    gpus_per_node: int = 1,
 ):
     """
     Utility function to configure a client for the model inference server.
@@ -439,6 +460,7 @@ def configure_client(
         get_random_port: Whether to get a random port for the server.
         extra_arguments: Extra arguments to pass to the command.
         server_container: Container to use for the server.
+        gpus_per_node: Number of GPUs per node for multi-instance mode.
 
     Returns:
         A tuple containing:
@@ -467,10 +489,17 @@ def configure_client(
             server_config["container"] = server_container
         # Only add server_type if user didn't specify it (allows vllm_multimodal override)
         server_type_arg = "" if user_specified_server_type else f"++server.server_type={server_type} "
-        extra_arguments = (
-            f"{extra_arguments} {server_type_arg}++server.host=127.0.0.1 "
-            f"++server.port={server_port} ++server.model={model} "
-        )
+        if gpus_per_node > 1:
+            # Multi-instance mode: port is computed at runtime based on SLURM_LOCALID
+            extra_arguments = (
+                f"{extra_arguments} {server_type_arg}++server.host=127.0.0.1 "
+                f'"++server.port=$(({server_port} + $SLURM_LOCALID))" ++server.model={model} '
+            )
+        else:
+            extra_arguments = (
+                f"{extra_arguments} {server_type_arg}++server.host=127.0.0.1 "
+                f"++server.port={server_port} ++server.model={model} "
+            )
     else:  # model is hosted elsewhere
         server_config = None
         # Only add server_type if user didn't specify it
diff --git a/nemo_skills/pipeline/utils/server.py b/nemo_skills/pipeline/utils/server.py
index e9258e4e7b..631dfe7610 100644
--- a/nemo_skills/pipeline/utils/server.py
+++ b/nemo_skills/pipeline/utils/server.py
@@ -120,6 +120,7 @@ def get_server_command(
     server_port: int,
     server_args: str = "",
     server_entrypoint: str | None = None,
+    gpus_per_node: int = 1,
 ):
     num_tasks = num_gpus
 
@@ -209,15 +210,29 @@ def get_server_command(
     elif server_type == "generic":
         if not server_entrypoint:
             raise ValueError("For 'generic' server type, 'server_entrypoint' must be specified.")
-        server_start_cmd = (
-            f"{server_entrypoint} "
-            f"    --model {model_path} "
-            f"    --num_gpus {num_gpus} "
-            f"    --num_nodes {num_nodes} "
-            f"    --port {server_port} "
-            f"    {server_args} "
-        )
-        num_tasks = 1
+        if gpus_per_node > 1:
+            # Multi-instance mode: each SLURM task gets its own GPU and port
+            server_start_cmd = (
+                f"echo 'SLURM_LOCALID='$SLURM_LOCALID' SLURM_PROCID='$SLURM_PROCID && "
+                f"export CUDA_VISIBLE_DEVICES=${{SLURM_LOCALID:-0}} && "
+                f"{server_entrypoint} "
+                f"    --model {model_path} "
+                f"    --num_gpus 1 "
+                f"    --num_nodes 1 "
+                f"    --port $(({server_port} + ${{SLURM_LOCALID:-0}})) "
+                f"    {server_args} "
+            )
+            num_tasks = gpus_per_node
+        else:
+            server_start_cmd = (
+                f"{server_entrypoint} "
+                f"    --model {model_path} "
+                f"    --num_gpus {num_gpus} "
+                f"    --num_nodes {num_nodes} "
+                f"    --port {server_port} "
+                f"    {server_args} "
+            )
+            num_tasks = 1
     else:
         raise ValueError(f"Server type '{server_type}' not supported for model inference.")
 

From a11456e19d01775b73061af1bcd9335221ad1fa5 Mon Sep 17 00:00:00 2001
From: Valentin Mendelev <vmendelev@nvidia.com>
Date: Sun, 28 Dec 2025 11:55:45 -0800
Subject: [PATCH 11/26] Documentation and comparison script

---
 nemo_skills/dataset/nv_tts/TTS_eval.md        | 123 +++++
 .../scripts/compare_tts_eval_results.py       | 425 ++++++++++++++++++
 .../nv_tts/scripts/tts_comparison_report.md   | 115 +++++
 3 files changed, 663 insertions(+)
 create mode 100644 nemo_skills/dataset/nv_tts/TTS_eval.md
 create mode 100644 nemo_skills/dataset/nv_tts/scripts/compare_tts_eval_results.py
 create mode 100644 nemo_skills/dataset/nv_tts/scripts/tts_comparison_report.md

diff --git a/nemo_skills/dataset/nv_tts/TTS_eval.md b/nemo_skills/dataset/nv_tts/TTS_eval.md
new file mode 100644
index 0000000000..73864defd7
--- /dev/null
+++ b/nemo_skills/dataset/nv_tts/TTS_eval.md
@@ -0,0 +1,123 @@
+# TTS Evaluation Based on NeMo-Skills
+
+This is an adaptation of `examples/tts/magpietts_inference.py` into NeMo-Skills. The generation and scoring are separated into 2 stages and can be effectively parallelized. The same code as in `magpietts_inference.py` is used for both stages.
+
+The test sets are also borrowed from the current evaluation setup.
+
+## Getting Started
+
+### 1. Clone and Setup
+
+```bash
+# Clone this branch
+git clone <repository_url>
+cd ns_eval
+
+# Create a virtual environment and install nemo-skills
+python -m venv .venv
+source .venv/bin/activate
+pip install -e .
+```
+
+### 2. Cluster Configuration
+
+Decide which cluster you want to work on and setup the corresponding cluster configuration.
+
+- An example configuration for EOS is provided in `cluster_configs/eos_example.yaml`
+- You can get more configurations from the [NeMo-Skills cluster configs](https://github.com/NVIDIA/NeMo-Skills/tree/main/cluster_configs)
+- Update the username in the configuration file
+
+Note that NeMo-Skills standard quant of resources is 1 gpu. Eos cluster is special because it allows assigning only full nodes (not e.g. 2 gpus from 8). So I had to write a fix, which is not nice, so probably won't be merged as is. You can remove the "EOS FIX 8 chunks per node" if running on other clusters. This may entail small changes in the config.
+
+### 3. Prepare Test Data
+
+You can either prepare a new test set or reuse an existing data directory.
+
+**To prepare a new test set:**
+
+```bash
+cd /home/vmendelev/workspace/expressiveness/src/ns_eval && source .venv/bin/activate && \
+python nemo_skills/dataset/nv_tts/prepare.py \
+  --config <cluster_login>:<path_to_>/eval_config_full_fixed.json
+```
+
+This will prepare `test.jsonl` for each benchmark with pointers to the files on the cluster.
+
+**To reuse an existing data directory (EOS):**
+
+```
+/lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/data_dir
+```
+
+### 4. Configuration Files
+
+Review the config file and ensure all required artifacts are in the specified locations:
+
+| Config | Description |
+|--------|-------------|
+| `nemo_skills/dataset/nv_tts/scripts/config/default.yaml` | For `.nemo` model input |
+| `nemo_skills/dataset/nv_tts/scripts/config/grpo_small_step1100.yaml` | For checkpoint + hparams input |
+
+### 5. Environment Setup
+
+Make sure `HF_TOKEN` is present in the environment:
+
+```bash
+export HF_TOKEN=<your_huggingface_token>
+# or source from your .env file
+. ~/.env && export HF_TOKEN=$HF_READ_ONLY
+```
+
+## Running Evaluation
+
+### Full Evaluation (Generation + Scoring)
+
+```bash
+cd /home/vmendelev/workspace/expressiveness/src/ns_eval && source .venv/bin/activate && \
+NEMO_SKILLS_DISABLE_UNCOMMITTED_CHANGES_CHECK=1 \
+python -m nemo_skills.dataset.nv_tts.scripts.run_tts_eval \
+  --config nemo_skills/dataset/nv_tts/scripts/config/default.yaml \
+  --stage all \
+  --expname default_eval
+```
+
+### Stage Options
+
+| Stage | Description |
+|-------|-------------|
+| `all` | Run both generation and scoring |
+| `generation` | Run only TTS generation |
+| `scoring` | Run only scoring (requires completed generation) |
+| `aggregation` | Print summary of all metrics |
+
+## Comparing Results
+
+To produce a comparison report between different evaluation runs:
+
+```bash
+cd /home/vmendelev/workspace/expressiveness/src/ns_eval && source .venv/bin/activate && \
+python nemo_skills/dataset/nv_tts/scripts/compare_eval_results.py \
+  --baseline_dir <path_to_baseline_results> \
+  --compare_dir <path_to_comparison_results> \
+  --output_file tts_comparison_report.md
+```
+
+See [example report](scripts/tts_comparison_report.md) for sample output.
+
+## Output Structure
+
+Results are saved to `output_dir/eval-results/` with the following structure:
+
+```
+output_dir/
+├── eval-results/
+│   ├── nv_tts.libritts_seen/
+│   │   ├── output.jsonl          # Generated audio paths + metadata
+│   │   ├── output_with_metrics.jsonl  # With per-file metrics
+│   │   ├── metrics.json          # Aggregate metrics (CER, WER, UTMOSv2)
+│   │   └── audio/                # Generated audio files
+│   ├── nv_tts.vctk/
+│   │   └── ...
+│   └── ...
+└── eval-logs/                    # Job logs
+```
diff --git a/nemo_skills/dataset/nv_tts/scripts/compare_tts_eval_results.py b/nemo_skills/dataset/nv_tts/scripts/compare_tts_eval_results.py
new file mode 100644
index 0000000000..b8780d8f1f
--- /dev/null
+++ b/nemo_skills/dataset/nv_tts/scripts/compare_tts_eval_results.py
@@ -0,0 +1,425 @@
+#!/usr/bin/env python3
+"""
+Compare multiple TTS evaluation results and generate a Markdown report.
+
+Usage:
+    python compare_tts_eval_results.py \
+        --eval_folders "/path/to/eval1:Model A" "/path/to/eval2:Model B" \
+        --output report.md
+
+Supports remote folders via SSH:
+    python compare_tts_eval_results.py \
+        --eval_folders "host1:/path/to/eval1:Model A" "host2:/path/to/eval2:Model B" \
+        --output report.md
+
+Each eval folder should contain subdirectories for different test sets,
+each with a metrics.json file.
+"""
+
+import argparse
+import json
+import os
+import subprocess
+from typing import Optional
+
+
+def run_remote_cmd(host: str, cmd: str, timeout: int = 30) -> Optional[str]:
+    """Run a command on a remote host via SSH."""
+    try:
+        result = subprocess.run(["ssh", host, cmd], capture_output=True, text=True, timeout=timeout)
+        if result.returncode != 0:
+            return None
+        return result.stdout.strip()
+    except (subprocess.TimeoutExpired, Exception):
+        return None
+
+
+def list_test_sets_local(eval_folder: str) -> list[str]:
+    """List test set subdirectories in a local eval folder."""
+    if not os.path.isdir(eval_folder):
+        return []
+    return [d for d in os.listdir(eval_folder) if os.path.isdir(os.path.join(eval_folder, d))]
+
+
+def list_test_sets_remote(host: str, eval_folder: str) -> list[str]:
+    """List test set subdirectories in a remote eval folder via SSH."""
+    output = run_remote_cmd(host, f"ls -1 {eval_folder}")
+    if output is None:
+        return []
+    return [d.strip() for d in output.split("\n") if d.strip()]
+
+
+def load_metrics_local(metrics_path: str) -> Optional[dict]:
+    """Load metrics from a local JSON file."""
+    if not os.path.exists(metrics_path):
+        return None
+    with open(metrics_path, "r") as f:
+        return json.load(f)
+
+
+def load_metrics_remote(host: str, metrics_path: str) -> Optional[dict]:
+    """Load metrics from a remote JSON file via SSH."""
+    output = run_remote_cmd(host, f"cat {metrics_path}")
+    if output is None:
+        return None
+    try:
+        return json.loads(output)
+    except json.JSONDecodeError:
+        return None
+
+
+def load_metrics(path: str, host: Optional[str] = None) -> Optional[dict]:
+    """Load metrics from a local or remote JSON file."""
+    if host:
+        return load_metrics_remote(host, path)
+    return load_metrics_local(path)
+
+
+def list_test_sets(eval_folder: str, host: Optional[str] = None) -> list[str]:
+    """List test set subdirectories."""
+    if host:
+        return list_test_sets_remote(host, eval_folder)
+    return list_test_sets_local(eval_folder)
+
+
+def format_value(value, format_spec: str = ".4f", na_str: str = "N/A") -> str:
+    """Format a value for display."""
+    if value is None:
+        return na_str
+    try:
+        return f"{value:{format_spec}}"
+    except (ValueError, TypeError):
+        return str(value)
+
+
+def format_percent(value, na_str: str = "N/A") -> str:
+    """Format a value as percentage."""
+    if value is None:
+        return na_str
+    try:
+        return f"{value * 100:.2f}%"
+    except (ValueError, TypeError):
+        return str(value)
+
+
+def parse_folder_spec(spec: str) -> tuple[Optional[str], str, str]:
+    """
+    Parse folder specification in format:
+      - 'path:name' (local)
+      - 'path' (local, auto-name)
+      - 'host:path:name' (remote)
+      - 'host:path' (remote, auto-name)
+    """
+    parts = spec.split(":")
+    if len(parts) == 1:
+        path = parts[0]
+        name = os.path.basename(path.rstrip("/"))
+        return None, path, name
+    if len(parts) == 2:
+        if parts[0].startswith("/") or parts[0].startswith("."):
+            return None, parts[0], parts[1]
+        else:
+            host, path = parts[0], parts[1]
+            name = os.path.basename(path.rstrip("/"))
+            return host, path, name
+    if len(parts) == 3:
+        return parts[0], parts[1], parts[2]
+    if len(parts) > 3:
+        host = parts[0]
+        name = parts[-1]
+        path = ":".join(parts[1:-1])
+        return host, path, name
+    return None, spec, os.path.basename(spec.rstrip("/"))
+
+
+# TTS metrics: (key, display_name, format_func, higher_is_better)
+TTS_METRICS = [
+    ("wer_cumulative", "WER (cumulative)", format_percent, False),
+    ("cer_cumulative", "CER (cumulative)", format_percent, False),
+    ("wer_filewise_avg", "WER (filewise avg)", format_percent, False),
+    ("cer_filewise_avg", "CER (filewise avg)", format_percent, False),
+    ("utmosv2_avg", "UTMOS v2", lambda v: format_value(v, ".3f"), True),
+    ("ssim_pred_gt_avg", "SSIM (pred vs GT)", lambda v: format_value(v, ".4f"), True),
+    ("ssim_pred_context_avg", "SSIM (pred vs context)", lambda v: format_value(v, ".4f"), True),
+    ("total_gen_audio_seconds", "Total audio (sec)", lambda v: format_value(v, ".1f"), None),
+]
+
+
+def generate_test_set_table(
+    test_set: str,
+    models: list[tuple[str, dict]],
+) -> list[str]:
+    """Generate a comparison table for a single test set."""
+    lines = []
+    lines.append(f"### {test_set}\n")
+
+    # Table header
+    header = "| Metric | " + " | ".join(name for name, _ in models) + " |"
+    separator = "|" + "|".join(["---"] * (len(models) + 1)) + "|"
+    lines.append(header)
+    lines.append(separator)
+
+    for metric_key, display_name, fmt_func, higher_better in TTS_METRICS:
+        values = []
+        raw_values = []
+        for _, m in models:
+            val = m.get(metric_key)
+            raw_values.append(val)
+            values.append(fmt_func(val))
+
+        # Highlight best value
+        if higher_better is not None:
+            valid_vals = [(i, v) for i, v in enumerate(raw_values) if v is not None]
+            if len(valid_vals) >= 2:
+                if higher_better:
+                    best_idx = max(valid_vals, key=lambda x: x[1])[0]
+                else:
+                    best_idx = min(valid_vals, key=lambda x: x[1])[0]
+                values[best_idx] = f"**{values[best_idx]}**"
+
+        row = f"| {display_name} | " + " | ".join(values) + " |"
+        lines.append(row)
+
+    lines.append("")
+    return lines
+
+
+def compute_summary_metrics(
+    all_test_sets: list[str],
+    model_metrics: dict[str, dict[str, dict]],
+) -> dict[str, dict[str, float]]:
+    """Compute average metrics across all test sets for each model."""
+    summary = {}
+    for model_name, test_data in model_metrics.items():
+        totals = {}
+        counts = {}
+        for test_set in all_test_sets:
+            if test_set not in test_data:
+                continue
+            m = test_data[test_set]
+            for key, _, _, _ in TTS_METRICS:
+                if key in m and m[key] is not None:
+                    totals[key] = totals.get(key, 0) + m[key]
+                    counts[key] = counts.get(key, 0) + 1
+        summary[model_name] = {k: totals[k] / counts[k] for k in totals if counts.get(k, 0) > 0}
+    return summary
+
+
+def determine_best_model(models: list[tuple[str, dict]]) -> tuple[str, str]:
+    """Determine the best model based on summary metrics."""
+    if len(models) < 2:
+        return "", "Need at least 2 models to compare."
+
+    scores = {name: 0.0 for name, _ in models}
+    comparisons = []
+
+    # Weight metrics
+    weights = {
+        "wer_cumulative": 2.0,
+        "cer_cumulative": 1.5,
+        "utmosv2_avg": 2.0,
+        "ssim_pred_gt_avg": 1.0,
+    }
+
+    for metric_key, metric_name, _, higher_better in TTS_METRICS:
+        if higher_better is None:
+            continue
+        weight = weights.get(metric_key, 1.0)
+        valid = [(name, m.get(metric_key)) for name, m in models if m.get(metric_key) is not None]
+        if len(valid) < 2:
+            continue
+
+        if higher_better:
+            best_val = max(v for _, v in valid)
+            worst_val = min(v for _, v in valid)
+        else:
+            best_val = min(v for _, v in valid)
+            worst_val = max(v for _, v in valid)
+
+        if best_val == worst_val:
+            continue
+
+        best_name = [n for n, v in valid if v == best_val][0]
+        for name, val in valid:
+            if higher_better:
+                normalized = (val - worst_val) / (best_val - worst_val)
+            else:
+                normalized = (worst_val - val) / (worst_val - best_val)
+            scores[name] += normalized * weight
+        comparisons.append(f"{best_name} leads in {metric_name}")
+
+    if not any(scores.values()):
+        return "", "Unable to determine best model: insufficient metrics."
+
+    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
+    best_name = sorted_scores[0][0]
+    return best_name, (
+        f"**{best_name}** performs best overall. "
+        f"Key advantages: {'; '.join(comparisons[:4]) if comparisons else 'balanced performance'}."
+    )
+
+
+def generate_report(
+    model_names: list[str],
+    all_test_sets: list[str],
+    model_metrics: dict[str, dict[str, dict]],
+    output_path: str,
+):
+    """Generate the full Markdown comparison report."""
+    lines = []
+    lines.append("# TTS Evaluation Comparison Report\n")
+    lines.append(f"Comparing {len(model_names)} model(s): " + ", ".join(model_names) + "\n")
+
+    # Summary section
+    lines.append("## Summary (Averaged Across Test Sets)\n")
+    summary = compute_summary_metrics(all_test_sets, model_metrics)
+    summary_models = [(name, summary.get(name, {})) for name in model_names]
+
+    header = "| Metric | " + " | ".join(model_names) + " |"
+    separator = "|" + "|".join(["---"] * (len(model_names) + 1)) + "|"
+    lines.append(header)
+    lines.append(separator)
+
+    for metric_key, display_name, fmt_func, higher_better in TTS_METRICS:
+        if metric_key == "total_gen_audio_seconds":
+            continue  # Skip total audio in summary
+        values = []
+        raw_values = []
+        for name in model_names:
+            val = summary.get(name, {}).get(metric_key)
+            raw_values.append(val)
+            values.append(fmt_func(val))
+
+        if higher_better is not None:
+            valid_vals = [(i, v) for i, v in enumerate(raw_values) if v is not None]
+            if len(valid_vals) >= 2:
+                if higher_better:
+                    best_idx = max(valid_vals, key=lambda x: x[1])[0]
+                else:
+                    best_idx = min(valid_vals, key=lambda x: x[1])[0]
+                values[best_idx] = f"**{values[best_idx]}**"
+
+        row = f"| {display_name} | " + " | ".join(values) + " |"
+        lines.append(row)
+
+    lines.append("")
+
+    # Best model analysis
+    best_name, explanation = determine_best_model(summary_models)
+    lines.append("### Analysis\n")
+    lines.append(explanation)
+    lines.append("")
+
+    # Per-test-set sections
+    lines.append("## Per-Test-Set Results\n")
+
+    for test_set in sorted(all_test_sets):
+        test_models = []
+        for name in model_names:
+            m = model_metrics.get(name, {}).get(test_set)
+            if m is not None:
+                test_models.append((name, m))
+
+        if not test_models:
+            continue
+
+        lines.extend(generate_test_set_table(test_set, test_models))
+
+    # Legend
+    lines.append("---")
+    lines.append("*Lower WER/CER is better, higher UTMOS/SSIM is better. **bold** = best value.*")
+
+    report = "\n".join(lines)
+
+    with open(output_path, "w") as f:
+        f.write(report)
+
+    print(f"Report saved to: {output_path}")
+    print("\n" + "=" * 60)
+    print(report)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Compare TTS evaluation results from multiple folders",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    # Remote folders (via SSH):
+    python compare_tts_eval_results.py \\
+        --eval_folders "login-eos:/path/to/eval1/eval-results:Model A" \\
+                       "login-eos:/path/to/eval2/eval-results:Model B" \\
+        --output comparison_report.md
+
+    # Local folders:
+    python compare_tts_eval_results.py \\
+        --eval_folders "/path/to/eval1:Model A" "/path/to/eval2:Model B" \\
+        --output comparison_report.md
+
+Each eval folder should contain subdirectories for different test sets
+(e.g., nv_tts.vctk, nv_tts.libritts_test_clean), each with a metrics.json.
+        """,
+    )
+    parser.add_argument(
+        "--eval_folders",
+        nargs="+",
+        required=True,
+        help="Evaluation folders: 'path:name', 'host:path:name', or 'host:path'",
+    )
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    default_output = os.path.join(script_dir, "tts_comparison_report.md")
+    parser.add_argument("--output", type=str, default=default_output, help="Output Markdown file path")
+
+    args = parser.parse_args()
+
+    # Parse folder specs
+    folder_specs = []
+    for spec in args.eval_folders:
+        host, path, name = parse_folder_spec(spec)
+        folder_specs.append((host, path, name))
+
+    # Discover all test sets across all models
+    all_test_sets = set()
+    model_test_sets = {}
+    for host, path, name in folder_specs:
+        test_sets = list_test_sets(path, host)
+        model_test_sets[name] = (host, path, test_sets)
+        all_test_sets.update(test_sets)
+        loc = f"{host}:{path}" if host else path
+        print(f"Found {len(test_sets)} test sets for {name} at {loc}")
+
+    if not all_test_sets:
+        print("Error: No test sets found.")
+        return 1
+
+    # Load metrics for each model and test set
+    model_metrics: dict[str, dict[str, dict]] = {}
+    model_names = []
+
+    for host, path, name in folder_specs:
+        model_names.append(name)
+        model_metrics[name] = {}
+        test_sets = model_test_sets[name][2]
+
+        for test_set in test_sets:
+            metrics_path = f"{path}/{test_set}/metrics.json"
+            metrics = load_metrics(metrics_path, host)
+            if metrics is not None:
+                model_metrics[name][test_set] = metrics
+                print(f"  Loaded {test_set} for {name}")
+            else:
+                print(f"  Skipping {test_set} for {name} (no metrics.json)")
+
+    # Filter to test sets that have metrics for at least one model
+    valid_test_sets = [ts for ts in all_test_sets if any(ts in model_metrics.get(n, {}) for n in model_names)]
+
+    if not valid_test_sets:
+        print("Error: No valid metrics found for any test set.")
+        return 1
+
+    generate_report(model_names, valid_test_sets, model_metrics, args.output)
+    return 0
+
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/nemo_skills/dataset/nv_tts/scripts/tts_comparison_report.md b/nemo_skills/dataset/nv_tts/scripts/tts_comparison_report.md
new file mode 100644
index 0000000000..361673d849
--- /dev/null
+++ b/nemo_skills/dataset/nv_tts/scripts/tts_comparison_report.md
@@ -0,0 +1,115 @@
+# TTS Evaluation Comparison Report
+
+Comparing 2 model(s): GRPO Step 1100, A3 Baseline
+
+## Summary (Averaged Across Test Sets)
+
+| Metric | GRPO Step 1100 | A3 Baseline |
+|---|---|---|
+| WER (cumulative) | 4.56% | **3.65%** |
+| CER (cumulative) | 2.75% | **2.24%** |
+| WER (filewise avg) | 4.49% | **3.54%** |
+| CER (filewise avg) | 3.14% | **2.28%** |
+| UTMOS v2 | **3.124** | 3.005 |
+| SSIM (pred vs GT) | **0.6599** | 0.0255 |
+| SSIM (pred vs context) | **0.6709** | 0.0259 |
+
+### Analysis
+
+**A3 Baseline** performs best overall. Key advantages: A3 Baseline leads in WER (cumulative); A3 Baseline leads in CER (cumulative); A3 Baseline leads in WER (filewise avg); A3 Baseline leads in CER (filewise avg).
+
+## Per-Test-Set Results
+
+### nv_tts.libritts_seen
+
+| Metric | GRPO Step 1100 |
+|---|---|
+| WER (cumulative) | 2.76% |
+| CER (cumulative) | 1.92% |
+| WER (filewise avg) | 2.69% |
+| CER (filewise avg) | 1.83% |
+| UTMOS v2 | 3.322 |
+| SSIM (pred vs GT) | 0.8022 |
+| SSIM (pred vs context) | 0.8022 |
+| Total audio (sec) | 1314.7 |
+
+### nv_tts.libritts_test_clean
+
+| Metric | GRPO Step 1100 | A3 Baseline |
+|---|---|---|
+| WER (cumulative) | 1.37% | **1.27%** |
+| CER (cumulative) | 0.46% | **0.41%** |
+| WER (filewise avg) | 1.47% | **1.30%** |
+| CER (filewise avg) | 0.51% | **0.42%** |
+| UTMOS v2 | **3.279** | 3.153 |
+| SSIM (pred vs GT) | **0.8378** | -0.0138 |
+| SSIM (pred vs context) | **0.8378** | -0.0138 |
+| Total audio (sec) | 12454.1 | 15257.6 |
+
+### nv_tts.riva_hard_digits
+
+| Metric | GRPO Step 1100 | A3 Baseline |
+|---|---|---|
+| WER (cumulative) | 3.52% | **2.13%** |
+| CER (cumulative) | 2.58% | **1.41%** |
+| WER (filewise avg) | 3.27% | **1.96%** |
+| CER (filewise avg) | 2.43% | **1.32%** |
+| UTMOS v2 | **3.119** | 3.035 |
+| SSIM (pred vs GT) | **0.6976** | 0.0460 |
+| SSIM (pred vs context) | **0.6976** | 0.0460 |
+| Total audio (sec) | 2462.9 | 3022.4 |
+
+### nv_tts.riva_hard_letters
+
+| Metric | GRPO Step 1100 | A3 Baseline |
+|---|---|---|
+| WER (cumulative) | **5.34%** | 6.17% |
+| CER (cumulative) | **2.92%** | 4.51% |
+| WER (filewise avg) | **5.00%** | 5.82% |
+| CER (filewise avg) | **2.82%** | 4.26% |
+| UTMOS v2 | 2.988 | **2.991** |
+| SSIM (pred vs GT) | **0.6505** | 0.0432 |
+| SSIM (pred vs context) | **0.6505** | 0.0432 |
+| Total audio (sec) | 1984.2 | 2432.8 |
+
+### nv_tts.riva_hard_money
+
+| Metric | GRPO Step 1100 | A3 Baseline |
+|---|---|---|
+| WER (cumulative) | 2.92% | **0.92%** |
+| CER (cumulative) | 2.00% | **0.55%** |
+| WER (filewise avg) | 2.86% | **0.86%** |
+| CER (filewise avg) | 1.96% | **0.49%** |
+| UTMOS v2 | **3.191** | 3.076 |
+| SSIM (pred vs GT) | **0.7075** | 0.0428 |
+| SSIM (pred vs context) | **0.7075** | 0.0428 |
+| Total audio (sec) | 2635.0 | 3149.5 |
+
+### nv_tts.riva_hard_short
+
+| Metric | GRPO Step 1100 | A3 Baseline |
+|---|---|---|
+| WER (cumulative) | 15.66% | **9.84%** |
+| CER (cumulative) | 9.24% | **6.11%** |
+| WER (filewise avg) | 15.66% | **9.84%** |
+| CER (filewise avg) | 12.32% | **6.76%** |
+| UTMOS v2 | 2.525 | **2.544** |
+| SSIM (pred vs GT) | **0.3004** | 0.0373 |
+| SSIM (pred vs context) | **0.3004** | 0.0373 |
+| Total audio (sec) | 312.4 | 573.5 |
+
+### nv_tts.vctk
+
+| Metric | GRPO Step 1100 | A3 Baseline |
+|---|---|---|
+| WER (cumulative) | **0.36%** | 1.55% |
+| CER (cumulative) | **0.09%** | 0.46% |
+| WER (filewise avg) | **0.47%** | 1.43% |
+| CER (filewise avg) | **0.10%** | 0.45% |
+| UTMOS v2 | **3.441** | 3.229 |
+| SSIM (pred vs GT) | **0.6236** | -0.0028 |
+| SSIM (pred vs context) | **0.7002** | -0.0004 |
+| Total audio (sec) | 310.6 | 334.6 |
+
+---
+*Lower WER/CER is better, higher UTMOS/SSIM is better. **bold** = best value.*

From e80448eb9dfcf92c85f730adc3cb6048347ff71d Mon Sep 17 00:00:00 2001
From: Valentin Mendelev <vmendelev@nvidia.com>
Date: Sun, 28 Dec 2025 12:08:20 -0800
Subject: [PATCH 12/26] eos config example

---
 cluster_configs/eos_example.yaml | 48 ++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 cluster_configs/eos_example.yaml

diff --git a/cluster_configs/eos_example.yaml b/cluster_configs/eos_example.yaml
new file mode 100644
index 0000000000..124f85eb0d
--- /dev/null
+++ b/cluster_configs/eos_example.yaml
@@ -0,0 +1,48 @@
+executor: slurm
+
+ssh_tunnel:
+  host: login-eos.nvidia.com
+  # ------------------------------- Fill this up! -------------------------------
+  user: your_username
+  job_dir: /lustre/fsw/llmservice_nemo_speechlm/users/your_username/code/nemo-run
+  identity: ""
+  # -----------------------------------------------------------------------------
+
+# if you're running directly from cluster, you only need to define job_dir and shouldn't use ssh_tunnel
+# job_dir: <some location on slurm cluster to keep job metadata, uploaded code and generated sbatch files>
+
+account: llmservice_nemo_speechlm
+partition: batch
+job_name_prefix: ""
+
+disable_gpus_per_node: True
+
+containers:
+  trtllm: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-trtllm-latest.sqsh
+  vllm: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-vllm-latest.sqsh
+  sglang: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-sglang-latest.sqsh
+  nemo-rl: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-nemo-rl-latest.sqsh
+  megatron: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-megatron-latest.sqsh
+  sandbox: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-sandbox-latest.sqsh
+  nemo-skills: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-latest.sqsh
+  verl: /lustre/share/llmservice_nemo_reasoning/images/nemo-skills-verl-latest.sqsh
+
+mounts:
+  # - /lustre/fsw/llmservice_nemo_reasoning/hf_models:/hf_models
+  # - /lustre/fsw/llmservice_nemo_reasoning/images/swe-bench:/swe-bench-images
+  - /lustre/fsw/llmservice_nemo_speechlm:/lustre/fsw/llmservice_nemo_speechlm
+
+  # you also need to mount your own workspace folder (or any other folder you need)
+  # - /lustre/fsw/llmservice_nemo_reasoning/users/igitman/:/workspace
+
+env_vars:
+  # ------------------------------- Fill this up! -------------------------------
+  - HF_HOME=/lustre/fsw/llmservice_nemo_speechlm/users/your_username/hfcache
+  # -----------------------------------------------------------------------------
+
+timeouts:
+  batch: 04:00:00
+  interactive: 02:00:00
+
+mail_type: FAIL
+mail_user: # <your email goes here>

From 547c9120623a7f6c83a183c1c9a810d5e55d2d2f Mon Sep 17 00:00:00 2001
From: Valentin Mendelev <vmendelev@nvidia.com>
Date: Fri, 9 Jan 2026 08:31:02 -0800
Subject: [PATCH 13/26] EAR TTS backend

---
 .../dataset/nv_tts/scripts/run_tts_eval.py    |  10 +-
 nemo_skills/inference/server/serve_unified.py |  24 +-
 .../multimodal/server/backends/__init__.py    |   6 +-
 .../server/backends/ear_tts_backend.py        | 530 ++++++++++++++++++
 4 files changed, 562 insertions(+), 8 deletions(-)
 create mode 100644 recipes/multimodal/server/backends/ear_tts_backend.py

diff --git a/nemo_skills/dataset/nv_tts/scripts/run_tts_eval.py b/nemo_skills/dataset/nv_tts/scripts/run_tts_eval.py
index c4548d3305..2e51e9c2de 100644
--- a/nemo_skills/dataset/nv_tts/scripts/run_tts_eval.py
+++ b/nemo_skills/dataset/nv_tts/scripts/run_tts_eval.py
@@ -47,10 +47,10 @@ def run_generation(cfg: dict, expname: str):
     """Run generation stage using ns eval, returns experiment object."""
     gen = cfg["generation"]
 
-    # Add nemo_code_path to server_args
+    # Add generation_code_path to server_args
     server_args = gen["server_args"]
-    if cfg.get("nemo_code_path"):
-        server_args += f" --code_path {cfg['nemo_code_path']}"
+    if cfg.get("generation_code_path"):
+        server_args += f" --code_path {cfg['generation_code_path']}"
 
     # Parse extra_args for the context
     extra_args = gen.get("extra_args", "").split() if gen.get("extra_args") else []
@@ -93,7 +93,7 @@ def main():
     cfg = load_config(args.config)
     scoring = cfg.get("scoring", {})
     hf_token = os.environ.get("HF_TOKEN", "")
-    nemo_path = cfg["nemo_code_path"]
+    scoring_code_path = cfg.get("scoring_code_path", "")
     output_dir = cfg["output_dir"]
 
     gen_exp_name = None
@@ -131,7 +131,7 @@ def main():
 
             scoring_cmd = (
                 f"HF_TOKEN={hf_token} "
-                f"PYTHONPATH={nemo_path}:$PYTHONPATH "
+                f"PYTHONPATH={scoring_code_path}:$PYTHONPATH "
                 f"python -m nemo_skills.dataset.nv_tts.scripts.score "
                 f"--results_dir {output_dir} "
                 f"--benchmark {benchmark_dir} "
diff --git a/nemo_skills/inference/server/serve_unified.py b/nemo_skills/inference/server/serve_unified.py
index 47feb62f2a..76a4b5524a 100644
--- a/nemo_skills/inference/server/serve_unified.py
+++ b/nemo_skills/inference/server/serve_unified.py
@@ -138,8 +138,8 @@ def main():
     parser.add_argument(
         "--backend",
         default="salm",
-        choices=["salm", "magpie_tts", "s2s", "s2s_incremental", "s2s_session"],
-        help="Backend type: salm (speech-augmented LM), magpie_tts (MagpieTTS with RTF metrics), s2s (speech-to-speech offline), s2s_incremental (frame-by-frame processing), s2s_session (session-aware multi-turn)",
+        choices=["salm", "magpie_tts", "ear_tts", "ear_tts_batch", "s2s", "s2s_incremental", "s2s_session"],
+        help="Backend type: salm (speech-augmented LM), magpie_tts (MagpieTTS with RTF metrics), ear_tts (EAR TTS streaming decode), ear_tts_batch (EAR TTS batch decode), s2s (speech-to-speech offline), s2s_incremental (frame-by-frame processing), s2s_session (session-aware multi-turn)",
     )
 
     # Backend-specific model paths
@@ -273,6 +273,10 @@ def main():
     setup_pythonpath(args.code_path)
     apply_safetensors_patch(args.hack_path)
 
+    # Store code_path for backends that may need to add paths late
+    if args.code_path:
+        os.environ["UNIFIED_SERVER_CODE_PATH"] = args.code_path
+
     # Set environment variables
     os.environ["UNIFIED_SERVER_HOST"] = args.host
     os.environ["UNIFIED_SERVER_PORT"] = str(args.port)
@@ -325,6 +329,15 @@ def main():
         if args.silence_padding_sec != 5.0:
             extra_config["silence_padding_sec"] = args.silence_padding_sec
 
+    # EAR TTS backend options
+    if args.backend in ("ear_tts", "ear_tts_batch"):
+        if args.config_path:
+            extra_config["config_path"] = args.config_path
+        if args.tts_checkpoint_path:
+            extra_config["tts_checkpoint_path"] = args.tts_checkpoint_path
+        if args.speaker_reference:
+            extra_config["speaker_reference"] = args.speaker_reference
+
     # S2S Incremental/Session backend options (shared config)
     if args.backend in ("s2s_incremental", "s2s_session"):
         if args.config_path:
@@ -375,6 +388,13 @@ def main():
                 print("  Legacy Codebooks: True")
             if args.legacy_text_conditioning:
                 print("  Legacy Text Conditioning: True")
+    if args.backend in ("ear_tts", "ear_tts_batch"):
+        if args.config_path:
+            print(f"  Config Path: {args.config_path}")
+        if args.speaker_reference:
+            print(f"  Speaker Reference: {args.speaker_reference}")
+        if args.tts_checkpoint_path:
+            print(f"  TTS Checkpoint: {args.tts_checkpoint_path}")
     if args.backend in ("s2s_incremental", "s2s_session"):
         if args.config_path:
             print(f"  Config Path: {args.config_path}")
diff --git a/recipes/multimodal/server/backends/__init__.py b/recipes/multimodal/server/backends/__init__.py
index fe3c4c1abd..f31032bfea 100644
--- a/recipes/multimodal/server/backends/__init__.py
+++ b/recipes/multimodal/server/backends/__init__.py
@@ -18,6 +18,8 @@
 Available backends:
 - salm: Speech-Augmented Language Model (text output from text/audio input)
 - magpie_tts: MagpieTTS using MagpieInferenceRunner with RTF metrics (audio output from text input)
+- ear_tts: EAR TTS using NemotronVoiceChat TTS model (audio output from text input, streaming decode)
+- ear_tts_batch: EAR TTS optimized version (audio output from text input, batch decode at end)
 - s2s: Speech-to-Speech using DuplexS2S offline (text output from audio input)
 - s2s_incremental: Speech-to-Speech using NemotronVoiceChat incremental (text+audio from audio)
 - s2s_session: Speech-to-Speech with session support for multi-turn conversations
@@ -39,8 +41,10 @@
 BACKEND_REGISTRY = {
     "salm": ("salm_backend", "SALMBackend"),
     "magpie_tts": ("magpie_tts_backend", "MagpieTTSBackend"),
+    "ear_tts": ("ear_tts_backend", "EarTTSBackend"),
+    "ear_tts_batch": ("ear_tts_backend", "EarTTSBatchBackend"),
     "s2s": ("s2s_backend", "S2SBackend"),
-    "s2s_incremental": ("s2s_incremental_backend", "S2SIncrementalBackend"),
+    "s2s_incremental": ("s2s_incremental_backend_c", "S2SIncrementalBackend"),
     "s2s_session": ("s2s_session_backend", "S2SSessionBackend"),
 }
 
diff --git a/recipes/multimodal/server/backends/ear_tts_backend.py b/recipes/multimodal/server/backends/ear_tts_backend.py
new file mode 100644
index 0000000000..7b9f4daf3f
--- /dev/null
+++ b/recipes/multimodal/server/backends/ear_tts_backend.py
@@ -0,0 +1,530 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+
+"""EAR TTS backend using NemotronVoiceChat's TTS model for text-to-speech synthesis."""
+
+import io
+import json
+import os
+import time
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+import soundfile as sf
+import torch
+import torchaudio
+from omegaconf import DictConfig, OmegaConf
+
+from .base import BackendConfig, GenerationRequest, GenerationResult
+from .magpie_tts_backend import MagpieTTSBackend, MagpieTTSConfig
+
+# TTS constants
+TTS_SAMPLE_RATE = 22050
+FRAME_SIZE_SEC = 0.08  # 80ms per frame
+DEFAULT_CODEC_TOKEN_HISTORY_SIZE = 60
+SILENCE_THRESHOLD = 0.1  # Max magnitude threshold for silence detection
+SILENCE_DURATION_SEC = 2.0  # Stop if last N seconds are silent
+
+
+@dataclass
+class EarTTSConfig(MagpieTTSConfig):
+    """Configuration for EAR TTS backend - extends MagpieTTSConfig."""
+
+    # EAR TTS specific paths
+    tts_checkpoint_path: Optional[str] = None  # Path to TTS checkpoint (safetensors)
+    speaker_reference: Optional[str] = None  # Speaker reference audio for voice cloning
+    config_path: Optional[str] = None  # Optional YAML config path
+
+    # TTS parameters
+    codec_token_history_size: int = DEFAULT_CODEC_TOKEN_HISTORY_SIZE
+    guidance_enabled: bool = True
+
+    @classmethod
+    def from_dict(cls, d: Dict[str, Any]) -> "EarTTSConfig":
+        known_fields = {
+            "model_path",
+            "device",
+            "dtype",
+            "max_new_tokens",
+            "temperature",
+            "top_p",
+            "top_k",
+            "codec_model_path",
+            "use_cfg",
+            "cfg_scale",
+            "max_decoder_steps",
+            "use_local_transformer",
+            "output_sample_rate",
+            "hparams_file",
+            "checkpoint_file",
+            "legacy_codebooks",
+            "legacy_text_conditioning",
+            "hparams_from_wandb",
+            # EAR TTS specific
+            "tts_checkpoint_path",
+            "speaker_reference",
+            "config_path",
+            "codec_token_history_size",
+            "guidance_enabled",
+        }
+        known = {k: v for k, v in d.items() if k in known_fields}
+        extra = {k: v for k, v in d.items() if k not in known_fields}
+        return cls(**known, extra_config=extra)
+
+
+class EarTTSBackend(MagpieTTSBackend):
+    """
+    EAR TTS backend using NemotronVoiceChat's TTS model.
+
+    Inherits from MagpieTTSBackend and overrides load_model() and generate()
+    to use the EAR TTS model instead of MagpieTTS.
+    """
+
+    @property
+    def name(self) -> str:
+        return "ear_tts"
+
+    def __init__(self, config: BackendConfig):
+        # Convert to EarTTSConfig
+        if isinstance(config, EarTTSConfig):
+            self.ear_config = config
+        else:
+            self.ear_config = EarTTSConfig.from_dict(
+                {
+                    **{
+                        k: getattr(config, k)
+                        for k in ["model_path", "device", "dtype", "max_new_tokens", "temperature", "top_p", "top_k"]
+                        if hasattr(config, k)
+                    },
+                    **config.extra_config,
+                }
+            )
+
+        # Call grandparent __init__ to skip MagpieTTSBackend's init
+        from .base import InferenceBackend
+
+        InferenceBackend.__init__(self, self.ear_config)
+        self.tts_config = self.ear_config
+
+        self._model = None
+        self._model_cfg = None
+        self._tokenizer = None
+
+        # TTS state
+        self.first_context_subword_id = None
+        self.generation_config = None
+        self.first_tts_code_input = None
+        self.first_tts_past_key_values_input = None
+        self.target_sample_rate = TTS_SAMPLE_RATE
+        self.target_fps = None
+
+    def _clone_cache(self, cache):
+        """Deep clone cache structures."""
+        if cache is None:
+            return None
+        if isinstance(cache, torch.Tensor):
+            return cache.detach().clone()
+        if isinstance(cache, (list, tuple)):
+            return type(cache)(self._clone_cache(x) for x in cache)
+        if isinstance(cache, dict):
+            return {k: self._clone_cache(v) for k, v in cache.items()}
+        if hasattr(cache, "__dict__"):
+            import copy
+
+            return copy.deepcopy(cache)
+        return cache
+
+    def load_model(self) -> None:
+        """Load the EAR TTS model from NemotronVoiceChat."""
+        import sys
+
+        from safetensors.torch import load_file
+
+        print(f"[EarTTS] Loading model from {self.config.model_path}...")
+
+        # Clear cached nemo modules FIRST to force reimport from our paths
+        nemo_modules = [k for k in sys.modules.keys() if k.startswith("nemo")]
+        for mod in nemo_modules:
+            del sys.modules[mod]
+        print(f"[EarTTS] Cleared {len(nemo_modules)} cached nemo modules")
+
+        # Ensure code path is FIRST in sys.path (for speechlm2 module)
+        code_path = os.environ.get("UNIFIED_SERVER_CODE_PATH", "")
+        print(f"[EarTTS] UNIFIED_SERVER_CODE_PATH = '{code_path}'")
+        if code_path:
+            paths = [p for p in code_path.split(":") if p]
+            # Remove existing entries and re-add at front
+            for path in paths:
+                while path in sys.path:
+                    sys.path.remove(path)
+            for path in reversed(paths):
+                sys.path.insert(0, path)
+                print(f"[EarTTS] Added to sys.path: {path}")
+        else:
+            print("[EarTTS] WARNING: No code path found in env!")
+
+        # Debug: show current path
+        print(f"[EarTTS] sys.path (first 5): {sys.path[:5]}")
+
+        try:
+            from nemo.collections.speechlm2.models.nemotron_voicechat import NemotronVoiceChat
+            from nemo.collections.speechlm2.parts.pretrained import set_model_dict_for_partial_init
+        except ImportError as e:
+            raise RuntimeError(f"Failed to import NemotronVoiceChat. Error: {e}")
+
+        torch.backends.cudnn.allow_tf32 = True
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.set_float32_matmul_precision("high")
+
+        # Load config
+        tts_path = self.ear_config.tts_checkpoint_path or self.config.model_path
+        config_file = os.path.join(tts_path, "config.json")
+        print(f"[EarTTS] Loading config: {config_file}")
+
+        with open(config_file, "r") as f:
+            cfg = DictConfig(json.load(f))
+
+        # Set speaker reference
+        speaker_ref = self.ear_config.speaker_reference
+        if not speaker_ref and self.ear_config.config_path:
+            yaml_cfg = OmegaConf.load(self.ear_config.config_path)
+            speaker_ref = yaml_cfg.get("model", {}).get("inference_speaker_reference")
+        if speaker_ref:
+            if "model" not in cfg:
+                cfg.model = {}
+            cfg.model.inference_speaker_reference = speaker_ref
+
+        self._model_cfg = cfg
+
+        # Disable pretrained model loading
+        if hasattr(cfg.model, "speech_generation") and hasattr(cfg.model.speech_generation, "model"):
+            cfg.model.speech_generation.model.pretrained_model = None
+        if hasattr(cfg.model, "stt") and hasattr(cfg.model.stt, "model"):
+            cfg.model.stt.model.pretrained_s2s_model = None
+
+        # Initialize and load model
+        print("[EarTTS] Initializing model structure...")
+        self._model = NemotronVoiceChat(OmegaConf.to_container(cfg, resolve=True))
+
+        safetensors_path = os.path.join(tts_path, "model.safetensors")
+        if os.path.exists(safetensors_path):
+            print(f"[EarTTS] Loading TTS weights from: {tts_path}")
+            state_dict = load_file(safetensors_path)
+            tts_only = {k: v for k, v in state_dict.items() if k.startswith("tts_model.")}
+            print(f"[EarTTS] Loading {len(tts_only)} TTS parameters")
+            tts_only = set_model_dict_for_partial_init(tts_only, self._model.state_dict())
+            self._model.load_state_dict(tts_only, strict=False)
+        else:
+            raise ValueError(f"TTS weights not found at {safetensors_path}")
+
+        self._model.to(self.config.device)
+        self._model.eval()
+        self._tokenizer = self._model.stt_model.tokenizer
+
+        if hasattr(self._model, "tts_model"):
+            self.target_fps = self._model.tts_model.target_fps
+            self.target_sample_rate = self._model.tts_model.target_sample_rate
+            print(f"[EarTTS] TTS: fps={self.target_fps}, sample_rate={self.target_sample_rate}")
+            self._prepare_tts_initial_state()
+
+        self._is_loaded = True
+        print("[EarTTS] Model loaded successfully")
+
+    def _prepare_tts_initial_state(self):
+        """Prepare TTS warmup state with speaker reference."""
+        from nemo.collections.audio.parts.utils.resampling import resample
+        from nemo.collections.speechlm2.parts.precision import fp32_precision
+
+        if not hasattr(self._model, "tts_model"):
+            return
+
+        speaker_ref = self._model_cfg.model.get("inference_speaker_reference") if self._model_cfg else None
+        if not speaker_ref:
+            speaker_ref = self.ear_config.speaker_reference
+        if not speaker_ref:
+            print("[EarTTS] Warning: No speaker reference")
+            return
+
+        print(f"[EarTTS] Preparing TTS with speaker: {speaker_ref}")
+
+        with fp32_precision():
+            speaker_audio, speaker_sr = torchaudio.load(speaker_ref)
+            speaker_audio = resample(speaker_audio, speaker_sr, self._model.tts_model.target_sample_rate)
+
+        speaker_audio = speaker_audio.to(self.config.device)
+        speaker_audio_lens = torch.tensor([speaker_audio.size(1)], device=self.config.device).long()
+
+        self._model.tts_model.set_init_inputs(speaker_audio=speaker_audio, speaker_audio_lens=speaker_audio_lens)
+        init_inputs = self._model.tts_model.get_init_inputs(B=1)
+        self.generation_config = self._model.tts_model._get_generation_config(
+            guidance_enabled=self.ear_config.guidance_enabled
+        )
+        init_inputs.update({"use_cache": True, "past_key_values": None, "guidance_enabled": True})
+
+        with torch.no_grad():
+            outputs = self._model.tts_model.tts_model(**init_inputs)
+            code = init_inputs["code"][:, -1:]
+
+        self.first_context_subword_id = init_inputs["subword_ids"][:, -1].unsqueeze(-1)
+        self.first_tts_code_input = code.detach().clone()
+        self.first_tts_past_key_values_input = self._clone_cache(outputs.past_key_values)
+        print("[EarTTS] TTS warmup state prepared")
+
+    @torch.no_grad()
+    def _synthesize_text(self, text: str) -> Optional[np.ndarray]:
+        """Synthesize audio from text using EAR TTS.
+
+        Generates audio frames until the last 2 seconds have max magnitude below threshold,
+        indicating the model has finished speaking.
+        """
+        from nemo.collections.speechlm2.parts.precision import fp32_precision
+
+        if not text or not self.generation_config:
+            return None
+
+        device = self.config.device
+        token_ids = self._tokenizer.text_to_ids(text)
+        if not token_ids:
+            return None
+
+        num_tokens = len(token_ids)
+        token_tensor = torch.tensor(token_ids, dtype=torch.long, device=device).unsqueeze(0)
+
+        # Max frames: generous upper bound (10x tokens should be plenty)
+        max_frames = num_tokens * 10
+
+        # Samples needed to check for silence (2 seconds)
+        sample_rate = self.target_sample_rate or TTS_SAMPLE_RATE
+        samples_for_silence_check = int(sample_rate * SILENCE_DURATION_SEC)
+
+        # Initialize TTS state
+        past_key_values = self._clone_cache(self.first_tts_past_key_values_input)
+        code = self.first_tts_code_input.detach().clone()
+        codec_history_size = self.ear_config.codec_token_history_size
+        audio_toks_buffer = (
+            self._model.tts_model.codec_silence_tokens.view(1, 1, -1).expand(-1, codec_history_size, -1).to(device)
+        )
+
+        audio_segments = []
+        samples_per_frame = int(float(sample_rate) * FRAME_SIZE_SEC)
+        total_samples = 0
+
+        for frame_idx in range(max_frames):
+            # Cycle through tokens, repeating the last token after we've used all
+            token_idx = min(frame_idx, num_tokens - 1)
+            current_subword_id = token_tensor[:, token_idx].unsqueeze(-1)
+
+            if frame_idx == 0:
+                prev_subword_id = self.first_context_subword_id
+            else:
+                prev_token_idx = min(frame_idx - 1, num_tokens - 1)
+                prev_subword_id = token_tensor[:, prev_token_idx].unsqueeze(-1)
+
+            code, past_key_values = self._model.tts_model.infer_codes_one_step(
+                current_subword_id=current_subword_id,
+                prev_subword_id=prev_subword_id,
+                current_subword_mask=torch.ones(1, 1, device=device, dtype=torch.bool),
+                prev_audio_tokens=code,
+                past_key_values=past_key_values,
+                guidance_enabled=self.ear_config.guidance_enabled,
+                generation_config=self.generation_config,
+                ignore_eos_flag_stop=True,
+            )
+
+            audio_toks_buffer = torch.cat([audio_toks_buffer[:, 1:], code], dim=1)
+
+            with fp32_precision():
+                decoded_audio, _ = self._model.tts_model.audio_codec.decode(
+                    audio_toks_buffer, torch.tensor([codec_history_size], dtype=torch.long, device=device)
+                )
+            frame_audio = decoded_audio[:, :, -samples_per_frame:]
+            audio_segments.append(frame_audio)
+            total_samples += samples_per_frame
+
+            # Check for silence after we have enough samples
+            if total_samples >= samples_for_silence_check:
+                # Get last 2 seconds of audio
+                recent_audio = torch.cat(audio_segments, dim=-1)[:, :, -samples_for_silence_check:]
+                max_magnitude = recent_audio.abs().max().item()
+
+                if max_magnitude < SILENCE_THRESHOLD:
+                    # Silence detected - stop generating
+                    break
+
+        if audio_segments:
+            audio_tensor = torch.cat(audio_segments, dim=-1)
+
+            # Trim trailing silence
+            audio_np = audio_tensor.float().cpu().numpy().squeeze()
+
+            # Find where audio becomes silent (from the end)
+            window_size = int(sample_rate * 0.1)  # 100ms window
+            for trim_point in range(len(audio_np) - window_size, 0, -window_size):
+                window_max = np.abs(audio_np[trim_point : trim_point + window_size]).max()
+                if window_max >= SILENCE_THRESHOLD:
+                    # Found non-silent audio, trim after this point + small buffer
+                    audio_np = audio_np[: trim_point + window_size * 2]
+                    break
+
+            max_val = np.abs(audio_np).max()
+            if max_val > 0:
+                audio_np = audio_np / max_val * 0.95
+            return audio_np
+        return None
+
+    def generate(self, requests: List[GenerationRequest]) -> List[GenerationResult]:
+        """Generate audio from text requests."""
+        if not self._is_loaded:
+            return [GenerationResult(error="Model not loaded", request_id=r.request_id) for r in requests]
+        if not requests:
+            return []
+
+        results = []
+        for req in requests:
+            start_time = time.time()
+            try:
+                parsed = self._extract_json(req.text)
+                text = parsed.get("text", "")
+
+                if not text:
+                    results.append(GenerationResult(error="No text provided", request_id=req.request_id))
+                    continue
+
+                audio_np = self._synthesize_text(text)
+                if audio_np is None:
+                    results.append(GenerationResult(error="Failed to synthesize audio", request_id=req.request_id))
+                    continue
+
+                wav_buffer = io.BytesIO()
+                sf.write(wav_buffer, audio_np, self.target_sample_rate, format="WAV")
+                elapsed_ms = (time.time() - start_time) * 1000
+                audio_duration = len(audio_np) / self.target_sample_rate
+
+                results.append(
+                    GenerationResult(
+                        text=text,
+                        audio_bytes=wav_buffer.getvalue(),
+                        audio_sample_rate=self.target_sample_rate,
+                        audio_format="wav",
+                        request_id=req.request_id,
+                        generation_time_ms=elapsed_ms,
+                        debug_info={
+                            "audio_duration_sec": audio_duration,
+                            "rtf": elapsed_ms / 1000 / audio_duration if audio_duration > 0 else 0,
+                        },
+                    )
+                )
+            except Exception as e:
+                import traceback
+
+                traceback.print_exc()
+                results.append(GenerationResult(error=str(e), request_id=req.request_id))
+        return results
+
+    def health_check(self) -> Dict[str, Any]:
+        """Return health status."""
+        h = super().health_check()
+        if self._is_loaded:
+            h.update(
+                {
+                    "sample_rate": self.target_sample_rate,
+                    "fps": self.target_fps,
+                    "tts_enabled": self.generation_config is not None,
+                    "speaker_reference": self.ear_config.speaker_reference,
+                }
+            )
+        return h
+
+
+class EarTTSBatchBackend(EarTTSBackend):
+    """
+    Optimized EAR TTS backend that decodes audio only once at the end.
+
+    This version generates all codes first (token-by-token), then decodes
+    the entire sequence in one batch operation - significantly faster than
+    decoding after every token.
+    """
+
+    @property
+    def name(self) -> str:
+        return "ear_tts_batch"
+
+    @torch.no_grad()
+    def _synthesize_text(self, text: str) -> Optional[np.ndarray]:
+        """Synthesize audio from text - optimized batch decoding version.
+
+        Generates 10x tokens worth of frames, decodes all at once, then trims trailing silence.
+        """
+        from nemo.collections.speechlm2.parts.precision import fp32_precision
+
+        if not text or not self.generation_config:
+            return None
+
+        device = self.config.device
+        token_ids = self._tokenizer.text_to_ids(text)
+        if not token_ids:
+            return None
+
+        num_tokens = len(token_ids)
+        max_frames = num_tokens * 10  # Generate more frames than tokens
+        token_tensor = torch.tensor(token_ids, dtype=torch.long, device=device).unsqueeze(0)
+
+        # Initialize TTS state
+        past_key_values = self._clone_cache(self.first_tts_past_key_values_input)
+        code = self.first_tts_code_input.detach().clone()
+
+        # Generate ALL codes first (no decoding in the loop)
+        all_codes = []
+
+        for frame_idx in range(max_frames):
+            # Cycle through tokens, repeating the last token after we've used all
+            token_idx = min(frame_idx, num_tokens - 1)
+            current_subword_id = token_tensor[:, token_idx].unsqueeze(-1)
+
+            if frame_idx == 0:
+                prev_subword_id = self.first_context_subword_id
+            else:
+                prev_token_idx = min(frame_idx - 1, num_tokens - 1)
+                prev_subword_id = token_tensor[:, prev_token_idx].unsqueeze(-1)
+
+            code, past_key_values = self._model.tts_model.infer_codes_one_step(
+                current_subword_id=current_subword_id,
+                prev_subword_id=prev_subword_id,
+                current_subword_mask=torch.ones(1, 1, device=device, dtype=torch.bool),
+                prev_audio_tokens=code,
+                past_key_values=past_key_values,
+                guidance_enabled=self.ear_config.guidance_enabled,
+                generation_config=self.generation_config,
+                ignore_eos_flag_stop=True,
+            )
+            all_codes.append(code)
+
+        # Decode ALL codes at once at the end
+        if all_codes:
+            all_codes_tensor = torch.cat(all_codes, dim=1)  # [B, max_frames, codebook_dim]
+            len_codes = torch.tensor([max_frames], dtype=torch.long, device=device)
+
+            with fp32_precision():
+                decoded_audio, _ = self._model.tts_model.audio_codec.decode(all_codes_tensor, len_codes)
+
+            audio_np = decoded_audio.float().cpu().numpy().squeeze()
+
+            # Trim trailing silence
+            sample_rate = self.target_sample_rate or TTS_SAMPLE_RATE
+            window_size = int(sample_rate * 0.1)  # 100ms window
+            for trim_point in range(len(audio_np) - window_size, 0, -window_size):
+                window_max = np.abs(audio_np[trim_point : trim_point + window_size]).max()
+                if window_max >= SILENCE_THRESHOLD:
+                    # Found non-silent audio, trim after this point + small buffer
+                    audio_np = audio_np[: trim_point + window_size * 2]
+                    break
+
+            max_val = np.abs(audio_np).max()
+            if max_val > 0:
+                audio_np = audio_np / max_val * 0.95
+            return audio_np
+
+        return None

From d318068c90a8ecac5e8b1a04b966a0591eeab618 Mon Sep 17 00:00:00 2001
From: Valentin Mendelev <vmendelev@nvidia.com>
Date: Fri, 9 Jan 2026 14:38:46 -0800
Subject: [PATCH 14/26] EAR TTS config

---
 .../scripts/config/ear_tts_hard_digits.yaml   | 42 +++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 nemo_skills/dataset/nv_tts/scripts/config/ear_tts_hard_digits.yaml

diff --git a/nemo_skills/dataset/nv_tts/scripts/config/ear_tts_hard_digits.yaml b/nemo_skills/dataset/nv_tts/scripts/config/ear_tts_hard_digits.yaml
new file mode 100644
index 0000000000..e8e5eff9e0
--- /dev/null
+++ b/nemo_skills/dataset/nv_tts/scripts/config/ear_tts_hard_digits.yaml
@@ -0,0 +1,42 @@
+# TTS Pipeline Configuration - EAR TTS with Hard Digits
+
+# Cluster and execution settings
+cluster: eos
+partition: batch
+mount_paths: /lustre:/lustre
+output_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/nv_tts_eval_ear_tts_hard_digits_a2
+
+# === COPY THESE FROM OCI TO EOS ===
+# Container (copy from: /lustre/fsw/portfolios/convai/users/ecasanova/docker_images/nemo_duplex_november_eartts.sqsh)
+container: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/containters/nemo_duplex_november_eartts.sqsh
+
+# NeMo code paths (separate for generation and scoring)
+generation_code_path: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/code/NeMo-release_not_rebased
+scoring_code_path: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/experimenta/tts_eval/NeMo
+
+# Generation settings
+generation:
+  benchmarks: nv_tts.riva_hard_digits
+  # Model checkpoint (copy from: /lustre/fsw/portfolios/convai/users/ecasanova/Checkpoints/Nemotron-VoiceChat-november/duplex-eartts-2mim_sw_et_eos_dp_eos_dup_fp32-stt-3-december_stt_edresson_model_R_digits_norm_eip_0.1_EA_model_step_9005)
+  model: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/models/ear_tts/duplex-eartts-2mim_sw_et_eos_dp_eos_dup_fp32-stt-3-december_stt_edresson_model_R_digits_norm_eip_0.1_EA_model_step_9005
+  server_type: generic
+  server_gpus: 1
+  server_entrypoint: python -m nemo_skills.inference.server.serve_unified
+  server_args: >-
+    --backend ear_tts
+    --speaker_reference /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/models/ear_tts/Emma_S3_A1_SC7_singleturntarget_21_channel_1_audio_in.wav
+    --config_path /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/models/ear_tts/nanov2_demo_model_eartts_updated.yaml
+    --batch_size 1
+    --batch_timeout 0.1
+  data_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/data_dir
+  num_chunks: 8
+  gpus_per_node: 1
+  extra_args: ++server.server_type=vllm_multimodal
+
+# Scoring settings
+scoring:
+  sv_model: titanet
+  asr_model_name: nvidia/parakeet-tdt-1.1b
+  language: en
+  with_utmosv2: true
+  gpus: 1

From 18981f94d982e5443b1e1cc8d4c8e2d63105475e Mon Sep 17 00:00:00 2001
From: Valentin Mendelev <vmendelev@nvidia.com>
Date: Fri, 30 Jan 2026 12:00:00 -0800
Subject: [PATCH 15/26] Fix MagpieTTS backend when no context audio is provided

Create a small dummy context wav for requests without context_audio_filepath to prevent dataloader failures (missing d*.wav) and 500s from the unified server.
---
 recipes/multimodal/server/backends/magpie_tts_backend.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/recipes/multimodal/server/backends/magpie_tts_backend.py b/recipes/multimodal/server/backends/magpie_tts_backend.py
index e11187f71a..9f1f052e27 100644
--- a/recipes/multimodal/server/backends/magpie_tts_backend.py
+++ b/recipes/multimodal/server/backends/magpie_tts_backend.py
@@ -183,6 +183,15 @@ def generate(self, requests: List[GenerationRequest]) -> List[GenerationResult]:
                             os.symlink(ctx, link_path)
                     else:
                         link_name = f"d{i}.wav"
+                        # Magpie inference expects a readable "context" wav for every manifest entry.
+                        # When callers don't provide one (common for pure TTS-from-text prompts),
+                        # create a tiny dummy wav so the dataloader doesn't crash.
+                        link_path = os.path.join(audio_dir, link_name)
+                        if not os.path.exists(link_path):
+                            sr = int(getattr(self.tts_config, "output_sample_rate", 22050) or 22050)
+                            dur_s = 0.1
+                            n = max(1, int(sr * dur_s))
+                            sf.write(link_path, [0.0] * n, sr)
                     f.write(
                         json.dumps(
                             {

From 8b9c22f546a0fca23db4e2dd734e32f29ad26ec7 Mon Sep 17 00:00:00 2001
From: Valentin Mendelev <vmendelev@nvidia.com>
Date: Fri, 30 Jan 2026 13:51:50 -0800
Subject: [PATCH 16/26] Reset MagpieTTS decoder cache per request batch

Avoid KV-cache shape mismatches when batch sizes vary between requests in the unified server.
---
 .../server/backends/magpie_tts_backend.py         | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/recipes/multimodal/server/backends/magpie_tts_backend.py b/recipes/multimodal/server/backends/magpie_tts_backend.py
index 9f1f052e27..3ed09b02e6 100644
--- a/recipes/multimodal/server/backends/magpie_tts_backend.py
+++ b/recipes/multimodal/server/backends/magpie_tts_backend.py
@@ -164,6 +164,21 @@ def generate(self, requests: List[GenerationRequest]) -> List[GenerationResult]:
         os.makedirs(output_dir, exist_ok=True)
 
         try:
+            # MagpieTTS uses KV caching internally during decoding. When the unified server
+            # batches requests, consecutive calls to this backend can have different batch
+            # sizes, and stale KV caches can trigger shape mismatches (e.g. cat on self_k).
+            # Reset caches at the start of each request batch to avoid cross-request leakage.
+            try:
+                if self._model is not None:
+                    decoder = getattr(self._model, "decoder", None)
+                    if decoder is not None and hasattr(decoder, "reset_cache"):
+                        # Keep caching enabled for decoding speed, but start from a clean cache.
+                        decoder.reset_cache(use_cache=True)
+            except Exception:
+                # Best-effort: if cache reset fails for any reason, continue and let the
+                # underlying stack surface the real error.
+                pass
+
             # Parse requests, extracting JSON from text (skips non-JSON prefixes)
             parsed = [self._extract_json(r.text) for r in requests]
 

From dfb522f027d6931ef6987784972b7c3ae093e842 Mon Sep 17 00:00:00 2001
From: Valentin Mendelev <vmendelev@nvidia.com>
Date: Fri, 30 Jan 2026 14:01:15 -0800
Subject: [PATCH 17/26] Cache HF resolve URL loads in MagpieTTS backend

Route HuggingFace resolve URLs used by NeMo audio codec checkpoints through huggingface_hub download so multi-rank server startup avoids repeated downloads and 429s.
---
 .../server/backends/magpie_tts_backend.py     | 42 +++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/recipes/multimodal/server/backends/magpie_tts_backend.py b/recipes/multimodal/server/backends/magpie_tts_backend.py
index 3ed09b02e6..9ac2faabe5 100644
--- a/recipes/multimodal/server/backends/magpie_tts_backend.py
+++ b/recipes/multimodal/server/backends/magpie_tts_backend.py
@@ -93,6 +93,48 @@ def __init__(self, config: BackendConfig):
         self._model = self._runner = self._temp_dir = self._checkpoint_name = None
 
     def load_model(self) -> None:
+        # Some dependencies inside NeMo's audio codec model load weights via raw
+        # HuggingFace "resolve" URLs using fsspec's HTTP filesystem, which does
+        # not cache and can easily hit 429s when many ranks start concurrently.
+        #
+        # Patch NeMo's `load_fsspec()` at runtime to route HuggingFace resolve
+        # URLs through `huggingface_hub.hf_hub_download()` (uses file locks and
+        # local caching under HF_HOME/HF_HUB_CACHE), then load from the local path.
+        try:  # best-effort; do not fail server if patching is unavailable
+            import os
+            import re
+
+            import nemo.collections.tts.modules.audio_codec_modules as _acm  # type: ignore
+
+            _orig_load_fsspec = getattr(_acm, "load_fsspec", None)
+            if callable(_orig_load_fsspec) and not getattr(_acm, "_hf_load_fsspec_patched", False):
+                try:
+                    from huggingface_hub import hf_hub_download  # type: ignore
+
+                    def _hf_resolve_to_local(url: str) -> str | None:
+                        m = re.match(r"^https?://huggingface\\.co/([^/]+)/([^/]+)/resolve/([^/]+)/(.+)$", url)
+                        if not m:
+                            return None
+                        repo_id = f"{m.group(1)}/{m.group(2)}"
+                        revision = m.group(3)
+                        filename = m.group(4)
+                        token = os.environ.get("HF_TOKEN") or None
+                        return hf_hub_download(repo_id=repo_id, filename=filename, revision=revision, token=token)
+
+                    def _load_fsspec_patched(path: str, map_location: str = None, **kwargs):
+                        if isinstance(path, str) and path.startswith("http"):
+                            local = _hf_resolve_to_local(path)
+                            if local:
+                                return _orig_load_fsspec(local, map_location=map_location, **kwargs)
+                        return _orig_load_fsspec(path, map_location=map_location, **kwargs)
+
+                    _acm.load_fsspec = _load_fsspec_patched  # type: ignore[assignment]
+                    _acm._hf_load_fsspec_patched = True  # type: ignore[attr-defined]
+                except Exception:
+                    pass
+        except Exception:
+            pass
+
         from nemo.collections.tts.modules.magpietts_inference.inference import InferenceConfig, MagpieInferenceRunner
         from nemo.collections.tts.modules.magpietts_inference.utils import ModelLoadConfig, load_magpie_model
 

From 80a2d7c001172b245defacc310d6865ae9cc1f62 Mon Sep 17 00:00:00 2001
From: Valentin Mendelev <vmendelev@nvidia.com>
Date: Fri, 30 Jan 2026 14:24:46 -0800
Subject: [PATCH 18/26] Disable MagpieTTS KV cache to avoid shape mismatches

Longform decoding with the transformer cache path can produce sequence-length mismatches; disable cache per request batch to prevent 500s in serve_unified.
---
 recipes/multimodal/server/backends/magpie_tts_backend.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/recipes/multimodal/server/backends/magpie_tts_backend.py b/recipes/multimodal/server/backends/magpie_tts_backend.py
index 9ac2faabe5..b794de0a3f 100644
--- a/recipes/multimodal/server/backends/magpie_tts_backend.py
+++ b/recipes/multimodal/server/backends/magpie_tts_backend.py
@@ -214,8 +214,10 @@ def generate(self, requests: List[GenerationRequest]) -> List[GenerationResult]:
                 if self._model is not None:
                     decoder = getattr(self._model, "decoder", None)
                     if decoder is not None and hasattr(decoder, "reset_cache"):
-                        # Keep caching enabled for decoding speed, but start from a clean cache.
-                        decoder.reset_cache(use_cache=True)
+                        # Disable KV caching: NeMo's transformer cache path can produce
+                        # sequence-length mismatches under longform decoding when the
+                        # server batches/streams requests.
+                        decoder.reset_cache(use_cache=False)
             except Exception:
                 # Best-effort: if cache reset fails for any reason, continue and let the
                 # underlying stack surface the real error.

From 9fe27034283854f14f6a6909defd94486fa622eb Mon Sep 17 00:00:00 2001
From: Valentin Mendelev <vmendelev@nvidia.com>
Date: Fri, 30 Jan 2026 15:11:32 -0800
Subject: [PATCH 19/26] Fix HF resolve URL caching in MagpieTTS backend

Correct HuggingFace resolve URL matching so downloads go through hf_hub_download() and avoid multi-rank 429s.
---
 recipes/multimodal/server/backends/magpie_tts_backend.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/recipes/multimodal/server/backends/magpie_tts_backend.py b/recipes/multimodal/server/backends/magpie_tts_backend.py
index b794de0a3f..4c2eee7477 100644
--- a/recipes/multimodal/server/backends/magpie_tts_backend.py
+++ b/recipes/multimodal/server/backends/magpie_tts_backend.py
@@ -112,7 +112,12 @@ def load_model(self) -> None:
                     from huggingface_hub import hf_hub_download  # type: ignore
 
                     def _hf_resolve_to_local(url: str) -> str | None:
-                        m = re.match(r"^https?://huggingface\\.co/([^/]+)/([^/]+)/resolve/([^/]+)/(.+)$", url)
+                        # Some NeMo deps pass raw HF "resolve" URLs via fsspec http.
+                        # Route them through HF Hub caching + file locks to avoid 429s.
+                        if not isinstance(url, str):
+                            return None
+                        url_no_q = url.split("?", 1)[0]
+                        m = re.match(r"^https?://huggingface\.co/([^/]+)/([^/]+)/resolve/([^/]+)/(.+)$", url_no_q)
                         if not m:
                             return None
                         repo_id = f"{m.group(1)}/{m.group(2)}"

From 136af12ec4b11030a8290f4a5133cc7edace3ee6 Mon Sep 17 00:00:00 2001
From: Valentin Mendelev <vmendelev@nvidia.com>
Date: Fri, 30 Jan 2026 15:39:46 -0800
Subject: [PATCH 20/26] Avoid killing multi-instance tasks via srun --wait

Stop setting srun --wait by default; allow opt-in via cluster_config.srun_wait_seconds.
---
 nemo_skills/pipeline/utils/exp.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/nemo_skills/pipeline/utils/exp.py b/nemo_skills/pipeline/utils/exp.py
index 3eed966b33..c8729f89b3 100644
--- a/nemo_skills/pipeline/utils/exp.py
+++ b/nemo_skills/pipeline/utils/exp.py
@@ -312,13 +312,23 @@ def get_executor(
     srun_args = [
         "--no-container-mount-home",
         "--mpi=pmix",
-        "--wait=240",  # wait up to 4 minutes for slower tasks to complete (important for multi-instance mode)
         # we need to be explicit about this in srun as commands might need to run in parallel
         f"--ntasks-per-node={tasks_per_node}",
         f"--nodes={num_nodes}",
         # NeMo-run should take care of this, but we'll put it here temporarily
         f"--container-env={','.join([k.strip() for k in env_vars.keys()])}",
     ]
+    # IMPORTANT:
+    # Slurm's `srun --wait=<sec>` terminates the job step if other tasks are still
+    # running <sec> seconds after the first task exits. For multi-instance runs
+    # (e.g., chunked evaluation), task runtimes can differ widely, and a low wait
+    # will kill long-running tasks (observed with `--wait=240`).
+    #
+    # If you need this behavior, configure it explicitly in the cluster config:
+    #   srun_wait_seconds: <int>
+    srun_wait_seconds = cluster_config.get("srun_wait_seconds")
+    if srun_wait_seconds is not None:
+        srun_args.append(f"--wait={int(srun_wait_seconds)}")
     if overlap:
         srun_args.append("--overlap")
     if not cluster_config.get("disable_gpus_per_node", False) and gpus_per_node is not None:

From 8f6d68ff9164c16d6928f963445eedc3c03e2822 Mon Sep 17 00:00:00 2001
From: Valentin Mendelev <vmendelev@nvidia.com>
Date: Fri, 30 Jan 2026 15:53:19 -0800
Subject: [PATCH 21/26] Override srun wait for multi-instance jobs

Add a large srun --wait for multi-instance runs to override nemo_run's default --wait=60, preventing premature termination when some ranks finish earlier.
---
 nemo_skills/pipeline/utils/exp.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/nemo_skills/pipeline/utils/exp.py b/nemo_skills/pipeline/utils/exp.py
index c8729f89b3..7d1dc31487 100644
--- a/nemo_skills/pipeline/utils/exp.py
+++ b/nemo_skills/pipeline/utils/exp.py
@@ -320,13 +320,20 @@ def get_executor(
     ]
     # IMPORTANT:
     # Slurm's `srun --wait=<sec>` terminates the job step if other tasks are still
-    # running <sec> seconds after the first task exits. For multi-instance runs
-    # (e.g., chunked evaluation), task runtimes can differ widely, and a low wait
-    # will kill long-running tasks (observed with `--wait=240`).
+    # running <sec> seconds after the first task exits.
     #
-    # If you need this behavior, configure it explicitly in the cluster config:
+    # `nemo_run` adds `--wait=60` by default; for multi-instance runs (e.g., chunked
+    # evaluation) tasks can finish at very different times (some may exit quickly
+    # due to `++skip_filled=True`), which causes Slurm to kill still-running tasks.
+    #
+    # We override this with a large wait by default for multi-instance mode.
+    # You can customize via cluster config:
     #   srun_wait_seconds: <int>
     srun_wait_seconds = cluster_config.get("srun_wait_seconds")
+    if srun_wait_seconds is None and tasks_per_node > 1:
+        # Use a very large wait (1 day) so long-running ranks aren't killed just
+        # because other ranks finished earlier.
+        srun_wait_seconds = 24 * 60 * 60
     if srun_wait_seconds is not None:
         srun_args.append(f"--wait={int(srun_wait_seconds)}")
     if overlap:

From c23805d0e531119d295115787f2a74d4a29bcb39 Mon Sep 17 00:00:00 2001
From: Valentin Mendelev <vmendelev@nvidia.com>
Date: Fri, 30 Jan 2026 16:10:34 -0800
Subject: [PATCH 22/26] Reduce MagpieTTS inference batch size

Lower Magpie inference runner batch size to reduce memory/latency spikes under multi-instance load.
---
 recipes/multimodal/server/backends/magpie_tts_backend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes/multimodal/server/backends/magpie_tts_backend.py b/recipes/multimodal/server/backends/magpie_tts_backend.py
index 4c2eee7477..0e42b752b2 100644
--- a/recipes/multimodal/server/backends/magpie_tts_backend.py
+++ b/recipes/multimodal/server/backends/magpie_tts_backend.py
@@ -175,7 +175,7 @@ def _load_fsspec_patched(path: str, map_location: str = None, **kwargs):
                 use_cfg=self.tts_config.use_cfg,
                 cfg_scale=self.tts_config.cfg_scale,
                 use_local_transformer=self.tts_config.use_local_transformer,
-                batch_size=32,
+                batch_size=16,
             ),
         )
 

From c4824123816b4daf64916a6281715c1096668138 Mon Sep 17 00:00:00 2001
From: Valentin Mendelev <vmendelev@nvidia.com>
Date: Fri, 30 Jan 2026 16:35:08 -0800
Subject: [PATCH 23/26] Set multi-instance srun wait to 1 hour

Use a 1-hour default srun --wait for multi-instance runs to avoid premature task termination when chunk runtimes differ.
---
 nemo_skills/pipeline/utils/exp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo_skills/pipeline/utils/exp.py b/nemo_skills/pipeline/utils/exp.py
index 7d1dc31487..6166ec3763 100644
--- a/nemo_skills/pipeline/utils/exp.py
+++ b/nemo_skills/pipeline/utils/exp.py
@@ -331,9 +331,9 @@ def get_executor(
     #   srun_wait_seconds: <int>
     srun_wait_seconds = cluster_config.get("srun_wait_seconds")
     if srun_wait_seconds is None and tasks_per_node > 1:
-        # Use a very large wait (1 day) so long-running ranks aren't killed just
+        # Use a reasonably large wait (1 hour) so long-running ranks aren't killed just
         # because other ranks finished earlier.
-        srun_wait_seconds = 24 * 60 * 60
+        srun_wait_seconds = 60 * 60
     if srun_wait_seconds is not None:
         srun_args.append(f"--wait={int(srun_wait_seconds)}")
     if overlap:

From 5d104d344fb0b3ea85cceac1ba07ed4c3629b75b Mon Sep 17 00:00:00 2001
From: Valentin Mendelev <vmendelev@nvidia.com>
Date: Sat, 31 Jan 2026 14:23:06 -0800
Subject: [PATCH 24/26] Add emergent_tts dataset + eval scripts

Introduce the emergent_tts dataset package with prepare/generate/score helpers and default configs to run EmergentTTS evaluation via NeMo-Skills.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 nemo_skills/dataset/emergent_tts/__init__.py  |   6 +
 .../dataset/emergent_tts/emergent/__init__.py |   3 +
 nemo_skills/dataset/emergent_tts/prepare.py   | 238 +++++++++++++++++
 .../dataset/emergent_tts/scripts/__init__.py  |   2 +
 .../emergent_tts/scripts/check_deps.py        |  95 +++++++
 .../emergent_tts/scripts/config/default.yaml  |  61 +++++
 .../scripts/config/interactive_10.yaml        |  22 ++
 .../scripts/config/local_interactive_10.yaml  |  30 +++
 .../config/local_interactive_10_base.yaml     |  26 ++
 .../scripts/convert_ns_outputs_to_emergent.py |  92 +++++++
 .../emergent_tts/scripts/run_tts_eval.py      | 168 ++++++++++++
 .../dataset/emergent_tts/scripts/score.py     | 252 ++++++++++++++++++
 12 files changed, 995 insertions(+)
 create mode 100644 nemo_skills/dataset/emergent_tts/__init__.py
 create mode 100644 nemo_skills/dataset/emergent_tts/emergent/__init__.py
 create mode 100644 nemo_skills/dataset/emergent_tts/prepare.py
 create mode 100644 nemo_skills/dataset/emergent_tts/scripts/__init__.py
 create mode 100644 nemo_skills/dataset/emergent_tts/scripts/check_deps.py
 create mode 100644 nemo_skills/dataset/emergent_tts/scripts/config/default.yaml
 create mode 100644 nemo_skills/dataset/emergent_tts/scripts/config/interactive_10.yaml
 create mode 100644 nemo_skills/dataset/emergent_tts/scripts/config/local_interactive_10.yaml
 create mode 100644 nemo_skills/dataset/emergent_tts/scripts/config/local_interactive_10_base.yaml
 create mode 100644 nemo_skills/dataset/emergent_tts/scripts/convert_ns_outputs_to_emergent.py
 create mode 100644 nemo_skills/dataset/emergent_tts/scripts/run_tts_eval.py
 create mode 100644 nemo_skills/dataset/emergent_tts/scripts/score.py

diff --git a/nemo_skills/dataset/emergent_tts/__init__.py b/nemo_skills/dataset/emergent_tts/__init__.py
new file mode 100644
index 0000000000..c95f451485
--- /dev/null
+++ b/nemo_skills/dataset/emergent_tts/__init__.py
@@ -0,0 +1,6 @@
+"""EmergentTTS-Eval dataset integration for NeMo-Skills.
+
+This package contains tooling to prepare the EmergentTTS-Eval benchmark for
+NeMo-Skills evaluation runs.
+"""
+
diff --git a/nemo_skills/dataset/emergent_tts/emergent/__init__.py b/nemo_skills/dataset/emergent_tts/emergent/__init__.py
new file mode 100644
index 0000000000..13edac0edf
--- /dev/null
+++ b/nemo_skills/dataset/emergent_tts/emergent/__init__.py
@@ -0,0 +1,3 @@
+# EmergentTTS-Eval benchmark (NeMo-Skills)
+
+GENERATION_ARGS = "++prompt_format=openai"
diff --git a/nemo_skills/dataset/emergent_tts/prepare.py b/nemo_skills/dataset/emergent_tts/prepare.py
new file mode 100644
index 0000000000..f3616cc2d8
--- /dev/null
+++ b/nemo_skills/dataset/emergent_tts/prepare.py
@@ -0,0 +1,238 @@
+#!/usr/bin/env python3
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Prepare EmergentTTS-Eval benchmark for NeMo-Skills.
+
+This script:
+1) Downloads the EmergentTTS-Eval HF dataset
+2) Saves baseline audios to wav files
+3) Writes `data/emergent_tts_eval_data.jsonl` in Emergent's expected schema
+4) Downloads `data/wv_mos.ckpt`
+5) Writes NeMo-Skills `test.jsonl` for generation (OpenAI prompt format)
+
+Typical usage (to create everything under your shared NeMo-Skills data dir):
+  python prepare.py --output_dir /lustre/.../data_dir/emergent_tts
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import time
+import urllib.request
+from urllib.error import ContentTooShortError
+from pathlib import Path
+
+
+SYSTEM_MESSAGE = "You are a helpful assistant."
+DEFAULT_DATASET = "bosonai/EmergentTTS-Eval"
+DEFAULT_SPLIT = "train"
+WV_MOS_URL = "https://zenodo.org/record/6201162/files/wav2vec2.ckpt?download=1"
+
+
+def _require_deps():
+    try:
+        import numpy as np  # noqa: F401
+        from datasets import load_dataset  # noqa: F401
+        import librosa  # noqa: F401
+        import soundfile  # noqa: F401
+        from pydub import AudioSegment  # noqa: F401
+        from tqdm import tqdm  # noqa: F401
+    except Exception as e:  # pragma: no cover
+        raise RuntimeError(
+            "Missing dependencies for EmergentTTS-Eval preparation.\n\n"
+            "Install into the repo venv:\n"
+            "  cd /home/vmendelev/workspace/expressiveness/src/nemo-skills-tts-eval\n"
+            "  . ./.venv/bin/activate\n"
+            "  pip install datasets numpy pydub tqdm librosa soundfile\n"
+        ) from e
+
+
+def _download_wv_mos(dst_path: Path, overwrite: bool) -> None:
+    dst_path.parent.mkdir(parents=True, exist_ok=True)
+    if dst_path.exists() and not overwrite:
+        return
+    tmp_path = dst_path.with_suffix(dst_path.suffix + ".tmp")
+
+    # Zenodo downloads can occasionally fail with ContentTooShortError; retry.
+    max_attempts = 5
+    for attempt in range(1, max_attempts + 1):
+        if tmp_path.exists():
+            tmp_path.unlink()
+        try:
+            urllib.request.urlretrieve(WV_MOS_URL, str(tmp_path))
+            tmp_path.replace(dst_path)
+            return
+        except ContentTooShortError as e:
+            # Partial download: wait and retry.
+            wait_s = min(5 * attempt, 30)
+            print(f"Warning: partial download for wv_mos.ckpt (attempt {attempt}/{max_attempts}): {e}")
+            time.sleep(wait_s)
+        except Exception as e:
+            wait_s = min(5 * attempt, 30)
+            print(f"Warning: failed downloading wv_mos.ckpt (attempt {attempt}/{max_attempts}): {e}")
+            time.sleep(wait_s)
+
+    raise RuntimeError(f"Failed to download wv_mos.ckpt after {max_attempts} attempts: {WV_MOS_URL}")
+
+
+def _write_benchmark_init(bench_dir: Path) -> None:
+    bench_dir.mkdir(parents=True, exist_ok=True)
+    init_path = bench_dir / "__init__.py"
+    init_path.write_text(
+        (
+            "# EmergentTTS-Eval benchmark (NeMo-Skills)\n\n"
+            'GENERATION_ARGS = "++prompt_format=openai"\n'
+        ),
+        encoding="utf-8",
+    )
+
+
+def _to_nemo_skills_entry(sample: dict) -> dict:
+    # MagpieTTS backend expects JSON with at least `text`. We also keep Emergent
+    # metadata to enable deterministic conversion/scoring later.
+    payload = {
+        "text": sample["text_to_synthesize"],
+        "text_to_synthesize": sample["text_to_synthesize"],
+        "category": sample["category"],
+        "evolution_depth": sample["evolution_depth"],
+        "language": sample["language"],
+        "unique_id_eval": sample["unique_id_eval"],
+        # Optional fields used by MagpieTTS evaluation code paths.
+        "context_audio_filepath": "",
+        "duration": 5.0,
+        "context_audio_duration": 5.0,
+    }
+    return {
+        "problem": "",
+        "messages": [
+            {"role": "system", "content": SYSTEM_MESSAGE},
+            {"role": "user", "content": json.dumps(payload, ensure_ascii=False)},
+        ],
+    }
+
+
+def main():
+    _require_deps()
+    import numpy as np
+    from datasets import load_dataset
+    from pydub import AudioSegment
+    from tqdm import tqdm
+
+    parser = argparse.ArgumentParser(description="Prepare EmergentTTS-Eval for NeMo-Skills")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=str(Path(__file__).parent),
+        help="Where to create emergent_tts module structure (default: folder containing this script).",
+    )
+    parser.add_argument("--dataset", type=str, default=DEFAULT_DATASET, help="HF dataset name")
+    parser.add_argument("--split", type=str, default=DEFAULT_SPLIT, help="HF split to download (train contains 1645)")
+    parser.add_argument(
+        "--overwrite",
+        action="store_true",
+        help="Overwrite existing files (baseline audios, jsonl, wv_mos.ckpt, test.jsonl).",
+    )
+    parser.add_argument(
+        "--num_samples",
+        type=int,
+        default=None,
+        help="Optional: limit number of samples (debug). If set, takes the first N rows.",
+    )
+    args = parser.parse_args()
+
+    output_dir = Path(args.output_dir).resolve()
+    data_dir = output_dir / "data"
+    baseline_audios_dir = data_dir / "baseline_audios"
+    baseline_audios_dir.mkdir(parents=True, exist_ok=True)
+
+    # Emergent expected files
+    emergent_jsonl_path = data_dir / "emergent_tts_eval_data.jsonl"
+    wv_mos_path = data_dir / "wv_mos.ckpt"
+
+    # NeMo-Skills benchmark module structure
+    bench_dir = output_dir / "emergent"
+    test_jsonl_path = bench_dir / "test.jsonl"
+    _write_benchmark_init(bench_dir)
+
+    # Download dataset
+    dataset_hf = load_dataset(args.dataset, split=args.split)
+    total = len(dataset_hf) if args.num_samples is None else min(args.num_samples, len(dataset_hf))
+
+    if emergent_jsonl_path.exists() and test_jsonl_path.exists() and not args.overwrite:
+        print(f"Found existing outputs under {output_dir}. Use --overwrite to rebuild.")
+    else:
+        if args.overwrite:
+            for p in [emergent_jsonl_path, test_jsonl_path]:
+                if p.exists():
+                    p.unlink()
+
+        emergent_records: list[dict] = []
+
+        # Build emergent jsonl + baseline audios
+        for i in tqdm(range(total), desc="Preparing EmergentTTS-Eval"):
+            curr = dataset_hf[i]
+            unique_id = i
+
+            # Save baseline audio
+            wav_path = baseline_audios_dir / f"{unique_id}.wav"
+            if args.overwrite or not wav_path.exists():
+                audio_array = curr["audio"]["array"]
+                audio_sr = int(curr["audio"]["sampling_rate"])
+                audio_array_int16 = np.int16(audio_array * 32767)
+                audio_segment = AudioSegment(
+                    audio_array_int16.tobytes(),
+                    frame_rate=audio_sr,
+                    sample_width=2,
+                    channels=1,
+                )
+                audio_segment.export(str(wav_path), format="wav")
+
+            emergent_records.append(
+                {
+                    "unique_id_eval": unique_id,
+                    "category": curr["category"],
+                    "text_to_synthesize": curr["text_to_synthesize"],
+                    "evolution_depth": curr["evolution_depth"],
+                    "language": curr["language"],
+                }
+            )
+
+        # Write emergent jsonl data file
+        emergent_jsonl_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(emergent_jsonl_path, "w", encoding="utf-8") as f:
+            for rec in emergent_records:
+                f.write(json.dumps(rec, ensure_ascii=False) + "\n")
+
+        # Write NeMo-Skills test.jsonl
+        with open(test_jsonl_path, "w", encoding="utf-8") as f:
+            for rec in emergent_records:
+                f.write(json.dumps(_to_nemo_skills_entry(rec), ensure_ascii=False) + "\n")
+
+    # Download MOS model checkpoint (used by Emergent scoring)
+    _download_wv_mos(wv_mos_path, overwrite=args.overwrite)
+
+    print("\nPrepared EmergentTTS-Eval:")
+    print(f"  - data dir: {data_dir}")
+    print(f"  - baseline audios: {baseline_audios_dir}")
+    print(f"  - emergent jsonl: {emergent_jsonl_path}")
+    print(f"  - wv_mos.ckpt: {wv_mos_path}")
+    print(f"  - nemo-skills test.jsonl: {test_jsonl_path}")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/nemo_skills/dataset/emergent_tts/scripts/__init__.py b/nemo_skills/dataset/emergent_tts/scripts/__init__.py
new file mode 100644
index 0000000000..b1989f6c3b
--- /dev/null
+++ b/nemo_skills/dataset/emergent_tts/scripts/__init__.py
@@ -0,0 +1,2 @@
+"""Scripts for running EmergentTTS-Eval via NeMo-Skills."""
+
diff --git a/nemo_skills/dataset/emergent_tts/scripts/check_deps.py b/nemo_skills/dataset/emergent_tts/scripts/check_deps.py
new file mode 100644
index 0000000000..459cfc3311
--- /dev/null
+++ b/nemo_skills/dataset/emergent_tts/scripts/check_deps.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+
+"""Dependency checker for EmergentTTS-Eval integration.
+
+This script is meant to fail fast with a clear actionable message when you are
+missing Python packages needed for:
+- dataset preparation (`prepare.py`)
+- scoring (EmergentTTS-Eval-public `inference.py`)
+"""
+
+from __future__ import annotations
+
+import argparse
+import importlib
+import os
+from pathlib import Path
+
+
+def _try_import(module: str) -> str | None:
+    try:
+        importlib.import_module(module)
+        return None
+    except Exception as e:
+        return f"{module} ({type(e).__name__}: {e})"
+
+
+def _venv_install_hint(*, emergent_repo_path: str | None) -> str:
+    repo_root = Path(__file__).resolve().parents[4]  # .../nemo_skills/dataset/emergent_tts/scripts
+    lines = [
+        "To install missing deps into the repo venv:",
+        f"  cd {repo_root}",
+        "  . ./.venv/bin/activate",
+        "  pip install -e .",
+        "  pip install librosa soundfile",
+    ]
+    if emergent_repo_path:
+        lines.append(f"  pip install -r {Path(emergent_repo_path).resolve()}/requirements.txt")
+    else:
+        lines.append("  pip install -r /path/to/EmergentTTS-Eval-public/requirements.txt")
+    return "\n".join(lines)
+
+
+def main():
+    p = argparse.ArgumentParser(description="Check dependencies for EmergentTTS-Eval integration")
+    p.add_argument("--stage", choices=["prepare", "scoring", "all"], default="all")
+    p.add_argument(
+        "--emergent_repo_path",
+        default=os.environ.get("EMERGENT_TTS_EVAL_REPO", ""),
+        help="Path to EmergentTTS-Eval-public (used only to print install hint)",
+    )
+    args = p.parse_args()
+
+    emergent_repo_path = args.emergent_repo_path or None
+
+    missing: list[str] = []
+
+    if args.stage in ("prepare", "all"):
+        for mod in ["datasets", "numpy", "pydub", "tqdm", "librosa", "soundfile"]:
+            err = _try_import(mod)
+            if err:
+                missing.append(err)
+
+    if args.stage in ("scoring", "all"):
+        # Minimal set required by EmergentTTS-Eval-public scoring path (fetch-audios mode)
+        for mod in [
+            "torch",
+            "transformers",
+            "editdistance",
+            "whisper_normalizer",
+            "json_repair",
+            "tenacity",
+            "openai",
+            "google.genai",
+            "pydub",
+            "librosa",
+            "soundfile",
+        ]:
+            err = _try_import(mod)
+            if err:
+                missing.append(err)
+
+    if missing:
+        print("Missing required dependencies:\n")
+        for m in missing:
+            print(f"- {m}")
+        print()
+        print(_venv_install_hint(emergent_repo_path=emergent_repo_path))
+        raise SystemExit(2)
+
+    print("All required dependencies are available.")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/nemo_skills/dataset/emergent_tts/scripts/config/default.yaml b/nemo_skills/dataset/emergent_tts/scripts/config/default.yaml
new file mode 100644
index 0000000000..5015e4151c
--- /dev/null
+++ b/nemo_skills/dataset/emergent_tts/scripts/config/default.yaml
@@ -0,0 +1,61 @@
+# EmergentTTS-Eval pipeline configuration (example)
+#
+# NOTE: Before running generation, create the dataset under generation.data_dir:
+#   python nemo_skills/dataset/emergent_tts/prepare.py --output_dir <data_dir>/emergent_tts
+#
+# Then run:
+#   python -m nemo_skills.dataset.emergent_tts.scripts.run_tts_eval --config <this_file> --stage all
+
+cluster: eos
+container: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/containters/nemo-25.11.sqsh
+partition: batch
+mount_paths: /lustre:/lustre
+
+# Where NeMo-Skills will write eval-results/ and eval-logs/
+output_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/emergent_tts_eval_full_8chunks
+
+# NeMo source checkout on EOS (needed for MagpieTTS inference modules).
+nemo_code_path: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/experimenta/tts_eval/NeMo
+
+generation:
+  benchmarks: emergent_tts.emergent
+  model: nvidia/magpie_tts_multilingual_357m
+  server_type: generic
+  # One GPU for the server process.
+  server_gpus: 1
+  server_entrypoint: python -m nemo_skills.inference.server.serve_unified
+  server_args: --backend magpie_tts --codec_model nvidia/nemo-nano-codec-22khz-1.89kbps-21.5fps --batch_size 16 --batch_timeout 0.1 --use_cfg --cfg_scale 2.5
+
+  # Shared NeMo-Skills data_dir. Must contain emergent_tts/emergent/test.jsonl and emergent_tts/emergent/__init__.py
+  data_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/data_dir
+
+  # Full run: split across 8 chunks.
+  num_chunks: 8
+  # Request 8 GPUs per node for the generation job allocation.
+  # (Generation itself calls the server; this matches nv_tts scheduling expectations.)
+  gpus_per_node: 8
+  extra_args: ++server.server_type=vllm_multimodal
+
+scoring:
+  gpus: 1
+  # Container for scoring jobs (conversion + Emergent eval). Use the same container
+  # as the generation "main" job (not the Magpie server container).
+  container: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/containters/nemo-25.11.sqsh
+  # Install missing Python deps at job start (runs inside the scoring container).
+  # Keep this conservative: avoid upgrading core deps inside the base container.
+  installation_command: pip install editdistance whisper-normalizer json-repair tenacity
+
+  # Path to EmergentTTS-Eval-public on the cluster (added to PYTHONPATH)
+  scoring_code_path: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/code/EmergentTTS-Eval-public
+
+  # Path to Emergent data directory created by prepare.py:
+  #   <generation.data_dir>/emergent_tts/data
+  emergent_data_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/data_dir/emergent_tts/data
+
+  # Judge configuration (OpenAI-compatible via NVIDIA Inference API)
+  judge_model: gcp/google/gemini-2.5-pro
+  judger_base_url: https://inference-api.nvidia.com/v1/chat/completions
+  num_threads: 8
+  evaluate_function: win_rate
+  strong_prompting: false
+
diff --git a/nemo_skills/dataset/emergent_tts/scripts/config/interactive_10.yaml b/nemo_skills/dataset/emergent_tts/scripts/config/interactive_10.yaml
new file mode 100644
index 0000000000..dfef60facf
--- /dev/null
+++ b/nemo_skills/dataset/emergent_tts/scripts/config/interactive_10.yaml
@@ -0,0 +1,22 @@
+# EmergentTTS-Eval: interactive 10-sample generation smoke test
+
+cluster: eos
+container: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/containters/nemo-25.11.sqsh
+partition: interactive
+mount_paths: /lustre:/lustre
+
+output_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/emergent_tts_smoke10
+nemo_code_path: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/experimenta/tts_eval/NeMo
+
+generation:
+  benchmarks: emergent_tts.emergent
+  model: nvidia/magpie_tts_multilingual_357m
+  server_type: generic
+  server_gpus: 1
+  server_entrypoint: python -m nemo_skills.inference.server.serve_unified
+  server_args: --backend magpie_tts --codec_model nvidia/nemo-nano-codec-22khz-1.89kbps-21.5fps --batch_size 32 --batch_timeout 0.1 --use_cfg --cfg_scale 2.5
+  data_dir: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/data_dir
+  num_chunks: 1
+  gpus_per_node: 1
+  extra_args: ++max_samples=10 ++server.server_type=vllm_multimodal
+
diff --git a/nemo_skills/dataset/emergent_tts/scripts/config/local_interactive_10.yaml b/nemo_skills/dataset/emergent_tts/scripts/config/local_interactive_10.yaml
new file mode 100644
index 0000000000..e50bca8e2f
--- /dev/null
+++ b/nemo_skills/dataset/emergent_tts/scripts/config/local_interactive_10.yaml
@@ -0,0 +1,30 @@
+# EmergentTTS-Eval: local (docker) 10-sample generation smoke test
+#
+# Usage:
+#   export NEMO_SKILLS_CONFIG_DIR=/home/vmendelev/workspace/expressiveness/src/nemo-skills-tts-eval/cluster_configs
+#   python -m nemo_skills.dataset.emergent_tts.scripts.run_tts_eval --config <this_file> --stage generation
+
+cluster: local_nemo_25_11
+partition: interactive
+
+output_dir: /home/vmendelev/workspace/expressiveness/src/nemo-skills-tts-eval/_local_runs/emergent_tts_smoke10
+
+# Optional: if your local setup needs a NeMo source checkout for MagpieTTS inference,
+# set this to an absolute host path and ensure it's mounted in the local cluster config.
+nemo_code_path: /home/vmendelev/workspace/expressiveness/src/NeMo
+
+generation:
+  benchmarks: emergent_tts.emergent
+  model: nvidia/magpie_tts_multilingual_357m
+  server_type: generic
+  server_gpus: 1
+  server_entrypoint: python -m nemo_skills.inference.server.serve_unified
+  server_args: --backend magpie_tts --codec_model nvidia/nemo-nano-codec-22khz-1.89kbps-21.5fps --batch_size 8 --batch_timeout 0.1 --use_cfg --cfg_scale 2.5
+
+  # Use the repo dataset folder (contains emergent_tts/emergent/test.jsonl).
+  data_dir: /home/vmendelev/workspace/expressiveness/src/nemo-skills-tts-eval/nemo_skills/dataset
+
+  num_chunks: 1
+  gpus_per_node: 1
+  extra_args: ++max_samples=10 ++server.server_type=vllm_multimodal
+
diff --git a/nemo_skills/dataset/emergent_tts/scripts/config/local_interactive_10_base.yaml b/nemo_skills/dataset/emergent_tts/scripts/config/local_interactive_10_base.yaml
new file mode 100644
index 0000000000..6816e5b523
--- /dev/null
+++ b/nemo_skills/dataset/emergent_tts/scripts/config/local_interactive_10_base.yaml
@@ -0,0 +1,26 @@
+# EmergentTTS-Eval: local (docker) 10-sample generation smoke test
+# Using the base NeMo container (nvcr.io/nvidia/nemo:25.11).
+
+cluster: local_nemo_25_11_base
+partition: interactive
+
+# For local docker runs, set the container to use for the *server* task.
+# (The main task uses cluster_config.containers.nemo-skills.)
+container: nvcr.io/nvidia/nemo:25.11
+
+output_dir: /home/vmendelev/workspace/expressiveness/src/nemo-skills-tts-eval/_local_runs/emergent_tts_smoke10_base
+
+nemo_code_path: /home/vmendelev/workspace/expressiveness/src/NeMo
+
+generation:
+  benchmarks: emergent_tts.emergent
+  model: nvidia/magpie_tts_multilingual_357m
+  server_type: generic
+  server_gpus: 1
+  server_entrypoint: python -m nemo_skills.inference.server.serve_unified
+  server_args: --backend magpie_tts --codec_model nvidia/nemo-nano-codec-22khz-1.89kbps-21.5fps --batch_size 8 --batch_timeout 0.1 --use_cfg --cfg_scale 2.5
+  data_dir: /home/vmendelev/workspace/expressiveness/src/nemo-skills-tts-eval/nemo_skills/dataset
+  num_chunks: 1
+  gpus_per_node: 1
+  extra_args: ++max_samples=10 ++server.server_type=vllm_multimodal
+
diff --git a/nemo_skills/dataset/emergent_tts/scripts/convert_ns_outputs_to_emergent.py b/nemo_skills/dataset/emergent_tts/scripts/convert_ns_outputs_to_emergent.py
new file mode 100644
index 0000000000..66ab564990
--- /dev/null
+++ b/nemo_skills/dataset/emergent_tts/scripts/convert_ns_outputs_to_emergent.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+# Copyright (c) 2026, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import shutil
+from pathlib import Path
+
+
+def _extract_user_json(record: dict) -> dict | None:
+    for msg in record.get("messages", []):
+        if msg.get("role") != "user":
+            continue
+        content = msg.get("content")
+        if isinstance(content, dict):
+            return content
+        if isinstance(content, str):
+            try:
+                return json.loads(content)
+            except json.JSONDecodeError:
+                return None
+    return None
+
+
+def _link_or_copy(src: str, dst: str, mode: str):
+    if mode == "symlink":
+        if os.path.islink(dst):
+            if os.readlink(dst) == src:
+                return
+            os.unlink(dst)
+        elif os.path.exists(dst):
+            os.unlink(dst)
+        os.symlink(src, dst)
+        return
+
+    if mode == "copy":
+        shutil.copyfile(src, dst)
+        return
+
+    raise ValueError(f"Unknown mode: {mode}")
+
+
+def main():
+    p = argparse.ArgumentParser(description="Convert NeMo-Skills TTS outputs into Emergent audio layout")
+    p.add_argument("--ns_output_jsonl", required=True, help="Path to NeMo-Skills output.jsonl")
+    p.add_argument("--out_dir", required=True, help="Destination directory for <unique_id_eval>.wav")
+    p.add_argument("--mode", choices=["symlink", "copy"], default="symlink")
+    p.add_argument("--overwrite", action="store_true")
+    args = p.parse_args()
+
+    out_dir = Path(args.out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    converted = 0
+    skipped = 0
+    missing = 0
+
+    with open(args.ns_output_jsonl, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            record = json.loads(line)
+            user_json = _extract_user_json(record) or {}
+            unique_id = user_json.get("unique_id_eval", record.get("unique_id_eval"))
+            audio_path = (record.get("audio") or {}).get("path")
+
+            if unique_id is None:
+                skipped += 1
+                continue
+            if not audio_path or not os.path.exists(audio_path):
+                missing += 1
+                continue
+
+            dst = out_dir / f"{unique_id}.wav"
+            if dst.exists() and not args.overwrite:
+                continue
+            _link_or_copy(audio_path, str(dst), args.mode)
+            converted += 1
+
+    print(
+        f"Converted {converted} files into {out_dir}. "
+        f"skipped(no unique_id_eval)={skipped}, missing_audio={missing}"
+    )
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/nemo_skills/dataset/emergent_tts/scripts/run_tts_eval.py b/nemo_skills/dataset/emergent_tts/scripts/run_tts_eval.py
new file mode 100644
index 0000000000..19d08f3497
--- /dev/null
+++ b/nemo_skills/dataset/emergent_tts/scripts/run_tts_eval.py
@@ -0,0 +1,168 @@
+#!/usr/bin/env python3
+# Copyright (c) 2026, NVIDIA CORPORATION.
+
+"""
+Emergent TTS Pipeline: Generation -> Scoring (-> Aggregation)
+
+This mirrors `nemo_skills/dataset/nv_tts/scripts/run_tts_eval.py` but uses
+EmergentTTS-Eval scoring logic.
+"""
+
+import argparse
+import os
+
+import yaml
+
+from nemo_skills.pipeline.eval import eval as ns_eval
+from nemo_skills.pipeline.run_cmd import run_cmd as ns_run_cmd
+
+
+class MockContext:
+    """Mock typer.Context for programmatic calls."""
+
+    def __init__(self, extra_args=None):
+        self.args = extra_args or []
+
+
+def load_config(config_path: str) -> dict:
+    with open(config_path) as f:
+        return yaml.safe_load(f)
+
+
+def run_generation(cfg: dict, expname: str):
+    gen = cfg["generation"]
+    # Mirror nv_tts behavior: allow injecting a NeMo source checkout into PYTHONPATH
+    # for the unified server (MagpieTTS inference code lives in NeMo).
+    server_args = gen["server_args"]
+    generation_code_path = cfg.get("generation_code_path") or cfg.get("nemo_code_path")
+    if generation_code_path:
+        server_args += f" --code_path {generation_code_path}"
+
+    extra_args = gen.get("extra_args", "").split() if gen.get("extra_args") else []
+    ctx = MockContext(extra_args)
+    return ns_eval(
+        ctx=ctx,
+        cluster=cfg["cluster"],
+        output_dir=cfg["output_dir"],
+        benchmarks=gen["benchmarks"],
+        model=gen["model"],
+        server_type=gen["server_type"],
+        server_gpus=gen["server_gpus"],
+        # Local executor doesn't require explicit container/mount_paths in the run YAML.
+        # For slurm clusters these are required and should be present in the config.
+        server_container=cfg.get("container", ""),
+        mount_paths=cfg.get("mount_paths", ""),
+        server_entrypoint=gen["server_entrypoint"],
+        server_args=server_args,
+        data_dir=gen["data_dir"],
+        num_chunks=gen["num_chunks"],
+        gpus_per_node=gen.get("gpus_per_node", 1),
+        partition=cfg["partition"],
+        expname=expname,
+        auto_summarize_results=False,
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Emergent TTS Eval Pipeline")
+    parser.add_argument("--config", required=True)
+    parser.add_argument(
+        "--stage",
+        choices=["all", "generation", "scoring", "aggregation"],
+        default="all",
+    )
+    parser.add_argument("--expname", default="emergent_tts_eval")
+    args = parser.parse_args()
+
+    cfg = load_config(args.config)
+    scoring = cfg.get("scoring", {})
+    output_dir = cfg["output_dir"]
+
+    gen_exp_name = None
+
+    if args.stage in ("all", "generation"):
+        print("\n" + "=" * 60)
+        print("Stage 1: GENERATION")
+        print("=" * 60)
+        gen_exp = run_generation(cfg, args.expname)
+        gen_exp_name = args.expname
+        print(f"Generation submitted: {gen_exp}")
+
+    if args.stage in ("all", "scoring"):
+        print("\n" + "=" * 60)
+        print("Stage 2: SCORING (EmergentTTS-Eval)")
+        print("=" * 60)
+
+        benchmarks = cfg["generation"]["benchmarks"].split(",")
+        run_after = [gen_exp_name] if args.stage == "all" and gen_exp_name else None
+
+        scoring_code_path = scoring.get("scoring_code_path", "")
+        emergent_data_dir = scoring.get("emergent_data_dir", "")
+        install_cmd = scoring.get("installation_command")
+        scoring_container = scoring.get("container") or "nemo-skills"
+
+        # Required by Emergent's judge clients
+        judger_api_key = (
+            os.environ.get("JUDGER_API_KEY")
+            or os.environ.get("NVIDIA_API_KEY")
+            or os.environ.get("OPENAI_API_KEY")
+            or ""
+        )
+        if not judger_api_key:
+            print("Warning: JUDGER_API_KEY/NVIDIA_API_KEY/OPENAI_API_KEY not set; win_rate judging may fail.")
+
+        for benchmark in benchmarks:
+            benchmark = benchmark.strip()
+            short_name = benchmark.split(".")[-1]
+            score_cmd = (
+                f"JUDGER_API_KEY={judger_api_key} "
+                f"PYTHONPATH={scoring_code_path}:$PYTHONPATH "
+                f"python -m nemo_skills.dataset.emergent_tts.scripts.score "
+                f"--results_dir {output_dir} "
+                f"--benchmark {benchmark} "
+                f"--emergent_data_dir {emergent_data_dir} "
+                f"--judge_model {scoring.get('judge_model', 'gcp/google/gemini-2.5-pro')} "
+                f"--judger_base_url {scoring.get('judger_base_url', 'https://inference-api.nvidia.com/v1/chat/completions')} "
+                f"--num_threads {int(scoring.get('num_threads', 8))} "
+                f"--evaluate_function {scoring.get('evaluate_function', 'win_rate')}"
+            )
+            if scoring.get("strong_prompting"):
+                score_cmd += " --strong_prompting"
+
+            ns_run_cmd(
+                ctx=MockContext(),
+                cluster=cfg["cluster"],
+                container=scoring_container,
+                partition=cfg["partition"],
+                num_gpus=int(scoring.get("gpus", 1)),
+                mount_paths=cfg["mount_paths"],
+                command=score_cmd,
+                installation_command=install_cmd,
+                run_after=run_after,
+                expname=f"{args.expname}_score_{short_name}",
+                log_dir=f"{output_dir}/eval-logs",
+            )
+
+    if args.stage == "aggregation":
+        print("\n" + "=" * 60)
+        print("Stage 3: AGGREGATION")
+        print("=" * 60)
+        agg_cmd = f"python -m nemo_skills.dataset.emergent_tts.scripts.score --results_dir {output_dir} --aggregation_only"
+        ns_run_cmd(
+            ctx=MockContext(),
+            cluster=cfg["cluster"],
+            container=cfg["container"],
+            partition=cfg["partition"],
+            num_gpus=0,
+            mount_paths=cfg["mount_paths"],
+            command=agg_cmd,
+            expname=f"{args.expname}_agg",
+            log_dir=f"{output_dir}/eval-logs",
+        )
+
+    print("\nDone!")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/nemo_skills/dataset/emergent_tts/scripts/score.py b/nemo_skills/dataset/emergent_tts/scripts/score.py
new file mode 100644
index 0000000000..ec5c77c58b
--- /dev/null
+++ b/nemo_skills/dataset/emergent_tts/scripts/score.py
@@ -0,0 +1,252 @@
+#!/usr/bin/env python3
+# Copyright (c) 2026, NVIDIA CORPORATION.
+
+"""Run EmergentTTS-Eval scoring on NeMo-Skills generated audio.
+
+This script expects NeMo-Skills generation output layout:
+  <results_dir>/eval-results/<benchmark>/output.jsonl
+
+It will:
+  1) Convert NeMo-Skills `output.jsonl` audio paths into Emergent layout
+     (<benchmark>/emergent-tts-eval_output-audios/<unique_id_eval>.wav)
+  2) Run Emergent scoring in fetch-audios mode (no re-generation)
+  3) Write `metrics.json` in the benchmark folder for consistency with other evals
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+from pathlib import Path
+
+
+def _benchmarks_dir(results_dir: str) -> Path:
+    p = Path(results_dir) / "eval-results"
+    return p if p.exists() else Path(results_dir)
+
+
+def _normalize_openai_base_url(url: str) -> str:
+    # Some callers pass the full endpoint; OpenAI client expects base URL.
+    suffix = "/v1/chat/completions"
+    if url.endswith(suffix):
+        return url[: -len("/chat/completions")]
+    return url
+
+
+class _NoopModelClient:
+    """A minimal Emergent model_client for scoring-only runs."""
+
+    def prepare_emergent_tts_sample(self, text_to_synthesize, category, strong_prompting, prompting_object, **kwargs):
+        if strong_prompting:
+            user_message = (
+                prompting_object.USER_MESSAGE_STRONG_TEMPLATE.replace(
+                    "{{{descriptions}}}", prompting_object.ALL_DESCRIPTIONS[category]
+                ).replace("{{{text_to_synthesize}}}", text_to_synthesize)
+            )
+        else:
+            user_message = prompting_object.USER_MESSAGE_DEFAULT_TEMPLATE.replace(
+                "{{{text_to_synthesize}}}", text_to_synthesize
+            )
+        return prompting_object.SYSTEM_PROMPT_DEFAULT, user_message
+
+
+def _convert(ns_output_jsonl: Path, out_dir: Path, overwrite: bool) -> None:
+    from nemo_skills.dataset.emergent_tts.scripts.convert_ns_outputs_to_emergent import main as convert_main
+
+    # Reuse converter as a library via argv.
+    import sys
+
+    argv = sys.argv
+    try:
+        sys.argv = [
+            argv[0],
+            "--ns_output_jsonl",
+            str(ns_output_jsonl),
+            "--out_dir",
+            str(out_dir),
+            "--mode",
+            "symlink",
+        ] + (["--overwrite"] if overwrite else [])
+        convert_main()
+    finally:
+        sys.argv = argv
+
+
+def _run_emergent_scoring(
+    *,
+    benchmark_dir: Path,
+    emergent_data_base_path: Path,
+    fetch_audios_from_path: Path,
+    baseline_audios_path: Path,
+    judge_model: str,
+    judger_base_url: str,
+    num_threads: int,
+    depths_to_evaluate: str,
+    categories_to_evaluate: str,
+    evaluate_function: str,
+    strong_prompting: bool,
+):
+    # Import from EmergentTTS-Eval-public (caller should add it to PYTHONPATH).
+    import inference as emergent_inference  # type: ignore
+
+    # Tell Emergent code where to find `emergent_tts_eval_data.jsonl` and `wv_mos.ckpt`.
+    os.environ["EMERGENT_TTS_DATA_BASE_PATH"] = str(emergent_data_base_path)
+
+    emergent_inference.eval_api_closed_model(
+        model_client=_NoopModelClient(),
+        accelerator=None,
+        depths_to_evaluate=depths_to_evaluate,
+        categories_to_evaluate=categories_to_evaluate,
+        seed=42,
+        output_dir=str(benchmark_dir),
+        num_samples=None,
+        baseline_audios_path=str(baseline_audios_path),
+        fetch_audios_from_path=str(fetch_audios_from_path),
+        judge_model=judge_model,
+        temperature=0.0,
+        evaluate_function=evaluate_function,
+        strong_prompting=strong_prompting,
+        judger_base_url=_normalize_openai_base_url(judger_base_url) if judger_base_url else None,
+        num_threads=num_threads,
+        model_name="nemo-skills-generated",
+    )
+
+
+def run_scoring(
+    *,
+    results_dir: str,
+    benchmark: str | None,
+    emergent_data_dir: str,
+    judge_model: str,
+    judger_base_url: str,
+    num_threads: int,
+    depths_to_evaluate: str,
+    categories_to_evaluate: str,
+    evaluate_function: str,
+    strong_prompting: bool,
+    overwrite_converted: bool,
+):
+    bdir = _benchmarks_dir(results_dir)
+    emergent_data_dir_p = Path(emergent_data_dir)
+    emergent_base = emergent_data_dir_p  # expects emergent_tts_eval_data.jsonl and wv_mos.ckpt here
+    baseline_audios = emergent_data_dir_p / "baseline_audios"
+
+    if benchmark:
+        benches = [benchmark]
+    else:
+        benches = [p.name for p in bdir.iterdir() if p.is_dir()]
+
+    for bench in sorted(benches):
+        bench_dir = bdir / bench
+        output_jsonl = bench_dir / "output.jsonl"
+        if not output_jsonl.exists():
+            print(f"Skipping {bench}: output.jsonl not found")
+            continue
+
+        # Emergent uses this naming convention for generated audio dir (see inference.py).
+        converted_audio_dir = bench_dir / "emergent-tts-eval_output-audios"
+        converted_audio_dir.mkdir(parents=True, exist_ok=True)
+        _convert(output_jsonl, converted_audio_dir, overwrite=overwrite_converted)
+
+        # Run Emergent scoring (writes emergent-tts-eval_* files into bench_dir)
+        _run_emergent_scoring(
+            benchmark_dir=bench_dir,
+            emergent_data_base_path=emergent_base,
+            fetch_audios_from_path=converted_audio_dir,
+            baseline_audios_path=baseline_audios,
+            judge_model=judge_model,
+            judger_base_url=judger_base_url,
+            num_threads=num_threads,
+            depths_to_evaluate=depths_to_evaluate,
+            categories_to_evaluate=categories_to_evaluate,
+            evaluate_function=evaluate_function,
+            strong_prompting=strong_prompting,
+        )
+
+        # Convert Emergent metrics file into `metrics.json` for NeMo-Skills conventions.
+        # Emergent prefix matches inference.py defaults when strong_prompting=False and voice_to_use=None.
+        emergent_metrics_path = bench_dir / "emergent-tts-eval_evaluation-metrics.json"
+        if emergent_metrics_path.exists():
+            with open(emergent_metrics_path, "r", encoding="utf-8") as f:
+                metrics = json.load(f)
+            with open(bench_dir / "metrics.json", "w", encoding="utf-8") as f:
+                json.dump(metrics, f, indent=2)
+            print(f"[{bench}] Saved: {bench_dir/'metrics.json'}")
+        else:
+            print(f"[{bench}] Warning: Emergent metrics file not found at {emergent_metrics_path}")
+
+
+def run_aggregation(results_dir: str):
+    bdir = _benchmarks_dir(results_dir)
+    print("\nAggregated Results (EmergentTTS-Eval):")
+    for benchmark in sorted([p.name for p in bdir.iterdir() if p.is_dir()]):
+        metrics_path = bdir / benchmark / "metrics.json"
+        if not metrics_path.exists():
+            continue
+        with open(metrics_path, "r", encoding="utf-8") as f:
+            metrics = json.load(f)
+        # Keep this minimal; Emergent metrics are keyed like eval/wer, eval/mos, eval/win_rate, etc.
+        wer = metrics.get("eval/wer")
+        mos = metrics.get("eval/mos")
+        win = metrics.get("eval/win_rate")
+        print(f"  {benchmark}:")
+        if wer is not None:
+            print(f"    WER: {wer:.4f}")
+        if mos is not None:
+            print(f"    MOS: {mos:.4f}")
+        if win is not None:
+            print(f"    Win-rate: {win:.4f}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="EmergentTTS-Eval scoring for NeMo-Skills outputs")
+    parser.add_argument("--results_dir", required=True)
+    parser.add_argument("--benchmark", default=None, help="Score only this benchmark (e.g. emergent_tts.emergent)")
+    parser.add_argument("--aggregation_only", action="store_true")
+
+    parser.add_argument(
+        "--emergent_data_dir",
+        required=False,
+        default=None,
+        help="Path containing Emergent files: emergent_tts_eval_data.jsonl, wv_mos.ckpt, baseline_audios/",
+    )
+    parser.add_argument("--judge_model", default="gcp/google/gemini-2.5-pro")
+    parser.add_argument("--judger_base_url", default="https://inference-api.nvidia.com/v1/chat/completions")
+    parser.add_argument("--num_threads", type=int, default=8)
+    parser.add_argument("--depths_to_evaluate", default="0,1,2,3")
+    parser.add_argument(
+        "--categories_to_evaluate",
+        default="Emotions,Paralinguistics,Syntactic Complexity,Foreign Words,Questions,Pronunciation",
+    )
+    parser.add_argument("--evaluate_function", default="win_rate")
+    parser.add_argument("--strong_prompting", action="store_true")
+    parser.add_argument("--overwrite_converted", action="store_true")
+    args = parser.parse_args()
+
+    if args.aggregation_only:
+        run_aggregation(args.results_dir)
+    else:
+        emergent_data_dir = args.emergent_data_dir
+        if emergent_data_dir is None:
+            # Try to derive from NEMO_SKILLS_DATA_DIR (common in cluster configs).
+            emergent_data_dir = os.environ.get("EMERGENT_TTS_DATA_BASE_PATH") or os.environ.get("NEMO_SKILLS_DATA_DIR")
+            if emergent_data_dir:
+                emergent_data_dir = str(Path(emergent_data_dir) / "emergent_tts" / "data")
+        if emergent_data_dir is None:
+            raise SystemExit("--emergent_data_dir is required (or set EMERGENT_TTS_DATA_BASE_PATH/NEMO_SKILLS_DATA_DIR)")
+
+        run_scoring(
+            results_dir=args.results_dir,
+            benchmark=args.benchmark,
+            emergent_data_dir=emergent_data_dir,
+            judge_model=args.judge_model,
+            judger_base_url=args.judger_base_url,
+            num_threads=args.num_threads,
+            depths_to_evaluate=args.depths_to_evaluate,
+            categories_to_evaluate=args.categories_to_evaluate,
+            evaluate_function=args.evaluate_function,
+            strong_prompting=args.strong_prompting,
+            overwrite_converted=args.overwrite_converted,
+        )
+

From 52b6599fe4eed9ec4bc45f4cd7d9daa676ba48b6 Mon Sep 17 00:00:00 2001
From: Valentin Mendelev <vmendelev@nvidia.com>
Date: Tue, 3 Feb 2026 06:13:48 -0800
Subject: [PATCH 25/26] Fix Emergent scoring deps and paths

Install google-genai for EmergentTTS-Eval, run scoring from the dataset base dir so relative paths resolve, and avoid shipping large local caches/data. Document EmergentTTS-Eval usage in nv_tts guide.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .gitignore                                    |  8 +++
 .../emergent_tts/scripts/config/default.yaml  |  4 +-
 .../emergent_tts/scripts/run_tts_eval.py      | 27 +++++----
 .../dataset/emergent_tts/scripts/score.py     | 45 ++++++++------
 nemo_skills/dataset/nv_tts/TTS_eval.md        | 60 +++++++++++++++++++
 5 files changed, 115 insertions(+), 29 deletions(-)

diff --git a/.gitignore b/.gitignore
index ecb9012331..e2218786e5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -31,6 +31,14 @@ build
 .venv
 *.lock
 
+# Local caches / secrets (never ship to remote via rsync)
+.ssh/
+.hf_cache/
+.nemo_run/
+
+# Emergent dataset artifacts (large; stored in shared data_dir instead)
+nemo_skills/dataset/emergent_tts/data/
+
 __pycache__
 .ipynb_checkpoints
 
diff --git a/nemo_skills/dataset/emergent_tts/scripts/config/default.yaml b/nemo_skills/dataset/emergent_tts/scripts/config/default.yaml
index 5015e4151c..9ffb01781f 100644
--- a/nemo_skills/dataset/emergent_tts/scripts/config/default.yaml
+++ b/nemo_skills/dataset/emergent_tts/scripts/config/default.yaml
@@ -43,7 +43,9 @@ scoring:
   container: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/containters/nemo-25.11.sqsh
   # Install missing Python deps at job start (runs inside the scoring container).
   # Keep this conservative: avoid upgrading core deps inside the base container.
-  installation_command: pip install editdistance whisper-normalizer json-repair tenacity
+  # EmergentTTS-Eval imports `from google import genai`, so ensure google-genai exists
+  # but install it without pulling/upgrading transitive deps (to avoid httpx/transformers churn).
+  installation_command: pip install editdistance whisper-normalizer json-repair tenacity && pip install --no-deps google-genai
 
   # Path to EmergentTTS-Eval-public on the cluster (added to PYTHONPATH)
   scoring_code_path: /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/code/EmergentTTS-Eval-public
diff --git a/nemo_skills/dataset/emergent_tts/scripts/run_tts_eval.py b/nemo_skills/dataset/emergent_tts/scripts/run_tts_eval.py
index 19d08f3497..2b12fd87b6 100644
--- a/nemo_skills/dataset/emergent_tts/scripts/run_tts_eval.py
+++ b/nemo_skills/dataset/emergent_tts/scripts/run_tts_eval.py
@@ -10,6 +10,7 @@
 
 import argparse
 import os
+from pathlib import Path
 
 import yaml
 
@@ -100,6 +101,7 @@ def main():
         emergent_data_dir = scoring.get("emergent_data_dir", "")
         install_cmd = scoring.get("installation_command")
         scoring_container = scoring.get("container") or "nemo-skills"
+        emergent_data_base_dir = str(Path(emergent_data_dir).parent) if emergent_data_dir else ""
 
         # Required by Emergent's judge clients
         judger_api_key = (
@@ -115,16 +117,17 @@ def main():
             benchmark = benchmark.strip()
             short_name = benchmark.split(".")[-1]
             score_cmd = (
-                f"JUDGER_API_KEY={judger_api_key} "
-                f"PYTHONPATH={scoring_code_path}:$PYTHONPATH "
-                f"python -m nemo_skills.dataset.emergent_tts.scripts.score "
-                f"--results_dir {output_dir} "
-                f"--benchmark {benchmark} "
-                f"--emergent_data_dir {emergent_data_dir} "
-                f"--judge_model {scoring.get('judge_model', 'gcp/google/gemini-2.5-pro')} "
-                f"--judger_base_url {scoring.get('judger_base_url', 'https://inference-api.nvidia.com/v1/chat/completions')} "
-                f"--num_threads {int(scoring.get('num_threads', 8))} "
-                f"--evaluate_function {scoring.get('evaluate_function', 'win_rate')}"
+                (f"cd {emergent_data_base_dir} && " if emergent_data_base_dir else "")
+                + f"JUDGER_API_KEY={judger_api_key} "
+                + f"PYTHONPATH={scoring_code_path}:$PYTHONPATH "
+                + "python -m nemo_skills.dataset.emergent_tts.scripts.score "
+                + f"--results_dir {output_dir} "
+                + f"--benchmark {benchmark} "
+                + f"--emergent_data_dir {emergent_data_dir} "
+                + f"--judge_model {scoring.get('judge_model', 'gcp/google/gemini-2.5-pro')} "
+                + f"--judger_base_url {scoring.get('judger_base_url', 'https://inference-api.nvidia.com/v1/chat/completions')} "
+                + f"--num_threads {int(scoring.get('num_threads', 8))} "
+                + f"--evaluate_function {scoring.get('evaluate_function', 'win_rate')}"
             )
             if scoring.get("strong_prompting"):
                 score_cmd += " --strong_prompting"
@@ -139,6 +142,9 @@ def main():
                 command=score_cmd,
                 installation_command=install_cmd,
                 run_after=run_after,
+                # Ensure we ship the current repo state for scoring jobs.
+                # (Otherwise nemo_run may reuse an older code snapshot and miss fixes.)
+                reuse_code=False,
                 expname=f"{args.expname}_score_{short_name}",
                 log_dir=f"{output_dir}/eval-logs",
             )
@@ -156,6 +162,7 @@ def main():
             num_gpus=0,
             mount_paths=cfg["mount_paths"],
             command=agg_cmd,
+            reuse_code=False,
             expname=f"{args.expname}_agg",
             log_dir=f"{output_dir}/eval-logs",
         )
diff --git a/nemo_skills/dataset/emergent_tts/scripts/score.py b/nemo_skills/dataset/emergent_tts/scripts/score.py
index ec5c77c58b..3ff0a0ca6b 100644
--- a/nemo_skills/dataset/emergent_tts/scripts/score.py
+++ b/nemo_skills/dataset/emergent_tts/scripts/score.py
@@ -93,24 +93,33 @@ def _run_emergent_scoring(
     # Tell Emergent code where to find `emergent_tts_eval_data.jsonl` and `wv_mos.ckpt`.
     os.environ["EMERGENT_TTS_DATA_BASE_PATH"] = str(emergent_data_base_path)
 
-    emergent_inference.eval_api_closed_model(
-        model_client=_NoopModelClient(),
-        accelerator=None,
-        depths_to_evaluate=depths_to_evaluate,
-        categories_to_evaluate=categories_to_evaluate,
-        seed=42,
-        output_dir=str(benchmark_dir),
-        num_samples=None,
-        baseline_audios_path=str(baseline_audios_path),
-        fetch_audios_from_path=str(fetch_audios_from_path),
-        judge_model=judge_model,
-        temperature=0.0,
-        evaluate_function=evaluate_function,
-        strong_prompting=strong_prompting,
-        judger_base_url=_normalize_openai_base_url(judger_base_url) if judger_base_url else None,
-        num_threads=num_threads,
-        model_name="nemo-skills-generated",
-    )
+    # EmergentTTS-Eval expects paths like "data/emergent_tts_eval_data.jsonl" relative
+    # to its *data base directory* (repo root). We keep the dataset in a shared path:
+    #   <...>/emergent_tts/data/{emergent_tts_eval_data.jsonl,wv_mos.ckpt,baseline_audios/}
+    # So we temporarily `chdir` into the directory that contains the "data/" folder.
+    prev_cwd = os.getcwd()
+    try:
+        os.chdir(str(emergent_data_base_path.parent))
+        emergent_inference.eval_api_closed_model(
+            model_client=_NoopModelClient(),
+            accelerator=None,
+            depths_to_evaluate=depths_to_evaluate,
+            categories_to_evaluate=categories_to_evaluate,
+            seed=42,
+            output_dir=str(benchmark_dir),
+            num_samples=None,
+            baseline_audios_path=str(baseline_audios_path),
+            fetch_audios_from_path=str(fetch_audios_from_path),
+            judge_model=judge_model,
+            temperature=0.0,
+            evaluate_function=evaluate_function,
+            strong_prompting=strong_prompting,
+            judger_base_url=_normalize_openai_base_url(judger_base_url) if judger_base_url else None,
+            num_threads=num_threads,
+            model_name="nemo-skills-generated",
+        )
+    finally:
+        os.chdir(prev_cwd)
 
 
 def run_scoring(
diff --git a/nemo_skills/dataset/nv_tts/TTS_eval.md b/nemo_skills/dataset/nv_tts/TTS_eval.md
index 73864defd7..32dfff6a20 100644
--- a/nemo_skills/dataset/nv_tts/TTS_eval.md
+++ b/nemo_skills/dataset/nv_tts/TTS_eval.md
@@ -49,6 +49,66 @@ This will prepare `test.jsonl` for each benchmark with pointers to the files on
 /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/tmp/data_dir
 ```
 
+### EmergentTTS-Eval (new test set)
+
+EmergentTTS-Eval does **not** rely on cluster-local pre-existing audio paths. Instead, it is prepared by downloading the dataset and writing the NeMo-Skills `test.jsonl` + Emergent artifacts into your shared `data_dir`.
+
+**Prerequisites:**
+- Your environment/container must have Python deps required for dataset prep: `datasets`, `numpy`, `pydub`, `tqdm`.
+- `prepare.py` downloads from HuggingFace + Zenodo, so it must run in an environment with network access.
+
+**Prepare EmergentTTS-Eval under your shared data_dir:**
+
+```bash
+cd /home/vmendelev/workspace/expressiveness/src/ns_eval && source .venv/bin/activate && \
+python nemo_skills/dataset/emergent_tts/prepare.py \
+  --output_dir <data_dir>/emergent_tts
+```
+
+Optional flags:
+- `--num_samples 10` (debug: only write first 10)
+- `--overwrite` (re-download and regenerate outputs)
+
+This creates:
+- `<data_dir>/emergent_tts/emergent/test.jsonl` (NeMo-Skills generation input)
+- `<data_dir>/emergent_tts/data/emergent_tts_eval_data.jsonl`
+- `<data_dir>/emergent_tts/data/baseline_audios/<unique_id_eval>.wav`
+- `<data_dir>/emergent_tts/data/wv_mos.ckpt`
+
+**Run Emergent generation/scoring:**
+
+- Use the example config: `nemo_skills/dataset/emergent_tts/scripts/config/default.yaml`
+- Set in the config:
+  - `generation.data_dir: <data_dir>`
+  - `scoring.emergent_data_dir: <data_dir>/emergent_tts/data`
+  - `scoring.scoring_code_path: <path_to_EmergentTTS-Eval-public>`
+
+```bash
+cd /home/vmendelev/workspace/expressiveness/src/ns_eval && source .venv/bin/activate && \
+NEMO_SKILLS_DISABLE_UNCOMMITTED_CHANGES_CHECK=1 \
+python -m nemo_skills.dataset.emergent_tts.scripts.run_tts_eval \
+  --config nemo_skills/dataset/emergent_tts/scripts/config/default.yaml \
+  --stage all \
+  --expname emergent_eval
+```
+
+**Required env vars (Emergent scoring):**
+
+```bash
+export JUDGER_API_KEY=<your_nvidia_inference_api_key>
+```
+
+**Verification flow (recommended):**
+- **Generation-only smoke test (10 samples)**:
+  - set `partition: interactive`, `generation.num_chunks: 1`
+  - set `generation.extra_args: "++max_samples=10 ++server.server_type=vllm_multimodal"`
+  - run: `python -m nemo_skills.dataset.emergent_tts.scripts.run_tts_eval --config <cfg> --stage generation`
+- **Full run (~1645)**:
+  - restore `partition: batch` and your usual `generation.num_chunks`
+  - run: `python -m nemo_skills.dataset.emergent_tts.scripts.run_tts_eval --config <cfg> --stage all`
+- **Scoring-only**:
+  - run: `python -m nemo_skills.dataset.emergent_tts.scripts.run_tts_eval --config <cfg> --stage scoring`
+
 ### 4. Configuration Files
 
 Review the config file and ensure all required artifacts are in the specified locations:

From 88bd09cde4d290599ee32667044a90fcd81e4ab6 Mon Sep 17 00:00:00 2001
From: Valentin Mendelev <vmendelev@nvidia.com>
Date: Fri, 6 Feb 2026 02:23:50 -0800
Subject: [PATCH 26/26] Add emergent_tts README

Document dataset preparation (HF_TOKEN) and evaluation workflow, including cloning and patching EmergentTTS-Eval for NVIDIA Inference API judging.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 nemo_skills/dataset/emergent_tts/README.md | 124 +++++++++++++++++++++
 1 file changed, 124 insertions(+)
 create mode 100644 nemo_skills/dataset/emergent_tts/README.md

diff --git a/nemo_skills/dataset/emergent_tts/README.md b/nemo_skills/dataset/emergent_tts/README.md
new file mode 100644
index 0000000000..140dbb5533
--- /dev/null
+++ b/nemo_skills/dataset/emergent_tts/README.md
@@ -0,0 +1,124 @@
+## EmergentTTS-Eval dataset (`emergent_tts`)
+
+This dataset integration lets you:
+
+- **Prepare** the EmergentTTS-Eval test set under a shared `data_dir` (download baseline audios + metadata + MOS model).
+- **Generate** TTS outputs with NeMo-Skills (`ns eval` via `run_tts_eval.py`).
+- **Score** the generated outputs with EmergentTTS-Eval (WER/MOS/win-rate, depending on config).
+
+### 1) Prepare the test set (requires `HF_TOKEN`)
+
+`prepare.py` downloads the dataset and writes all required artifacts into:
+
+- `<DATA_DIR>/emergent_tts/emergent/test.jsonl`
+- `<DATA_DIR>/emergent_tts/data/emergent_tts_eval_data.jsonl`
+- `<DATA_DIR>/emergent_tts/data/baseline_audios/*.wav`
+- `<DATA_DIR>/emergent_tts/data/wv_mos.ckpt`
+
+Run it from your dev machine (or any environment with network access):
+
+```bash
+cd /home/vmendelev/workspace/expressiveness/src/nemo-skills-tts-eval
+. ./.venv/bin/activate
+
+export HF_TOKEN="<your_hf_token>"
+
+python nemo_skills/dataset/emergent_tts/prepare.py \
+  --output_dir "<DATA_DIR>/emergent_tts"
+```
+
+Optional flags:
+
+- `--num_samples 10`: write only the first 10 samples (smoke test).
+- `--overwrite`: re-download / regenerate outputs.
+
+### 2) Configure evaluation
+
+Use the example configs in `nemo_skills/dataset/emergent_tts/scripts/config/`.
+
+In `scripts/config/default.yaml`, set:
+
+- `generation.data_dir: <DATA_DIR>`
+- `scoring.emergent_data_dir: <DATA_DIR>/emergent_tts/data`
+- `scoring.scoring_code_path: <PATH_TO>/EmergentTTS-Eval-public` (on the cluster)
+
+### 3) Clone + patch EmergentTTS-Eval-public for NVIDIA Inference API judging
+
+On EOS (or wherever you run scoring), clone EmergentTTS-Eval:
+
+```bash
+cd /lustre/fsw/llmservice_nemo_speechlm/users/vmendelev/code
+git clone <repo_url> EmergentTTS-Eval-public
+```
+
+Then update Emergent’s judge client selection so that **Gemini models are called via NVIDIA’s OpenAI-compatible Inference API**.
+
+Target behavior:
+
+- **Model name** stays as: `gcp/google/gemini-2.5-pro` (or similar).
+- **Base URL** is NVIDIA Inference API: `https://inference-api.nvidia.com/v1`
+- **API key** comes from: `JUDGER_API_KEY` (or `NVIDIA_API_KEY`)
+
+Minimal patch checklist inside `EmergentTTS-Eval-public`:
+
+- In `api_clients.py` (or wherever the client is chosen), ensure `gcp/google/*` uses an **OpenAI-compatible** client (not the Google SDK client), e.g.:
+  - `OpenAI(base_url=<judger_base_url>, api_key=os.getenv("JUDGER_API_KEY"))`
+- Thread `judger_base_url` through so calls use `https://inference-api.nvidia.com/v1` (not the full `/v1/chat/completions` endpoint).
+
+After patching, set these in `scripts/config/default.yaml`:
+
+- `scoring.judge_model: gcp/google/gemini-2.5-pro`
+- `scoring.judger_base_url: https://inference-api.nvidia.com/v1/chat/completions`
+
+### 3) Run evaluation (generation + scoring)
+
+From your dev machine, submit jobs to EOS:
+
+```bash
+cd /home/vmendelev/workspace/expressiveness/src/nemo-skills-tts-eval
+. ./.venv/bin/activate
+mkdir -p .nemo_run
+
+export NEMORUN_HOME="$PWD/.nemo_run"
+export NEMO_SKILLS_CONFIG_DIR=/home/vmendelev/workspace/expressiveness/src/ns_eval/cluster_configs
+export NEMO_SKILLS_DISABLE_UNCOMMITTED_CHANGES_CHECK=1
+
+# Required for win-rate judging (NVIDIA Inference API key)
+export JUDGER_API_KEY="<your_nvidia_api_key>"
+
+python -m nemo_skills.dataset.emergent_tts.scripts.run_tts_eval \
+  --config nemo_skills/dataset/emergent_tts/scripts/config/default.yaml \
+  --stage all \
+  --expname emergent_eval
+```
+
+### 4) Smoke test (10 samples, interactive)
+
+```bash
+cd /home/vmendelev/workspace/expressiveness/src/nemo-skills-tts-eval
+. ./.venv/bin/activate
+mkdir -p .nemo_run
+
+export NEMORUN_HOME="$PWD/.nemo_run"
+export NEMO_SKILLS_CONFIG_DIR=/home/vmendelev/workspace/expressiveness/src/ns_eval/cluster_configs
+export NEMO_SKILLS_DISABLE_UNCOMMITTED_CHANGES_CHECK=1
+
+python -m nemo_skills.dataset.emergent_tts.scripts.run_tts_eval \
+  --config nemo_skills/dataset/emergent_tts/scripts/config/interactive_10.yaml \
+  --stage generation \
+  --expname emergent_smoke10
+```
+
+### Outputs
+
+NeMo-Skills generation writes:
+
+- `<output_dir>/eval-results/emergent_tts.emergent/output.jsonl`
+- `<output_dir>/eval-results/emergent_tts.emergent/audio/*.wav` (or equivalent)
+
+Emergent scoring writes (in the same benchmark folder):
+
+- `emergent-tts-eval_*_evaluation-predictions.jsonl`
+- `emergent-tts-eval_*_evaluation-metrics.json`
+- `metrics.json` (a NeMo-Skills-friendly copy of Emergent metrics)
+