NVIDIA-NeMo · Jorjeous · Feb 12, 2026 · Feb 13, 2026 · coderabbitai · Feb 12, 2026
diff --git a/nemo_skills/dataset/eval_kit/__init__.py b/nemo_skills/dataset/eval_kit/__init__.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# VLMEvalKit integration module.
+# Benchmarks are referenced as eval_kit.<VLMEvalKit_dataset_name>, e.g. eval_kit.MMBench_DEV_EN
+# The sub-benchmark name after eval_kit. is dynamically resolved and passed to VLMEvalKit.
+
+GENERATION_MODULE = "nemo_skills.inference.eval.eval_kit"
+METRICS_TYPE = "eval_kit"
+GENERATION_ARGS = ""
+NUM_SAMPLES = 0  # VLMEvalKit inference is deterministic; no random seeds
+
+# No JSONL input file; VLMEvalKit manages its own data via build_dataset()
+SKIP_INPUT_FILE = True
+
+# Note: SELF_CONTAINED_TASK is NOT set here because it depends on model_type.
+# For mcore mode (Megatron in-process), the pipeline sets self_contained_task=True
+# at runtime based on ++model_type=mcore in extra_arguments.
+# For vllm mode, the standard NeMo Skills server/client flow is used.
+
+
+def get_extra_generation_args(benchmark):
+    """Return extra generation args for the given benchmark name.
+
+    Extracts the VLMEvalKit dataset name from the dotted benchmark name
+    (e.g. eval_kit.MMBench_DEV_EN -> ++vlm_dataset=MMBench_DEV_EN).
+    """
+    if "." in benchmark:
+        sub = benchmark.split(".", 1)[1]
+        return f" ++vlm_dataset={sub} "
+    return ""
diff --git a/nemo_skills/dataset/utils.py b/nemo_skills/dataset/utils.py
@@ -88,7 +88,21 @@ def _get_dataset_module_from_cluster(cluster_config, mounted_path):
 
 
 def get_default_dataset_module(dataset, data_dir=None, cluster_config=None):
+    """Return (dataset_module, data_path, is_on_cluster)."""
     is_on_cluster = False
+
+    # For dotted names like eval_kit.MMBench_DEV_EN, import the parent package.
+    # The sub-benchmark part is handled by the module's get_extra_generation_args().
+    if dataset.startswith("eval_kit."):
+        dataset_module = importlib.import_module("nemo_skills.dataset.eval_kit")
+        if data_dir is None:
+            data_path = "/nemo_run/code/nemo_skills/dataset"
+        else:
+            data_path = data_dir
+        if cluster_config is not None and cluster_config["executor"] == "slurm":
+            is_on_cluster = True
+        return dataset_module, data_path, is_on_cluster
+
     if data_dir is None:
         data_path = "/nemo_run/code/nemo_skills/dataset"
         dataset_module = importlib.import_module(f"nemo_skills.dataset.{dataset}")
@@ -121,9 +135,11 @@ def get_dataset_module(dataset, data_dir=None, cluster_config=None, extra_datase
     1. data_dir (or `nemo_skills.dataset` if None) folder
     3. extra_datasets parameter if defined
     4. `NEMO_SKILLS_EXTRA_DATASETS` environment variable
+
+    Returns (module, data_path, is_on_cluster).
     """
     try:
-        dataset_module, data_path, is_on_cluster = get_default_dataset_module(dataset, data_dir, cluster_config)
+        return get_default_dataset_module(dataset, data_dir, cluster_config)
     except ModuleNotFoundError:
         try:
             dataset = dataset.replace(".", "/")

diff --git a/nemo_skills/evaluation/evaluator/__init__.py b/nemo_skills/evaluation/evaluator/__init__.py
@@ -27,7 +27,10 @@
     eval_livebench_coding,
     eval_livecodebench_pro,
 )
-from nemo_skills.evaluation.evaluator.compute_eval import ComputeEvalEvaluator
+try:
+    from nemo_skills.evaluation.evaluator.compute_eval import ComputeEvalEvaluator
+except ImportError:
+    ComputeEvalEvaluator = None
-try:
-    from nemo_skills.evaluation.evaluator.compute_eval import ComputeEvalEvaluator
-except ImportError:
-    ComputeEvalEvaluator = None
+_compute_eval_import_error = None
+try:
+    from nemo_skills.evaluation.evaluator.compute_eval import ComputeEvalEvaluator
+except ImportError as e:
+    ComputeEvalEvaluator = None
+    _compute_eval_import_error = e
-try:
-    from nemo_skills.evaluation.evaluator.compute_eval import ComputeEvalEvaluator
-except ImportError:
-    ComputeEvalEvaluator = None
+_compute_eval_import_error = None
+try:
+    from nemo_skills.evaluation.evaluator.compute_eval import ComputeEvalEvaluator
+except ImportError as e:
+    ComputeEvalEvaluator = None
+    _compute_eval_import_error = e
 from nemo_skills.evaluation.evaluator.icpc import ICPCEvaluator
 from nemo_skills.evaluation.evaluator.ifbench import eval_ifbench
 from nemo_skills.evaluation.evaluator.ifeval import eval_if
@@ -71,8 +74,9 @@
     "icpc": ICPCEvaluator,
     "audio": AudioEvaluator,
     "bird": BirdEvaluator,
-    "compute-eval": ComputeEvalEvaluator,
 }
+if ComputeEvalEvaluator is not None:
+    EVALUATOR_CLASS_MAP["compute-eval"] = ComputeEvalEvaluator
 
 # Validation: Ensure no overlap between class and function maps
 _class_types = set(EVALUATOR_CLASS_MAP.keys())

diff --git a/nemo_skills/evaluation/evaluator/audio.py b/nemo_skills/evaluation/evaluator/audio.py
@@ -496,16 +496,23 @@ def evaluate_sample(sample: dict[str, Any], config: AudioEvaluatorConfig) -> dic
     if config.strip_helpful_prefixes:
         generation = strip_helpful_prefixes(generation)
 
-    if task_type in ["ASR", "ASR-PC", "ASR_LEADERBOARD", "AST", "Translation", "CER"] and not generation:
+    # Normalise AudioBench speech-translation task types (ST-EN-ZH → Translation)
+    _ASR_TYPES = {"ASR", "ASR-ZH", "ASR-PC", "ASR_LEADERBOARD"}
+    _TRANSLATION_TYPES = {"AST", "Translation"}
+    # AudioBench speech translation types: ST-{src}-{tgt}
+    if task_type.startswith("ST-"):
+        _TRANSLATION_TYPES.add(task_type)
+
+    if task_type in (_ASR_TYPES | _TRANSLATION_TYPES | {"CER"}) and not generation:
         base = {
             "is_correct": False,
             "error": "missing_generation",
         }
-        if task_type in ["AST", "Translation"]:
+        if task_type in _TRANSLATION_TYPES:
             return {**base, "bleu": 0.0}
         if task_type == "CER":
             return {**base, "cer": 1.0}
-        # ASR / ASR-PC
+        # ASR / ASR-PC / ASR-ZH
         return {**base, "wer": 1.0}
 
     if task_type == "ASR-PC":
@@ -518,7 +525,7 @@ def evaluate_sample(sample: dict[str, Any], config: AudioEvaluatorConfig) -> dic
         )
         updates.update(metrics)
 
-    elif task_type == "ASR":
+    elif task_type in {"ASR", "ASR-ZH"}:
         mode = config.normalization_mode if config.apply_whisper_normalization else "none"
         metrics = evaluate_asr(expected_answer, generation, normalization_mode=mode)
         updates.update(metrics)
@@ -530,7 +537,7 @@ def evaluate_sample(sample: dict[str, Any], config: AudioEvaluatorConfig) -> dic
         metrics = evaluate_asr(expected_answer, generation, normalization_mode=mode)
         updates.update(metrics)
 
-    elif task_type in ["AST", "Translation"]:
+    elif task_type in _TRANSLATION_TYPES:
         metrics = evaluate_translation(expected_answer, generation)
         updates.update(metrics)
 
@@ -547,6 +554,13 @@ def evaluate_sample(sample: dict[str, Any], config: AudioEvaluatorConfig) -> dic
         metrics = evaluate_pc_rate(expected_answer, generation)
         updates.update(metrics)
 
+    elif task_type == "MathQA":
+        # AudioBench MathQA: exact string match after normalization
+        gen_norm = generation.strip().lower()
+        ref_norm = expected_answer.strip().lower()
+        updates["is_correct"] = gen_norm == ref_norm
+        updates["predicted_answer"] = generation
+
     else:
         if "requires_judge" not in sample:
             updates["requires_judge"] = True

diff --git a/nemo_skills/evaluation/metrics/compute_metrics.py b/nemo_skills/evaluation/metrics/compute_metrics.py
@@ -38,7 +38,7 @@ def __init__(
         self.metric_type = metric_type
         self.max_seq_len = max_seq_len
         if self.metric_type is None:
-            benchmark_module, _, _ = get_dataset_module(
+            benchmark_module, *_ = get_dataset_module(
                 benchmark,
                 data_dir=data_dir,
                 cluster_config=cluster_config,

diff --git a/nemo_skills/evaluation/metrics/eval_kit_metrics.py b/nemo_skills/evaluation/metrics/eval_kit_metrics.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from collections import defaultdict
+from pathlib import Path
+
+from nemo_skills.evaluation.metrics.base import BaseMetrics
+
+
+class EvalKitMetrics(BaseMetrics):
+    """Metrics class for VLMEvalKit benchmarks.
+
+    VLMEvalKit computes its own aggregate metrics during evaluation.
+    This class reads pre-computed aggregates from eval_kit_metrics.json
+    (written by EvalKitGenerationTask) rather than computing per-sample metrics.
+    The per-sample JSONL is still read by ComputeMetrics for the update() loop,
+    but we only count entries here -- the real metrics come from the JSON file.
+
+    Note: ComputeMetrics only calls setup() on the "_all_" calculator.  When
+    the data contains ``subset_for_metrics``, additional per-subset calculator
+    instances are created but never receive a setup() call.  We use a
+    class-level ``_shared_metrics_file`` so that those subset instances can
+    still locate the eval_kit_metrics.json discovered by the "_all_" instance.
+    """
+
+    # Shared across all instances so subset calculators can find the file
+    # even though only the "_all_" calculator receives setup().
+    _shared_metrics_file: Path | None = None
+
+    def __init__(self, **kwargs):
+        super().__init__(compute_no_answer=False)
+        self.eval_kit_metrics_file = None
+
+    def setup(self, input_files):
+        """Find the eval_kit_metrics.json in the same directory as the input files."""
+        if input_files:
+            # input_files are like ['/path/to/eval-results/eval_kit.MMBench_DEV_EN/output.jsonl']
+            metrics_dir = Path(input_files[0]).parent
+            candidate = metrics_dir / "eval_kit_metrics.json"
+            if candidate.exists():
+                self.eval_kit_metrics_file = candidate
+                EvalKitMetrics._shared_metrics_file = candidate
+
+    def update(self, predictions):
+        """Count entries but don't compute per-sample metrics."""
+        self.total += 1
+
+    def get_metrics(self):
+        """Return pre-computed VLMEvalKit aggregate metrics."""
+        metrics_dict = {}
+
+        # Load pre-computed metrics from VLMEvalKit.
+        # Fall back to the class-level shared file for subset calculators
+        # that never received a setup() call.
+        eval_kit_results = {}
+        effective_file = self.eval_kit_metrics_file or EvalKitMetrics._shared_metrics_file
+        if effective_file and effective_file.exists():
+            with open(effective_file, "rt", encoding="utf-8") as f:
+                eval_kit_results = json.load(f)
+
+        # Build the metrics in NeMo Skills format
+        agg_dict = {"num_entries": self.total}
+
+        # Flatten VLMEvalKit results into the metrics dict
+        for key, value in eval_kit_results.items():
+            if isinstance(value, dict):
+                # Nested results (e.g., per-category scores)
+                for sub_key, sub_value in value.items():
+                    if isinstance(sub_value, (int, float)):
+                        agg_dict[f"{key}_{sub_key}"] = sub_value
+            elif isinstance(value, (int, float)):
+                agg_dict[key] = value
+
+        metrics_dict["greedy"] = agg_dict
+        return metrics_dict
+
+    def metrics_to_print(self):
+        return None
+
+    def evaluations_to_print(self):
+        return ["greedy"]
diff --git a/nemo_skills/evaluation/metrics/map_metrics.py b/nemo_skills/evaluation/metrics/map_metrics.py
@@ -31,6 +31,7 @@
     SciCodeMetrics,
     SweBenchMetrics,
 )
+from nemo_skills.evaluation.metrics.eval_kit_metrics import EvalKitMetrics
 from nemo_skills.evaluation.metrics.gradingbench_metrics import GradingBenchMetrics
 from nemo_skills.evaluation.metrics.hleaa_metrics import HLEAAMetrics
 from nemo_skills.evaluation.metrics.icpc_metrics import ICPCMetrics
@@ -84,6 +85,7 @@
     "omniscience": OmniMetrics,
     "compute-eval": ComputeEvalMetrics,
     "gradingbench": GradingBenchMetrics,
+    "eval_kit": EvalKitMetrics,
 }
 
 

diff --git a/nemo_skills/evaluation/metrics/translation_metrics.py b/nemo_skills/evaluation/metrics/translation_metrics.py
@@ -16,7 +16,6 @@
 from collections import defaultdict
 
 import numpy as np
-from sacrebleu import corpus_bleu
 
 from nemo_skills.evaluation.metrics.base import BaseMetrics, as_float
 
@@ -35,6 +34,8 @@ class TranslationMetrics(BaseMetrics):
     # TODO: add support for other translation metrics, such as MetricX
 
     def get_metrics(self):
+        from sacrebleu import corpus_bleu
+
         metrics_dict = {}
         for key in self.translation_dict:
             src_lang, tgt_lang = key.split("->")