Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions nemo_skills/dataset/eval_kit/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# VLMEvalKit integration module.
# Benchmarks are referenced as eval_kit.<VLMEvalKit_dataset_name>, e.g. eval_kit.MMBench_DEV_EN
# The sub-benchmark name after eval_kit. is dynamically resolved and passed to VLMEvalKit.

GENERATION_MODULE = "nemo_skills.inference.eval.eval_kit"
METRICS_TYPE = "eval_kit"
GENERATION_ARGS = ""
NUM_SAMPLES = 0 # VLMEvalKit inference is deterministic; no random seeds

# No JSONL input file; VLMEvalKit manages its own data via build_dataset()
SKIP_INPUT_FILE = True

# Note: SELF_CONTAINED_TASK is NOT set here because it depends on model_type.
# For mcore mode (Megatron in-process), the pipeline sets self_contained_task=True
# at runtime based on ++model_type=mcore in extra_arguments.
# For vllm mode, the standard NeMo Skills server/client flow is used.


def get_extra_generation_args(benchmark):
"""Return extra generation args for the given benchmark name.

Extracts the VLMEvalKit dataset name from the dotted benchmark name
(e.g. eval_kit.MMBench_DEV_EN -> ++vlm_dataset=MMBench_DEV_EN).
"""
if "." in benchmark:
sub = benchmark.split(".", 1)[1]
return f" ++vlm_dataset={sub} "
return ""
18 changes: 17 additions & 1 deletion nemo_skills/dataset/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,21 @@ def _get_dataset_module_from_cluster(cluster_config, mounted_path):


def get_default_dataset_module(dataset, data_dir=None, cluster_config=None):
"""Return (dataset_module, data_path, is_on_cluster)."""
is_on_cluster = False

# For dotted names like eval_kit.MMBench_DEV_EN, import the parent package.
# The sub-benchmark part is handled by the module's get_extra_generation_args().
if dataset.startswith("eval_kit."):
dataset_module = importlib.import_module("nemo_skills.dataset.eval_kit")
if data_dir is None:
data_path = "/nemo_run/code/nemo_skills/dataset"
else:
data_path = data_dir
if cluster_config is not None and cluster_config["executor"] == "slurm":
is_on_cluster = True
return dataset_module, data_path, is_on_cluster

if data_dir is None:
data_path = "/nemo_run/code/nemo_skills/dataset"
dataset_module = importlib.import_module(f"nemo_skills.dataset.{dataset}")
Expand Down Expand Up @@ -121,9 +135,11 @@ def get_dataset_module(dataset, data_dir=None, cluster_config=None, extra_datase
1. data_dir (or `nemo_skills.dataset` if None) folder
3. extra_datasets parameter if defined
4. `NEMO_SKILLS_EXTRA_DATASETS` environment variable

Returns (module, data_path, is_on_cluster).
"""
try:
dataset_module, data_path, is_on_cluster = get_default_dataset_module(dataset, data_dir, cluster_config)
return get_default_dataset_module(dataset, data_dir, cluster_config)
except ModuleNotFoundError:
try:
dataset = dataset.replace(".", "/")
Expand Down
8 changes: 6 additions & 2 deletions nemo_skills/evaluation/evaluator/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,10 @@
eval_livebench_coding,
eval_livecodebench_pro,
)
from nemo_skills.evaluation.evaluator.compute_eval import ComputeEvalEvaluator
try:
from nemo_skills.evaluation.evaluator.compute_eval import ComputeEvalEvaluator
except ImportError:
ComputeEvalEvaluator = None
Comment on lines +30 to +33
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Silent import suppression will produce a confusing error when compute-eval is actually requested.

If the dependency is missing and a user requests the "compute-eval" evaluator, they'll see "Evaluator class not found for type: compute-eval" (from get_evaluator_class) instead of a clear message about the missing import. Consider deferring the error to the point of use so it's actionable.

Suggested approach
-try:
-    from nemo_skills.evaluation.evaluator.compute_eval import ComputeEvalEvaluator
-except ImportError:
-    ComputeEvalEvaluator = None
+_compute_eval_import_error = None
+try:
+    from nemo_skills.evaluation.evaluator.compute_eval import ComputeEvalEvaluator
+except ImportError as e:
+    ComputeEvalEvaluator = None
+    _compute_eval_import_error = e

Then in get_evaluator_class (or at registration lookup time), surface _compute_eval_import_error when eval_type == "compute-eval" and the class is None.

As per coding guidelines, "Do not catch exceptions when they are not normally expected to be raised; let code fail with clear errors instead of silently misbehaving."

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
try:
from nemo_skills.evaluation.evaluator.compute_eval import ComputeEvalEvaluator
except ImportError:
ComputeEvalEvaluator = None
_compute_eval_import_error = None
try:
from nemo_skills.evaluation.evaluator.compute_eval import ComputeEvalEvaluator
except ImportError as e:
ComputeEvalEvaluator = None
_compute_eval_import_error = e
🤖 Prompt for AI Agents
In `@nemo_skills/evaluation/evaluator/__init__.py` around lines 30 - 33, The
current top-level try/except hides import errors for ComputeEvalEvaluator;
instead capture the ImportError into a module-level variable (e.g.
_compute_eval_import_error) and set ComputeEvalEvaluator = None, then in
get_evaluator_class (or the evaluator registration lookup) check if eval_type ==
"compute-eval" and if ComputeEvalEvaluator is None raise a clear ImportError
that includes _compute_eval_import_error; this defers the failure to the point
of use and gives an actionable message when someone requests the "compute-eval"
evaluator.

from nemo_skills.evaluation.evaluator.icpc import ICPCEvaluator
from nemo_skills.evaluation.evaluator.ifbench import eval_ifbench
from nemo_skills.evaluation.evaluator.ifeval import eval_if
Expand Down Expand Up @@ -71,8 +74,9 @@
"icpc": ICPCEvaluator,
"audio": AudioEvaluator,
"bird": BirdEvaluator,
"compute-eval": ComputeEvalEvaluator,
}
if ComputeEvalEvaluator is not None:
EVALUATOR_CLASS_MAP["compute-eval"] = ComputeEvalEvaluator

# Validation: Ensure no overlap between class and function maps
_class_types = set(EVALUATOR_CLASS_MAP.keys())
Expand Down
24 changes: 19 additions & 5 deletions nemo_skills/evaluation/evaluator/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,16 +496,23 @@ def evaluate_sample(sample: dict[str, Any], config: AudioEvaluatorConfig) -> dic
if config.strip_helpful_prefixes:
generation = strip_helpful_prefixes(generation)

if task_type in ["ASR", "ASR-PC", "ASR_LEADERBOARD", "AST", "Translation", "CER"] and not generation:
# Normalise AudioBench speech-translation task types (ST-EN-ZH → Translation)
_ASR_TYPES = {"ASR", "ASR-ZH", "ASR-PC", "ASR_LEADERBOARD"}
_TRANSLATION_TYPES = {"AST", "Translation"}
# AudioBench speech translation types: ST-{src}-{tgt}
if task_type.startswith("ST-"):
_TRANSLATION_TYPES.add(task_type)

if task_type in (_ASR_TYPES | _TRANSLATION_TYPES | {"CER"}) and not generation:
base = {
"is_correct": False,
"error": "missing_generation",
}
if task_type in ["AST", "Translation"]:
if task_type in _TRANSLATION_TYPES:
return {**base, "bleu": 0.0}
if task_type == "CER":
return {**base, "cer": 1.0}
# ASR / ASR-PC
# ASR / ASR-PC / ASR-ZH
return {**base, "wer": 1.0}

if task_type == "ASR-PC":
Expand All @@ -518,7 +525,7 @@ def evaluate_sample(sample: dict[str, Any], config: AudioEvaluatorConfig) -> dic
)
updates.update(metrics)

elif task_type == "ASR":
elif task_type in {"ASR", "ASR-ZH"}:
mode = config.normalization_mode if config.apply_whisper_normalization else "none"
metrics = evaluate_asr(expected_answer, generation, normalization_mode=mode)
updates.update(metrics)
Expand All @@ -530,7 +537,7 @@ def evaluate_sample(sample: dict[str, Any], config: AudioEvaluatorConfig) -> dic
metrics = evaluate_asr(expected_answer, generation, normalization_mode=mode)
updates.update(metrics)

elif task_type in ["AST", "Translation"]:
elif task_type in _TRANSLATION_TYPES:
metrics = evaluate_translation(expected_answer, generation)
updates.update(metrics)

Expand All @@ -547,6 +554,13 @@ def evaluate_sample(sample: dict[str, Any], config: AudioEvaluatorConfig) -> dic
metrics = evaluate_pc_rate(expected_answer, generation)
updates.update(metrics)

elif task_type == "MathQA":
# AudioBench MathQA: exact string match after normalization
gen_norm = generation.strip().lower()
ref_norm = expected_answer.strip().lower()
updates["is_correct"] = gen_norm == ref_norm
updates["predicted_answer"] = generation

else:
if "requires_judge" not in sample:
updates["requires_judge"] = True
Expand Down
2 changes: 1 addition & 1 deletion nemo_skills/evaluation/metrics/compute_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def __init__(
self.metric_type = metric_type
self.max_seq_len = max_seq_len
if self.metric_type is None:
benchmark_module, _, _ = get_dataset_module(
benchmark_module, *_ = get_dataset_module(
benchmark,
data_dir=data_dir,
cluster_config=cluster_config,
Expand Down
93 changes: 93 additions & 0 deletions nemo_skills/evaluation/metrics/eval_kit_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
from collections import defaultdict
from pathlib import Path

from nemo_skills.evaluation.metrics.base import BaseMetrics


class EvalKitMetrics(BaseMetrics):
"""Metrics class for VLMEvalKit benchmarks.

VLMEvalKit computes its own aggregate metrics during evaluation.
This class reads pre-computed aggregates from eval_kit_metrics.json
(written by EvalKitGenerationTask) rather than computing per-sample metrics.
The per-sample JSONL is still read by ComputeMetrics for the update() loop,
but we only count entries here -- the real metrics come from the JSON file.

Note: ComputeMetrics only calls setup() on the "_all_" calculator. When
the data contains ``subset_for_metrics``, additional per-subset calculator
instances are created but never receive a setup() call. We use a
class-level ``_shared_metrics_file`` so that those subset instances can
still locate the eval_kit_metrics.json discovered by the "_all_" instance.
"""

# Shared across all instances so subset calculators can find the file
# even though only the "_all_" calculator receives setup().
_shared_metrics_file: Path | None = None

def __init__(self, **kwargs):
super().__init__(compute_no_answer=False)
self.eval_kit_metrics_file = None

def setup(self, input_files):
"""Find the eval_kit_metrics.json in the same directory as the input files."""
if input_files:
# input_files are like ['/path/to/eval-results/eval_kit.MMBench_DEV_EN/output.jsonl']
metrics_dir = Path(input_files[0]).parent
candidate = metrics_dir / "eval_kit_metrics.json"
if candidate.exists():
self.eval_kit_metrics_file = candidate
EvalKitMetrics._shared_metrics_file = candidate

def update(self, predictions):
"""Count entries but don't compute per-sample metrics."""
self.total += 1

def get_metrics(self):
"""Return pre-computed VLMEvalKit aggregate metrics."""
metrics_dict = {}

# Load pre-computed metrics from VLMEvalKit.
# Fall back to the class-level shared file for subset calculators
# that never received a setup() call.
eval_kit_results = {}
effective_file = self.eval_kit_metrics_file or EvalKitMetrics._shared_metrics_file
if effective_file and effective_file.exists():
with open(effective_file, "rt", encoding="utf-8") as f:
eval_kit_results = json.load(f)

# Build the metrics in NeMo Skills format
agg_dict = {"num_entries": self.total}

# Flatten VLMEvalKit results into the metrics dict
for key, value in eval_kit_results.items():
if isinstance(value, dict):
# Nested results (e.g., per-category scores)
for sub_key, sub_value in value.items():
if isinstance(sub_value, (int, float)):
agg_dict[f"{key}_{sub_key}"] = sub_value
elif isinstance(value, (int, float)):
agg_dict[key] = value

metrics_dict["greedy"] = agg_dict
return metrics_dict

def metrics_to_print(self):
return None

def evaluations_to_print(self):
return ["greedy"]
2 changes: 2 additions & 0 deletions nemo_skills/evaluation/metrics/map_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
SciCodeMetrics,
SweBenchMetrics,
)
from nemo_skills.evaluation.metrics.eval_kit_metrics import EvalKitMetrics
from nemo_skills.evaluation.metrics.gradingbench_metrics import GradingBenchMetrics
from nemo_skills.evaluation.metrics.hleaa_metrics import HLEAAMetrics
from nemo_skills.evaluation.metrics.icpc_metrics import ICPCMetrics
Expand Down Expand Up @@ -84,6 +85,7 @@
"omniscience": OmniMetrics,
"compute-eval": ComputeEvalMetrics,
"gradingbench": GradingBenchMetrics,
"eval_kit": EvalKitMetrics,
}


Expand Down
3 changes: 2 additions & 1 deletion nemo_skills/evaluation/metrics/translation_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from collections import defaultdict

import numpy as np
from sacrebleu import corpus_bleu

from nemo_skills.evaluation.metrics.base import BaseMetrics, as_float

Expand All @@ -35,6 +34,8 @@ class TranslationMetrics(BaseMetrics):
# TODO: add support for other translation metrics, such as MetricX

def get_metrics(self):
from sacrebleu import corpus_bleu

metrics_dict = {}
for key in self.translation_dict:
src_lang, tgt_lang = key.split("->")
Expand Down
Loading
Loading