From f1b8d407da6340f31dfcbbe855b2130f709b679d Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Thu, 25 Sep 2025 09:29:49 +0000 Subject: [PATCH 1/2] fix --- examples/model_configs/vllm_model_config.yaml | 4 +- examples/test_tasks.txt | 3 + src/lighteval/main_tasks.py | 6 +- src/lighteval/metrics/metrics_corpus.py | 2 +- src/lighteval/metrics/metrics_sample.py | 2 +- .../tasks/extended/ifbench/instructions.py | 2 +- .../tasks/extended/tiny_benchmarks/main.py | 61 +++++++++---------- 7 files changed, 42 insertions(+), 38 deletions(-) diff --git a/examples/model_configs/vllm_model_config.yaml b/examples/model_configs/vllm_model_config.yaml index 74f0afb92..5dc1d11f4 100644 --- a/examples/model_configs/vllm_model_config.yaml +++ b/examples/model_configs/vllm_model_config.yaml @@ -1,6 +1,6 @@ model_parameters: - model_name: "HuggingFaceTB/SmolLM2-1.7B-Instruct" - revision: "57aa3c6599c53705406c648e7acca7e11dc45ea3" + model_name: "Qwen/Qwen3-4B-Thinking-2507" + revision: "main" dtype: "float16" tensor_parallel_size: 1 data_parallel_size: 1 diff --git a/examples/test_tasks.txt b/examples/test_tasks.txt index 12c8662a9..6e2acd828 100644 --- a/examples/test_tasks.txt +++ b/examples/test_tasks.txt @@ -25,3 +25,6 @@ lighteval|bigbench:temporal_sequences|3 lighteval|bigbench:tracking_shuffled_objects_five_objects|3 lighteval|bigbench:tracking_shuffled_objects_seven_objects|3 test|gsm8k|0 +lighteval|aime25|0 +extended|ifeval|0 +extended|lcb:codegeneration_v4|0 diff --git a/src/lighteval/main_tasks.py b/src/lighteval/main_tasks.py index 62f1129f4..eccfd8678 100644 --- a/src/lighteval/main_tasks.py +++ b/src/lighteval/main_tasks.py @@ -46,7 +46,9 @@ def inspect( from lighteval.tasks.registry import Registry - registry = Registry(custom_tasks=custom_tasks, load_community=True, load_extended=True, load_multilingual=True) + registry = Registry( + tasks=tasks, custom_tasks=custom_tasks, load_community=True, load_extended=True, load_multilingual=True + ) # Loading task task_dict = registry.load_tasks() @@ -54,7 +56,7 @@ def inspect( print("-" * 10, name, "-" * 10) if show_config: print("-" * 10, "CONFIG") - task.cfg.print() + task.config.print() for ix, sample in enumerate(task.eval_docs()[: int(num_samples)]): if ix == 0: print("-" * 10, "SAMPLES") diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py index 54b7f9fc6..92c2c574a 100644 --- a/src/lighteval/metrics/metrics_corpus.py +++ b/src/lighteval/metrics/metrics_corpus.py @@ -47,7 +47,7 @@ class CorpusLevelComputation(ABC): @abstractmethod - def compute_corpus(self): + def compute_corpus(self, items): raise NotImplementedError def __str__(self): diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 25b4f68ff..dd59731fe 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -63,7 +63,7 @@ class SampleLevelComputation(ABC): @abstractmethod - def compute(self, model_response: ModelResponse, doc: Doc, **kwargs): + def compute(self, doc: Doc, model_response: ModelResponse, **kwargs): raise NotImplementedError def __str__(self): diff --git a/src/lighteval/tasks/extended/ifbench/instructions.py b/src/lighteval/tasks/extended/ifbench/instructions.py index 0c4f0a9a0..bdfd3b379 100644 --- a/src/lighteval/tasks/extended/ifbench/instructions.py +++ b/src/lighteval/tasks/extended/ifbench/instructions.py @@ -788,7 +788,7 @@ def check_following(self, value): """Checks if the response only includes words with prime length.""" value = value.translate(str.maketrans("", "", string.punctuation)) words = value.split() - primes = set(2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97) + primes = {2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97} for word in words: if len(word) not in primes: return False diff --git a/src/lighteval/tasks/extended/tiny_benchmarks/main.py b/src/lighteval/tasks/extended/tiny_benchmarks/main.py index 44e05d0cc..b91eb4e00 100644 --- a/src/lighteval/tasks/extended/tiny_benchmarks/main.py +++ b/src/lighteval/tasks/extended/tiny_benchmarks/main.py @@ -32,16 +32,16 @@ import numpy as np import requests -from aenum import extend_enum from scipy.optimize import minimize import lighteval.tasks.default_prompts as prompt -from lighteval.metrics.metrics import CorpusLevelMetricGrouping, Metrics +from lighteval.metrics.metrics import CorpusLevelMetricGrouping from lighteval.metrics.metrics_corpus import CorpusLevelComputation from lighteval.metrics.metrics_sample import ExactMatches, LoglikelihoodAcc, SampleLevelComputation from lighteval.metrics.normalizations import gsm8k_normalizer +from lighteval.models.model_output import ModelResponse from lighteval.tasks.lighteval_task import LightevalTaskConfig -from lighteval.tasks.requests import SamplingMethod +from lighteval.tasks.requests import Doc, SamplingMethod # Utility functions @@ -101,18 +101,18 @@ def download(self): with open(path_dld, "wb") as file: file.write(response.content) - def compute(self, **args): + def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> float: if self.task == "gsm8k": res = ExactMatches( strip_strings=True, normalize_pred=gsm8k_normalizer, normalize_gold=gsm8k_normalizer - ).compute(**args) + ).compute(doc, model_response, **kwargs) return dict.fromkeys(self.METRICS, res) else: - res = LoglikelihoodAcc().compute(**args) + res = LoglikelihoodAcc().compute(doc, model_response, **kwargs) return dict.fromkeys(self.METRICS, res) - def compute_corpus(self, y_input): - if len(y_input) == self.num_samples and self.estimates is not None: + def compute_corpus(self, items): + if len(items) == self.num_samples and self.estimates is not None: return self.estimates[self.task] # We load the weights for the relevant examples @@ -149,7 +149,7 @@ def compute_corpus(self, y_input): # Creating vector y and estimating theta y = np.zeros(N) for i, j in enumerate(seen_examples): - y[j] = y_input[i] + y[j] = items[i] # Getting estimates theta = fit_theta(y, seen_examples, A, B) @@ -175,7 +175,7 @@ def compute_corpus(self, y_input): estimates[scenario]["pirt"] = IRTp estimates[scenario]["gpirt"] = IRTpp - self.num_samples = len(y_input) + self.num_samples = len(items) self.estimates = estimates return estimates[self.task] @@ -238,6 +238,25 @@ def compute_corpus(self, y_input): # }, ] +metrics = {} + +for task_param in task_params: + name = task_param["name"] + if name == "gsm8k": + category = SamplingMethod.GENERATIVE + else: + category = SamplingMethod.LOGPROBS + + metrics[f"tinybench_metric_{name}"] = ( + CorpusLevelMetricGrouping( + metric_name=TinyCorpusAggregator.METRICS, + higher_is_better=dict.fromkeys(TinyCorpusAggregator.METRICS, True), + sample_level_fn=TinyCorpusAggregator(name), + category=category, + corpus_level_fn=TinyCorpusAggregator(name), + ), + ) + TASKS_TABLE = [] for task in task_params: name = task["name"] @@ -256,28 +275,8 @@ def compute_corpus(self, y_input): evaluation_splits=task["evaluation_split"], few_shots_split=None, few_shots_select="random_sampling", - metrics=[f"tinybench_metric_{name}"], + metrics=metrics[f"tinybench_metric_{name}"], generation_size=generation_size, stop_sequence=stop_sequence, ) TASKS_TABLE.append(task) - -# CUSTOM METRIC -for task_param in task_params: - name = task_param["name"] - if name == "gsm8k": - category = SamplingMethod.GENERATIVE - else: - category = SamplingMethod.LOGPROBS - - extend_enum( - Metrics, - f"tinybench_metric_{name}", - CorpusLevelMetricGrouping( - metric_name=TinyCorpusAggregator.METRICS, - higher_is_better=dict.fromkeys(TinyCorpusAggregator.METRICS, True), - sample_level_fn=TinyCorpusAggregator(name), - category=category, - corpus_level_fn=TinyCorpusAggregator(name), - ), - ) From 1921c8b19c3e1f5eea8958ed1a6965e6c94b1bb3 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Thu, 25 Sep 2025 09:33:38 +0000 Subject: [PATCH 2/2] revert uneeded changes --- examples/model_configs/vllm_model_config.yaml | 4 ++-- examples/test_tasks.txt | 3 --- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/examples/model_configs/vllm_model_config.yaml b/examples/model_configs/vllm_model_config.yaml index 5dc1d11f4..74f0afb92 100644 --- a/examples/model_configs/vllm_model_config.yaml +++ b/examples/model_configs/vllm_model_config.yaml @@ -1,6 +1,6 @@ model_parameters: - model_name: "Qwen/Qwen3-4B-Thinking-2507" - revision: "main" + model_name: "HuggingFaceTB/SmolLM2-1.7B-Instruct" + revision: "57aa3c6599c53705406c648e7acca7e11dc45ea3" dtype: "float16" tensor_parallel_size: 1 data_parallel_size: 1 diff --git a/examples/test_tasks.txt b/examples/test_tasks.txt index 6e2acd828..12c8662a9 100644 --- a/examples/test_tasks.txt +++ b/examples/test_tasks.txt @@ -25,6 +25,3 @@ lighteval|bigbench:temporal_sequences|3 lighteval|bigbench:tracking_shuffled_objects_five_objects|3 lighteval|bigbench:tracking_shuffled_objects_seven_objects|3 test|gsm8k|0 -lighteval|aime25|0 -extended|ifeval|0 -extended|lcb:codegeneration_v4|0