From f1b8d407da6340f31dfcbbe855b2130f709b679d Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Thu, 25 Sep 2025 09:29:49 +0000
Subject: [PATCH 1/2] fix

---
 examples/model_configs/vllm_model_config.yaml |  4 +-
 examples/test_tasks.txt                       |  3 +
 src/lighteval/main_tasks.py                   |  6 +-
 src/lighteval/metrics/metrics_corpus.py       |  2 +-
 src/lighteval/metrics/metrics_sample.py       |  2 +-
 .../tasks/extended/ifbench/instructions.py    |  2 +-
 .../tasks/extended/tiny_benchmarks/main.py    | 61 +++++++++----------
 7 files changed, 42 insertions(+), 38 deletions(-)

diff --git a/examples/model_configs/vllm_model_config.yaml b/examples/model_configs/vllm_model_config.yaml
index 74f0afb92..5dc1d11f4 100644
--- a/examples/model_configs/vllm_model_config.yaml
+++ b/examples/model_configs/vllm_model_config.yaml
@@ -1,6 +1,6 @@
 model_parameters:
-  model_name: "HuggingFaceTB/SmolLM2-1.7B-Instruct"
-  revision: "57aa3c6599c53705406c648e7acca7e11dc45ea3"
+  model_name: "Qwen/Qwen3-4B-Thinking-2507"
+  revision: "main"
   dtype: "float16"
   tensor_parallel_size: 1
   data_parallel_size: 1
diff --git a/examples/test_tasks.txt b/examples/test_tasks.txt
index 12c8662a9..6e2acd828 100644
--- a/examples/test_tasks.txt
+++ b/examples/test_tasks.txt
@@ -25,3 +25,6 @@ lighteval|bigbench:temporal_sequences|3
 lighteval|bigbench:tracking_shuffled_objects_five_objects|3
 lighteval|bigbench:tracking_shuffled_objects_seven_objects|3
 test|gsm8k|0
+lighteval|aime25|0
+extended|ifeval|0
+extended|lcb:codegeneration_v4|0
diff --git a/src/lighteval/main_tasks.py b/src/lighteval/main_tasks.py
index 62f1129f4..eccfd8678 100644
--- a/src/lighteval/main_tasks.py
+++ b/src/lighteval/main_tasks.py
@@ -46,7 +46,9 @@ def inspect(
 
     from lighteval.tasks.registry import Registry
 
-    registry = Registry(custom_tasks=custom_tasks, load_community=True, load_extended=True, load_multilingual=True)
+    registry = Registry(
+        tasks=tasks, custom_tasks=custom_tasks, load_community=True, load_extended=True, load_multilingual=True
+    )
 
     # Loading task
     task_dict = registry.load_tasks()
@@ -54,7 +56,7 @@ def inspect(
         print("-" * 10, name, "-" * 10)
         if show_config:
             print("-" * 10, "CONFIG")
-            task.cfg.print()
+            task.config.print()
         for ix, sample in enumerate(task.eval_docs()[: int(num_samples)]):
             if ix == 0:
                 print("-" * 10, "SAMPLES")
diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py
index 54b7f9fc6..92c2c574a 100644
--- a/src/lighteval/metrics/metrics_corpus.py
+++ b/src/lighteval/metrics/metrics_corpus.py
@@ -47,7 +47,7 @@
 
 class CorpusLevelComputation(ABC):
     @abstractmethod
-    def compute_corpus(self):
+    def compute_corpus(self, items):
         raise NotImplementedError
 
     def __str__(self):
diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index 25b4f68ff..dd59731fe 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -63,7 +63,7 @@
 
 class SampleLevelComputation(ABC):
     @abstractmethod
-    def compute(self, model_response: ModelResponse, doc: Doc, **kwargs):
+    def compute(self, doc: Doc, model_response: ModelResponse, **kwargs):
         raise NotImplementedError
 
     def __str__(self):
diff --git a/src/lighteval/tasks/extended/ifbench/instructions.py b/src/lighteval/tasks/extended/ifbench/instructions.py
index 0c4f0a9a0..bdfd3b379 100644
--- a/src/lighteval/tasks/extended/ifbench/instructions.py
+++ b/src/lighteval/tasks/extended/ifbench/instructions.py
@@ -788,7 +788,7 @@ def check_following(self, value):
         """Checks if the response only includes words with prime length."""
         value = value.translate(str.maketrans("", "", string.punctuation))
         words = value.split()
-        primes = set(2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97)
+        primes = {2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97}
         for word in words:
             if len(word) not in primes:
                 return False
diff --git a/src/lighteval/tasks/extended/tiny_benchmarks/main.py b/src/lighteval/tasks/extended/tiny_benchmarks/main.py
index 44e05d0cc..b91eb4e00 100644
--- a/src/lighteval/tasks/extended/tiny_benchmarks/main.py
+++ b/src/lighteval/tasks/extended/tiny_benchmarks/main.py
@@ -32,16 +32,16 @@
 
 import numpy as np
 import requests
-from aenum import extend_enum
 from scipy.optimize import minimize
 
 import lighteval.tasks.default_prompts as prompt
-from lighteval.metrics.metrics import CorpusLevelMetricGrouping, Metrics
+from lighteval.metrics.metrics import CorpusLevelMetricGrouping
 from lighteval.metrics.metrics_corpus import CorpusLevelComputation
 from lighteval.metrics.metrics_sample import ExactMatches, LoglikelihoodAcc, SampleLevelComputation
 from lighteval.metrics.normalizations import gsm8k_normalizer
+from lighteval.models.model_output import ModelResponse
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
-from lighteval.tasks.requests import SamplingMethod
+from lighteval.tasks.requests import Doc, SamplingMethod
 
 
 # Utility functions
@@ -101,18 +101,18 @@ def download(self):
                 with open(path_dld, "wb") as file:
                     file.write(response.content)
 
-    def compute(self, **args):
+    def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> float:
         if self.task == "gsm8k":
             res = ExactMatches(
                 strip_strings=True, normalize_pred=gsm8k_normalizer, normalize_gold=gsm8k_normalizer
-            ).compute(**args)
+            ).compute(doc, model_response, **kwargs)
             return dict.fromkeys(self.METRICS, res)
         else:
-            res = LoglikelihoodAcc().compute(**args)
+            res = LoglikelihoodAcc().compute(doc, model_response, **kwargs)
             return dict.fromkeys(self.METRICS, res)
 
-    def compute_corpus(self, y_input):
-        if len(y_input) == self.num_samples and self.estimates is not None:
+    def compute_corpus(self, items):
+        if len(items) == self.num_samples and self.estimates is not None:
             return self.estimates[self.task]
 
         # We load the weights for the relevant examples
@@ -149,7 +149,7 @@ def compute_corpus(self, y_input):
         # Creating vector y and estimating theta
         y = np.zeros(N)
         for i, j in enumerate(seen_examples):
-            y[j] = y_input[i]
+            y[j] = items[i]
 
         # Getting estimates
         theta = fit_theta(y, seen_examples, A, B)
@@ -175,7 +175,7 @@ def compute_corpus(self, y_input):
             estimates[scenario]["pirt"] = IRTp
             estimates[scenario]["gpirt"] = IRTpp
 
-        self.num_samples = len(y_input)
+        self.num_samples = len(items)
         self.estimates = estimates
 
         return estimates[self.task]
@@ -238,6 +238,25 @@ def compute_corpus(self, y_input):
     #    },
 ]
 
+metrics = {}
+
+for task_param in task_params:
+    name = task_param["name"]
+    if name == "gsm8k":
+        category = SamplingMethod.GENERATIVE
+    else:
+        category = SamplingMethod.LOGPROBS
+
+    metrics[f"tinybench_metric_{name}"] = (
+        CorpusLevelMetricGrouping(
+            metric_name=TinyCorpusAggregator.METRICS,
+            higher_is_better=dict.fromkeys(TinyCorpusAggregator.METRICS, True),
+            sample_level_fn=TinyCorpusAggregator(name),
+            category=category,
+            corpus_level_fn=TinyCorpusAggregator(name),
+        ),
+    )
+
 TASKS_TABLE = []
 for task in task_params:
     name = task["name"]
@@ -256,28 +275,8 @@ def compute_corpus(self, y_input):
         evaluation_splits=task["evaluation_split"],
         few_shots_split=None,
         few_shots_select="random_sampling",
-        metrics=[f"tinybench_metric_{name}"],
+        metrics=metrics[f"tinybench_metric_{name}"],
         generation_size=generation_size,
         stop_sequence=stop_sequence,
     )
     TASKS_TABLE.append(task)
-
-# CUSTOM METRIC
-for task_param in task_params:
-    name = task_param["name"]
-    if name == "gsm8k":
-        category = SamplingMethod.GENERATIVE
-    else:
-        category = SamplingMethod.LOGPROBS
-
-    extend_enum(
-        Metrics,
-        f"tinybench_metric_{name}",
-        CorpusLevelMetricGrouping(
-            metric_name=TinyCorpusAggregator.METRICS,
-            higher_is_better=dict.fromkeys(TinyCorpusAggregator.METRICS, True),
-            sample_level_fn=TinyCorpusAggregator(name),
-            category=category,
-            corpus_level_fn=TinyCorpusAggregator(name),
-        ),
-    )

From 1921c8b19c3e1f5eea8958ed1a6965e6c94b1bb3 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Thu, 25 Sep 2025 09:33:38 +0000
Subject: [PATCH 2/2] revert uneeded changes

---
 examples/model_configs/vllm_model_config.yaml | 4 ++--
 examples/test_tasks.txt                       | 3 ---
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/examples/model_configs/vllm_model_config.yaml b/examples/model_configs/vllm_model_config.yaml
index 5dc1d11f4..74f0afb92 100644
--- a/examples/model_configs/vllm_model_config.yaml
+++ b/examples/model_configs/vllm_model_config.yaml
@@ -1,6 +1,6 @@
 model_parameters:
-  model_name: "Qwen/Qwen3-4B-Thinking-2507"
-  revision: "main"
+  model_name: "HuggingFaceTB/SmolLM2-1.7B-Instruct"
+  revision: "57aa3c6599c53705406c648e7acca7e11dc45ea3"
   dtype: "float16"
   tensor_parallel_size: 1
   data_parallel_size: 1
diff --git a/examples/test_tasks.txt b/examples/test_tasks.txt
index 6e2acd828..12c8662a9 100644
--- a/examples/test_tasks.txt
+++ b/examples/test_tasks.txt
@@ -25,6 +25,3 @@ lighteval|bigbench:temporal_sequences|3
 lighteval|bigbench:tracking_shuffled_objects_five_objects|3
 lighteval|bigbench:tracking_shuffled_objects_seven_objects|3
 test|gsm8k|0
-lighteval|aime25|0
-extended|ifeval|0
-extended|lcb:codegeneration_v4|0