From e15db12cf0df45e268777fc14e971031b3c037d7 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Sat, 4 Oct 2025 06:35:17 +0000
Subject: [PATCH 1/8] option1

---
 src/lighteval/utils/cache_management.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/lighteval/utils/cache_management.py b/src/lighteval/utils/cache_management.py
index 2059d2843..3e8c0a08a 100644
--- a/src/lighteval/utils/cache_management.py
+++ b/src/lighteval/utils/cache_management.py
@@ -92,6 +92,8 @@ def __init__(self, model_config: ModelConfig):
         self.registry = None
 
         self.existing_indices = self._load_cached_indices()
+        # Caching the task_hashes to avoid grabbing the registry all the time
+        self._task_hashes = {}
 
     def _init_registry(self, registry: Registry):
         self.registry = registry
@@ -163,10 +165,15 @@ def _get_task_hash(self, full_task_name: str) -> str:
                 "The task registry was not provided to the cache config. We can't test if the current task has the same hash as the saved tasks."
             )
             return "NO_HASH"
-        task_suite, task_name, _ = full_task_name.split("|")
-        task_configs: list[LightevalTaskConfig] = sorted(self.registry.task_to_configs[f"{task_suite}|{task_name}"])
-        config_str = "|".join([task_config.__str__(lite=True) for task_config in task_configs])
-        return hashlib.sha256(config_str.encode()).hexdigest()[:16]
+        if full_task_name not in self._task_hashes:
+            task_suite, task_name, _ = full_task_name.split("|")
+            task_configs: list[LightevalTaskConfig] = sorted(
+                self.registry.task_to_configs[f"{task_suite}|{task_name}"]
+            )
+            config_str = "|".join([task_config.__str__(lite=True) for task_config in task_configs])
+            task_hash = hashlib.sha256(config_str.encode()).hexdigest()[:16]
+            self._task_hashes[full_task_name] = task_hash
+        return self._task_hashes[full_task_name]
 
     def get_cache_path(self, task_id: TaskID) -> Path:
         """Get the file path for a specific task's cache file.

From 9d5bd304e0a763d4654aab60947524154fa6e917 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Sat, 4 Oct 2025 06:45:25 +0000
Subject: [PATCH 2/8] also debugging the judge

---
 src/lighteval/metrics/metrics_sample.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index 25b4f68ff..f8e7696e1 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -1003,7 +1003,7 @@ def __init__(
             backend_options=backend_options,
         )
 
-    def compute(self, responses: list[ModelResponse], docs: list[Doc], **kwargs) -> list:
+    def compute(self, response: list[ModelResponse], doc: list[Doc], **kwargs) -> list:
         raise NotImplementedError("This method should be implemented in the subclass.")
 
 
@@ -1017,12 +1017,14 @@ def __init__(self):
             short_judge_name="gpt4o",
         )
 
-    def compute(self, responses: list[ModelResponse], docs: list[Doc], **kwargs) -> list:
+    def compute(self, response: list[ModelResponse], doc: list[Doc], **kwargs) -> list:
         """Compute the score of a generative task using a llm as a judge.
         The generative task can be multiturn with 2 turns max, in that case, we
         return scores for turn 1 and 2. Also returns user_prompt and judgement
         which are ignored later by the aggregator.
         """
+        docs = as_list(doc)
+        responses = as_list(response)
         questions = [formatted_doc.query for formatted_doc in docs]
         options = [formatted_doc.choices for formatted_doc in docs]
         golds = [formatted_doc.get_golds()[0] for formatted_doc in docs]
@@ -1044,7 +1046,7 @@ def compute(self, responses: list[ModelResponse], docs: list[Doc], **kwargs) ->
 
 
 class JudgeLLMMTBench(JudgeLLM):
-    def compute(self, model_response: list[ModelResponse], docs: list[Doc], **kwargs):
+    def compute(self, model_response: list[ModelResponse], doc: list[Doc], **kwargs):
         """Compute the score of a generative task using a llm as a judge.
         The generative task can be multiturn with 2 turns max, in that case, we
         return scores for turn 1 and 2. Also returns user_prompt and judgement
@@ -1052,6 +1054,8 @@ def compute(self, model_response: list[ModelResponse], docs: list[Doc], **kwargs
         """
         import json
 
+        docs = as_list(doc)
+
         # If we are evaluating a multiturn task, we need to have specific field in the formatted doc
         questions = [doc.specific["multi_turn_queries"] for doc in docs]
         golds = [doc.specific.get("reference", None) for doc in docs]
@@ -1076,12 +1080,15 @@ def compute(self, model_response: list[ModelResponse], docs: list[Doc], **kwargs
 
 
 class JudgeLLMMixEval(JudgeLLM):
-    def compute(self, model_responses: list[ModelResponse], docs: list[Doc], **kwargs):
+    def compute(self, model_response: list[ModelResponse], doc: list[Doc], **kwargs):
         """Compute the score of a generative task using a llm as a judge.
         The generative task can be multiturn with 2 turns max, in that case, we
         return scores for turn 1 and 2. Also returns user_prompt and judgement
         which are ignored later by the aggregator.
         """
+        docs = as_list(doc)
+        model_responses = as_list(model_response)
+
         questions = [doc.specific["question"] for doc in docs]
         options = [doc.choices for doc in docs]
         golds = [doc.get_golds()[0] for doc in docs]

From 213eda86f324b4135aad48cff9bfca55217e5c2e Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Sat, 4 Oct 2025 07:09:55 +0000
Subject: [PATCH 3/8] also debugging the judge

---
 src/lighteval/metrics/metrics_sample.py       | 17 +++++++----------
 src/lighteval/tasks/extended/mix_eval/main.py |  4 ++++
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index f8e7696e1..36aa09c49 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -1003,7 +1003,8 @@ def __init__(
             backend_options=backend_options,
         )
 
-    def compute(self, response: list[ModelResponse], doc: list[Doc], **kwargs) -> list:
+    def compute(self, **kwargs) -> list:
+        # When deriving: Use model_responses/docs for batched eval, model_response/doc for non batched eval
         raise NotImplementedError("This method should be implemented in the subclass.")
 
 
@@ -1017,14 +1018,12 @@ def __init__(self):
             short_judge_name="gpt4o",
         )
 
-    def compute(self, response: list[ModelResponse], doc: list[Doc], **kwargs) -> list:
+    def compute(self, responses: list[ModelResponse], docs: list[Doc], **kwargs) -> list:
         """Compute the score of a generative task using a llm as a judge.
         The generative task can be multiturn with 2 turns max, in that case, we
         return scores for turn 1 and 2. Also returns user_prompt and judgement
         which are ignored later by the aggregator.
         """
-        docs = as_list(doc)
-        responses = as_list(response)
         questions = [formatted_doc.query for formatted_doc in docs]
         options = [formatted_doc.choices for formatted_doc in docs]
         golds = [formatted_doc.get_golds()[0] for formatted_doc in docs]
@@ -1054,12 +1053,13 @@ def compute(self, model_response: list[ModelResponse], doc: list[Doc], **kwargs)
         """
         import json
 
+        model_responses = as_list(model_response)
         docs = as_list(doc)
 
         # If we are evaluating a multiturn task, we need to have specific field in the formatted doc
         questions = [doc.specific["multi_turn_queries"] for doc in docs]
         golds = [doc.specific.get("reference", None) for doc in docs]
-        predictions = [response.text[0] for response in model_response]
+        predictions = [response.text[0] for response in model_responses]
 
         query_context_1 = {"query": questions[0], "context": ""}
         query_context_2 = {"query": questions[1], "context": predictions[0]}
@@ -1080,19 +1080,16 @@ def compute(self, model_response: list[ModelResponse], doc: list[Doc], **kwargs)
 
 
 class JudgeLLMMixEval(JudgeLLM):
-    def compute(self, model_response: list[ModelResponse], doc: list[Doc], **kwargs):
+    def compute(self, responses: list[ModelResponse], docs: list[Doc], **kwargs):
         """Compute the score of a generative task using a llm as a judge.
         The generative task can be multiturn with 2 turns max, in that case, we
         return scores for turn 1 and 2. Also returns user_prompt and judgement
         which are ignored later by the aggregator.
         """
-        docs = as_list(doc)
-        model_responses = as_list(model_response)
-
         questions = [doc.specific["question"] for doc in docs]
         options = [doc.choices for doc in docs]
         golds = [doc.get_golds()[0] for doc in docs]
-        predictions = [response.text[0] for response in model_responses]
+        predictions = [response.text[0] for response in responses]
 
         scores, messages, judgements = self.judge.evaluate_answer_batch(questions, predictions, options, golds)
 
diff --git a/src/lighteval/tasks/extended/mix_eval/main.py b/src/lighteval/tasks/extended/mix_eval/main.py
index 2d9b7569a..e57faa1bd 100644
--- a/src/lighteval/tasks/extended/mix_eval/main.py
+++ b/src/lighteval/tasks/extended/mix_eval/main.py
@@ -115,6 +115,7 @@ def process_judge_response_freeform_gpt(x):
     corpus_level_fn={
         "judge_score_flow": np.mean,
     },
+    batched_compute=True,
 )
 
 llm_judge_mixeval_multichoice_gpt_judge = SampleLevelMetricGrouping(
@@ -131,6 +132,7 @@ def process_judge_response_freeform_gpt(x):
     corpus_level_fn={
         "judge_score_gpt-3.5": np.mean,
     },
+    batched_compute=True,
 )
 
 
@@ -152,6 +154,7 @@ def mean_dv_5(x):
     corpus_level_fn={
         "judge_score_flow": mean_dv_5,
     },
+    batched_compute=True,
 )
 
 llm_judge_mixeval_freeform_gpt_judge = SampleLevelMetricGrouping(
@@ -168,6 +171,7 @@ def mean_dv_5(x):
     corpus_level_fn={
         "judge_score_gpt-3.5": np.mean,
     },
+    batched_compute=True,
 )
 
 

From fa3ef6293a87878e7b7974279f6065166f8752df Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Sat, 4 Oct 2025 07:19:27 +0000
Subject: [PATCH 4/8] debug

---
 docs/source/adding-a-new-metric.mdx     |  4 ++--
 src/lighteval/metrics/metrics_sample.py | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/source/adding-a-new-metric.mdx b/docs/source/adding-a-new-metric.mdx
index 90b7256a4..9bf02b4f5 100644
--- a/docs/source/adding-a-new-metric.mdx
+++ b/docs/source/adding-a-new-metric.mdx
@@ -58,7 +58,7 @@ boolean.
 
 ```python
 def custom_metric(doc: Doc, model_response: ModelResponse) -> bool:
-    response = model_response.text[0]
+    response = model_response.final_text[0]
     return response == doc.choices[doc.gold_index]
 ```
 
@@ -68,7 +68,7 @@ If you want to return multiple metrics per sample, you need to return a dictiona
 
 ```python
 def custom_metric(doc: Doc, model_response: ModelResponse) -> dict:
-    response = model_response.text[0]
+    response = model_response.final_text[0]
     return {"accuracy": response == doc.choices[doc.gold_index], "other_metric": 0.5}
 ```
 
diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index 36aa09c49..083686c4b 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -1027,7 +1027,7 @@ def compute(self, responses: list[ModelResponse], docs: list[Doc], **kwargs) ->
         questions = [formatted_doc.query for formatted_doc in docs]
         options = [formatted_doc.choices for formatted_doc in docs]
         golds = [formatted_doc.get_golds()[0] for formatted_doc in docs]
-        predictions = [response.text[0] for response in responses]
+        predictions = [response.final_text[0] for response in responses]
 
         scores, messages, judgements = self.judge.evaluate_answer_batch(questions, predictions, options, golds)
 
@@ -1059,7 +1059,7 @@ def compute(self, model_response: list[ModelResponse], doc: list[Doc], **kwargs)
         # If we are evaluating a multiturn task, we need to have specific field in the formatted doc
         questions = [doc.specific["multi_turn_queries"] for doc in docs]
         golds = [doc.specific.get("reference", None) for doc in docs]
-        predictions = [response.text[0] for response in model_responses]
+        predictions = [response.final_text[0] for response in model_responses]
 
         query_context_1 = {"query": questions[0], "context": ""}
         query_context_2 = {"query": questions[1], "context": predictions[0]}
@@ -1089,7 +1089,7 @@ def compute(self, responses: list[ModelResponse], docs: list[Doc], **kwargs):
         questions = [doc.specific["question"] for doc in docs]
         options = [doc.choices for doc in docs]
         golds = [doc.get_golds()[0] for doc in docs]
-        predictions = [response.text[0] for response in responses]
+        predictions = [response.final_text[0] for response in responses]
 
         scores, messages, judgements = self.judge.evaluate_answer_batch(questions, predictions, options, golds)
 
@@ -1098,8 +1098,8 @@ def compute(self, responses: list[ModelResponse], docs: list[Doc], **kwargs):
             metrics.append(
                 {
                     f"judge_score_{self.short_judge_name}": scores[i],
-                    f"user_prompt_{self.short_judge_name}": messages[i],
-                    f"judgement_{self.short_judge_name}": judgements[i],
+                    # f"user_prompt_{self.short_judge_name}": messages[i],
+                    # f"judgement_{self.short_judge_name}": judgements[i],
                 }
             )
 

From 2ae58aafda5e1082cadcdcd1dc21b26a2d5ff4a7 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Sat, 4 Oct 2025 14:47:34 +0000
Subject: [PATCH 5/8] eval tracker fix 1

---
 src/lighteval/logging/evaluation_tracker.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
index aed32d2f1..976b21c86 100644
--- a/src/lighteval/logging/evaluation_tracker.py
+++ b/src/lighteval/logging/evaluation_tracker.py
@@ -63,12 +63,15 @@ class EnhancedJSONEncoder(json.JSONEncoder):
     Notably manages the json encoding of dataclasses.
     """
 
-    def default(self, o):
+    def default(self, o):  # noqa : C901
         if is_dataclass(o):
             try:
                 return asdict(o)  # type: ignore
             except Exception:
-                return str(o)
+                try:
+                    return o.__dict__
+                except Exception:
+                    return str(o)
         if callable(o):
             if hasattr(o, "__name__"):
                 return o.__name__

From 257ff3548dc9f5082797a08ec5f9819404842f8e Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Sat, 4 Oct 2025 14:48:08 +0000
Subject: [PATCH 6/8] likely fix for the GSM+ issue

---
 src/lighteval/tasks/lighteval_task.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index c54afb5fe..9cde473a8 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -295,6 +295,8 @@ def _get_docs_from_split(self, splits: list[str], few_shots=False) -> list[Doc]:
                 # Some tasks require to know which is the current item index in order to apply a different prompt template
                 item["__index"] = ix
                 doc = self.formatter(item, self.name)
+                if doc is None or doc == []:
+                    continue
                 doc.id = str(ix)
 
                 # Transfer task-level generation parameters to the document

From a83d84abbb6d2a2ed9a607481c52c64713e33d82 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Sat, 4 Oct 2025 15:03:43 +0000
Subject: [PATCH 7/8] stringify model judge + change max_length to what's
 actually passed instead of setting a bunch of overwrites

---
 src/lighteval/metrics/utils/llm_as_judge.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/lighteval/metrics/utils/llm_as_judge.py b/src/lighteval/metrics/utils/llm_as_judge.py
index 7e1b775c9..2a4566df1 100644
--- a/src/lighteval/metrics/utils/llm_as_judge.py
+++ b/src/lighteval/metrics/utils/llm_as_judge.py
@@ -97,7 +97,7 @@ def __init__(
         judge_backend: Literal["litellm", "openai", "transformers", "tgi", "vllm", "inference-providers"],
         url: str | None = None,
         api_key: str | None = None,
-        max_tokens: int = 512,
+        max_tokens: int | None = None,
         response_format: BaseModel = None,
         hf_provider: Optional[
             Literal[
@@ -172,7 +172,7 @@ def __lazy_load_client(self):  # noqa: C901
 
                     self.sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=self.max_tokens)
                     self.tokenizer = get_tokenizer(self.model, tokenizer_mode="auto")
-                    self.pipe = LLM(model=self.model, max_model_len=2048, gpu_memory_utilization=0.5, dtype="float16")
+                    self.pipe = LLM(model=self.model, gpu_memory_utilization=0.5, dtype="float16")
                 return self.__call_vllm
 
             case "transformers":
@@ -300,7 +300,7 @@ def __call_vllm(self, prompt):
         outputs = [output.outputs[0].text for output in output]
         return outputs
 
-    def __call_litellm(self, prompts):
+    def __call_litellm(self, prompts):  # noqa: C901
         import litellm
 
         if self.backend_options.caching:
@@ -324,10 +324,11 @@ def __call_api(prompt):
                     kwargs = {
                         "model": self.model,
                         "messages": prompt,
-                        "max_tokens": max_new_tokens,
                         "n": 1,
                         "caching": True,
                     }
+                    if max_new_tokens is not None:
+                        kwargs["max_tokens"] = (max_new_tokens,)
 
                     response = litellm.completion(**kwargs)
                     text = response.choices[0].message.content
@@ -412,7 +413,7 @@ def __call_api(self, prompt):
                     model=self.model,
                     messages=as_list(prompt),
                     response_format=self.response_format,
-                    max_tokens=4096,
+                    max_tokens=self.max_tokens,
                     temperature=0.0,
                     n=1,
                 )
@@ -425,7 +426,7 @@ def __call_api(self, prompt):
                         model=self.model,
                         messages=as_list(prompt),
                         response_format=self.response_format,
-                        max_tokens=512,
+                        max_tokens=self.max_tokens,
                         n=1,
                     )
                     text = response.choices[0].message.content
@@ -438,3 +439,6 @@ def __call_api(self, prompt):
                 time.sleep(self.API_RETRY_SLEEP)
 
         raise Exception("Failed to get response from the API")
+
+    def __str__(self) -> str:
+        return f"Model: {self.model}, Judge Backend: {self.backend}, URL: {self.url}"

From 32bb552d1571940aac530802209984915f80559f Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Sat, 4 Oct 2025 15:15:11 +0000
Subject: [PATCH 8/8] more memory for flow judge

---
 src/lighteval/metrics/utils/llm_as_judge.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lighteval/metrics/utils/llm_as_judge.py b/src/lighteval/metrics/utils/llm_as_judge.py
index 2a4566df1..e30ec0449 100644
--- a/src/lighteval/metrics/utils/llm_as_judge.py
+++ b/src/lighteval/metrics/utils/llm_as_judge.py
@@ -172,7 +172,7 @@ def __lazy_load_client(self):  # noqa: C901
 
                     self.sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=self.max_tokens)
                     self.tokenizer = get_tokenizer(self.model, tokenizer_mode="auto")
-                    self.pipe = LLM(model=self.model, gpu_memory_utilization=0.5, dtype="float16")
+                    self.pipe = LLM(model=self.model, gpu_memory_utilization=0.8, dtype="float16")
                 return self.__call_vllm
 
             case "transformers":