From e15db12cf0df45e268777fc14e971031b3c037d7 Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Sat, 4 Oct 2025 06:35:17 +0000 Subject: [PATCH 1/8] option1 --- src/lighteval/utils/cache_management.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/lighteval/utils/cache_management.py b/src/lighteval/utils/cache_management.py index 2059d2843..3e8c0a08a 100644 --- a/src/lighteval/utils/cache_management.py +++ b/src/lighteval/utils/cache_management.py @@ -92,6 +92,8 @@ def __init__(self, model_config: ModelConfig): self.registry = None self.existing_indices = self._load_cached_indices() + # Caching the task_hashes to avoid grabbing the registry all the time + self._task_hashes = {} def _init_registry(self, registry: Registry): self.registry = registry @@ -163,10 +165,15 @@ def _get_task_hash(self, full_task_name: str) -> str: "The task registry was not provided to the cache config. We can't test if the current task has the same hash as the saved tasks." ) return "NO_HASH" - task_suite, task_name, _ = full_task_name.split("|") - task_configs: list[LightevalTaskConfig] = sorted(self.registry.task_to_configs[f"{task_suite}|{task_name}"]) - config_str = "|".join([task_config.__str__(lite=True) for task_config in task_configs]) - return hashlib.sha256(config_str.encode()).hexdigest()[:16] + if full_task_name not in self._task_hashes: + task_suite, task_name, _ = full_task_name.split("|") + task_configs: list[LightevalTaskConfig] = sorted( + self.registry.task_to_configs[f"{task_suite}|{task_name}"] + ) + config_str = "|".join([task_config.__str__(lite=True) for task_config in task_configs]) + task_hash = hashlib.sha256(config_str.encode()).hexdigest()[:16] + self._task_hashes[full_task_name] = task_hash + return self._task_hashes[full_task_name] def get_cache_path(self, task_id: TaskID) -> Path: """Get the file path for a specific task's cache file. From 9d5bd304e0a763d4654aab60947524154fa6e917 Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Sat, 4 Oct 2025 06:45:25 +0000 Subject: [PATCH 2/8] also debugging the judge --- src/lighteval/metrics/metrics_sample.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 25b4f68ff..f8e7696e1 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -1003,7 +1003,7 @@ def __init__( backend_options=backend_options, ) - def compute(self, responses: list[ModelResponse], docs: list[Doc], **kwargs) -> list: + def compute(self, response: list[ModelResponse], doc: list[Doc], **kwargs) -> list: raise NotImplementedError("This method should be implemented in the subclass.") @@ -1017,12 +1017,14 @@ def __init__(self): short_judge_name="gpt4o", ) - def compute(self, responses: list[ModelResponse], docs: list[Doc], **kwargs) -> list: + def compute(self, response: list[ModelResponse], doc: list[Doc], **kwargs) -> list: """Compute the score of a generative task using a llm as a judge. The generative task can be multiturn with 2 turns max, in that case, we return scores for turn 1 and 2. Also returns user_prompt and judgement which are ignored later by the aggregator. """ + docs = as_list(doc) + responses = as_list(response) questions = [formatted_doc.query for formatted_doc in docs] options = [formatted_doc.choices for formatted_doc in docs] golds = [formatted_doc.get_golds()[0] for formatted_doc in docs] @@ -1044,7 +1046,7 @@ def compute(self, responses: list[ModelResponse], docs: list[Doc], **kwargs) -> class JudgeLLMMTBench(JudgeLLM): - def compute(self, model_response: list[ModelResponse], docs: list[Doc], **kwargs): + def compute(self, model_response: list[ModelResponse], doc: list[Doc], **kwargs): """Compute the score of a generative task using a llm as a judge. The generative task can be multiturn with 2 turns max, in that case, we return scores for turn 1 and 2. Also returns user_prompt and judgement @@ -1052,6 +1054,8 @@ def compute(self, model_response: list[ModelResponse], docs: list[Doc], **kwargs """ import json + docs = as_list(doc) + # If we are evaluating a multiturn task, we need to have specific field in the formatted doc questions = [doc.specific["multi_turn_queries"] for doc in docs] golds = [doc.specific.get("reference", None) for doc in docs] @@ -1076,12 +1080,15 @@ def compute(self, model_response: list[ModelResponse], docs: list[Doc], **kwargs class JudgeLLMMixEval(JudgeLLM): - def compute(self, model_responses: list[ModelResponse], docs: list[Doc], **kwargs): + def compute(self, model_response: list[ModelResponse], doc: list[Doc], **kwargs): """Compute the score of a generative task using a llm as a judge. The generative task can be multiturn with 2 turns max, in that case, we return scores for turn 1 and 2. Also returns user_prompt and judgement which are ignored later by the aggregator. """ + docs = as_list(doc) + model_responses = as_list(model_response) + questions = [doc.specific["question"] for doc in docs] options = [doc.choices for doc in docs] golds = [doc.get_golds()[0] for doc in docs] From 213eda86f324b4135aad48cff9bfca55217e5c2e Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Sat, 4 Oct 2025 07:09:55 +0000 Subject: [PATCH 3/8] also debugging the judge --- src/lighteval/metrics/metrics_sample.py | 17 +++++++---------- src/lighteval/tasks/extended/mix_eval/main.py | 4 ++++ 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index f8e7696e1..36aa09c49 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -1003,7 +1003,8 @@ def __init__( backend_options=backend_options, ) - def compute(self, response: list[ModelResponse], doc: list[Doc], **kwargs) -> list: + def compute(self, **kwargs) -> list: + # When deriving: Use model_responses/docs for batched eval, model_response/doc for non batched eval raise NotImplementedError("This method should be implemented in the subclass.") @@ -1017,14 +1018,12 @@ def __init__(self): short_judge_name="gpt4o", ) - def compute(self, response: list[ModelResponse], doc: list[Doc], **kwargs) -> list: + def compute(self, responses: list[ModelResponse], docs: list[Doc], **kwargs) -> list: """Compute the score of a generative task using a llm as a judge. The generative task can be multiturn with 2 turns max, in that case, we return scores for turn 1 and 2. Also returns user_prompt and judgement which are ignored later by the aggregator. """ - docs = as_list(doc) - responses = as_list(response) questions = [formatted_doc.query for formatted_doc in docs] options = [formatted_doc.choices for formatted_doc in docs] golds = [formatted_doc.get_golds()[0] for formatted_doc in docs] @@ -1054,12 +1053,13 @@ def compute(self, model_response: list[ModelResponse], doc: list[Doc], **kwargs) """ import json + model_responses = as_list(model_response) docs = as_list(doc) # If we are evaluating a multiturn task, we need to have specific field in the formatted doc questions = [doc.specific["multi_turn_queries"] for doc in docs] golds = [doc.specific.get("reference", None) for doc in docs] - predictions = [response.text[0] for response in model_response] + predictions = [response.text[0] for response in model_responses] query_context_1 = {"query": questions[0], "context": ""} query_context_2 = {"query": questions[1], "context": predictions[0]} @@ -1080,19 +1080,16 @@ def compute(self, model_response: list[ModelResponse], doc: list[Doc], **kwargs) class JudgeLLMMixEval(JudgeLLM): - def compute(self, model_response: list[ModelResponse], doc: list[Doc], **kwargs): + def compute(self, responses: list[ModelResponse], docs: list[Doc], **kwargs): """Compute the score of a generative task using a llm as a judge. The generative task can be multiturn with 2 turns max, in that case, we return scores for turn 1 and 2. Also returns user_prompt and judgement which are ignored later by the aggregator. """ - docs = as_list(doc) - model_responses = as_list(model_response) - questions = [doc.specific["question"] for doc in docs] options = [doc.choices for doc in docs] golds = [doc.get_golds()[0] for doc in docs] - predictions = [response.text[0] for response in model_responses] + predictions = [response.text[0] for response in responses] scores, messages, judgements = self.judge.evaluate_answer_batch(questions, predictions, options, golds) diff --git a/src/lighteval/tasks/extended/mix_eval/main.py b/src/lighteval/tasks/extended/mix_eval/main.py index 2d9b7569a..e57faa1bd 100644 --- a/src/lighteval/tasks/extended/mix_eval/main.py +++ b/src/lighteval/tasks/extended/mix_eval/main.py @@ -115,6 +115,7 @@ def process_judge_response_freeform_gpt(x): corpus_level_fn={ "judge_score_flow": np.mean, }, + batched_compute=True, ) llm_judge_mixeval_multichoice_gpt_judge = SampleLevelMetricGrouping( @@ -131,6 +132,7 @@ def process_judge_response_freeform_gpt(x): corpus_level_fn={ "judge_score_gpt-3.5": np.mean, }, + batched_compute=True, ) @@ -152,6 +154,7 @@ def mean_dv_5(x): corpus_level_fn={ "judge_score_flow": mean_dv_5, }, + batched_compute=True, ) llm_judge_mixeval_freeform_gpt_judge = SampleLevelMetricGrouping( @@ -168,6 +171,7 @@ def mean_dv_5(x): corpus_level_fn={ "judge_score_gpt-3.5": np.mean, }, + batched_compute=True, ) From fa3ef6293a87878e7b7974279f6065166f8752df Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Sat, 4 Oct 2025 07:19:27 +0000 Subject: [PATCH 4/8] debug --- docs/source/adding-a-new-metric.mdx | 4 ++-- src/lighteval/metrics/metrics_sample.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/source/adding-a-new-metric.mdx b/docs/source/adding-a-new-metric.mdx index 90b7256a4..9bf02b4f5 100644 --- a/docs/source/adding-a-new-metric.mdx +++ b/docs/source/adding-a-new-metric.mdx @@ -58,7 +58,7 @@ boolean. ```python def custom_metric(doc: Doc, model_response: ModelResponse) -> bool: - response = model_response.text[0] + response = model_response.final_text[0] return response == doc.choices[doc.gold_index] ``` @@ -68,7 +68,7 @@ If you want to return multiple metrics per sample, you need to return a dictiona ```python def custom_metric(doc: Doc, model_response: ModelResponse) -> dict: - response = model_response.text[0] + response = model_response.final_text[0] return {"accuracy": response == doc.choices[doc.gold_index], "other_metric": 0.5} ``` diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 36aa09c49..083686c4b 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -1027,7 +1027,7 @@ def compute(self, responses: list[ModelResponse], docs: list[Doc], **kwargs) -> questions = [formatted_doc.query for formatted_doc in docs] options = [formatted_doc.choices for formatted_doc in docs] golds = [formatted_doc.get_golds()[0] for formatted_doc in docs] - predictions = [response.text[0] for response in responses] + predictions = [response.final_text[0] for response in responses] scores, messages, judgements = self.judge.evaluate_answer_batch(questions, predictions, options, golds) @@ -1059,7 +1059,7 @@ def compute(self, model_response: list[ModelResponse], doc: list[Doc], **kwargs) # If we are evaluating a multiturn task, we need to have specific field in the formatted doc questions = [doc.specific["multi_turn_queries"] for doc in docs] golds = [doc.specific.get("reference", None) for doc in docs] - predictions = [response.text[0] for response in model_responses] + predictions = [response.final_text[0] for response in model_responses] query_context_1 = {"query": questions[0], "context": ""} query_context_2 = {"query": questions[1], "context": predictions[0]} @@ -1089,7 +1089,7 @@ def compute(self, responses: list[ModelResponse], docs: list[Doc], **kwargs): questions = [doc.specific["question"] for doc in docs] options = [doc.choices for doc in docs] golds = [doc.get_golds()[0] for doc in docs] - predictions = [response.text[0] for response in responses] + predictions = [response.final_text[0] for response in responses] scores, messages, judgements = self.judge.evaluate_answer_batch(questions, predictions, options, golds) @@ -1098,8 +1098,8 @@ def compute(self, responses: list[ModelResponse], docs: list[Doc], **kwargs): metrics.append( { f"judge_score_{self.short_judge_name}": scores[i], - f"user_prompt_{self.short_judge_name}": messages[i], - f"judgement_{self.short_judge_name}": judgements[i], + # f"user_prompt_{self.short_judge_name}": messages[i], + # f"judgement_{self.short_judge_name}": judgements[i], } ) From 2ae58aafda5e1082cadcdcd1dc21b26a2d5ff4a7 Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Sat, 4 Oct 2025 14:47:34 +0000 Subject: [PATCH 5/8] eval tracker fix 1 --- src/lighteval/logging/evaluation_tracker.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index aed32d2f1..976b21c86 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -63,12 +63,15 @@ class EnhancedJSONEncoder(json.JSONEncoder): Notably manages the json encoding of dataclasses. """ - def default(self, o): + def default(self, o): # noqa : C901 if is_dataclass(o): try: return asdict(o) # type: ignore except Exception: - return str(o) + try: + return o.__dict__ + except Exception: + return str(o) if callable(o): if hasattr(o, "__name__"): return o.__name__ From 257ff3548dc9f5082797a08ec5f9819404842f8e Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Sat, 4 Oct 2025 14:48:08 +0000 Subject: [PATCH 6/8] likely fix for the GSM+ issue --- src/lighteval/tasks/lighteval_task.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index c54afb5fe..9cde473a8 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -295,6 +295,8 @@ def _get_docs_from_split(self, splits: list[str], few_shots=False) -> list[Doc]: # Some tasks require to know which is the current item index in order to apply a different prompt template item["__index"] = ix doc = self.formatter(item, self.name) + if doc is None or doc == []: + continue doc.id = str(ix) # Transfer task-level generation parameters to the document From a83d84abbb6d2a2ed9a607481c52c64713e33d82 Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Sat, 4 Oct 2025 15:03:43 +0000 Subject: [PATCH 7/8] stringify model judge + change max_length to what's actually passed instead of setting a bunch of overwrites --- src/lighteval/metrics/utils/llm_as_judge.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/lighteval/metrics/utils/llm_as_judge.py b/src/lighteval/metrics/utils/llm_as_judge.py index 7e1b775c9..2a4566df1 100644 --- a/src/lighteval/metrics/utils/llm_as_judge.py +++ b/src/lighteval/metrics/utils/llm_as_judge.py @@ -97,7 +97,7 @@ def __init__( judge_backend: Literal["litellm", "openai", "transformers", "tgi", "vllm", "inference-providers"], url: str | None = None, api_key: str | None = None, - max_tokens: int = 512, + max_tokens: int | None = None, response_format: BaseModel = None, hf_provider: Optional[ Literal[ @@ -172,7 +172,7 @@ def __lazy_load_client(self): # noqa: C901 self.sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=self.max_tokens) self.tokenizer = get_tokenizer(self.model, tokenizer_mode="auto") - self.pipe = LLM(model=self.model, max_model_len=2048, gpu_memory_utilization=0.5, dtype="float16") + self.pipe = LLM(model=self.model, gpu_memory_utilization=0.5, dtype="float16") return self.__call_vllm case "transformers": @@ -300,7 +300,7 @@ def __call_vllm(self, prompt): outputs = [output.outputs[0].text for output in output] return outputs - def __call_litellm(self, prompts): + def __call_litellm(self, prompts): # noqa: C901 import litellm if self.backend_options.caching: @@ -324,10 +324,11 @@ def __call_api(prompt): kwargs = { "model": self.model, "messages": prompt, - "max_tokens": max_new_tokens, "n": 1, "caching": True, } + if max_new_tokens is not None: + kwargs["max_tokens"] = (max_new_tokens,) response = litellm.completion(**kwargs) text = response.choices[0].message.content @@ -412,7 +413,7 @@ def __call_api(self, prompt): model=self.model, messages=as_list(prompt), response_format=self.response_format, - max_tokens=4096, + max_tokens=self.max_tokens, temperature=0.0, n=1, ) @@ -425,7 +426,7 @@ def __call_api(self, prompt): model=self.model, messages=as_list(prompt), response_format=self.response_format, - max_tokens=512, + max_tokens=self.max_tokens, n=1, ) text = response.choices[0].message.content @@ -438,3 +439,6 @@ def __call_api(self, prompt): time.sleep(self.API_RETRY_SLEEP) raise Exception("Failed to get response from the API") + + def __str__(self) -> str: + return f"Model: {self.model}, Judge Backend: {self.backend}, URL: {self.url}" From 32bb552d1571940aac530802209984915f80559f Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Sat, 4 Oct 2025 15:15:11 +0000 Subject: [PATCH 8/8] more memory for flow judge --- src/lighteval/metrics/utils/llm_as_judge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lighteval/metrics/utils/llm_as_judge.py b/src/lighteval/metrics/utils/llm_as_judge.py index 2a4566df1..e30ec0449 100644 --- a/src/lighteval/metrics/utils/llm_as_judge.py +++ b/src/lighteval/metrics/utils/llm_as_judge.py @@ -172,7 +172,7 @@ def __lazy_load_client(self): # noqa: C901 self.sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=self.max_tokens) self.tokenizer = get_tokenizer(self.model, tokenizer_mode="auto") - self.pipe = LLM(model=self.model, gpu_memory_utilization=0.5, dtype="float16") + self.pipe = LLM(model=self.model, gpu_memory_utilization=0.8, dtype="float16") return self.__call_vllm case "transformers":