diff --git a/community_tasks/chart_qa_evals.py b/community_tasks/chart_qa_evals.py new file mode 100644 index 000000000..ebe968551 --- /dev/null +++ b/community_tasks/chart_qa_evals.py @@ -0,0 +1,97 @@ +# MIT License + +# Copyright (c) 2024 The HuggingFace Team + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# ruff: noqa: F405, F403, F401 +""" +Task to evaluate VLMs on HuggingFaceM4/ChartQA. + +Example usage: +lighteval accelerate "model_name=google/gemma-3-4b-it" "community|chart_qa:human|0" --custom-tasks community_tasks/chart_qa_evals.py --vision-model +""" + +import numpy as np + +from lighteval.metrics.dynamic_metrics import MultilingualExtractiveMatchMetric +from lighteval.metrics.metrics import SampleLevelMetric +from lighteval.metrics.utils.extractive_match_utils import ( + ExprExtractionConfig, + LatexExtractionConfig, +) +from lighteval.metrics.utils.metric_utils import SamplingMethod +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc +from lighteval.utils.language import Language + + +def prompt_fn(line, task_name: str = None): + return Doc( + task_name=task_name, + query="Answer the following question. The last line of your response should be of the following format: 'Answer: $ANSWER' (without quotes) where $ANSWER is the answer to the question.\n\n" + + line["query"], + gold_index=0, + choices=[line["label"]], + images=[line["image"]], + ) + + +extraction_targets = [ExprExtractionConfig(), LatexExtractionConfig()] +metric = SampleLevelMetric( + metric_name="extractive_match", + sample_level_fn=MultilingualExtractiveMatchMetric( + language=Language.ENGLISH, + gold_extraction_target=extraction_targets, + pred_extraction_target=extraction_targets, + precision=4, + ), + category=SamplingMethod.GENERATIVE, + corpus_level_fn=np.mean, + higher_is_better=True, +) + +task = LightevalTaskConfig( + name="chart_qa", + prompt_function=prompt_fn, + suite=["community"], + hf_repo="HuggingFaceM4/ChartQA", + hf_subset="default", + hf_avail_splits=["train", "val", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + metrics=[metric], +) + +human_task = LightevalTaskConfig( + name="chart_qa:human", + prompt_function=prompt_fn, + suite=["community"], + hf_repo="HuggingFaceM4/ChartQA", + hf_subset="default", + hf_avail_splits=["train", "val", "test"], + evaluation_splits=["test"], + hf_filter=lambda line: line["human_or_machine"] == 0, + few_shots_split=None, + few_shots_select=None, + metrics=[metric], +) + +TASKS_TABLE = [task, human_task]