diff --git a/ais_bench/benchmark/configs/summarizers/groups/mmlu_pro.py b/ais_bench/benchmark/configs/summarizers/groups/mmlu_pro.py index ba2d9fc7..b7832e89 100644 --- a/ais_bench/benchmark/configs/summarizers/groups/mmlu_pro.py +++ b/ais_bench/benchmark/configs/summarizers/groups/mmlu_pro.py @@ -3,3 +3,22 @@ mmlu_pro_summary_groups = [ {'name': 'mmlu_pro', 'subsets': ['mmlu_pro_' + c.replace(' ', '_') for c in categories]}, ] + +_mmlu_pro_all = ['mmlu_pro_' + c.replace(' ', '_') for c in categories] +_mmlu_pro_weights = { + 'mmlu_pro_math': 1351, + 'mmlu_pro_physics': 1299, + 'mmlu_pro_chemistry': 1132, + 'mmlu_pro_law': 1101, + 'mmlu_pro_engineering': 969, + 'mmlu_pro_other': 924, + 'mmlu_pro_economics': 844, + 'mmlu_pro_health': 818, + 'mmlu_pro_psychology': 798, + 'mmlu_pro_business': 789, + 'mmlu_pro_biology': 717, + 'mmlu_pro_philosophy': 499, + 'mmlu_pro_computer_science': 410, + 'mmlu_pro_history': 381, +} +mmlu_pro_summary_groups.append({'name': 'mmlu_pro-weighted', 'subsets': _mmlu_pro_all, 'weights': _mmlu_pro_weights}) diff --git a/ais_bench/benchmark/openicl/icl_inferencer/icl_bfcl_v3_inferencer.py b/ais_bench/benchmark/openicl/icl_inferencer/icl_bfcl_v3_inferencer.py index 19f822d6..b7dee555 100644 --- a/ais_bench/benchmark/openicl/icl_inferencer/icl_bfcl_v3_inferencer.py +++ b/ais_bench/benchmark/openicl/icl_inferencer/icl_bfcl_v3_inferencer.py @@ -7,6 +7,8 @@ from aiohttp import ClientSession from multiprocessing import BoundedSemaphore +from bfcl_eval.utils import make_json_serializable + from ais_bench.benchmark.openicl.icl_retriever import BaseRetriever from ais_bench.benchmark.registry import MODELS from ais_bench.benchmark.utils.prompt import PromptList @@ -600,6 +602,7 @@ async def _inference_multi_turn(self, data: dict, finial_output: FunctionCallOut await self.status_counter.case_finish() if all_model_response: finial_output.tool_calls = all_model_response + finial_output.inference_log = make_json_serializable(finial_output.inference_log) await self.output_handler.report_cache_info( index, prompt_list, finial_output, data_abbr )