diff --git a/eval/chat_benchmarks/AIME24/eval_instruct.py b/eval/chat_benchmarks/AIME24/eval_instruct.py index 7cbe5701..2a96b88c 100644 --- a/eval/chat_benchmarks/AIME24/eval_instruct.py +++ b/eval/chat_benchmarks/AIME24/eval_instruct.py @@ -27,7 +27,7 @@ def __init__( data_file: str = "eval/chat_benchmarks/AIME24/data/aime24.json", debug: bool = False, seed: List[int] = [0, 1234, 1234, 1234], - max_tokens: Optional[int] = 32768, + max_tokens: int = 32768, logger: Optional[logging.Logger] = None, system_instruction: Optional[str] = None, ): @@ -44,9 +44,7 @@ def __init__( super().__init__(logger=logger, system_instruction=system_instruction) self.data_file = data_file self.debug = debug - self.max_new_tokens = ( - max_tokens if max_tokens is not None else 32768 - ) # set higher to avoid truncation for reasoning models + self.max_new_tokens = max_tokens self.seed = seed self.n_repeat = 10 diff --git a/eval/chat_benchmarks/AIME25/eval_instruct.py b/eval/chat_benchmarks/AIME25/eval_instruct.py index 08d339bd..9bfeaf02 100644 --- a/eval/chat_benchmarks/AIME25/eval_instruct.py +++ b/eval/chat_benchmarks/AIME25/eval_instruct.py @@ -43,7 +43,7 @@ def __init__( super().__init__(logger=logger, system_instruction=system_instruction) self.data_file = data_file self.debug = debug - self.max_new_tokens = max_tokens if max_tokens is not None else 32768 + self.max_new_tokens = max_tokens self.seed = seed self.n_repeat = 10 diff --git a/eval/chat_benchmarks/AIW/eval_instruct.py b/eval/chat_benchmarks/AIW/eval_instruct.py index 96d8b04d..c3b54c86 100644 --- a/eval/chat_benchmarks/AIW/eval_instruct.py +++ b/eval/chat_benchmarks/AIW/eval_instruct.py @@ -23,7 +23,7 @@ def __init__( data_file: str = "eval/chat_benchmarks/AIW/data/aiw_data.json", debug: bool = False, seed: List[int] = [0, 1234, 1234, 1234], - max_tokens: Optional[int] = 32768, + max_tokens: int = 32768, logger: Optional[logging.Logger] = None, system_instruction: Optional[str] = None, n_trials: int = 100, # Run 100 trials @@ -41,7 +41,7 @@ def __init__( super().__init__(logger=logger, system_instruction=system_instruction) self.data_file = data_file self.debug = debug - self.max_new_tokens = max_tokens if max_tokens is not None else 32768 + self.max_new_tokens = max_tokens self.seed = seed self.n_trials = n_trials diff --git a/eval/chat_benchmarks/AMC23/eval_instruct.py b/eval/chat_benchmarks/AMC23/eval_instruct.py index 24f88e21..094c77b7 100644 --- a/eval/chat_benchmarks/AMC23/eval_instruct.py +++ b/eval/chat_benchmarks/AMC23/eval_instruct.py @@ -29,7 +29,7 @@ def __init__( data_file: str = "eval/chat_benchmarks/AMC23/data/amc23.json", debug: bool = False, seed: List[int] = [0, 1234, 1234, 1234], - max_tokens: Optional[int] = 32768, + max_tokens: int = 32768, logger: Optional[logging.Logger] = None, system_instruction: Optional[str] = None, ): @@ -47,7 +47,7 @@ def __init__( self.data_file = data_file self.debug = debug self.seed = seed - self.max_new_tokens = max_tokens if max_tokens is not None else 32768 + self.max_new_tokens = max_tokens self.n_repeat = 10 def generate_responses(self, model: LM) -> Dict[str, Any]: diff --git a/eval/chat_benchmarks/BigCodeBench/eval_instruct.py b/eval/chat_benchmarks/BigCodeBench/eval_instruct.py index 8b0b58f4..5b56647d 100644 --- a/eval/chat_benchmarks/BigCodeBench/eval_instruct.py +++ b/eval/chat_benchmarks/BigCodeBench/eval_instruct.py @@ -70,7 +70,7 @@ def __init__( self, language: str = "python", data_dir: str = BIGCODEBENCH_PATH, - max_tokens: Optional[int] = 1280, + max_tokens: int = 1280, num_workers: int = 32, timeout: float = 120, debug: bool = False, @@ -98,7 +98,7 @@ def __init__( self.language = language os.makedirs(data_dir, exist_ok=True) self.data_dir = data_dir - self.max_tokens = max_tokens if max_tokens is not None else 1280 + self.max_tokens = max_tokens self.num_workers = num_workers self.timeout = timeout self.debug = debug diff --git a/eval/chat_benchmarks/CodeElo/eval_instruct.py b/eval/chat_benchmarks/CodeElo/eval_instruct.py index a561e836..eae33254 100644 --- a/eval/chat_benchmarks/CodeElo/eval_instruct.py +++ b/eval/chat_benchmarks/CodeElo/eval_instruct.py @@ -48,7 +48,7 @@ def __init__( self, debug: bool = False, seed: List[int] = [0, 1234, 1234, 1234], - max_tokens: Optional[int] = None, + max_tokens: int = 32768, logger: Optional[logging.Logger] = None, system_instruction: Optional[str] = None, ): @@ -63,9 +63,7 @@ def __init__( """ super().__init__(logger=logger, system_instruction=system_instruction) self.debug = debug - self.max_new_tokens = ( - max_tokens if max_tokens is not None else 32768 - ) # set higher to avoid truncation for reasoning models + self.max_new_tokens = max_tokens self.seed = seed self.n_repeat = 3 self.filter_interaction_questions = True diff --git a/eval/chat_benchmarks/CodeForces/eval_instruct.py b/eval/chat_benchmarks/CodeForces/eval_instruct.py index 30243392..08ef6cd9 100644 --- a/eval/chat_benchmarks/CodeForces/eval_instruct.py +++ b/eval/chat_benchmarks/CodeForces/eval_instruct.py @@ -47,7 +47,7 @@ def __init__( self, debug: bool = False, seed: List[int] = [0, 1234, 1234, 1234], - max_tokens: Optional[int] = None, + max_tokens: int = 32768, logger: Optional[logging.Logger] = None, system_instruction: Optional[str] = None, ): @@ -62,8 +62,7 @@ def __init__( """ super().__init__(logger=logger, system_instruction=system_instruction) self.debug = debug - self.max_new_tokens = max_tokens if max_tokens is not None else 32768 - self.seed = seed + self.max_new_tokens = max_tokens self.n_repeat = 3 self.filter_interaction_questions = True diff --git a/eval/chat_benchmarks/CruxEval/eval_instruct.py b/eval/chat_benchmarks/CruxEval/eval_instruct.py index 7580e254..4049df1f 100644 --- a/eval/chat_benchmarks/CruxEval/eval_instruct.py +++ b/eval/chat_benchmarks/CruxEval/eval_instruct.py @@ -132,7 +132,7 @@ class CruxEvalBenchmark(BaseBenchmark): def __init__( self, data_dir: str = CruxEval_PATH, - max_tokens: Optional[int] = 2048, + max_tokens: int = 2048, num_workers: int = 32, timeout: float = 120, debug: bool = False, @@ -155,7 +155,7 @@ def __init__( self.language = "python" os.makedirs(data_dir, exist_ok=True) self.data_dir = data_dir - self.max_tokens = max_tokens if max_tokens is not None else 2048 + self.max_tokens = max_tokens self.num_workers = num_workers self.timeout = timeout self.debug = debug diff --git a/eval/chat_benchmarks/GPQADiamond/eval_instruct.py b/eval/chat_benchmarks/GPQADiamond/eval_instruct.py index 46288c69..e49cbd37 100644 --- a/eval/chat_benchmarks/GPQADiamond/eval_instruct.py +++ b/eval/chat_benchmarks/GPQADiamond/eval_instruct.py @@ -35,7 +35,7 @@ def __init__( self, debug: bool = False, seed: List[int] = [0, 1234, 1234, 1234], - max_tokens: Optional[int] = 32768, + max_tokens: int = 32768, logger: Optional[logging.Logger] = None, system_instruction: Optional[str] = None, ): @@ -51,7 +51,7 @@ def __init__( self.dataset_name = "Idavidrein/gpqa" self.debug = debug self.seed = seed - self.max_new_tokens = max_tokens if max_tokens is not None else 32768 + self.max_new_tokens = max_tokens self.n_repeat = 3 def generate_responses(self, model: LM) -> Dict[str, Any]: diff --git a/eval/chat_benchmarks/HLE/eval_instruct.py b/eval/chat_benchmarks/HLE/eval_instruct.py index 8df8d9a6..275668ab 100644 --- a/eval/chat_benchmarks/HLE/eval_instruct.py +++ b/eval/chat_benchmarks/HLE/eval_instruct.py @@ -63,7 +63,7 @@ def __init__( self, debug: bool = False, seed: List[int] = [0, 1234, 1234, 1234], - max_tokens: Optional[int] = None, + max_tokens: int = 32768, logger: Optional[logging.Logger] = None, system_instruction: Optional[str] = None, ): @@ -77,9 +77,7 @@ def __init__( """ super().__init__(logger=logger, system_instruction=system_instruction) self.debug = debug - self.max_new_tokens = ( - max_tokens if max_tokens is not None else 32768 - ) # set higher to avoid truncation for reasoning models + self.max_new_tokens = max_tokens self.seed = seed self.n_repeat = 3 diff --git a/eval/chat_benchmarks/HMMT/eval_instruct.py b/eval/chat_benchmarks/HMMT/eval_instruct.py index 32b46dfb..22f76801 100644 --- a/eval/chat_benchmarks/HMMT/eval_instruct.py +++ b/eval/chat_benchmarks/HMMT/eval_instruct.py @@ -29,7 +29,7 @@ def __init__( self, dataset_name: str = "MathArena/hmmt_feb_2025", debug: bool = False, - max_tokens: Optional[int] = None, + max_tokens: int = 32768, seed: List[int] = [0, 1234, 1234, 1234], logger: Optional[logging.Logger] = None, system_instruction: Optional[str] = None, @@ -47,9 +47,7 @@ def __init__( super().__init__(logger=logger, system_instruction=system_instruction) self.dataset_name = dataset_name self.debug = debug - self.max_new_tokens = ( - max_tokens if max_tokens is not None else 32768 - ) # set higher to avoid truncation for reasoning models + self.max_new_tokens = max_tokens self.seed = seed self.n_repeat = 10 diff --git a/eval/chat_benchmarks/HumanEval/eval_instruct.py b/eval/chat_benchmarks/HumanEval/eval_instruct.py index fb5c54e9..9b54653f 100644 --- a/eval/chat_benchmarks/HumanEval/eval_instruct.py +++ b/eval/chat_benchmarks/HumanEval/eval_instruct.py @@ -22,7 +22,7 @@ def __init__( self, languages: List[str] = ["python", "sh"], data_dir: str = "eval/chat_benchmarks/HumanEval/data", - max_tokens: Optional[int] = 1024, + max_tokens: int = 1024, num_workers: int = 8, timeout: float = 3.0, debug: bool = False, @@ -45,7 +45,7 @@ def __init__( super().__init__(logger=logger, system_instruction=system_instruction) self.languages = languages self.data_dir = data_dir - self.max_tokens = max_tokens if max_tokens is not None else 1024 + self.max_tokens = max_tokens self.num_workers = num_workers self.timeout = timeout self.debug = debug diff --git a/eval/chat_benchmarks/HumanEvalPlus/eval_instruct.py b/eval/chat_benchmarks/HumanEvalPlus/eval_instruct.py index f63fd7b2..d12b31a7 100644 --- a/eval/chat_benchmarks/HumanEvalPlus/eval_instruct.py +++ b/eval/chat_benchmarks/HumanEvalPlus/eval_instruct.py @@ -22,7 +22,7 @@ def __init__( self, languages: List[str] = ["python"], data_dir: str = "eval/chat_benchmarks/HumanEvalPlus/data", - max_tokens: Optional[int] = 1024, + max_tokens: int = 1024, num_workers: int = 8, timeout: float = 3.0, debug: bool = False, @@ -45,7 +45,7 @@ def __init__( super().__init__(logger=logger, system_instruction=system_instruction) self.languages = languages self.data_dir = data_dir - self.max_tokens = max_tokens if max_tokens is not None else 1024 + self.max_tokens = max_tokens self.num_workers = num_workers self.timeout = timeout self.debug = debug diff --git a/eval/chat_benchmarks/IFEval/eval_instruct.py b/eval/chat_benchmarks/IFEval/eval_instruct.py index 79be036f..907a8019 100644 --- a/eval/chat_benchmarks/IFEval/eval_instruct.py +++ b/eval/chat_benchmarks/IFEval/eval_instruct.py @@ -18,7 +18,7 @@ def __init__( start_idx: int = 10, end_idx: int = 510, debug: bool = False, - max_tokens: Optional[int] = 512, + max_tokens: int = 512, logger: Optional[logging.Logger] = None, system_instruction: Optional[str] = None, ): @@ -37,7 +37,7 @@ def __init__( """ super().__init__(logger=logger, system_instruction=system_instruction) self.data_dir = data_dir - self.max_tokens = max_tokens if max_tokens is not None else 512 + self.max_tokens = max_tokens self.num_examples = num_examples self.start_idx = start_idx self.end_idx = end_idx diff --git a/eval/chat_benchmarks/JEEBench/eval_instruct.py b/eval/chat_benchmarks/JEEBench/eval_instruct.py index 5ba53541..05881a19 100644 --- a/eval/chat_benchmarks/JEEBench/eval_instruct.py +++ b/eval/chat_benchmarks/JEEBench/eval_instruct.py @@ -78,7 +78,7 @@ def __init__( self, debug: bool = False, seed: List[int] = [0, 1234, 1234, 1234], - max_tokens: Optional[int] = 32768, + max_tokens: int = 32768, logger: Optional[logging.Logger] = None, system_instruction: Optional[str] = None, ): @@ -92,9 +92,7 @@ def __init__( """ super().__init__(logger=logger, system_instruction=system_instruction) self.debug = debug - self.max_new_tokens = ( - max_tokens if max_tokens is not None else 32768 - ) # set higher to avoid truncation for reasoning models + self.max_new_tokens = max_tokens self.seed = seed self.n_repeat = 3 diff --git a/eval/chat_benchmarks/LiveBench/eval_instruct.py b/eval/chat_benchmarks/LiveBench/eval_instruct.py index 3e4e0339..760b4a2b 100644 --- a/eval/chat_benchmarks/LiveBench/eval_instruct.py +++ b/eval/chat_benchmarks/LiveBench/eval_instruct.py @@ -47,7 +47,7 @@ def __init__( release_date: str = "2024-08-31", remove_existing_file: bool = True, seed: List[int] = [0, 1234, 1234, 1234], - max_tokens: Optional[int] = 4096, + max_tokens: int = 4096, logger: Optional[logging.Logger] = None, system_instruction: Optional[str] = None, ): @@ -72,7 +72,7 @@ def __init__( self.release_date = "2024-06-24" self.num_workers = 1 else: - self.max_tokens = max_tokens if max_tokens is not None else 4096 + self.max_tokens = max_tokens self.temperature = temperature self.num_choices = num_choices self.all_release_dates = ["2024-07-26", "2024-06-24", "2024-08-31", "2024-11-25"] diff --git a/eval/chat_benchmarks/LiveCodeBench/eval_instruct.py b/eval/chat_benchmarks/LiveCodeBench/eval_instruct.py index 5c36e5bf..19dcec42 100644 --- a/eval/chat_benchmarks/LiveCodeBench/eval_instruct.py +++ b/eval/chat_benchmarks/LiveCodeBench/eval_instruct.py @@ -51,7 +51,7 @@ def __init__( self, debug: bool = False, seed: List[int] = [0, 1234, 1234, 1234], - max_tokens: Optional[int] = 32768, + max_tokens: int = 32768, logger: Optional[logging.Logger] = None, system_instruction: Optional[str] = None, ): @@ -66,7 +66,7 @@ def __init__( """ super().__init__(logger=logger, system_instruction=system_instruction) self.debug = debug - self.max_new_tokens = max_tokens or 32768 + self.max_new_tokens = max_tokens self.seed = seed self.n_repeat = 6 diff --git a/eval/chat_benchmarks/LiveCodeBenchv5/eval_instruct.py b/eval/chat_benchmarks/LiveCodeBenchv5/eval_instruct.py index 9ae5ee56..e1cc5c75 100644 --- a/eval/chat_benchmarks/LiveCodeBenchv5/eval_instruct.py +++ b/eval/chat_benchmarks/LiveCodeBenchv5/eval_instruct.py @@ -47,7 +47,7 @@ def __init__( self, debug: bool = False, seed: List[int] = [0, 1234, 1234, 1234], - max_tokens: Optional[int] = None, + max_tokens: int = 32768, logger: Optional[logging.Logger] = None, system_instruction: Optional[str] = None, ): @@ -62,9 +62,7 @@ def __init__( """ super().__init__(logger=logger, system_instruction=system_instruction) self.debug = debug - self.max_new_tokens = ( - max_tokens if max_tokens is not None else 32768 - ) # set higher to avoid truncation for reasoning models + self.max_new_tokens = max_tokens self.seed = seed self.n_repeat = 3 diff --git a/eval/chat_benchmarks/MATH500/eval_instruct.py b/eval/chat_benchmarks/MATH500/eval_instruct.py index f082ce0f..eec964f0 100644 --- a/eval/chat_benchmarks/MATH500/eval_instruct.py +++ b/eval/chat_benchmarks/MATH500/eval_instruct.py @@ -27,7 +27,7 @@ def __init__( data_file: str = "eval/chat_benchmarks/MATH500/data/math500.jsonl", debug: bool = False, seed: List[int] = [0, 1234, 1234, 1234], - max_tokens: Optional[int] = 32768, + max_tokens: int = 32768, logger: Optional[logging.Logger] = None, system_instruction: Optional[str] = None, ): @@ -45,9 +45,7 @@ def __init__( self.data_file = data_file self.debug = debug self.seed = seed - self.max_new_tokens = ( - max_tokens if max_tokens is not None else 32768 - ) # set higher to avoid truncation for reasoning models + self.max_new_tokens = max_tokens def generate_responses(self, model: LM) -> Dict[str, Any]: """ diff --git a/eval/chat_benchmarks/MBPP/eval_instruct.py b/eval/chat_benchmarks/MBPP/eval_instruct.py index 6f559a68..0fcb8a3b 100644 --- a/eval/chat_benchmarks/MBPP/eval_instruct.py +++ b/eval/chat_benchmarks/MBPP/eval_instruct.py @@ -26,7 +26,7 @@ def __init__( start_idx: int = 10, end_idx: int = 510, debug: bool = False, - max_tokens: Optional[int] = 512, + max_tokens: int = 512, logger: Optional[logging.Logger] = None, system_instruction: Optional[str] = None, ): @@ -45,7 +45,7 @@ def __init__( """ super().__init__(logger=logger, system_instruction=system_instruction) self.data_dir = data_dir - self.max_tokens = max_tokens or 512 + self.max_tokens = max_tokens self.num_examples = num_examples self.start_idx = start_idx self.end_idx = end_idx diff --git a/eval/chat_benchmarks/MBPPPlus/eval_instruct.py b/eval/chat_benchmarks/MBPPPlus/eval_instruct.py index d56d689b..6094bc17 100644 --- a/eval/chat_benchmarks/MBPPPlus/eval_instruct.py +++ b/eval/chat_benchmarks/MBPPPlus/eval_instruct.py @@ -25,7 +25,7 @@ def __init__( num_workers: int = 8, timeout: float = 3.0, debug: bool = False, - max_tokens: Optional[int] = 1024, + max_tokens: int = 1024, logger: Optional[logging.Logger] = None, system_instruction: Optional[str] = None, ): @@ -43,7 +43,7 @@ def __init__( """ super().__init__(logger=logger, system_instruction=system_instruction) self.data_dir = data_dir - self.max_tokens = max_tokens if max_tokens is not None else 1024 + self.max_tokens = max_tokens self.num_workers = num_workers self.timeout = timeout self.debug = debug diff --git a/eval/chat_benchmarks/MTBench/eval_instruct.py b/eval/chat_benchmarks/MTBench/eval_instruct.py index eddb24ca..38e9b13e 100644 --- a/eval/chat_benchmarks/MTBench/eval_instruct.py +++ b/eval/chat_benchmarks/MTBench/eval_instruct.py @@ -71,7 +71,7 @@ def __init__( config: Optional[MTBenchConfig] = None, debug: bool = False, annotator_model: str = "gpt-4o-mini-2024-07-18", - max_tokens: Optional[int] = 1024, + max_tokens: int = 1024, logger: Optional[logging.Logger] = None, system_instruction: Optional[str] = None, ): @@ -87,13 +87,13 @@ def __init__( """ super().__init__(logger=logger, system_instruction=system_instruction) self.base_path = Path(base_path) - if getattr(self, "config", None) is None: - self.config = MTBenchConfig( - judge_model=annotator_model, - ) - else: - self.config = config - self.config.max_new_token = max_tokens if max_tokens is not None else 1024 + if annotator_model == "auto": + annotator_model = "gpt-4" + if config: + print(f"Warning: Overwriting config.judge_model = {annotator_model} ") + config.judge_model = annotator_model + self.config = config or MTBenchConfig(judge_model=annotator_model) + self.config.max_new_token = max_tokens self.debug = debug # Setup paths diff --git a/eval/chat_benchmarks/MTBench/fastchat/protocol/api_protocol.py b/eval/chat_benchmarks/MTBench/fastchat/protocol/api_protocol.py index 2dc99449..589e78cc 100644 --- a/eval/chat_benchmarks/MTBench/fastchat/protocol/api_protocol.py +++ b/eval/chat_benchmarks/MTBench/fastchat/protocol/api_protocol.py @@ -55,7 +55,7 @@ class APIChatCompletionRequest(BaseModel): top_p: Optional[float] = 1.0 top_k: Optional[int] = -1 n: Optional[int] = 1 - max_tokens: Optional[int] = None + max_tokens: int = 1024 stop: Optional[Union[str, List[str]]] = None stream: Optional[bool] = False user: Optional[str] = None @@ -129,7 +129,7 @@ class CompletionRequest(BaseModel): suffix: Optional[str] = None temperature: Optional[float] = 0.7 n: Optional[int] = 1 - max_tokens: Optional[int] = 16 + max_tokens: int = 16 stop: Optional[Union[str, List[str]]] = None stream: Optional[bool] = False top_p: Optional[float] = 1.0 diff --git a/eval/chat_benchmarks/MTBench/fastchat/protocol/openai_api_protocol.py b/eval/chat_benchmarks/MTBench/fastchat/protocol/openai_api_protocol.py index bb50a5ef..38713ed5 100644 --- a/eval/chat_benchmarks/MTBench/fastchat/protocol/openai_api_protocol.py +++ b/eval/chat_benchmarks/MTBench/fastchat/protocol/openai_api_protocol.py @@ -66,7 +66,7 @@ class ChatCompletionRequest(BaseModel): top_p: Optional[float] = 1.0 top_k: Optional[int] = -1 n: Optional[int] = 1 - max_tokens: Optional[int] = None + max_tokens: int = 1024 stop: Optional[Union[str, List[str]]] = None stream: Optional[bool] = False presence_penalty: Optional[float] = 0.0 @@ -154,7 +154,7 @@ class CompletionRequest(BaseModel): suffix: Optional[str] = None temperature: Optional[float] = 0.7 n: Optional[int] = 1 - max_tokens: Optional[int] = 16 + max_tokens: int = 16 stop: Optional[Union[str, List[str]]] = None stream: Optional[bool] = False top_p: Optional[float] = 1.0 diff --git a/eval/chat_benchmarks/MTBench/fastchat/serve/api_provider.py b/eval/chat_benchmarks/MTBench/fastchat/serve/api_provider.py index ebf7f25a..07184521 100644 --- a/eval/chat_benchmarks/MTBench/fastchat/serve/api_provider.py +++ b/eval/chat_benchmarks/MTBench/fastchat/serve/api_provider.py @@ -957,7 +957,7 @@ def cohere_api_stream_iter( messages: list, temperature: Optional[float] = None, # The SDK or API handles None for all parameters following top_p: Optional[float] = None, - max_new_tokens: Optional[int] = None, + max_new_tokens: int = 1024, api_key: Optional[str] = None, # default is env var CO_API_KEY api_base: Optional[str] = None, ): @@ -1084,7 +1084,7 @@ def reka_api_stream_iter( messages: list, temperature: Optional[float] = None, # The SDK or API handles None for all parameters following top_p: Optional[float] = None, - max_new_tokens: Optional[int] = None, + max_new_tokens: int = 1024, api_key: Optional[str] = None, # default is env var CO_API_KEY api_base: Optional[str] = None, ): diff --git a/eval/chat_benchmarks/MTBench/fastchat/serve/openai_api_server.py b/eval/chat_benchmarks/MTBench/fastchat/serve/openai_api_server.py index 86e63cd1..9b5e58d5 100644 --- a/eval/chat_benchmarks/MTBench/fastchat/serve/openai_api_server.py +++ b/eval/chat_benchmarks/MTBench/fastchat/serve/openai_api_server.py @@ -266,7 +266,7 @@ async def get_gen_params( top_k: Optional[int], presence_penalty: Optional[float], frequency_penalty: Optional[float], - max_tokens: Optional[int], + max_tokens: int, echo: Optional[bool], logprobs: Optional[int] = None, stop: Optional[Union[str, List[str]]], diff --git a/eval/chat_benchmarks/WildBench/eval_instruct.py b/eval/chat_benchmarks/WildBench/eval_instruct.py index 60c8384e..cfe73a36 100644 --- a/eval/chat_benchmarks/WildBench/eval_instruct.py +++ b/eval/chat_benchmarks/WildBench/eval_instruct.py @@ -76,7 +76,7 @@ def __init__( config: Optional[WildBenchConfig] = None, annotator_model: str = "gpt-4o-mini-2024-07-18", debug: bool = False, - max_tokens: Optional[int] = 1024, + max_tokens: int = 1024, logger: Optional[logging.Logger] = None, system_instruction: Optional[str] = None, ): @@ -96,7 +96,7 @@ def __init__( config.model = annotator_model self.config = config or WildBenchConfig(model=annotator_model) self.debug = debug - self.max_new_tokens = max_tokens if max_tokens is not None else 1024 + self.max_new_tokens = max_tokens # Task category mapping self.task_group_mapping = { diff --git a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/decoders/cohere.py b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/decoders/cohere.py index c26c822d..dfe60aac 100644 --- a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/decoders/cohere.py +++ b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/decoders/cohere.py @@ -73,7 +73,7 @@ def cohere_completions( def _cohere_completion_helper( prompt: str, cohere_api_keys: Optional[Sequence[str]] = (constants.COHERE_API_KEY,), - max_tokens: Optional[int] = 1000, + max_tokens: int = 1000, temperature: Optional[float] = 0.7, max_tries=5, **kwargs, diff --git a/eval/task.py b/eval/task.py index 42e77824..70962115 100644 --- a/eval/task.py +++ b/eval/task.py @@ -234,13 +234,20 @@ def _register_benchmark(self, name: str, benchmark_class: Type[BaseBenchmark]): valid_kwargs = {} # Only pass kwargs that the benchmark's __init__ accepts + # Filter out None values to let benchmarks use their default values for param_name, param in init_params.items(): if param_name in self.benchmark_kwargs: - valid_kwargs[param_name] = self.benchmark_kwargs[param_name] - self.logger.debug(f"Passing {param_name} to {name} benchmark") - - # Ensure system_instruction is passed if available - if "system_instruction" in self.benchmark_kwargs: + value = self.benchmark_kwargs[param_name] + # Only pass the argument if it's not None, so benchmarks can use defaults + if value is not None: + valid_kwargs[param_name] = value + self.logger.debug(f"Passing {param_name}={value} to {name} benchmark") + + # Ensure system_instruction is passed if available and not None + if ( + "system_instruction" in self.benchmark_kwargs + and self.benchmark_kwargs["system_instruction"] is not None + ): valid_kwargs["system_instruction"] = self.benchmark_kwargs["system_instruction"] instance = benchmark_class(**valid_kwargs)