diff --git a/eval/chat_benchmarks/AIME24/eval_instruct.py b/eval/chat_benchmarks/AIME24/eval_instruct.py
index 7cbe5701..2a96b88c 100644
--- a/eval/chat_benchmarks/AIME24/eval_instruct.py
+++ b/eval/chat_benchmarks/AIME24/eval_instruct.py
@@ -27,7 +27,7 @@ def __init__(
         data_file: str = "eval/chat_benchmarks/AIME24/data/aime24.json",
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = 32768,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -44,9 +44,7 @@ def __init__(
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.data_file = data_file
         self.debug = debug
-        self.max_new_tokens = (
-            max_tokens if max_tokens is not None else 32768
-        )  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = max_tokens
         self.seed = seed
         self.n_repeat = 10
 
diff --git a/eval/chat_benchmarks/AIME25/eval_instruct.py b/eval/chat_benchmarks/AIME25/eval_instruct.py
index 08d339bd..9bfeaf02 100644
--- a/eval/chat_benchmarks/AIME25/eval_instruct.py
+++ b/eval/chat_benchmarks/AIME25/eval_instruct.py
@@ -43,7 +43,7 @@ def __init__(
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.data_file = data_file
         self.debug = debug
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768
+        self.max_new_tokens = max_tokens
         self.seed = seed
         self.n_repeat = 10
 
diff --git a/eval/chat_benchmarks/AIW/eval_instruct.py b/eval/chat_benchmarks/AIW/eval_instruct.py
index 96d8b04d..c3b54c86 100644
--- a/eval/chat_benchmarks/AIW/eval_instruct.py
+++ b/eval/chat_benchmarks/AIW/eval_instruct.py
@@ -23,7 +23,7 @@ def __init__(
         data_file: str = "eval/chat_benchmarks/AIW/data/aiw_data.json",
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = 32768,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
         n_trials: int = 100,  # Run 100 trials
@@ -41,7 +41,7 @@ def __init__(
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.data_file = data_file
         self.debug = debug
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768
+        self.max_new_tokens = max_tokens
         self.seed = seed
         self.n_trials = n_trials
 
diff --git a/eval/chat_benchmarks/AMC23/eval_instruct.py b/eval/chat_benchmarks/AMC23/eval_instruct.py
index 24f88e21..094c77b7 100644
--- a/eval/chat_benchmarks/AMC23/eval_instruct.py
+++ b/eval/chat_benchmarks/AMC23/eval_instruct.py
@@ -29,7 +29,7 @@ def __init__(
         data_file: str = "eval/chat_benchmarks/AMC23/data/amc23.json",
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = 32768,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -47,7 +47,7 @@ def __init__(
         self.data_file = data_file
         self.debug = debug
         self.seed = seed
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768
+        self.max_new_tokens = max_tokens
         self.n_repeat = 10
 
     def generate_responses(self, model: LM) -> Dict[str, Any]:
diff --git a/eval/chat_benchmarks/BigCodeBench/eval_instruct.py b/eval/chat_benchmarks/BigCodeBench/eval_instruct.py
index 8b0b58f4..5b56647d 100644
--- a/eval/chat_benchmarks/BigCodeBench/eval_instruct.py
+++ b/eval/chat_benchmarks/BigCodeBench/eval_instruct.py
@@ -70,7 +70,7 @@ def __init__(
         self,
         language: str = "python",
         data_dir: str = BIGCODEBENCH_PATH,
-        max_tokens: Optional[int] = 1280,
+        max_tokens: int = 1280,
         num_workers: int = 32,
         timeout: float = 120,
         debug: bool = False,
@@ -98,7 +98,7 @@ def __init__(
         self.language = language
         os.makedirs(data_dir, exist_ok=True)
         self.data_dir = data_dir
-        self.max_tokens = max_tokens if max_tokens is not None else 1280
+        self.max_tokens = max_tokens
         self.num_workers = num_workers
         self.timeout = timeout
         self.debug = debug
diff --git a/eval/chat_benchmarks/CodeElo/eval_instruct.py b/eval/chat_benchmarks/CodeElo/eval_instruct.py
index a561e836..eae33254 100644
--- a/eval/chat_benchmarks/CodeElo/eval_instruct.py
+++ b/eval/chat_benchmarks/CodeElo/eval_instruct.py
@@ -48,7 +48,7 @@ def __init__(
         self,
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = None,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -63,9 +63,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
-        self.max_new_tokens = (
-            max_tokens if max_tokens is not None else 32768
-        )  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = max_tokens
         self.seed = seed
         self.n_repeat = 3
         self.filter_interaction_questions = True
diff --git a/eval/chat_benchmarks/CodeForces/eval_instruct.py b/eval/chat_benchmarks/CodeForces/eval_instruct.py
index 30243392..08ef6cd9 100644
--- a/eval/chat_benchmarks/CodeForces/eval_instruct.py
+++ b/eval/chat_benchmarks/CodeForces/eval_instruct.py
@@ -47,7 +47,7 @@ def __init__(
         self,
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = None,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -62,8 +62,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768
-        self.seed = seed
+        self.max_new_tokens = max_tokens
         self.n_repeat = 3
         self.filter_interaction_questions = True
 
diff --git a/eval/chat_benchmarks/CruxEval/eval_instruct.py b/eval/chat_benchmarks/CruxEval/eval_instruct.py
index 7580e254..4049df1f 100644
--- a/eval/chat_benchmarks/CruxEval/eval_instruct.py
+++ b/eval/chat_benchmarks/CruxEval/eval_instruct.py
@@ -132,7 +132,7 @@ class CruxEvalBenchmark(BaseBenchmark):
     def __init__(
         self,
         data_dir: str = CruxEval_PATH,
-        max_tokens: Optional[int] = 2048,
+        max_tokens: int = 2048,
         num_workers: int = 32,
         timeout: float = 120,
         debug: bool = False,
@@ -155,7 +155,7 @@ def __init__(
         self.language = "python"
         os.makedirs(data_dir, exist_ok=True)
         self.data_dir = data_dir
-        self.max_tokens = max_tokens if max_tokens is not None else 2048
+        self.max_tokens = max_tokens
         self.num_workers = num_workers
         self.timeout = timeout
         self.debug = debug
diff --git a/eval/chat_benchmarks/GPQADiamond/eval_instruct.py b/eval/chat_benchmarks/GPQADiamond/eval_instruct.py
index 46288c69..e49cbd37 100644
--- a/eval/chat_benchmarks/GPQADiamond/eval_instruct.py
+++ b/eval/chat_benchmarks/GPQADiamond/eval_instruct.py
@@ -35,7 +35,7 @@ def __init__(
         self,
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = 32768,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -51,7 +51,7 @@ def __init__(
         self.dataset_name = "Idavidrein/gpqa"
         self.debug = debug
         self.seed = seed
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768
+        self.max_new_tokens = max_tokens
         self.n_repeat = 3
 
     def generate_responses(self, model: LM) -> Dict[str, Any]:
diff --git a/eval/chat_benchmarks/HLE/eval_instruct.py b/eval/chat_benchmarks/HLE/eval_instruct.py
index 8df8d9a6..275668ab 100644
--- a/eval/chat_benchmarks/HLE/eval_instruct.py
+++ b/eval/chat_benchmarks/HLE/eval_instruct.py
@@ -63,7 +63,7 @@ def __init__(
         self,
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = None,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -77,9 +77,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
-        self.max_new_tokens = (
-            max_tokens if max_tokens is not None else 32768
-        )  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = max_tokens
         self.seed = seed
         self.n_repeat = 3
 
diff --git a/eval/chat_benchmarks/HMMT/eval_instruct.py b/eval/chat_benchmarks/HMMT/eval_instruct.py
index 32b46dfb..22f76801 100644
--- a/eval/chat_benchmarks/HMMT/eval_instruct.py
+++ b/eval/chat_benchmarks/HMMT/eval_instruct.py
@@ -29,7 +29,7 @@ def __init__(
         self,
         dataset_name: str = "MathArena/hmmt_feb_2025",
         debug: bool = False,
-        max_tokens: Optional[int] = None,
+        max_tokens: int = 32768,
         seed: List[int] = [0, 1234, 1234, 1234],
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
@@ -47,9 +47,7 @@ def __init__(
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.dataset_name = dataset_name
         self.debug = debug
-        self.max_new_tokens = (
-            max_tokens if max_tokens is not None else 32768
-        )  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = max_tokens
         self.seed = seed
         self.n_repeat = 10
 
diff --git a/eval/chat_benchmarks/HumanEval/eval_instruct.py b/eval/chat_benchmarks/HumanEval/eval_instruct.py
index fb5c54e9..9b54653f 100644
--- a/eval/chat_benchmarks/HumanEval/eval_instruct.py
+++ b/eval/chat_benchmarks/HumanEval/eval_instruct.py
@@ -22,7 +22,7 @@ def __init__(
         self,
         languages: List[str] = ["python", "sh"],
         data_dir: str = "eval/chat_benchmarks/HumanEval/data",
-        max_tokens: Optional[int] = 1024,
+        max_tokens: int = 1024,
         num_workers: int = 8,
         timeout: float = 3.0,
         debug: bool = False,
@@ -45,7 +45,7 @@ def __init__(
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.languages = languages
         self.data_dir = data_dir
-        self.max_tokens = max_tokens if max_tokens is not None else 1024
+        self.max_tokens = max_tokens
         self.num_workers = num_workers
         self.timeout = timeout
         self.debug = debug
diff --git a/eval/chat_benchmarks/HumanEvalPlus/eval_instruct.py b/eval/chat_benchmarks/HumanEvalPlus/eval_instruct.py
index f63fd7b2..d12b31a7 100644
--- a/eval/chat_benchmarks/HumanEvalPlus/eval_instruct.py
+++ b/eval/chat_benchmarks/HumanEvalPlus/eval_instruct.py
@@ -22,7 +22,7 @@ def __init__(
         self,
         languages: List[str] = ["python"],
         data_dir: str = "eval/chat_benchmarks/HumanEvalPlus/data",
-        max_tokens: Optional[int] = 1024,
+        max_tokens: int = 1024,
         num_workers: int = 8,
         timeout: float = 3.0,
         debug: bool = False,
@@ -45,7 +45,7 @@ def __init__(
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.languages = languages
         self.data_dir = data_dir
-        self.max_tokens = max_tokens if max_tokens is not None else 1024
+        self.max_tokens = max_tokens
         self.num_workers = num_workers
         self.timeout = timeout
         self.debug = debug
diff --git a/eval/chat_benchmarks/IFEval/eval_instruct.py b/eval/chat_benchmarks/IFEval/eval_instruct.py
index 79be036f..907a8019 100644
--- a/eval/chat_benchmarks/IFEval/eval_instruct.py
+++ b/eval/chat_benchmarks/IFEval/eval_instruct.py
@@ -18,7 +18,7 @@ def __init__(
         start_idx: int = 10,
         end_idx: int = 510,
         debug: bool = False,
-        max_tokens: Optional[int] = 512,
+        max_tokens: int = 512,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -37,7 +37,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.data_dir = data_dir
-        self.max_tokens = max_tokens if max_tokens is not None else 512
+        self.max_tokens = max_tokens
         self.num_examples = num_examples
         self.start_idx = start_idx
         self.end_idx = end_idx
diff --git a/eval/chat_benchmarks/JEEBench/eval_instruct.py b/eval/chat_benchmarks/JEEBench/eval_instruct.py
index 5ba53541..05881a19 100644
--- a/eval/chat_benchmarks/JEEBench/eval_instruct.py
+++ b/eval/chat_benchmarks/JEEBench/eval_instruct.py
@@ -78,7 +78,7 @@ def __init__(
         self,
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = 32768,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -92,9 +92,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
-        self.max_new_tokens = (
-            max_tokens if max_tokens is not None else 32768
-        )  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = max_tokens
         self.seed = seed
         self.n_repeat = 3
 
diff --git a/eval/chat_benchmarks/LiveBench/eval_instruct.py b/eval/chat_benchmarks/LiveBench/eval_instruct.py
index 3e4e0339..760b4a2b 100644
--- a/eval/chat_benchmarks/LiveBench/eval_instruct.py
+++ b/eval/chat_benchmarks/LiveBench/eval_instruct.py
@@ -47,7 +47,7 @@ def __init__(
         release_date: str = "2024-08-31",
         remove_existing_file: bool = True,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = 4096,
+        max_tokens: int = 4096,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -72,7 +72,7 @@ def __init__(
             self.release_date = "2024-06-24"
             self.num_workers = 1
         else:
-            self.max_tokens = max_tokens if max_tokens is not None else 4096
+            self.max_tokens = max_tokens
         self.temperature = temperature
         self.num_choices = num_choices
         self.all_release_dates = ["2024-07-26", "2024-06-24", "2024-08-31", "2024-11-25"]
diff --git a/eval/chat_benchmarks/LiveCodeBench/eval_instruct.py b/eval/chat_benchmarks/LiveCodeBench/eval_instruct.py
index 5c36e5bf..19dcec42 100644
--- a/eval/chat_benchmarks/LiveCodeBench/eval_instruct.py
+++ b/eval/chat_benchmarks/LiveCodeBench/eval_instruct.py
@@ -51,7 +51,7 @@ def __init__(
         self,
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = 32768,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -66,7 +66,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
-        self.max_new_tokens = max_tokens or 32768
+        self.max_new_tokens = max_tokens
         self.seed = seed
         self.n_repeat = 6
 
diff --git a/eval/chat_benchmarks/LiveCodeBenchv5/eval_instruct.py b/eval/chat_benchmarks/LiveCodeBenchv5/eval_instruct.py
index 9ae5ee56..e1cc5c75 100644
--- a/eval/chat_benchmarks/LiveCodeBenchv5/eval_instruct.py
+++ b/eval/chat_benchmarks/LiveCodeBenchv5/eval_instruct.py
@@ -47,7 +47,7 @@ def __init__(
         self,
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = None,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -62,9 +62,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
-        self.max_new_tokens = (
-            max_tokens if max_tokens is not None else 32768
-        )  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = max_tokens
         self.seed = seed
         self.n_repeat = 3
 
diff --git a/eval/chat_benchmarks/MATH500/eval_instruct.py b/eval/chat_benchmarks/MATH500/eval_instruct.py
index f082ce0f..eec964f0 100644
--- a/eval/chat_benchmarks/MATH500/eval_instruct.py
+++ b/eval/chat_benchmarks/MATH500/eval_instruct.py
@@ -27,7 +27,7 @@ def __init__(
         data_file: str = "eval/chat_benchmarks/MATH500/data/math500.jsonl",
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = 32768,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -45,9 +45,7 @@ def __init__(
         self.data_file = data_file
         self.debug = debug
         self.seed = seed
-        self.max_new_tokens = (
-            max_tokens if max_tokens is not None else 32768
-        )  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = max_tokens
 
     def generate_responses(self, model: LM) -> Dict[str, Any]:
         """
diff --git a/eval/chat_benchmarks/MBPP/eval_instruct.py b/eval/chat_benchmarks/MBPP/eval_instruct.py
index 6f559a68..0fcb8a3b 100644
--- a/eval/chat_benchmarks/MBPP/eval_instruct.py
+++ b/eval/chat_benchmarks/MBPP/eval_instruct.py
@@ -26,7 +26,7 @@ def __init__(
         start_idx: int = 10,
         end_idx: int = 510,
         debug: bool = False,
-        max_tokens: Optional[int] = 512,
+        max_tokens: int = 512,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -45,7 +45,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.data_dir = data_dir
-        self.max_tokens = max_tokens or 512
+        self.max_tokens = max_tokens
         self.num_examples = num_examples
         self.start_idx = start_idx
         self.end_idx = end_idx
diff --git a/eval/chat_benchmarks/MBPPPlus/eval_instruct.py b/eval/chat_benchmarks/MBPPPlus/eval_instruct.py
index d56d689b..6094bc17 100644
--- a/eval/chat_benchmarks/MBPPPlus/eval_instruct.py
+++ b/eval/chat_benchmarks/MBPPPlus/eval_instruct.py
@@ -25,7 +25,7 @@ def __init__(
         num_workers: int = 8,
         timeout: float = 3.0,
         debug: bool = False,
-        max_tokens: Optional[int] = 1024,
+        max_tokens: int = 1024,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -43,7 +43,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.data_dir = data_dir
-        self.max_tokens = max_tokens if max_tokens is not None else 1024
+        self.max_tokens = max_tokens
         self.num_workers = num_workers
         self.timeout = timeout
         self.debug = debug
diff --git a/eval/chat_benchmarks/MTBench/eval_instruct.py b/eval/chat_benchmarks/MTBench/eval_instruct.py
index eddb24ca..38e9b13e 100644
--- a/eval/chat_benchmarks/MTBench/eval_instruct.py
+++ b/eval/chat_benchmarks/MTBench/eval_instruct.py
@@ -71,7 +71,7 @@ def __init__(
         config: Optional[MTBenchConfig] = None,
         debug: bool = False,
         annotator_model: str = "gpt-4o-mini-2024-07-18",
-        max_tokens: Optional[int] = 1024,
+        max_tokens: int = 1024,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -87,13 +87,13 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.base_path = Path(base_path)
-        if getattr(self, "config", None) is None:
-            self.config = MTBenchConfig(
-                judge_model=annotator_model,
-            )
-        else:
-            self.config = config
-        self.config.max_new_token = max_tokens if max_tokens is not None else 1024
+        if annotator_model == "auto":
+            annotator_model = "gpt-4"
+        if config:
+            print(f"Warning: Overwriting config.judge_model = {annotator_model} ")
+            config.judge_model = annotator_model
+        self.config = config or MTBenchConfig(judge_model=annotator_model)
+        self.config.max_new_token = max_tokens
         self.debug = debug
 
         # Setup paths
diff --git a/eval/chat_benchmarks/MTBench/fastchat/protocol/api_protocol.py b/eval/chat_benchmarks/MTBench/fastchat/protocol/api_protocol.py
index 2dc99449..589e78cc 100644
--- a/eval/chat_benchmarks/MTBench/fastchat/protocol/api_protocol.py
+++ b/eval/chat_benchmarks/MTBench/fastchat/protocol/api_protocol.py
@@ -55,7 +55,7 @@ class APIChatCompletionRequest(BaseModel):
     top_p: Optional[float] = 1.0
     top_k: Optional[int] = -1
     n: Optional[int] = 1
-    max_tokens: Optional[int] = None
+    max_tokens: int = 1024
     stop: Optional[Union[str, List[str]]] = None
     stream: Optional[bool] = False
     user: Optional[str] = None
@@ -129,7 +129,7 @@ class CompletionRequest(BaseModel):
     suffix: Optional[str] = None
     temperature: Optional[float] = 0.7
     n: Optional[int] = 1
-    max_tokens: Optional[int] = 16
+    max_tokens: int = 16
     stop: Optional[Union[str, List[str]]] = None
     stream: Optional[bool] = False
     top_p: Optional[float] = 1.0
diff --git a/eval/chat_benchmarks/MTBench/fastchat/protocol/openai_api_protocol.py b/eval/chat_benchmarks/MTBench/fastchat/protocol/openai_api_protocol.py
index bb50a5ef..38713ed5 100644
--- a/eval/chat_benchmarks/MTBench/fastchat/protocol/openai_api_protocol.py
+++ b/eval/chat_benchmarks/MTBench/fastchat/protocol/openai_api_protocol.py
@@ -66,7 +66,7 @@ class ChatCompletionRequest(BaseModel):
     top_p: Optional[float] = 1.0
     top_k: Optional[int] = -1
     n: Optional[int] = 1
-    max_tokens: Optional[int] = None
+    max_tokens: int = 1024
     stop: Optional[Union[str, List[str]]] = None
     stream: Optional[bool] = False
     presence_penalty: Optional[float] = 0.0
@@ -154,7 +154,7 @@ class CompletionRequest(BaseModel):
     suffix: Optional[str] = None
     temperature: Optional[float] = 0.7
     n: Optional[int] = 1
-    max_tokens: Optional[int] = 16
+    max_tokens: int = 16
     stop: Optional[Union[str, List[str]]] = None
     stream: Optional[bool] = False
     top_p: Optional[float] = 1.0
diff --git a/eval/chat_benchmarks/MTBench/fastchat/serve/api_provider.py b/eval/chat_benchmarks/MTBench/fastchat/serve/api_provider.py
index ebf7f25a..07184521 100644
--- a/eval/chat_benchmarks/MTBench/fastchat/serve/api_provider.py
+++ b/eval/chat_benchmarks/MTBench/fastchat/serve/api_provider.py
@@ -957,7 +957,7 @@ def cohere_api_stream_iter(
     messages: list,
     temperature: Optional[float] = None,  # The SDK or API handles None for all parameters following
     top_p: Optional[float] = None,
-    max_new_tokens: Optional[int] = None,
+    max_new_tokens: int = 1024,
     api_key: Optional[str] = None,  # default is env var CO_API_KEY
     api_base: Optional[str] = None,
 ):
@@ -1084,7 +1084,7 @@ def reka_api_stream_iter(
     messages: list,
     temperature: Optional[float] = None,  # The SDK or API handles None for all parameters following
     top_p: Optional[float] = None,
-    max_new_tokens: Optional[int] = None,
+    max_new_tokens: int = 1024,
     api_key: Optional[str] = None,  # default is env var CO_API_KEY
     api_base: Optional[str] = None,
 ):
diff --git a/eval/chat_benchmarks/MTBench/fastchat/serve/openai_api_server.py b/eval/chat_benchmarks/MTBench/fastchat/serve/openai_api_server.py
index 86e63cd1..9b5e58d5 100644
--- a/eval/chat_benchmarks/MTBench/fastchat/serve/openai_api_server.py
+++ b/eval/chat_benchmarks/MTBench/fastchat/serve/openai_api_server.py
@@ -266,7 +266,7 @@ async def get_gen_params(
     top_k: Optional[int],
     presence_penalty: Optional[float],
     frequency_penalty: Optional[float],
-    max_tokens: Optional[int],
+    max_tokens: int,
     echo: Optional[bool],
     logprobs: Optional[int] = None,
     stop: Optional[Union[str, List[str]]],
diff --git a/eval/chat_benchmarks/WildBench/eval_instruct.py b/eval/chat_benchmarks/WildBench/eval_instruct.py
index 60c8384e..cfe73a36 100644
--- a/eval/chat_benchmarks/WildBench/eval_instruct.py
+++ b/eval/chat_benchmarks/WildBench/eval_instruct.py
@@ -76,7 +76,7 @@ def __init__(
         config: Optional[WildBenchConfig] = None,
         annotator_model: str = "gpt-4o-mini-2024-07-18",
         debug: bool = False,
-        max_tokens: Optional[int] = 1024,
+        max_tokens: int = 1024,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -96,7 +96,7 @@ def __init__(
             config.model = annotator_model
         self.config = config or WildBenchConfig(model=annotator_model)
         self.debug = debug
-        self.max_new_tokens = max_tokens if max_tokens is not None else 1024
+        self.max_new_tokens = max_tokens
 
         # Task category mapping
         self.task_group_mapping = {
diff --git a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/decoders/cohere.py b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/decoders/cohere.py
index c26c822d..dfe60aac 100644
--- a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/decoders/cohere.py
+++ b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/decoders/cohere.py
@@ -73,7 +73,7 @@ def cohere_completions(
 def _cohere_completion_helper(
     prompt: str,
     cohere_api_keys: Optional[Sequence[str]] = (constants.COHERE_API_KEY,),
-    max_tokens: Optional[int] = 1000,
+    max_tokens: int = 1000,
     temperature: Optional[float] = 0.7,
     max_tries=5,
     **kwargs,
diff --git a/eval/task.py b/eval/task.py
index 42e77824..70962115 100644
--- a/eval/task.py
+++ b/eval/task.py
@@ -234,13 +234,20 @@ def _register_benchmark(self, name: str, benchmark_class: Type[BaseBenchmark]):
             valid_kwargs = {}
 
             # Only pass kwargs that the benchmark's __init__ accepts
+            # Filter out None values to let benchmarks use their default values
             for param_name, param in init_params.items():
                 if param_name in self.benchmark_kwargs:
-                    valid_kwargs[param_name] = self.benchmark_kwargs[param_name]
-                    self.logger.debug(f"Passing {param_name} to {name} benchmark")
-
-            # Ensure system_instruction is passed if available
-            if "system_instruction" in self.benchmark_kwargs:
+                    value = self.benchmark_kwargs[param_name]
+                    # Only pass the argument if it's not None, so benchmarks can use defaults
+                    if value is not None:
+                        valid_kwargs[param_name] = value
+                        self.logger.debug(f"Passing {param_name}={value} to {name} benchmark")
+
+            # Ensure system_instruction is passed if available and not None
+            if (
+                "system_instruction" in self.benchmark_kwargs
+                and self.benchmark_kwargs["system_instruction"] is not None
+            ):
                 valid_kwargs["system_instruction"] = self.benchmark_kwargs["system_instruction"]
 
             instance = benchmark_class(**valid_kwargs)