Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions eval/chat_benchmarks/AIME24/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def __init__(
data_file: str = "eval/chat_benchmarks/AIME24/data/aime24.json",
debug: bool = False,
seed: List[int] = [0, 1234, 1234, 1234],
max_tokens: Optional[int] = 32768,
max_tokens: int = 32768,
logger: Optional[logging.Logger] = None,
system_instruction: Optional[str] = None,
):
Expand All @@ -44,9 +44,7 @@ def __init__(
super().__init__(logger=logger, system_instruction=system_instruction)
self.data_file = data_file
self.debug = debug
self.max_new_tokens = (
max_tokens if max_tokens is not None else 32768
) # set higher to avoid truncation for reasoning models
self.max_new_tokens = max_tokens
self.seed = seed
self.n_repeat = 10

Expand Down
2 changes: 1 addition & 1 deletion eval/chat_benchmarks/AIME25/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def __init__(
super().__init__(logger=logger, system_instruction=system_instruction)
self.data_file = data_file
self.debug = debug
self.max_new_tokens = max_tokens if max_tokens is not None else 32768
self.max_new_tokens = max_tokens
self.seed = seed
self.n_repeat = 10

Expand Down
4 changes: 2 additions & 2 deletions eval/chat_benchmarks/AIW/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def __init__(
data_file: str = "eval/chat_benchmarks/AIW/data/aiw_data.json",
debug: bool = False,
seed: List[int] = [0, 1234, 1234, 1234],
max_tokens: Optional[int] = 32768,
max_tokens: int = 32768,
logger: Optional[logging.Logger] = None,
system_instruction: Optional[str] = None,
n_trials: int = 100, # Run 100 trials
Expand All @@ -41,7 +41,7 @@ def __init__(
super().__init__(logger=logger, system_instruction=system_instruction)
self.data_file = data_file
self.debug = debug
self.max_new_tokens = max_tokens if max_tokens is not None else 32768
self.max_new_tokens = max_tokens
self.seed = seed
self.n_trials = n_trials

Expand Down
4 changes: 2 additions & 2 deletions eval/chat_benchmarks/AMC23/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def __init__(
data_file: str = "eval/chat_benchmarks/AMC23/data/amc23.json",
debug: bool = False,
seed: List[int] = [0, 1234, 1234, 1234],
max_tokens: Optional[int] = 32768,
max_tokens: int = 32768,
logger: Optional[logging.Logger] = None,
system_instruction: Optional[str] = None,
):
Expand All @@ -47,7 +47,7 @@ def __init__(
self.data_file = data_file
self.debug = debug
self.seed = seed
self.max_new_tokens = max_tokens if max_tokens is not None else 32768
self.max_new_tokens = max_tokens
self.n_repeat = 10

def generate_responses(self, model: LM) -> Dict[str, Any]:
Expand Down
4 changes: 2 additions & 2 deletions eval/chat_benchmarks/BigCodeBench/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def __init__(
self,
language: str = "python",
data_dir: str = BIGCODEBENCH_PATH,
max_tokens: Optional[int] = 1280,
max_tokens: int = 1280,
num_workers: int = 32,
timeout: float = 120,
debug: bool = False,
Expand Down Expand Up @@ -98,7 +98,7 @@ def __init__(
self.language = language
os.makedirs(data_dir, exist_ok=True)
self.data_dir = data_dir
self.max_tokens = max_tokens if max_tokens is not None else 1280
self.max_tokens = max_tokens
self.num_workers = num_workers
self.timeout = timeout
self.debug = debug
Expand Down
6 changes: 2 additions & 4 deletions eval/chat_benchmarks/CodeElo/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def __init__(
self,
debug: bool = False,
seed: List[int] = [0, 1234, 1234, 1234],
max_tokens: Optional[int] = None,
max_tokens: int = 32768,
logger: Optional[logging.Logger] = None,
system_instruction: Optional[str] = None,
):
Expand All @@ -63,9 +63,7 @@ def __init__(
"""
super().__init__(logger=logger, system_instruction=system_instruction)
self.debug = debug
self.max_new_tokens = (
max_tokens if max_tokens is not None else 32768
) # set higher to avoid truncation for reasoning models
self.max_new_tokens = max_tokens
self.seed = seed
self.n_repeat = 3
self.filter_interaction_questions = True
Expand Down
5 changes: 2 additions & 3 deletions eval/chat_benchmarks/CodeForces/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def __init__(
self,
debug: bool = False,
seed: List[int] = [0, 1234, 1234, 1234],
max_tokens: Optional[int] = None,
max_tokens: int = 32768,
logger: Optional[logging.Logger] = None,
system_instruction: Optional[str] = None,
):
Expand All @@ -62,8 +62,7 @@ def __init__(
"""
super().__init__(logger=logger, system_instruction=system_instruction)
self.debug = debug
self.max_new_tokens = max_tokens if max_tokens is not None else 32768
self.seed = seed
self.max_new_tokens = max_tokens
self.n_repeat = 3
self.filter_interaction_questions = True

Expand Down
4 changes: 2 additions & 2 deletions eval/chat_benchmarks/CruxEval/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ class CruxEvalBenchmark(BaseBenchmark):
def __init__(
self,
data_dir: str = CruxEval_PATH,
max_tokens: Optional[int] = 2048,
max_tokens: int = 2048,
num_workers: int = 32,
timeout: float = 120,
debug: bool = False,
Expand All @@ -155,7 +155,7 @@ def __init__(
self.language = "python"
os.makedirs(data_dir, exist_ok=True)
self.data_dir = data_dir
self.max_tokens = max_tokens if max_tokens is not None else 2048
self.max_tokens = max_tokens
self.num_workers = num_workers
self.timeout = timeout
self.debug = debug
Expand Down
4 changes: 2 additions & 2 deletions eval/chat_benchmarks/GPQADiamond/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def __init__(
self,
debug: bool = False,
seed: List[int] = [0, 1234, 1234, 1234],
max_tokens: Optional[int] = 32768,
max_tokens: int = 32768,
logger: Optional[logging.Logger] = None,
system_instruction: Optional[str] = None,
):
Expand All @@ -51,7 +51,7 @@ def __init__(
self.dataset_name = "Idavidrein/gpqa"
self.debug = debug
self.seed = seed
self.max_new_tokens = max_tokens if max_tokens is not None else 32768
self.max_new_tokens = max_tokens
self.n_repeat = 3

def generate_responses(self, model: LM) -> Dict[str, Any]:
Expand Down
6 changes: 2 additions & 4 deletions eval/chat_benchmarks/HLE/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def __init__(
self,
debug: bool = False,
seed: List[int] = [0, 1234, 1234, 1234],
max_tokens: Optional[int] = None,
max_tokens: int = 32768,
logger: Optional[logging.Logger] = None,
system_instruction: Optional[str] = None,
):
Expand All @@ -77,9 +77,7 @@ def __init__(
"""
super().__init__(logger=logger, system_instruction=system_instruction)
self.debug = debug
self.max_new_tokens = (
max_tokens if max_tokens is not None else 32768
) # set higher to avoid truncation for reasoning models
self.max_new_tokens = max_tokens
self.seed = seed
self.n_repeat = 3

Expand Down
6 changes: 2 additions & 4 deletions eval/chat_benchmarks/HMMT/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def __init__(
self,
dataset_name: str = "MathArena/hmmt_feb_2025",
debug: bool = False,
max_tokens: Optional[int] = None,
max_tokens: int = 32768,
seed: List[int] = [0, 1234, 1234, 1234],
logger: Optional[logging.Logger] = None,
system_instruction: Optional[str] = None,
Expand All @@ -47,9 +47,7 @@ def __init__(
super().__init__(logger=logger, system_instruction=system_instruction)
self.dataset_name = dataset_name
self.debug = debug
self.max_new_tokens = (
max_tokens if max_tokens is not None else 32768
) # set higher to avoid truncation for reasoning models
self.max_new_tokens = max_tokens
self.seed = seed
self.n_repeat = 10

Expand Down
4 changes: 2 additions & 2 deletions eval/chat_benchmarks/HumanEval/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def __init__(
self,
languages: List[str] = ["python", "sh"],
data_dir: str = "eval/chat_benchmarks/HumanEval/data",
max_tokens: Optional[int] = 1024,
max_tokens: int = 1024,
num_workers: int = 8,
timeout: float = 3.0,
debug: bool = False,
Expand All @@ -45,7 +45,7 @@ def __init__(
super().__init__(logger=logger, system_instruction=system_instruction)
self.languages = languages
self.data_dir = data_dir
self.max_tokens = max_tokens if max_tokens is not None else 1024
self.max_tokens = max_tokens
self.num_workers = num_workers
self.timeout = timeout
self.debug = debug
Expand Down
4 changes: 2 additions & 2 deletions eval/chat_benchmarks/HumanEvalPlus/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def __init__(
self,
languages: List[str] = ["python"],
data_dir: str = "eval/chat_benchmarks/HumanEvalPlus/data",
max_tokens: Optional[int] = 1024,
max_tokens: int = 1024,
num_workers: int = 8,
timeout: float = 3.0,
debug: bool = False,
Expand All @@ -45,7 +45,7 @@ def __init__(
super().__init__(logger=logger, system_instruction=system_instruction)
self.languages = languages
self.data_dir = data_dir
self.max_tokens = max_tokens if max_tokens is not None else 1024
self.max_tokens = max_tokens
self.num_workers = num_workers
self.timeout = timeout
self.debug = debug
Expand Down
4 changes: 2 additions & 2 deletions eval/chat_benchmarks/IFEval/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def __init__(
start_idx: int = 10,
end_idx: int = 510,
debug: bool = False,
max_tokens: Optional[int] = 512,
max_tokens: int = 512,
logger: Optional[logging.Logger] = None,
system_instruction: Optional[str] = None,
):
Expand All @@ -37,7 +37,7 @@ def __init__(
"""
super().__init__(logger=logger, system_instruction=system_instruction)
self.data_dir = data_dir
self.max_tokens = max_tokens if max_tokens is not None else 512
self.max_tokens = max_tokens
self.num_examples = num_examples
self.start_idx = start_idx
self.end_idx = end_idx
Expand Down
6 changes: 2 additions & 4 deletions eval/chat_benchmarks/JEEBench/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def __init__(
self,
debug: bool = False,
seed: List[int] = [0, 1234, 1234, 1234],
max_tokens: Optional[int] = 32768,
max_tokens: int = 32768,
logger: Optional[logging.Logger] = None,
system_instruction: Optional[str] = None,
):
Expand All @@ -92,9 +92,7 @@ def __init__(
"""
super().__init__(logger=logger, system_instruction=system_instruction)
self.debug = debug
self.max_new_tokens = (
max_tokens if max_tokens is not None else 32768
) # set higher to avoid truncation for reasoning models
self.max_new_tokens = max_tokens
self.seed = seed
self.n_repeat = 3

Expand Down
4 changes: 2 additions & 2 deletions eval/chat_benchmarks/LiveBench/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def __init__(
release_date: str = "2024-08-31",
remove_existing_file: bool = True,
seed: List[int] = [0, 1234, 1234, 1234],
max_tokens: Optional[int] = 4096,
max_tokens: int = 4096,
logger: Optional[logging.Logger] = None,
system_instruction: Optional[str] = None,
):
Expand All @@ -72,7 +72,7 @@ def __init__(
self.release_date = "2024-06-24"
self.num_workers = 1
else:
self.max_tokens = max_tokens if max_tokens is not None else 4096
self.max_tokens = max_tokens
self.temperature = temperature
self.num_choices = num_choices
self.all_release_dates = ["2024-07-26", "2024-06-24", "2024-08-31", "2024-11-25"]
Expand Down
4 changes: 2 additions & 2 deletions eval/chat_benchmarks/LiveCodeBench/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def __init__(
self,
debug: bool = False,
seed: List[int] = [0, 1234, 1234, 1234],
max_tokens: Optional[int] = 32768,
max_tokens: int = 32768,
logger: Optional[logging.Logger] = None,
system_instruction: Optional[str] = None,
):
Expand All @@ -66,7 +66,7 @@ def __init__(
"""
super().__init__(logger=logger, system_instruction=system_instruction)
self.debug = debug
self.max_new_tokens = max_tokens or 32768
self.max_new_tokens = max_tokens
self.seed = seed
self.n_repeat = 6

Expand Down
6 changes: 2 additions & 4 deletions eval/chat_benchmarks/LiveCodeBenchv5/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def __init__(
self,
debug: bool = False,
seed: List[int] = [0, 1234, 1234, 1234],
max_tokens: Optional[int] = None,
max_tokens: int = 32768,
logger: Optional[logging.Logger] = None,
system_instruction: Optional[str] = None,
):
Expand All @@ -62,9 +62,7 @@ def __init__(
"""
super().__init__(logger=logger, system_instruction=system_instruction)
self.debug = debug
self.max_new_tokens = (
max_tokens if max_tokens is not None else 32768
) # set higher to avoid truncation for reasoning models
self.max_new_tokens = max_tokens
self.seed = seed
self.n_repeat = 3

Expand Down
6 changes: 2 additions & 4 deletions eval/chat_benchmarks/MATH500/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def __init__(
data_file: str = "eval/chat_benchmarks/MATH500/data/math500.jsonl",
debug: bool = False,
seed: List[int] = [0, 1234, 1234, 1234],
max_tokens: Optional[int] = 32768,
max_tokens: int = 32768,
logger: Optional[logging.Logger] = None,
system_instruction: Optional[str] = None,
):
Expand All @@ -45,9 +45,7 @@ def __init__(
self.data_file = data_file
self.debug = debug
self.seed = seed
self.max_new_tokens = (
max_tokens if max_tokens is not None else 32768
) # set higher to avoid truncation for reasoning models
self.max_new_tokens = max_tokens

def generate_responses(self, model: LM) -> Dict[str, Any]:
"""
Expand Down
4 changes: 2 additions & 2 deletions eval/chat_benchmarks/MBPP/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def __init__(
start_idx: int = 10,
end_idx: int = 510,
debug: bool = False,
max_tokens: Optional[int] = 512,
max_tokens: int = 512,
logger: Optional[logging.Logger] = None,
system_instruction: Optional[str] = None,
):
Expand All @@ -45,7 +45,7 @@ def __init__(
"""
super().__init__(logger=logger, system_instruction=system_instruction)
self.data_dir = data_dir
self.max_tokens = max_tokens or 512
self.max_tokens = max_tokens
self.num_examples = num_examples
self.start_idx = start_idx
self.end_idx = end_idx
Expand Down
4 changes: 2 additions & 2 deletions eval/chat_benchmarks/MBPPPlus/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def __init__(
num_workers: int = 8,
timeout: float = 3.0,
debug: bool = False,
max_tokens: Optional[int] = 1024,
max_tokens: int = 1024,
logger: Optional[logging.Logger] = None,
system_instruction: Optional[str] = None,
):
Expand All @@ -43,7 +43,7 @@ def __init__(
"""
super().__init__(logger=logger, system_instruction=system_instruction)
self.data_dir = data_dir
self.max_tokens = max_tokens if max_tokens is not None else 1024
self.max_tokens = max_tokens
self.num_workers = num_workers
self.timeout = timeout
self.debug = debug
Expand Down
Loading