From 9afb37d0be60bfc35e152485453796d9eb2fb739 Mon Sep 17 00:00:00 2001 From: richardzhuang0412 Date: Sun, 9 Nov 2025 15:46:50 -0800 Subject: [PATCH] MATH500 avg_3 --- CLAUDE.md | 320 ++++++++++++++++++ eval/chat_benchmarks/MATH500/eval_instruct.py | 82 +++-- 2 files changed, 381 insertions(+), 21 deletions(-) create mode 100644 CLAUDE.md diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000..53855d23 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,320 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Overview + +Evalchemy is a unified evaluation toolkit for post-trained language models. It builds on the LM-Eval-Harness and provides a consistent interface for running various benchmarks including instruction-following, reasoning, and code generation tasks. It supports 34+ benchmarks across instruction-following, code generation, and math reasoning domains, with integration for HuggingFace, vLLM, OpenAI, and 100+ API models via Curator. + +## Development Setup + +### Installation +```bash +conda create --name evalchemy python=3.10 +conda activate evalchemy +git clone git@github.com:mlfoundations/evalchemy.git +cd evalchemy +pip install -e . # Install in editable mode +pip install -e eval/chat_benchmarks/alpaca_eval # Special dependency +make install # Runs pip install -e ".[dev]" and pre-commit install +huggingface-cli login # For gated models/datasets +``` + +**Notes for HPC systems:** +- May need to modify `pyproject.toml` to use absolute paths for the fschat dependency +- See `eval/distributed/SETUP_*.md` for cluster-specific setup (Capella, Leonardo, TACC, Jureca, etc.) + +### Common Development Commands + +**Linting and formatting:** +```bash +black --line-length=120 eval/ # Format code +flake8 eval/ # Run linter +``` + +**Running tests:** +```bash +pytest # Run all tests in repository +pytest eval/chat_benchmarks/HumanEval/ -v # Test specific benchmark +pytest eval/chat_benchmarks/MTBench/tests/ # MTBench-specific tests +``` + +Tests are distributed across benchmark directories (not a centralized `tests/` folder): +- `eval/chat_benchmarks/*/` contain benchmark-specific test files +- MTBench has dedicated `tests/` directory with comprehensive coverage +- Use `--verbose` or `-v` flag for detailed test output + +### Running Evaluations + +**Basic evaluation:** +```bash +python -m eval.eval \ + --model hf \ + --tasks HumanEval,mmlu \ + --model_args "pretrained=mistralai/Mistral-7B-Instruct-v0.3" \ + --batch_size 2 \ + --output_path logs +``` + +**Using config files** (recommended for standard benchmarks): +```bash +python -m eval.eval \ + --model hf \ + --model_args "pretrained=mistralai/Mistral-7B-Instruct-v0.3" \ + --output_path logs \ + --config configs/light_gpt4omini0718.yaml +``` + +**With different model backends:** +```bash +# vLLM (high-performance inference) +python -m eval.eval --model vllm --tasks alpaca_eval \ + --model_args "pretrained=meta-llama/Meta-Llama-3-8B-Instruct" \ + --batch_size 16 --output_path logs + +# OpenAI API +python -m eval.eval --model openai-chat-completions --tasks alpaca_eval \ + --model_args "model=gpt-4o-mini-2024-07-18,num_concurrent=32" \ + --output_path logs + +# Curator (100+ API models via LiteLLM) +python -m eval.eval --model curator --tasks AIME24,MATH500 \ + --model_name "gemini/gemini-2.0-flash-thinking-exp-01-21" \ + --apply_chat_template False --output_path logs +``` + +**Distributed evaluation (HPC):** +```bash +python eval/distributed/launch.py \ + --model_name open-thoughts/OpenThinker-7B \ + --tasks AIME24,AIME25,AMC23,MATH500 \ + --num_shards 8 \ + --watchdog +``` + +**With database logging:** +```bash +python -m eval.eval \ + --model hf \ + --tasks MTBench,alpaca_eval \ + --model_args 'pretrained=meta-llama/Meta-Llama-3-8B-Instruct' \ + --batch_size 2 \ + --output_path logs \ + --use_database \ + --model_name "My Model Name" \ + --creation_location "Lab Name" \ + --created_by "Researcher Name" +``` + +**Debug mode** (test on small subset): +```bash +python -m eval.eval \ + --model hf \ + --tasks HumanEval \ + --model_args "pretrained=gpt2" \ + --debug +``` + +### Viewing and Processing Results + +```bash +# View raw results +jq '.results' logs/ModelName/results_timestamp.json + +# Extract specific metrics +jq '.results | to_entries[] | {task: .key, metrics: .value}' logs/ModelName/results_timestamp.json + +# Convert results to CSV for analysis +python create_csv_helper.py logs/ModelName/results_timestamp.json +``` + +## Architecture + +### Core Components + +**1. eval/eval.py** (646 lines) - Main evaluation orchestrator +- Extends LM-Eval-Harness parser with custom arguments: `--database`, `--annotator_model`, `--config`, `--debug`, `--max_tokens` +- Coordinates between pretrain tasks (from lm-eval) and instruction tasks (custom benchmarks) +- Handles result logging to files and optional PostgreSQL database +- Uses `DCEvaluationTracker` for tracking evaluation metadata +- Custom JSON serialization (`handle_non_serializable_extended()`) for large SymPy objects (>15,000 bits) + +**2. eval/task.py** (374 lines) - Task management system +- **BaseBenchmark**: Abstract base class for all custom benchmarks with two required methods: + - `generate_responses(model)`: Creates Instance objects and runs model.generate_until() + - `evaluate_responses(results)`: Computes metrics from generated outputs + - `_normalize_model_args(model_args)`: Handles API differences (HF, vLLM, OpenAI, Curator) + - `_prepare_messages(messages, model)`: Applies chat templates and system instructions + - `compute(model, instances)`: Batch inference wrapper supporting distributed evaluation +- **TaskManager**: Dynamically loads benchmarks from `eval/chat_benchmarks/`: + - Scans for directories with `eval_instruct.py` + - Dynamically imports and instantiates benchmark classes + - Merges with LM-Eval-Harness pretrain tasks + - Tracks benchmarks requiring annotator models + +**3. eval/chat_benchmarks/** - 34+ custom benchmark implementations +- Each benchmark is a subdirectory with `eval_instruct.py` containing a Benchmark class +- Uses the Instance API to batch generate model outputs efficiently +- Common pattern: load questions → create Instances with prompts → `compute(model, instances)` → evaluate +- Categories: + - **Instruction Following**: MTBench, WildBench, AlpacaEval, IFEval, RepoBench, MixEval, LiveBench, MBPPPlus + - **Code Generation**: HumanEval, HumanEvalPlus, MBPP, BigCodeBench, MultiPLE, CRUXEval, LiveCodeBench, CodeForces, CodeElo + - **Math Reasoning**: AIME24, AIME25, AMC23, MATH500, HMMT, JEEBench, GPQADiamond, Alice in Wonderland + - **Specialized**: ZeroEval (requires HF approval), SWEbench, MMLUPro, AIW + +**4. eval/eval_tracker.py** (419 lines) - Results tracking +- `DCEvaluationTracker`: Persistent result storage and metadata collection +- Logs to JSON files by default +- Optional PostgreSQL integration (enable with `--use_database`) +- Collects: git hash, model config, seeds, tokenizer details, hardware specs, timing + +**5. database/** - PostgreSQL schema (SQLAlchemy models) +- `models.py`: Schema definitions (Dataset, Model, EvalResult, EvalSetting) +- `config.py`: Connection configuration (uses env vars: DB_PASSWORD, DB_HOST, DB_PORT, DB_NAME, DB_USER) +- `utils.py`: Database utility functions +- Enable with `--use_database` flag and environment variables + +**6. eval/distributed/** - HPC cluster evaluation infrastructure +- `launch.py`: Submits SLURM array jobs for data-parallel evaluation +- `process_shard.py`: Worker script processing one data shard per GPU +- Cluster auto-detection via hostname (Capella, Leonardo, TACC, Jureca) +- Workflow: Dataset creation → HF Hub upload → SLURM job submission → Result collection +- Results saved as parquet, uploadable to HuggingFace + +### Key Design Patterns + +**Instance-based batching:** +Benchmarks create lists of Instance objects which are processed in batches by the model: +```python +from lm_eval.api.instance import Instance + +instances = [ + Instance( + "generate_until", + example, + (templated_prompt, {"max_new_tokens": 1024, "do_sample": False}), + idx + ) +] +outputs = self.compute(model, instances) +``` + +**Chat template handling:** +The `_prepare_messages()` method in BaseBenchmark handles system instructions and applies chat templates: +```python +messages = [{"role": "user", "content": prompt}] +templated = self._prepare_messages(messages, model) +``` + +**Model arguments normalization:** +`_normalize_model_args()` in BaseBenchmark handles differences between HuggingFace, vLLM, and OpenAI APIs: +- **HuggingFace**: Uses `max_new_tokens`, no seed support +- **vLLM**: Uses `max_gen_toks`, supports seed +- **OpenAI**: Uses `max_tokens`, caps gpt-4o at 16384 tokens +- **Curator**: Wrapper for 100+ models via LiteLLM + +**Dynamic benchmark discovery:** +TaskManager uses runtime introspection to load benchmarks with no central registry: +- Easy to add new benchmarks (create directory + `eval_instruct.py`) +- Benchmarks are self-contained with data + +### Database Integration + +Optional PostgreSQL integration for leaderboard and experiment tracking: +- Enable with `--use_database` flag when running evaluations +- Configure via environment variables: `DB_PASSWORD`, `DB_HOST`, `DB_PORT`, `DB_NAME`, `DB_USER` +- Schema tables: `models`, `evalresults`, `evalsettings`, `datasets` +- Results linked to model UUIDs for tracking experiments over time +- Use `--model_id` or `--model_name` to update existing entries +- See `database/README.md` for schema details + +### Distributed Evaluation + +For large-scale evaluations across HPC clusters: +- **Location**: `eval/distributed/` +- **Main scripts**: `launch.py` (submit SLURM jobs), `process_shard.py` (worker script) +- **Workflow**: Create dataset → Upload to HF Hub → Submit array job → Collect results → Compute metrics +- **Cluster auto-detection**: Reads hostname to detect cluster (Capella, Leonardo, TACC, Jureca) +- **Output**: Results saved as parquet files, uploadable to HuggingFace +- **Setup guides**: `eval/distributed/SETUP_*.md` for each supported cluster + +### Config Files + +YAML configurations in `configs/` directory enable standardized evaluation runs: +- **light_gpt4omini0718.yaml**: Lightweight benchmarks with GPT-4o Mini judge +- **full_gpt4omini0718.yaml**: Full benchmark suite +- **reasoning.yaml / reasoning_lite.yaml**: Math and reasoning tasks +- **single_task/**: Individual task configurations +- Config format: + ```yaml + annotator_model: gpt-4o-mini-2024-07-18 + tasks: + - task_name: alpaca_eval + batch_size: 32 + - task_name: WildBench + batch_size: 8 + max_tokens: 1024 + ``` +- Configs override command-line `--batch_size`, `--tasks`, `--annotator_model`, `--max_tokens` + +## Adding New Benchmarks + +**Standard benchmark structure:** +1. Create directory in `eval/chat_benchmarks/YourBenchmark/` +2. Create `eval_instruct.py` with a class inheriting from `BaseBenchmark`: + ```python + from eval.task import BaseBenchmark + from lm_eval.api.instance import Instance + from typing import Dict, Any + + class YourBenchmarkBenchmark(BaseBenchmark): + def generate_responses(self, model) -> Dict[str, Any]: + # Load dataset and create instances + instances = [ + Instance("generate_until", example, (prompt, stop_seq), idx) + for idx, example in enumerate(dataset) + ] + # Run batch inference + results = self.compute(model, instances) + return {"outputs": results, "references": references} + + def evaluate_responses(self, results: Dict[str, Any]) -> Dict[str, float]: + # Compute metrics from results + outputs = results["outputs"] + references = results["references"] + # Calculate accuracy, F1, etc. + return {"accuracy": score} + ``` +3. Add benchmark class name to `__init__.py` if needed (TaskManager auto-discovers via `eval_instruct.py`) +4. Add tested metrics to `reproduced_benchmarks.md` +5. Add benchmark entry to `README.md` + +**Key patterns for benchmark implementation:** +- Use `_prepare_messages()` for chat template handling +- Use `_normalize_model_args()` for API-agnostic model arguments +- Use `self.compute(model, instances)` for batch inference (supports distributed evaluation) +- Store raw outputs and references in results dict for downstream analysis +- Return flat dict of metric names to float values + +## Important Notes + +### Environment & Authentication +- Set `OPENAI_API_KEY` for LLM judge models (e.g., gpt-4o-mini) +- Run `huggingface-cli login` for accessing gated models/datasets +- May need `ANTHROPIC_API_KEY` for Claude models, `MISTRAL_API_KEY` for Mistral, etc. + +### Special Dependencies & Requirements +- **BigCodeBench**: Requires Docker for safe code execution +- **ZeroEval**: Requires HuggingFace dataset access approval +- **MTBench**: Installed as git subtree from `eval/chat_benchmarks/MTBench` +- **alpaca_eval**: Installed as git subtree, requires separate pip install + +### Metadata & Results +- All evaluation results include extensive metadata: model config, seeds, git hash, hardware specs (GPU, CUDA), timing +- Custom JSON serialization handles large SymPy objects (>15,000 bits) via `handle_non_serializable_extended()` +- Results organized as `logs/ModelName/results_timestamp.json` + +### Model Compatibility +- **Supported model backends**: HuggingFace (hf), vLLM, OpenAI, Curator (LiteLLM wrapper for 100+ models) +- **Chat template support**: Essential for instruction-following benchmarks; auto-applied via `_prepare_messages()` +- **Seed handling**: Different APIs handle seeds differently; `_normalize_model_args()` abstracts this diff --git a/eval/chat_benchmarks/MATH500/eval_instruct.py b/eval/chat_benchmarks/MATH500/eval_instruct.py index eec964f0..d260d16f 100644 --- a/eval/chat_benchmarks/MATH500/eval_instruct.py +++ b/eval/chat_benchmarks/MATH500/eval_instruct.py @@ -2,6 +2,7 @@ import logging from typing import Any, Dict, List, Optional +import numpy as np import lm_eval.models from lm_eval.api.instance import Instance from lm_eval.api.model import LM @@ -30,6 +31,7 @@ def __init__( max_tokens: int = 32768, logger: Optional[logging.Logger] = None, system_instruction: Optional[str] = None, + n_repeat: int = 3, ): """ Initialize MATH500 benchmark. @@ -40,12 +42,14 @@ def __init__( seed: Random seed for reproducibility. Default is [0, 1234, 1234, 1234] for lm-eval-harness. logger: Optional logger instance system_instruction: Optional system instruction for the model + n_repeat: Number of times to repeat the evaluation. Default is 3. """ super().__init__(logger=logger, system_instruction=system_instruction) self.data_file = data_file self.debug = debug self.seed = seed self.max_new_tokens = max_tokens + self.n_repeat = n_repeat def generate_responses(self, model: LM) -> Dict[str, Any]: """ @@ -61,22 +65,26 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: examples = self.load_questions() # Prepare instances for model - all_instances = [] + all_outputs = [] if isinstance(model, lm_eval.models.huggingface.HFLM): model_name = model.pretrained elif isinstance(model, lm_eval.models.openai_completions.OpenAIChatCompletion): model_name = str(f"openai/{model.model}") else: model_name = model.model_args["model"] - for idx, example in enumerate(examples): - messages = [ - {"role": "user", "content": PROMPT.format(problem=example["problem"])}, - ] - templated_messages = self._prepare_messages(messages, model) + for i in range(self.n_repeat): + all_instances = [] + seed = [s + i for s in self.seed] - all_instances.append( - Instance( + for idx, example in enumerate(examples): + messages = [ + {"role": "user", "content": PROMPT.format(problem=example["problem"])}, + ] + + templated_messages = self._prepare_messages(messages, model) + + instance = Instance( "generate_until", example, ( @@ -85,24 +93,33 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: "do_sample": False, "max_new_tokens": self.max_new_tokens, "temperature": 0.7, - "seed": self.seed, + "seed": seed, }, ), idx, ) - ) - # Generate model responses - self.logger.info("Generating responses for MATH500...") - outputs = self.compute(model, all_instances) + # Add repetition information to instance metadata + instance.repeat_idx = i + instance.metadata = { + "problem_id": str(example["id"]) if "id" in example else str(idx), + "expected_answer": str(example["answer"]), + } + + all_instances.append(instance) + + # Generate model responses + self.logger.info(f"Generating responses for MATH500 (repetition {i+1}/{self.n_repeat})...") + outputs = self.compute(model, all_instances) + all_outputs.append(outputs) # Return None early for non-primary ranks if model.rank != 0: return None - for example, output in zip(examples, outputs): - example["model_output"] = output - example["model_answer"] = self.extract_answer(output) + for example, outputs in zip(examples, zip(*all_outputs)): + example["model_outputs"] = list(outputs) + example["model_answers"] = [self.extract_answer(o) for o in outputs] return {"examples": examples} @@ -114,14 +131,37 @@ def evaluate_responses(self, results: Dict[str, Any]) -> Dict[str, float]: return None examples = results["examples"] - total = len(examples) - solved = sum(is_equiv(str(example["answer"]), example["model_answer"]) for example in examples) + num_questions = len(examples) + + # Calculate accuracy for each repetition + all_results = [] + for i in range(self.n_repeat): + solved = sum( + [is_equiv(str(example["answer"]), str(example["model_answers"][i])) for example in examples] + ) + all_results.append( + { + "repetition": i + 1, + "num_total": num_questions, + "num_solved": solved, + "accuracy": solved / num_questions, + } + ) + + # Calculate overall statistics + solved_avg = np.mean([result["num_solved"] for result in all_results]) + accuracy_avg = np.mean([result["accuracy"] for result in all_results]) + accuracy_std = np.std([result["accuracy"] for result in all_results]) + accuracy_std_err = np.std([result["accuracy"] for result in all_results]) / np.sqrt(self.n_repeat) results.update( { - "num_total": total, - "num_solved": solved, - "accuracy": solved / total, + "num_total": num_questions, + "solved_avg": solved_avg, + "run_stats": all_results, + "accuracy_avg": accuracy_avg, + "accuracy_std_err": accuracy_std_err, + "num_repeat": self.n_repeat, } )