diff --git a/benchmarks/cli.py b/benchmarks/cli.py index d5fca94..11e3023 100644 --- a/benchmarks/cli.py +++ b/benchmarks/cli.py @@ -3,21 +3,22 @@ CLI for running RLM benchmarks. Usage: - # Run benchmarks - python -m benchmarks.cli run --benchmark oolong --num-samples 10 + # Run benchmarks with model spec (backend:model format) + python -m benchmarks.cli run --benchmark niah --models openai:gpt-4o + python -m benchmarks.cli run --benchmark niah --models openai:gpt-4o anthropic:claude-sonnet-4-20250514 + + # Run with methods comparison python -m benchmarks.cli run --benchmark niah --methods rlm direct # Query stored results python -m benchmarks.cli history --benchmark oolong-toy_dnd --limit 10 python -m benchmarks.cli compare --benchmark niah-100k --group-by method - - # Legacy (no subcommand defaults to 'run') - python -m benchmarks.cli --benchmark oolong --num-samples 10 """ import argparse import json import sys +from dataclasses import dataclass from datetime import datetime from benchmarks.base import BenchmarkResult @@ -28,6 +29,37 @@ from benchmarks.tasks.oolong import OolongBenchmark, OolongPairsBenchmark +@dataclass +class ModelSpec: + """Parsed model specification.""" + + backend: str + model: str + + def __str__(self) -> str: + return f"{self.backend}:{self.model}" + + +def parse_model_spec(spec: str) -> ModelSpec: + """Parse a backend:model specification string. + + Formats supported: + - "backend:model" -> ModelSpec(backend, model) + - "model" -> ModelSpec("openai", model) # default backend + + Examples: + - "openai:gpt-4o" -> ModelSpec("openai", "gpt-4o") + - "anthropic:claude-sonnet-4-20250514" -> ModelSpec("anthropic", "claude-sonnet-4-20250514") + - "gpt-4o" -> ModelSpec("openai", "gpt-4o") + """ + if ":" in spec: + parts = spec.split(":", 1) + return ModelSpec(backend=parts[0], model=parts[1]) + else: + # Default to openai backend + return ModelSpec(backend="openai", model=spec) + + def get_benchmark(args: argparse.Namespace): """Instantiate benchmark from CLI arguments.""" name = args.benchmark.lower() @@ -278,17 +310,25 @@ def main(): default=None, help="Output file path for results (JSON)", ) + run_parser.add_argument( + "--models", + nargs="+", + default=None, + help="Model specs as backend:model (e.g., openai:gpt-4o anthropic:claude-sonnet-4-20250514). " + "Can specify multiple for comparison. Default: openai/gpt-4o-mini", + ) + # Legacy args for backwards compatibility run_parser.add_argument( "--backend", type=str, - default="openai", - help="LLM backend (default: openai)", + default=None, + help="(Legacy) LLM backend. Prefer --models backend:model syntax.", ) run_parser.add_argument( "--model", type=str, - default="gpt-5-mini", - help="Model name (default: gpt-5-mini)", + default=None, + help="(Legacy) Model name. Prefer --models backend:model syntax.", ) run_parser.add_argument( "--environment", @@ -389,18 +429,24 @@ def main(): return 1 +def get_model_specs(args: argparse.Namespace) -> list[ModelSpec]: + """Get model specifications from args, handling legacy and new formats.""" + if args.models: + # New format: --models backend:model [backend:model ...] + return [parse_model_spec(spec) for spec in args.models] + elif args.backend or args.model: + # Legacy format: --backend X --model Y + backend = args.backend or "openai" + model = args.model or "gpt-4o-mini" + return [ModelSpec(backend=backend, model=model)] + else: + # Default + return [ModelSpec(backend="openai", model="gpt-4o-mini")] + + def cmd_run(args: argparse.Namespace) -> int: """Run benchmarks (main command).""" - runner = BenchmarkRunner( - backend=args.backend, - model=args.model, - environment=args.environment, - max_iterations=args.max_iterations, - verbose=args.verbose, - log_dir=args.log_dir, - max_workers=args.max_workers, - progress=args.progress, - ) + model_specs = get_model_specs(args) if args.benchmark == "all": benchmarks = get_all_benchmarks(args) @@ -408,43 +454,56 @@ def cmd_run(args: argparse.Namespace) -> int: benchmarks = [get_benchmark(args)] all_results = {} + store = ResultsStore(args.results_dir) - for benchmark in benchmarks: - print(f"\n{'=' * 60}") - print(f"Running: {benchmark.name}") - print(f"Description: {benchmark.description}") - print(f"{'=' * 60}") - - results = compare_methods( - benchmark=benchmark, - runner=runner, - methods=args.methods, - num_samples=args.num_samples, - seed=args.seed, + for model_spec in model_specs: + print(f"\n{'=' * 70}") + print(f"Model: {model_spec}") + print(f"{'=' * 70}") + + runner = BenchmarkRunner( + backend=model_spec.backend, + model=model_spec.model, + environment=args.environment, + max_iterations=args.max_iterations, + verbose=args.verbose, + log_dir=args.log_dir, + max_workers=args.max_workers, + progress=args.progress, ) - for method, result in results.items(): - key = f"{benchmark.name}/{method}" - all_results[key] = result + for benchmark in benchmarks: + print(f"\nRunning: {benchmark.name}") + print(f"Description: {benchmark.description}") + + results = compare_methods( + benchmark=benchmark, + runner=runner, + methods=args.methods, + num_samples=args.num_samples, + seed=args.seed, + ) + + for method, result in results.items(): + key = f"{model_spec}/{benchmark.name}/{method}" + all_results[key] = result + + # Save to results store + config = ExperimentConfig( + backend=model_spec.backend, + model=model_spec.model, + environment=args.environment, + max_iterations=args.max_iterations, + num_samples=args.num_samples, + seed=args.seed, + method=method, + ) + exp_id = store.save(result, config) + print(f" Saved as experiment {exp_id}") # Print summary print_summary(all_results) - # Save to results store - store = ResultsStore(args.results_dir) - config = ExperimentConfig( - backend=args.backend, - model=args.model, - environment=args.environment, - max_iterations=args.max_iterations, - num_samples=args.num_samples, - seed=args.seed, - ) - - for key, result in all_results.items(): - exp_id = store.save(result, config) - print(f"Saved {key} as experiment {exp_id}") - if args.output: save_results(all_results, args.output) diff --git a/rlm/core/lm_handler.py b/rlm/core/lm_handler.py index 2ce7d89..bf8c4c1 100644 --- a/rlm/core/lm_handler.py +++ b/rlm/core/lm_handler.py @@ -39,9 +39,16 @@ def handle(self): socket_send(self.connection, response.to_dict()) + except (BrokenPipeError, ConnectionResetError): + # Client disconnected - this is expected, silently ignore + pass except Exception as e: - response = LMResponse.error_response(str(e)) - socket_send(self.connection, response.to_dict()) + try: + response = LMResponse.error_response(str(e)) + socket_send(self.connection, response.to_dict()) + except (BrokenPipeError, ConnectionResetError): + # Client disconnected while we tried to send error + pass def _handle_single(self, request: LMRequest, handler: "LMHandler") -> LMResponse: """Handle a single prompt request.""" @@ -98,6 +105,18 @@ class ThreadingLMServer(ThreadingTCPServer): daemon_threads = True allow_reuse_address = True + def handle_error(self, request, client_address): + """Suppress expected errors when clients disconnect.""" + import sys + + exc_type, exc_value, _ = sys.exc_info() + # BrokenPipeError and ConnectionResetError are expected when + # subprocess environments disconnect after getting their response + if exc_type in (BrokenPipeError, ConnectionResetError): + return # Silently ignore + # For other errors, use default behavior (print to stderr) + super().handle_error(request, client_address) + class LMHandler: """ diff --git a/tests/benchmarks/test_benchmarks.py b/tests/benchmarks/test_benchmarks.py index 75d9653..de7b201 100644 --- a/tests/benchmarks/test_benchmarks.py +++ b/tests/benchmarks/test_benchmarks.py @@ -446,6 +446,37 @@ def test_simple_progress_mode(self, capsys): assert "Acc:" in captured.out +class TestModelSpec: + """Tests for model specification parsing.""" + + def test_parse_model_spec_with_backend(self): + """Test parsing backend:model format.""" + from benchmarks.cli import parse_model_spec + + spec = parse_model_spec("openai:gpt-4o") + assert spec.backend == "openai" + assert spec.model == "gpt-4o" + + spec = parse_model_spec("anthropic:claude-sonnet-4-20250514") + assert spec.backend == "anthropic" + assert spec.model == "claude-sonnet-4-20250514" + + def test_parse_model_spec_model_only(self): + """Test parsing model-only format defaults to openai.""" + from benchmarks.cli import parse_model_spec + + spec = parse_model_spec("gpt-4o") + assert spec.backend == "openai" + assert spec.model == "gpt-4o" + + def test_model_spec_str(self): + """Test ModelSpec string representation.""" + from benchmarks.cli import ModelSpec + + spec = ModelSpec(backend="anthropic", model="claude-sonnet-4-20250514") + assert str(spec) == "anthropic:claude-sonnet-4-20250514" + + class TestBenchmarkIntegration: """Integration tests for benchmark framework."""