Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 108 additions & 49 deletions benchmarks/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,22 @@
CLI for running RLM benchmarks.

Usage:
# Run benchmarks
python -m benchmarks.cli run --benchmark oolong --num-samples 10
# Run benchmarks with model spec (backend:model format)
python -m benchmarks.cli run --benchmark niah --models openai:gpt-4o
python -m benchmarks.cli run --benchmark niah --models openai:gpt-4o anthropic:claude-sonnet-4-20250514

# Run with methods comparison
python -m benchmarks.cli run --benchmark niah --methods rlm direct

# Query stored results
python -m benchmarks.cli history --benchmark oolong-toy_dnd --limit 10
python -m benchmarks.cli compare --benchmark niah-100k --group-by method

# Legacy (no subcommand defaults to 'run')
python -m benchmarks.cli --benchmark oolong --num-samples 10
"""

import argparse
import json
import sys
from dataclasses import dataclass
from datetime import datetime

from benchmarks.base import BenchmarkResult
Expand All @@ -28,6 +29,37 @@
from benchmarks.tasks.oolong import OolongBenchmark, OolongPairsBenchmark


@dataclass
class ModelSpec:
"""Parsed model specification."""

backend: str
model: str

def __str__(self) -> str:
return f"{self.backend}:{self.model}"


def parse_model_spec(spec: str) -> ModelSpec:
"""Parse a backend:model specification string.

Formats supported:
- "backend:model" -> ModelSpec(backend, model)
- "model" -> ModelSpec("openai", model) # default backend

Examples:
- "openai:gpt-4o" -> ModelSpec("openai", "gpt-4o")
- "anthropic:claude-sonnet-4-20250514" -> ModelSpec("anthropic", "claude-sonnet-4-20250514")
- "gpt-4o" -> ModelSpec("openai", "gpt-4o")
"""
if ":" in spec:
parts = spec.split(":", 1)
return ModelSpec(backend=parts[0], model=parts[1])
else:
# Default to openai backend
return ModelSpec(backend="openai", model=spec)


def get_benchmark(args: argparse.Namespace):
"""Instantiate benchmark from CLI arguments."""
name = args.benchmark.lower()
Expand Down Expand Up @@ -278,17 +310,25 @@ def main():
default=None,
help="Output file path for results (JSON)",
)
run_parser.add_argument(
"--models",
nargs="+",
default=None,
help="Model specs as backend:model (e.g., openai:gpt-4o anthropic:claude-sonnet-4-20250514). "
"Can specify multiple for comparison. Default: openai/gpt-4o-mini",
)
# Legacy args for backwards compatibility
run_parser.add_argument(
"--backend",
type=str,
default="openai",
help="LLM backend (default: openai)",
default=None,
help="(Legacy) LLM backend. Prefer --models backend:model syntax.",
)
run_parser.add_argument(
"--model",
type=str,
default="gpt-5-mini",
help="Model name (default: gpt-5-mini)",
default=None,
help="(Legacy) Model name. Prefer --models backend:model syntax.",
)
run_parser.add_argument(
"--environment",
Expand Down Expand Up @@ -389,62 +429,81 @@ def main():
return 1


def get_model_specs(args: argparse.Namespace) -> list[ModelSpec]:
"""Get model specifications from args, handling legacy and new formats."""
if args.models:
# New format: --models backend:model [backend:model ...]
return [parse_model_spec(spec) for spec in args.models]
elif args.backend or args.model:
# Legacy format: --backend X --model Y
backend = args.backend or "openai"
model = args.model or "gpt-4o-mini"
return [ModelSpec(backend=backend, model=model)]
else:
# Default
return [ModelSpec(backend="openai", model="gpt-4o-mini")]


def cmd_run(args: argparse.Namespace) -> int:
"""Run benchmarks (main command)."""
runner = BenchmarkRunner(
backend=args.backend,
model=args.model,
environment=args.environment,
max_iterations=args.max_iterations,
verbose=args.verbose,
log_dir=args.log_dir,
max_workers=args.max_workers,
progress=args.progress,
)
model_specs = get_model_specs(args)

if args.benchmark == "all":
benchmarks = get_all_benchmarks(args)
else:
benchmarks = [get_benchmark(args)]

all_results = {}
store = ResultsStore(args.results_dir)

for benchmark in benchmarks:
print(f"\n{'=' * 60}")
print(f"Running: {benchmark.name}")
print(f"Description: {benchmark.description}")
print(f"{'=' * 60}")

results = compare_methods(
benchmark=benchmark,
runner=runner,
methods=args.methods,
num_samples=args.num_samples,
seed=args.seed,
for model_spec in model_specs:
print(f"\n{'=' * 70}")
print(f"Model: {model_spec}")
print(f"{'=' * 70}")

runner = BenchmarkRunner(
backend=model_spec.backend,
model=model_spec.model,
environment=args.environment,
max_iterations=args.max_iterations,
verbose=args.verbose,
log_dir=args.log_dir,
max_workers=args.max_workers,
progress=args.progress,
)

for method, result in results.items():
key = f"{benchmark.name}/{method}"
all_results[key] = result
for benchmark in benchmarks:
print(f"\nRunning: {benchmark.name}")
print(f"Description: {benchmark.description}")

results = compare_methods(
benchmark=benchmark,
runner=runner,
methods=args.methods,
num_samples=args.num_samples,
seed=args.seed,
)

for method, result in results.items():
key = f"{model_spec}/{benchmark.name}/{method}"
all_results[key] = result

# Save to results store
config = ExperimentConfig(
backend=model_spec.backend,
model=model_spec.model,
environment=args.environment,
max_iterations=args.max_iterations,
num_samples=args.num_samples,
seed=args.seed,
method=method,
)
exp_id = store.save(result, config)
print(f" Saved as experiment {exp_id}")

# Print summary
print_summary(all_results)

# Save to results store
store = ResultsStore(args.results_dir)
config = ExperimentConfig(
backend=args.backend,
model=args.model,
environment=args.environment,
max_iterations=args.max_iterations,
num_samples=args.num_samples,
seed=args.seed,
)

for key, result in all_results.items():
exp_id = store.save(result, config)
print(f"Saved {key} as experiment {exp_id}")

if args.output:
save_results(all_results, args.output)

Expand Down
23 changes: 21 additions & 2 deletions rlm/core/lm_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,16 @@ def handle(self):

socket_send(self.connection, response.to_dict())

except (BrokenPipeError, ConnectionResetError):
# Client disconnected - this is expected, silently ignore
pass
except Exception as e:
response = LMResponse.error_response(str(e))
socket_send(self.connection, response.to_dict())
try:
response = LMResponse.error_response(str(e))
socket_send(self.connection, response.to_dict())
except (BrokenPipeError, ConnectionResetError):
# Client disconnected while we tried to send error
pass

def _handle_single(self, request: LMRequest, handler: "LMHandler") -> LMResponse:
"""Handle a single prompt request."""
Expand Down Expand Up @@ -98,6 +105,18 @@ class ThreadingLMServer(ThreadingTCPServer):
daemon_threads = True
allow_reuse_address = True

def handle_error(self, request, client_address):
"""Suppress expected errors when clients disconnect."""
import sys

exc_type, exc_value, _ = sys.exc_info()
# BrokenPipeError and ConnectionResetError are expected when
# subprocess environments disconnect after getting their response
if exc_type in (BrokenPipeError, ConnectionResetError):
return # Silently ignore
# For other errors, use default behavior (print to stderr)
super().handle_error(request, client_address)


class LMHandler:
"""
Expand Down
31 changes: 31 additions & 0 deletions tests/benchmarks/test_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,37 @@ def test_simple_progress_mode(self, capsys):
assert "Acc:" in captured.out


class TestModelSpec:
"""Tests for model specification parsing."""

def test_parse_model_spec_with_backend(self):
"""Test parsing backend:model format."""
from benchmarks.cli import parse_model_spec

spec = parse_model_spec("openai:gpt-4o")
assert spec.backend == "openai"
assert spec.model == "gpt-4o"

spec = parse_model_spec("anthropic:claude-sonnet-4-20250514")
assert spec.backend == "anthropic"
assert spec.model == "claude-sonnet-4-20250514"

def test_parse_model_spec_model_only(self):
"""Test parsing model-only format defaults to openai."""
from benchmarks.cli import parse_model_spec

spec = parse_model_spec("gpt-4o")
assert spec.backend == "openai"
assert spec.model == "gpt-4o"

def test_model_spec_str(self):
"""Test ModelSpec string representation."""
from benchmarks.cli import ModelSpec

spec = ModelSpec(backend="anthropic", model="claude-sonnet-4-20250514")
assert str(spec) == "anthropic:claude-sonnet-4-20250514"


class TestBenchmarkIntegration:
"""Integration tests for benchmark framework."""

Expand Down