ShaneIsley · ShaneIsley · Jan 16, 2026 · Jan 16, 2026 · Jan 16, 2026
diff --git a/benchmarks/cli.py b/benchmarks/cli.py
@@ -3,21 +3,22 @@
 CLI for running RLM benchmarks.
 
 Usage:
-    # Run benchmarks
-    python -m benchmarks.cli run --benchmark oolong --num-samples 10
+    # Run benchmarks with model spec (backend:model format)
+    python -m benchmarks.cli run --benchmark niah --models openai:gpt-4o
+    python -m benchmarks.cli run --benchmark niah --models openai:gpt-4o anthropic:claude-sonnet-4-20250514
+
+    # Run with methods comparison
     python -m benchmarks.cli run --benchmark niah --methods rlm direct
 
     # Query stored results
     python -m benchmarks.cli history --benchmark oolong-toy_dnd --limit 10
     python -m benchmarks.cli compare --benchmark niah-100k --group-by method
-
-    # Legacy (no subcommand defaults to 'run')
-    python -m benchmarks.cli --benchmark oolong --num-samples 10
 """
 
 import argparse
 import json
 import sys
+from dataclasses import dataclass
 from datetime import datetime
 
 from benchmarks.base import BenchmarkResult
@@ -28,6 +29,37 @@
 from benchmarks.tasks.oolong import OolongBenchmark, OolongPairsBenchmark
 
 
+@dataclass
+class ModelSpec:
+    """Parsed model specification."""
+
+    backend: str
+    model: str
+
+    def __str__(self) -> str:
+        return f"{self.backend}:{self.model}"
+
+
+def parse_model_spec(spec: str) -> ModelSpec:
+    """Parse a backend:model specification string.
+
+    Formats supported:
+        - "backend:model" -> ModelSpec(backend, model)
+        - "model" -> ModelSpec("openai", model)  # default backend
+
+    Examples:
+        - "openai:gpt-4o" -> ModelSpec("openai", "gpt-4o")
+        - "anthropic:claude-sonnet-4-20250514" -> ModelSpec("anthropic", "claude-sonnet-4-20250514")
+        - "gpt-4o" -> ModelSpec("openai", "gpt-4o")
+    """
+    if ":" in spec:
+        parts = spec.split(":", 1)
+        return ModelSpec(backend=parts[0], model=parts[1])
+    else:
+        # Default to openai backend
+        return ModelSpec(backend="openai", model=spec)
+
+
 def get_benchmark(args: argparse.Namespace):
     """Instantiate benchmark from CLI arguments."""
     name = args.benchmark.lower()
@@ -278,17 +310,25 @@ def main():
         default=None,
         help="Output file path for results (JSON)",
     )
+    run_parser.add_argument(
+        "--models",
+        nargs="+",
+        default=None,
+        help="Model specs as backend:model (e.g., openai:gpt-4o anthropic:claude-sonnet-4-20250514). "
+        "Can specify multiple for comparison. Default: openai/gpt-4o-mini",
+    )
+    # Legacy args for backwards compatibility
     run_parser.add_argument(
         "--backend",
         type=str,
-        default="openai",
-        help="LLM backend (default: openai)",
+        default=None,
+        help="(Legacy) LLM backend. Prefer --models backend:model syntax.",
     )
     run_parser.add_argument(
         "--model",
         type=str,
-        default="gpt-5-mini",
-        help="Model name (default: gpt-5-mini)",
+        default=None,
+        help="(Legacy) Model name. Prefer --models backend:model syntax.",
     )
     run_parser.add_argument(
         "--environment",
@@ -389,62 +429,81 @@ def main():
         return 1
 
 
+def get_model_specs(args: argparse.Namespace) -> list[ModelSpec]:
+    """Get model specifications from args, handling legacy and new formats."""
+    if args.models:
+        # New format: --models backend:model [backend:model ...]
+        return [parse_model_spec(spec) for spec in args.models]
+    elif args.backend or args.model:
+        # Legacy format: --backend X --model Y
+        backend = args.backend or "openai"
+        model = args.model or "gpt-4o-mini"
+        return [ModelSpec(backend=backend, model=model)]
+    else:
+        # Default
+        return [ModelSpec(backend="openai", model="gpt-4o-mini")]
+
+
 def cmd_run(args: argparse.Namespace) -> int:
     """Run benchmarks (main command)."""
-    runner = BenchmarkRunner(
-        backend=args.backend,
-        model=args.model,
-        environment=args.environment,
-        max_iterations=args.max_iterations,
-        verbose=args.verbose,
-        log_dir=args.log_dir,
-        max_workers=args.max_workers,
-        progress=args.progress,
-    )
+    model_specs = get_model_specs(args)
 
     if args.benchmark == "all":
         benchmarks = get_all_benchmarks(args)
     else:
         benchmarks = [get_benchmark(args)]
 
     all_results = {}
+    store = ResultsStore(args.results_dir)
 
-    for benchmark in benchmarks:
-        print(f"\n{'=' * 60}")
-        print(f"Running: {benchmark.name}")
-        print(f"Description: {benchmark.description}")
-        print(f"{'=' * 60}")
-
-        results = compare_methods(
-            benchmark=benchmark,
-            runner=runner,
-            methods=args.methods,
-            num_samples=args.num_samples,
-            seed=args.seed,
+    for model_spec in model_specs:
+        print(f"\n{'=' * 70}")
+        print(f"Model: {model_spec}")
+        print(f"{'=' * 70}")
+
+        runner = BenchmarkRunner(
+            backend=model_spec.backend,
+            model=model_spec.model,
+            environment=args.environment,
+            max_iterations=args.max_iterations,
+            verbose=args.verbose,
+            log_dir=args.log_dir,
+            max_workers=args.max_workers,
+            progress=args.progress,
         )
 
-        for method, result in results.items():
-            key = f"{benchmark.name}/{method}"
-            all_results[key] = result
+        for benchmark in benchmarks:
+            print(f"\nRunning: {benchmark.name}")
+            print(f"Description: {benchmark.description}")
+
+            results = compare_methods(
+                benchmark=benchmark,
+                runner=runner,
+                methods=args.methods,
+                num_samples=args.num_samples,
+                seed=args.seed,
+            )
+
+            for method, result in results.items():
+                key = f"{model_spec}/{benchmark.name}/{method}"
+                all_results[key] = result
+
+                # Save to results store
+                config = ExperimentConfig(
+                    backend=model_spec.backend,
+                    model=model_spec.model,
+                    environment=args.environment,
+                    max_iterations=args.max_iterations,
+                    num_samples=args.num_samples,
+                    seed=args.seed,
+                    method=method,
+                )
+                exp_id = store.save(result, config)
+                print(f"  Saved as experiment {exp_id}")
 
     # Print summary
     print_summary(all_results)
 
-    # Save to results store
-    store = ResultsStore(args.results_dir)
-    config = ExperimentConfig(
-        backend=args.backend,
-        model=args.model,
-        environment=args.environment,
-        max_iterations=args.max_iterations,
-        num_samples=args.num_samples,
-        seed=args.seed,
-    )
-
-    for key, result in all_results.items():
-        exp_id = store.save(result, config)
-        print(f"Saved {key} as experiment {exp_id}")
-
     if args.output:
         save_results(all_results, args.output)
 

diff --git a/rlm/core/lm_handler.py b/rlm/core/lm_handler.py
@@ -39,9 +39,16 @@ def handle(self):
 
             socket_send(self.connection, response.to_dict())
 
+        except (BrokenPipeError, ConnectionResetError):
+            # Client disconnected - this is expected, silently ignore
+            pass
         except Exception as e:
-            response = LMResponse.error_response(str(e))
-            socket_send(self.connection, response.to_dict())
+            try:
+                response = LMResponse.error_response(str(e))
+                socket_send(self.connection, response.to_dict())
+            except (BrokenPipeError, ConnectionResetError):
+                # Client disconnected while we tried to send error
+                pass
 
     def _handle_single(self, request: LMRequest, handler: "LMHandler") -> LMResponse:
         """Handle a single prompt request."""
@@ -98,6 +105,18 @@ class ThreadingLMServer(ThreadingTCPServer):
     daemon_threads = True
     allow_reuse_address = True
 
+    def handle_error(self, request, client_address):
+        """Suppress expected errors when clients disconnect."""
+        import sys
+
+        exc_type, exc_value, _ = sys.exc_info()
+        # BrokenPipeError and ConnectionResetError are expected when
+        # subprocess environments disconnect after getting their response
+        if exc_type in (BrokenPipeError, ConnectionResetError):
+            return  # Silently ignore
+        # For other errors, use default behavior (print to stderr)
+        super().handle_error(request, client_address)
+
 
 class LMHandler:
     """

diff --git a/tests/benchmarks/test_benchmarks.py b/tests/benchmarks/test_benchmarks.py
@@ -446,6 +446,37 @@ def test_simple_progress_mode(self, capsys):
         assert "Acc:" in captured.out
 
 
+class TestModelSpec:
+    """Tests for model specification parsing."""
+
+    def test_parse_model_spec_with_backend(self):
+        """Test parsing backend:model format."""
+        from benchmarks.cli import parse_model_spec
+
+        spec = parse_model_spec("openai:gpt-4o")
+        assert spec.backend == "openai"
+        assert spec.model == "gpt-4o"
+
+        spec = parse_model_spec("anthropic:claude-sonnet-4-20250514")
+        assert spec.backend == "anthropic"
+        assert spec.model == "claude-sonnet-4-20250514"
+
+    def test_parse_model_spec_model_only(self):
+        """Test parsing model-only format defaults to openai."""
+        from benchmarks.cli import parse_model_spec
+
+        spec = parse_model_spec("gpt-4o")
+        assert spec.backend == "openai"
+        assert spec.model == "gpt-4o"
+
+    def test_model_spec_str(self):
+        """Test ModelSpec string representation."""
+        from benchmarks.cli import ModelSpec
+
+        spec = ModelSpec(backend="anthropic", model="claude-sonnet-4-20250514")
+        assert str(spec) == "anthropic:claude-sonnet-4-20250514"
+
+
 class TestBenchmarkIntegration:
     """Integration tests for benchmark framework."""