From 8ca1b7055e55ec2212b22152ebc103d8bb6062a4 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 16 Jan 2026 21:53:43 +0000 Subject: [PATCH 1/2] fix: suppress RLM verbose output during benchmark runs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Always set RLM verbose=False in benchmark runner to prevent RLM's configuration box output from interleaving with the progress bar. The runner's -v flag now only controls per-sample result output: [✓] Sample niah-0000: {'correct': 1.0, 'f1': 1.0} This keeps benchmark output clean while still allowing detailed per-sample visibility when requested. --- benchmarks/runner.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/benchmarks/runner.py b/benchmarks/runner.py index 5b837ab..a7b9700 100644 --- a/benchmarks/runner.py +++ b/benchmarks/runner.py @@ -432,6 +432,9 @@ def _inference_rlm(self, sample: BenchmarkSample) -> tuple[str, dict[str, Any]]: if self.config.log_dir: logger = RLMLogger(log_dir=self.config.log_dir) + # Note: We always set RLM verbose=False to avoid its output interleaving + # with the benchmark progress bar. The runner's verbose flag controls + # per-sample result output instead. rlm = RLM( backend=self.config.backend, backend_kwargs=self.config.backend_kwargs, @@ -439,7 +442,7 @@ def _inference_rlm(self, sample: BenchmarkSample) -> tuple[str, dict[str, Any]]: environment_kwargs=self.config.environment_kwargs, max_iterations=self.config.max_iterations, logger=logger, - verbose=self.config.verbose, + verbose=False, ) result = rlm.completion(prompt=sample.context, root_prompt=sample.question) From 84a05bd24ac4cb225b2672824c4c32f291921d41 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 16 Jan 2026 22:12:09 +0000 Subject: [PATCH 2/2] test: add model propagation tests to verify configuration flow Adds TestModelPropagation test class with 4 tests: - test_runner_config_model_name: verifies model_name in backend_kwargs - test_runner_config_preserves_extra_kwargs: verifies extra kwargs preserved - test_model_spec_to_runner: end-to-end model spec parsing test - test_rlm_receives_correct_config: mocked test verifying RLM receives correct backend_kwargs These tests confirm that the backend:model CLI syntax correctly propagates model configuration through BenchmarkRunner to RLM initialization. --- tests/benchmarks/test_benchmarks.py | 87 +++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/tests/benchmarks/test_benchmarks.py b/tests/benchmarks/test_benchmarks.py index de7b201..908765d 100644 --- a/tests/benchmarks/test_benchmarks.py +++ b/tests/benchmarks/test_benchmarks.py @@ -477,6 +477,93 @@ def test_model_spec_str(self): assert str(spec) == "anthropic:claude-sonnet-4-20250514" +class TestModelPropagation: + """Tests that model configuration is correctly passed through the system.""" + + def test_runner_config_model_name(self): + """Test that BenchmarkRunner correctly sets model_name in backend_kwargs.""" + from benchmarks.runner import BenchmarkRunner + + runner = BenchmarkRunner( + backend="openai", + model="my-test-model", + progress="none", + ) + + # Verify the config has the correct model_name in backend_kwargs + assert runner.config.backend == "openai" + assert runner.config.model == "my-test-model" + assert runner.config.backend_kwargs.get("model_name") == "my-test-model" + + def test_runner_config_preserves_extra_kwargs(self): + """Test that extra backend_kwargs are preserved.""" + from benchmarks.runner import BenchmarkRunner + + runner = BenchmarkRunner( + backend="openai", + model="my-model", + progress="none", + backend_kwargs={"extra_param": "extra_value", "api_key": "test-key"}, + ) + + # model_name from model param should be present + assert runner.config.backend_kwargs.get("model_name") == "my-model" + # Extra kwargs should be preserved + assert runner.config.backend_kwargs.get("extra_param") == "extra_value" + assert runner.config.backend_kwargs.get("api_key") == "test-key" + + def test_model_spec_to_runner(self): + """Test end-to-end from model spec parsing to runner config.""" + from benchmarks.cli import parse_model_spec + from benchmarks.runner import BenchmarkRunner + + spec = parse_model_spec("anthropic:claude-3-opus") + runner = BenchmarkRunner( + backend=spec.backend, + model=spec.model, + progress="none", + ) + + assert runner.config.backend == "anthropic" + assert runner.config.model == "claude-3-opus" + assert runner.config.backend_kwargs.get("model_name") == "claude-3-opus" + + def test_rlm_receives_correct_config(self): + """Test that RLM is initialized with correct backend_kwargs.""" + from unittest.mock import MagicMock, patch + + from benchmarks.runner import BenchmarkRunner + from benchmarks.tasks.niah import NIAHBenchmark + + runner = BenchmarkRunner( + backend="openai", + model="gpt-4o-test", + progress="none", + ) + + # Mock RLM to capture the initialization arguments + captured_kwargs = {} + + class MockRLM: + def __init__(self, **kwargs): + captured_kwargs.update(kwargs) + # Raise to exit early without actually running RLM + raise RuntimeError("Mock exit - RLM init captured") + + # RLM is imported inside _inference_rlm, so patch at rlm module level + with patch("rlm.RLM", MockRLM): + benchmark = NIAHBenchmark(context_length=1000) + try: + runner.run(benchmark, method="rlm", num_samples=1, seed=42) + except RuntimeError as e: + if "Mock exit" not in str(e): + raise + + # Verify the backend_kwargs passed to RLM + assert captured_kwargs.get("backend") == "openai" + assert captured_kwargs.get("backend_kwargs") == {"model_name": "gpt-4o-test"} + + class TestBenchmarkIntegration: """Integration tests for benchmark framework."""