Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion benchmarks/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,14 +432,17 @@ def _inference_rlm(self, sample: BenchmarkSample) -> tuple[str, dict[str, Any]]:
if self.config.log_dir:
logger = RLMLogger(log_dir=self.config.log_dir)

# Note: We always set RLM verbose=False to avoid its output interleaving
# with the benchmark progress bar. The runner's verbose flag controls
# per-sample result output instead.
rlm = RLM(
backend=self.config.backend,
backend_kwargs=self.config.backend_kwargs,
environment=self.config.environment,
environment_kwargs=self.config.environment_kwargs,
max_iterations=self.config.max_iterations,
logger=logger,
verbose=self.config.verbose,
verbose=False,
)

result = rlm.completion(prompt=sample.context, root_prompt=sample.question)
Expand Down
87 changes: 87 additions & 0 deletions tests/benchmarks/test_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,93 @@ def test_model_spec_str(self):
assert str(spec) == "anthropic:claude-sonnet-4-20250514"


class TestModelPropagation:
"""Tests that model configuration is correctly passed through the system."""

def test_runner_config_model_name(self):
"""Test that BenchmarkRunner correctly sets model_name in backend_kwargs."""
from benchmarks.runner import BenchmarkRunner

runner = BenchmarkRunner(
backend="openai",
model="my-test-model",
progress="none",
)

# Verify the config has the correct model_name in backend_kwargs
assert runner.config.backend == "openai"
assert runner.config.model == "my-test-model"
assert runner.config.backend_kwargs.get("model_name") == "my-test-model"

def test_runner_config_preserves_extra_kwargs(self):
"""Test that extra backend_kwargs are preserved."""
from benchmarks.runner import BenchmarkRunner

runner = BenchmarkRunner(
backend="openai",
model="my-model",
progress="none",
backend_kwargs={"extra_param": "extra_value", "api_key": "test-key"},
)

# model_name from model param should be present
assert runner.config.backend_kwargs.get("model_name") == "my-model"
# Extra kwargs should be preserved
assert runner.config.backend_kwargs.get("extra_param") == "extra_value"
assert runner.config.backend_kwargs.get("api_key") == "test-key"

def test_model_spec_to_runner(self):
"""Test end-to-end from model spec parsing to runner config."""
from benchmarks.cli import parse_model_spec
from benchmarks.runner import BenchmarkRunner

spec = parse_model_spec("anthropic:claude-3-opus")
runner = BenchmarkRunner(
backend=spec.backend,
model=spec.model,
progress="none",
)

assert runner.config.backend == "anthropic"
assert runner.config.model == "claude-3-opus"
assert runner.config.backend_kwargs.get("model_name") == "claude-3-opus"

def test_rlm_receives_correct_config(self):
"""Test that RLM is initialized with correct backend_kwargs."""
from unittest.mock import MagicMock, patch

from benchmarks.runner import BenchmarkRunner
from benchmarks.tasks.niah import NIAHBenchmark

runner = BenchmarkRunner(
backend="openai",
model="gpt-4o-test",
progress="none",
)

# Mock RLM to capture the initialization arguments
captured_kwargs = {}

class MockRLM:
def __init__(self, **kwargs):
captured_kwargs.update(kwargs)
# Raise to exit early without actually running RLM
raise RuntimeError("Mock exit - RLM init captured")

# RLM is imported inside _inference_rlm, so patch at rlm module level
with patch("rlm.RLM", MockRLM):
benchmark = NIAHBenchmark(context_length=1000)
try:
runner.run(benchmark, method="rlm", num_samples=1, seed=42)
except RuntimeError as e:
if "Mock exit" not in str(e):
raise

# Verify the backend_kwargs passed to RLM
assert captured_kwargs.get("backend") == "openai"
assert captured_kwargs.get("backend_kwargs") == {"model_name": "gpt-4o-test"}


class TestBenchmarkIntegration:
"""Integration tests for benchmark framework."""

Expand Down
Loading