From 5108867c35ddfbf9224eb20f4344a1096d2d4f2e Mon Sep 17 00:00:00 2001 From: Grey Newell Date: Wed, 25 Feb 2026 16:30:59 -0500 Subject: [PATCH 01/14] feat: add SWE-bench Pro benchmark and preflight check system Add multi-language benchmark support (Python, Go, TypeScript, JavaScript) with 731 instances from ScaleAI/SWE-bench_Pro, plus a preflight validation system that verifies golden patches pass all tests before agent evaluation. Key changes: - SWEBenchProBenchmark class with DockerHub image support and language-specific test runners (Go: go test, TS/JS: npx jest) - Preflight check system (mcpbr preflight CLI command) that validates golden patches in Docker environments - Docker image override support (_image_override, _workdir_override) for non-GHCR registries - Entrypoint override for images with /bin/bash entrypoint - Editable reinstall after patching for SWE-bench Pro Python images - Case-insensitive test list field access (fail_to_pass/FAIL_TO_PASS) Co-Authored-By: Claude Opus 4.6 --- CHANGELOG.md | 13 + src/mcpbr/benchmark_preflight.py | 271 +++++++++++++++++ src/mcpbr/benchmarks/__init__.py | 6 + src/mcpbr/benchmarks/swebench_pro.py | 428 +++++++++++++++++++++++++++ src/mcpbr/cli.py | 160 ++++++++++ src/mcpbr/config.py | 1 + src/mcpbr/docker_env.py | 82 ++++- src/mcpbr/evaluation.py | 38 ++- src/mcpbr/swebench_test_specs.py | 21 ++ tests/test_benchmark_preflight.py | 340 +++++++++++++++++++++ tests/test_swebench_pro.py | 383 ++++++++++++++++++++++++ uv.lock | 378 +---------------------- 12 files changed, 1732 insertions(+), 389 deletions(-) create mode 100644 src/mcpbr/benchmark_preflight.py create mode 100644 src/mcpbr/benchmarks/swebench_pro.py create mode 100644 tests/test_benchmark_preflight.py create mode 100644 tests/test_swebench_pro.py diff --git a/CHANGELOG.md b/CHANGELOG.md index e642c0b..54bf643 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- **SWE-bench Pro benchmark**: Multi-language benchmark support (Python, Go, TypeScript, JavaScript) with 731 instances across 11 repositories + - DockerHub-hosted pre-built images via `dockerhub_tag` field + - Language-aware test runners (Go `go test`, TS/JS `npx jest`, Python delegates to existing) + - Filter by language or repository substring with `--filter-category` +- **Preflight check command**: `mcpbr preflight` validates golden patches pass all tests before evaluation + - Concurrent validation with configurable parallelism (`--max-concurrent`) + - Fail-fast mode (`--fail-fast`) for quick CI checks + - Per-instance and aggregate reporting with language breakdown +- **Case-insensitive test list field access**: `get_test_list_field()` helper supports both SWE-bench (`FAIL_TO_PASS`) and SWE-bench Pro (`fail_to_pass`) conventions +- **Docker image override support**: `_image_override` task field allows benchmarks to specify custom Docker images + ## [0.14.0] - 2026-02-13 ### Added diff --git a/src/mcpbr/benchmark_preflight.py b/src/mcpbr/benchmark_preflight.py new file mode 100644 index 0000000..e95efe1 --- /dev/null +++ b/src/mcpbr/benchmark_preflight.py @@ -0,0 +1,271 @@ +"""Preflight validation for benchmarks. + +Validates that golden patches pass all tests in Docker environments before +running agent evaluations. This catches environment/configuration issues +early, ensuring evaluation infrastructure works correctly. +""" + +import asyncio +import logging +from dataclasses import dataclass, field +from typing import Any + +from .docker_env import DockerEnvironmentManager, TaskEnvironment +from .evaluation import ( + _apply_test_patch, + apply_patch, + get_test_list_field, + parse_test_list, + run_tests, +) + +logger = logging.getLogger(__name__) + + +@dataclass +class PreflightResult: + """Result of a single preflight instance check.""" + + instance_id: str + status: str # "passed", "failed", "error" + fail_to_pass_passed: int = 0 + fail_to_pass_total: int = 0 + pass_to_pass_passed: int = 0 + pass_to_pass_total: int = 0 + error: str | None = None + language: str = "unknown" + + +@dataclass +class PreflightReport: + """Aggregate preflight validation report.""" + + total: int = 0 + passed: int = 0 + failed: int = 0 + errors: int = 0 + results: list[PreflightResult] = field(default_factory=list) + + @property + def success_rate(self) -> float: + """Calculate success rate as a percentage.""" + if self.total == 0: + return 0.0 + return (self.passed / self.total) * 100.0 + + +async def _check_single_instance( + benchmark: Any, + task: dict[str, Any], + docker_manager: DockerEnvironmentManager, + timeout: int = 300, +) -> PreflightResult: + """Validate a single benchmark instance by applying the golden patch. + + Args: + benchmark: Benchmark instance with create_environment method. + task: Task dictionary with patch, test_patch, fail_to_pass, pass_to_pass. + docker_manager: Docker environment manager. + timeout: Timeout per test in seconds. + + Returns: + PreflightResult for this instance. + """ + instance_id = task.get("instance_id", "unknown") + language = task.get("repo_language", "python").lower() + env: TaskEnvironment | None = None + + try: + # Create Docker environment (skip Claude CLI install — not needed for preflight) + preflight_task = dict(task) + preflight_task["_skip_cli_install"] = True + env = await benchmark.create_environment(preflight_task, docker_manager) + + # Determine eval workdir: SWE-bench Pro images use /app (indicated by + # dockerhub_tag), standard SWE-bench uses /testbed. + eval_workdir: str | None + if env.uses_prebuilt: + if task.get("dockerhub_tag"): + eval_workdir = "/app" + else: + eval_workdir = "/testbed" + else: + eval_workdir = None + + # Apply golden patch + golden_patch = task.get("patch", "") + if not golden_patch: + return PreflightResult( + instance_id=instance_id, + status="error", + error="No golden patch found in task", + language=language, + ) + + applied, error = await apply_patch(env, golden_patch, workdir=eval_workdir) + if not applied: + return PreflightResult( + instance_id=instance_id, + status="failed", + error=f"Golden patch failed to apply: {error}", + language=language, + ) + + # Apply test patch + test_patch = task.get("test_patch", "") + if test_patch: + await _apply_test_patch(env, test_patch, workdir=eval_workdir) + + # Reinstall package in editable mode so patched code is used. + # SWE-bench Pro images install the package into site-packages; + # without this step, tests would import the old (unpatched) code. + if eval_workdir and language == "python": + await env.exec_command( + "pip install -e . -q 2>/dev/null || true", + timeout=120, + workdir=eval_workdir, + ) + + # Parse test lists (handle both uppercase and lowercase field names) + fail_to_pass_str = get_test_list_field(task, "fail_to_pass") + pass_to_pass_str = get_test_list_field(task, "pass_to_pass") + fail_to_pass_tests = parse_test_list(fail_to_pass_str) + pass_to_pass_tests = parse_test_list(pass_to_pass_str) + + # SWE-bench Pro images don't use conda, so skip conda activation + # even though uses_prebuilt is True (it only means "image was pulled") + uses_conda = env.uses_prebuilt and not task.get("dockerhub_tag") + + # Run fail_to_pass tests (all must PASS with golden patch) + ftp_results = await run_tests( + env, + fail_to_pass_tests, + timeout=timeout, + uses_prebuilt=uses_conda, + workdir=eval_workdir, + repo=task.get("repo"), + ) + + # Run pass_to_pass tests (all must still PASS) + ptp_results = await run_tests( + env, + pass_to_pass_tests[:10], + timeout=timeout, + uses_prebuilt=uses_conda, + workdir=eval_workdir, + repo=task.get("repo"), + ) + + # Determine status + all_ftp_pass = ftp_results.passed == ftp_results.total and ftp_results.total > 0 + all_ptp_pass = ptp_results.passed == ptp_results.total + + if all_ftp_pass and all_ptp_pass: + status = "passed" + error_msg = None + else: + status = "failed" + parts = [] + if not all_ftp_pass: + parts.append(f"fail_to_pass: {ftp_results.passed}/{ftp_results.total} passed") + if not all_ptp_pass: + parts.append(f"pass_to_pass: {ptp_results.passed}/{ptp_results.total} passed") + error_msg = "; ".join(parts) + + return PreflightResult( + instance_id=instance_id, + status=status, + fail_to_pass_passed=ftp_results.passed, + fail_to_pass_total=ftp_results.total, + pass_to_pass_passed=ptp_results.passed, + pass_to_pass_total=ptp_results.total, + error=error_msg, + language=language, + ) + + except Exception as e: + logger.exception(f"Preflight error for {instance_id}") + return PreflightResult( + instance_id=instance_id, + status="error", + error=str(e), + language=language, + ) + + finally: + if env is not None: + try: + await env.cleanup() + except Exception: + logger.warning(f"Failed to clean up container for {instance_id}") + + +async def run_benchmark_preflight( + benchmark: Any, + tasks: list[dict[str, Any]], + docker_manager: DockerEnvironmentManager, + max_concurrent: int = 4, + timeout: int = 300, + fail_fast: bool = False, +) -> PreflightReport: + """Run preflight validation on benchmark tasks. + + Applies golden patches and verifies all tests pass for each instance. + + Args: + benchmark: Benchmark instance. + tasks: List of task dictionaries to validate. + docker_manager: Docker environment manager. + max_concurrent: Maximum concurrent validations. + timeout: Timeout per test in seconds. + fail_fast: Stop on first failure. + + Returns: + PreflightReport with aggregate results. + """ + report = PreflightReport(total=len(tasks)) + semaphore = asyncio.Semaphore(max_concurrent) + + async def _check_with_semaphore(task: dict[str, Any]) -> PreflightResult: + async with semaphore: + return await _check_single_instance(benchmark, task, docker_manager, timeout) + + if fail_fast: + # Sequential execution with early exit + for task in tasks: + result = await _check_with_semaphore(task) + report.results.append(result) + if result.status == "passed": + report.passed += 1 + elif result.status == "failed": + report.failed += 1 + break + else: + report.errors += 1 + break + else: + # Concurrent execution + coros = [_check_with_semaphore(task) for task in tasks] + results = await asyncio.gather(*coros, return_exceptions=True) + + for r in results: + if isinstance(r, BaseException): + report.errors += 1 + report.results.append( + PreflightResult( + instance_id="unknown", + status="error", + error=str(r), + ) + ) + else: + preflight_result: PreflightResult = r + report.results.append(preflight_result) + if preflight_result.status == "passed": + report.passed += 1 + elif preflight_result.status == "failed": + report.failed += 1 + else: + report.errors += 1 + + return report diff --git a/src/mcpbr/benchmarks/__init__.py b/src/mcpbr/benchmarks/__init__.py index 419ceb9..99f48f6 100644 --- a/src/mcpbr/benchmarks/__init__.py +++ b/src/mcpbr/benchmarks/__init__.py @@ -29,6 +29,7 @@ from .mmmu import MMMUBenchmark from .repoqa import RepoQABenchmark from .swebench import SWEBenchmark +from .swebench_pro import SWEBenchProBenchmark from .terminalbench import TerminalBenchBenchmark from .toolbench import ToolBenchBenchmark from .truthfulqa import TruthfulQABenchmark @@ -63,6 +64,7 @@ "MLAgentBenchBenchmark", "MMMUBenchmark", "RepoQABenchmark", + "SWEBenchProBenchmark", "SWEBenchmark", "TerminalBenchBenchmark", "ToolBenchBenchmark", @@ -106,6 +108,7 @@ "mmmu": MMMUBenchmark, "longbench": LongBenchBenchmark, "adversarial": AdversarialBenchmark, + "swe-bench-pro": SWEBenchProBenchmark, } @@ -137,6 +140,9 @@ def create_benchmark(name: str, **kwargs: Any) -> Benchmark: if name in swebench_datasets: kwargs["dataset"] = swebench_datasets[name] + if name == "swe-bench-pro": + kwargs["dataset"] = "ScaleAI/SWE-bench_Pro" + return benchmark_class(**kwargs) diff --git a/src/mcpbr/benchmarks/swebench_pro.py b/src/mcpbr/benchmarks/swebench_pro.py new file mode 100644 index 0000000..96e692b --- /dev/null +++ b/src/mcpbr/benchmarks/swebench_pro.py @@ -0,0 +1,428 @@ +"""SWE-bench Pro benchmark implementation. + +SWE-bench Pro is a multi-language benchmark with 731 instances across 11 repos +in Python, Go, TypeScript, and JavaScript. Average solutions span 107.4 lines +across 4.1 files. Top models achieve ~23% resolution (vs 70%+ on Verified). + +Key differences from SWE-bench: +- Docker images from DockerHub (dockerhub_tag field) instead of GHCR +- Multi-language test runners (Python, Go, TypeScript, JavaScript) +- Lowercase field names (fail_to_pass instead of FAIL_TO_PASS) +- Language metadata per task (repo_language field) +""" + +from typing import Any + +from datasets import load_dataset + +from ..docker_env import DockerEnvironmentManager, TaskEnvironment +from ..evaluation import ( + EvaluationResult, + evaluate_patch, + get_test_list_field, + parse_test_list, + run_tests, +) +from .base import BenchmarkTask + +# Supported languages in SWE-bench Pro +PRO_LANGUAGES = {"python", "go", "typescript", "javascript"} + +# DockerHub registry prefix for SWE-bench Pro pre-built images +SWEBENCH_PRO_IMAGE_PREFIX = "jefzda/sweap-images" + + +class SWEBenchProBenchmark: + """SWE-bench Pro benchmark implementation. + + Multi-language benchmark for evaluating coding agents on real-world + software engineering tasks across Python, Go, TypeScript, and JavaScript. + """ + + name = "swe-bench-pro" + + def __init__(self, dataset: str = "ScaleAI/SWE-bench_Pro"): + """Initialize SWE-bench Pro benchmark. + + Args: + dataset: HuggingFace dataset identifier. + """ + self.dataset = dataset + + def load_tasks( + self, + sample_size: int | None = None, + task_ids: list[str] | None = None, + level: int | None = None, + filter_difficulty: list[str] | None = None, + filter_category: list[str] | None = None, + filter_tags: list[str] | None = None, + ) -> list[dict[str, Any]]: + """Load tasks from SWE-bench Pro dataset. + + Args: + sample_size: Maximum number of tasks to load (None for all). + task_ids: Specific task IDs to load (None for all). + level: Unused for SWE-bench Pro. + filter_difficulty: Unused for SWE-bench Pro. + filter_category: Filter by language name (e.g., "python", "go") + or repository substring (e.g., "django", "gin-gonic"). + filter_tags: Unused for SWE-bench Pro. + + Returns: + List of SWE-bench Pro task dictionaries. + """ + dataset = load_dataset(self.dataset, split="test") + + # Optimization: early truncation when no filtering is needed + needs_full_scan = bool(task_ids) or bool(filter_category) + if not needs_full_scan and sample_size is not None and len(dataset) > sample_size: + dataset = dataset.select(range(sample_size)) + + if task_ids: + task_id_set = set(task_ids) + tasks = [item for item in dataset if item["instance_id"] in task_id_set] + else: + tasks = list(dataset) + + if filter_category: + filtered = [] + for task in tasks: + repo = task.get("repo", "") + language = task.get("repo_language", "").lower() + for category in filter_category: + cat_lower = category.lower() + # If the category is a known language, match by language only + if cat_lower in PRO_LANGUAGES: + if cat_lower == language: + filtered.append(task) + break + elif cat_lower in repo.lower(): + # Otherwise, match by repo substring + filtered.append(task) + break + tasks = filtered + + if sample_size is not None and len(tasks) > sample_size: + tasks = tasks[:sample_size] + + return tasks + + def normalize_task(self, task: dict[str, Any]) -> BenchmarkTask: + """Convert SWE-bench Pro task to normalized format. + + Handles both lowercase (SWE-bench Pro) and uppercase (SWE-bench) + field names for test lists. + + Args: + task: SWE-bench Pro task dictionary. + + Returns: + Normalized BenchmarkTask. + """ + return BenchmarkTask( + task_id=task["instance_id"], + problem_statement=task["problem_statement"], + repo=task["repo"], + commit=task["base_commit"], + metadata={ + "fail_to_pass": get_test_list_field(task, "fail_to_pass"), + "pass_to_pass": get_test_list_field(task, "pass_to_pass"), + "test_patch": task.get("test_patch", ""), + "repo_language": task.get("repo_language", "unknown"), + }, + ) + + async def create_environment( + self, + task: dict[str, Any], + docker_manager: DockerEnvironmentManager, + ) -> TaskEnvironment: + """Create environment for SWE-bench Pro task. + + Injects the DockerHub image override so DockerEnvironmentManager + pulls from DockerHub instead of GHCR. + + Args: + task: SWE-bench Pro task dictionary. + docker_manager: Docker environment manager. + + Returns: + TaskEnvironment for the task. + """ + # Inject image override for DockerHub-hosted images + # The dockerhub_tag field is the tag portion; prepend the registry prefix + # SWE-bench Pro images use /app as workdir (not /testbed) + task_copy = dict(task) + dockerhub_tag = task.get("dockerhub_tag") + if dockerhub_tag: + task_copy["_image_override"] = f"{SWEBENCH_PRO_IMAGE_PREFIX}:{dockerhub_tag}" + task_copy["_workdir_override"] = "/app" + + return await docker_manager.create_environment(task_copy) + + async def evaluate( + self, + env: TaskEnvironment, + task: dict[str, Any], + solution: str, + ) -> dict[str, Any]: + """Evaluate a patch for SWE-bench Pro task. + + For Python tasks, delegates to the existing evaluate_patch(). + For Go/TypeScript/JavaScript, uses language-specific test runners. + + Args: + env: Task environment. + task: SWE-bench Pro task dictionary. + solution: Unified diff patch to evaluate. + + Returns: + Dictionary with evaluation results including 'resolved' boolean. + """ + language = task.get("repo_language", "python").lower() + + if language == "python": + # Delegate Python evaluation to existing logic + eval_result: EvaluationResult = await evaluate_patch(env, task, solution) + return self._eval_result_to_dict(eval_result) + + # For non-Python languages, use language-specific evaluation + return await self._evaluate_multilang(env, task, solution, language) + + async def _evaluate_multilang( + self, + env: TaskEnvironment, + task: dict[str, Any], + patch: str, + language: str, + ) -> dict[str, Any]: + """Evaluate a patch using language-specific test runners. + + Args: + env: Task environment. + task: SWE-bench Pro task dictionary. + patch: Unified diff patch to evaluate. + language: Programming language (go, typescript, javascript). + + Returns: + Dictionary with evaluation results. + """ + from ..evaluation import _apply_test_patch, apply_patch + + # SWE-bench Pro images use /app as their working directory + eval_workdir = "/app" if env.uses_prebuilt else None + + applied, error = await apply_patch(env, patch, workdir=eval_workdir) + if not applied: + return {"resolved": False, "patch_applied": False, "eval_error": error} + + test_patch = task.get("test_patch", "") + if test_patch: + await _apply_test_patch(env, test_patch, workdir=eval_workdir) + + # Reinstall package so patched code is active (SWE-bench Pro images + # install into site-packages, not editable mode) + if eval_workdir and language == "python": + await env.exec_command( + "pip install -e . -q 2>/dev/null || true", + timeout=120, + workdir=eval_workdir, + ) + + fail_to_pass_str = get_test_list_field(task, "fail_to_pass") + pass_to_pass_str = get_test_list_field(task, "pass_to_pass") + fail_to_pass_tests = parse_test_list(fail_to_pass_str) + pass_to_pass_tests = parse_test_list(pass_to_pass_str) + + fail_to_pass_results = await self._run_lang_tests( + env, fail_to_pass_tests, language, workdir=eval_workdir + ) + pass_to_pass_results = await self._run_lang_tests( + env, pass_to_pass_tests[:10], language, workdir=eval_workdir + ) + + resolved = ( + fail_to_pass_results.passed == fail_to_pass_results.total + and fail_to_pass_results.total > 0 + and pass_to_pass_results.passed == pass_to_pass_results.total + ) + + result: dict[str, Any] = {"resolved": resolved, "patch_applied": True} + if fail_to_pass_results: + result["fail_to_pass"] = { + "passed": fail_to_pass_results.passed, + "total": fail_to_pass_results.total, + } + if pass_to_pass_results: + result["pass_to_pass"] = { + "passed": pass_to_pass_results.passed, + "total": pass_to_pass_results.total, + } + return result + + async def _run_lang_tests( + self, + env: TaskEnvironment, + tests: list[str], + language: str, + workdir: str | None = None, + timeout: int = 120, + ) -> Any: + """Run tests using language-specific commands. + + Args: + env: Task environment. + tests: List of test identifiers. + language: Programming language. + workdir: Working directory. + timeout: Timeout per test in seconds. + + Returns: + TestResults instance. + """ + if language == "python": + return await run_tests( + env, tests, timeout=timeout, uses_prebuilt=env.uses_prebuilt, workdir=workdir + ) + + # For non-Python, build language-specific commands and run + from ..evaluation import TestResults + + if not tests: + return TestResults(passed=0, total=0, details=[]) + + results = [] + passed = 0 + + for test in tests: + test_cmd = _build_pro_test_command(test, language, env.uses_prebuilt) + try: + exit_code, stdout, stderr = await env.exec_command( + test_cmd, timeout=timeout, workdir=workdir + ) + test_passed = exit_code == 0 + if test_passed: + passed += 1 + results.append( + { + "test": test, + "passed": test_passed, + "exit_code": exit_code, + "output": stdout[:1000] if stdout else "", + "error": stderr[:1000] if stderr else "", + } + ) + except TimeoutError: + results.append( + { + "test": test, + "passed": False, + "exit_code": -1, + "output": "", + "error": "Test timed out", + } + ) + + return TestResults(passed=passed, total=len(tests), details=results) + + def _eval_result_to_dict(self, eval_result: EvaluationResult) -> dict[str, Any]: + """Convert EvaluationResult to dictionary format.""" + result: dict[str, Any] = { + "resolved": eval_result.resolved, + "patch_applied": eval_result.patch_applied, + } + if eval_result.fail_to_pass: + result["fail_to_pass"] = { + "passed": eval_result.fail_to_pass.passed, + "total": eval_result.fail_to_pass.total, + } + if eval_result.pass_to_pass: + result["pass_to_pass"] = { + "passed": eval_result.pass_to_pass.passed, + "total": eval_result.pass_to_pass.total, + } + if eval_result.error: + result["eval_error"] = eval_result.error + return result + + def get_prebuilt_image(self, task: dict[str, Any]) -> str | None: + """Get pre-built Docker image name for the task. + + SWE-bench Pro uses DockerHub images specified in the dockerhub_tag field. + + Args: + task: SWE-bench Pro task dictionary. + + Returns: + Full DockerHub image name, or None if not available. + """ + tag = task.get("dockerhub_tag") + if tag: + return f"{SWEBENCH_PRO_IMAGE_PREFIX}:{tag}" + return None + + def get_prompt_template(self) -> str: + """Get SWE-bench Pro prompt template. + + Returns: + Prompt template for fixing bugs across multiple languages. + """ + return ( + "Fix the following bug in this repository:\n\n" + "{problem_statement}\n\n" + "IMPORTANT CONSTRAINTS:\n" + "- Only modify the minimum files necessary to fix the bug\n" + "- Do NOT create new test files\n" + "- Do NOT create documentation files\n" + "- Do NOT create reproduction scripts\n" + "- Focus solely on the fix in existing source files\n" + "- This may be a Python, Go, TypeScript, or JavaScript project" + ) + + def get_default_sandbox_level(self) -> str | None: + """Get default sandbox level for SWE-bench Pro.""" + return None + + +def _build_pro_test_command(test: str, language: str, uses_prebuilt: bool = False) -> str: + """Build a language-specific test command for SWE-bench Pro. + + Args: + test: Test identifier. + language: Programming language (python, go, typescript, javascript). + uses_prebuilt: Whether a pre-built image is being used. + + Returns: + Shell command string to run the test. + """ + if language == "python": + from ..evaluation import _build_test_command + + return _build_test_command(test, uses_prebuilt) + + if uses_prebuilt: + activate = "source /opt/miniconda3/etc/profile.d/conda.sh && conda activate testbed && " + else: + activate = "" + + if language == "go": + # Go test identifiers can be package paths or test function names + if "/" in test or test.startswith("."): + # Package path: go test -v ./path/to/package + return f"{activate}go test -v -count=1 {test} 2>&1" + else: + # Test function name: go test -v -run TestName ./... + return f"{activate}go test -v -count=1 -run '{test}' ./... 2>&1" + + if language in ("typescript", "javascript"): + # Jest-style test identifiers + if "/" in test or test.endswith((".ts", ".js", ".tsx", ".jsx")): + # File path + return f"{activate}npx jest {test} --verbose --no-cache 2>&1" + else: + # Test name pattern + return f"{activate}npx jest -t '{test}' --verbose --no-cache 2>&1" + + # Fallback: try running as-is + return f"{activate}{test} 2>&1" diff --git a/src/mcpbr/cli.py b/src/mcpbr/cli.py index df18461..a41cdc9 100644 --- a/src/mcpbr/cli.py +++ b/src/mcpbr/cli.py @@ -1510,6 +1510,11 @@ def benchmarks() -> None: "2,294", "Bug fixing (complete benchmark, research)", ) + table.add_row( + "swe-bench-pro", + "731", + "Multi-language bug fixing (Python, Go, TS, JS — harder)", + ) # Other benchmarks table.add_row( "cybergym", @@ -1531,6 +1536,161 @@ def benchmarks() -> None: console.print("[dim] mcpbr run -c config.yaml -b mcptoolbench[/dim]") +@main.command(context_settings={"help_option_names": ["-h", "--help"]}) +@click.option( + "-c", + "--config", + "config_path", + type=click.Path(exists=True), + help="Path to configuration YAML file (used for Docker settings).", +) +@click.option( + "-b", + "--benchmark", + "benchmark_name", + type=click.Choice(list(VALID_BENCHMARKS)), + default="swe-bench-pro", + help="Benchmark to validate (default: swe-bench-pro).", +) +@click.option( + "-n", + "--sample", + "sample_size", + type=int, + default=None, + help="Number of instances to validate (default: all).", +) +@click.option( + "--task", + "task_ids", + multiple=True, + help="Specific task ID(s) to validate.", +) +@click.option( + "--max-concurrent", + type=int, + default=4, + help="Maximum concurrent validations (default: 4).", +) +@click.option( + "--fail-fast", + is_flag=True, + help="Stop on first failure.", +) +@click.option( + "--filter-category", + multiple=True, + help="Filter by language or repo substring.", +) +@click.option( + "--timeout", + type=int, + default=300, + help="Timeout per test in seconds (default: 300).", +) +def preflight( + config_path: str | None, + benchmark_name: str, + sample_size: int | None, + task_ids: tuple[str, ...], + max_concurrent: int, + fail_fast: bool, + filter_category: tuple[str, ...], + timeout: int, +) -> None: + """Validate golden patches pass all tests before evaluation. + + Runs the benchmark's golden (reference) patches against Docker + environments and verifies all tests pass. Use this to catch + environment or configuration issues before running agent evaluations. + + \b + Examples: + mcpbr preflight -b swe-bench-pro -n 5 # Check 5 instances + mcpbr preflight --fail-fast # Stop on first failure + mcpbr preflight --filter-category python -n 10 # Check 10 Python instances + mcpbr preflight --task django__django-16046 # Check specific instance + """ + from .benchmark_preflight import run_benchmark_preflight + from .benchmarks import create_benchmark + from .docker_env import DockerEnvironmentManager + + benchmark = create_benchmark(benchmark_name) + + # Load tasks + task_id_list = list(task_ids) if task_ids else None + category_list = list(filter_category) if filter_category else None + + console.print(f"[bold]Preflight Check: {benchmark_name}[/bold]\n") + dataset_name = getattr(benchmark, "dataset", benchmark_name) + console.print(f"Loading tasks from {dataset_name}...") + + tasks = benchmark.load_tasks( + sample_size=sample_size, + task_ids=task_id_list, + filter_category=category_list, + ) + + if not tasks: + console.print("[yellow]No tasks found matching the criteria.[/yellow]") + return + + console.print(f"Validating {len(tasks)} instance(s)...\n") + + # Create Docker manager + docker_manager = DockerEnvironmentManager(use_prebuilt=True) + + try: + report = asyncio.run( + run_benchmark_preflight( + benchmark=benchmark, + tasks=tasks, + docker_manager=docker_manager, + max_concurrent=max_concurrent, + timeout=timeout, + fail_fast=fail_fast, + ) + ) + finally: + with contextlib.suppress(Exception): + docker_manager.cleanup_all_sync() + + # Display results + result_table = Table() + result_table.add_column("Instance", style="cyan") + result_table.add_column("Language") + result_table.add_column("Status") + result_table.add_column("FTP (pass/total)") + result_table.add_column("PTP (pass/total)") + result_table.add_column("Error") + + for r in report.results: + status_style = { + "passed": "[green]PASS[/green]", + "failed": "[red]FAIL[/red]", + "error": "[yellow]ERROR[/yellow]", + }.get(r.status, r.status) + + result_table.add_row( + r.instance_id, + r.language, + status_style, + f"{r.fail_to_pass_passed}/{r.fail_to_pass_total}", + f"{r.pass_to_pass_passed}/{r.pass_to_pass_total}", + r.error or "", + ) + + console.print(result_table) + console.print( + f"\n[bold]Summary:[/bold] {report.passed}/{report.total} passed " + f"({report.success_rate:.1f}%), " + f"{report.failed} failed, {report.errors} errors" + ) + + if report.failed > 0 or report.errors > 0: + sys.exit(1) + + @main.group(context_settings={"help_option_names": ["-h", "--help"]}) def config() -> None: """Configuration file management commands. diff --git a/src/mcpbr/config.py b/src/mcpbr/config.py index 132d3e9..aa03647 100644 --- a/src/mcpbr/config.py +++ b/src/mcpbr/config.py @@ -47,6 +47,7 @@ "longbench", "adversarial", "codegraph", + "swe-bench-pro", ) VALID_INFRASTRUCTURE_MODES = ("local", "azure", "aws", "gcp", "kubernetes", "cloudflare") diff --git a/src/mcpbr/docker_env.py b/src/mcpbr/docker_env.py index 2f0863a..a7b2b4a 100644 --- a/src/mcpbr/docker_env.py +++ b/src/mcpbr/docker_env.py @@ -445,6 +445,30 @@ def _pull() -> str | None: loop = asyncio.get_event_loop() return await loop.run_in_executor(None, _pull) + async def _try_pull_image(self, image_name: str) -> str | None: + """Try to pull a Docker image by its full name. + + Used for explicit image overrides (e.g., SWE-bench Pro DockerHub images). + + Args: + image_name: Full Docker image name (e.g., "dockerhub_user/image:tag"). + + Returns: + Image name if successful, None if not available. + """ + + def _pull() -> str | None: + try: + self.client.images.pull(image_name, platform="linux/amd64") + return image_name + except docker.errors.ImageNotFound: + return None + except docker.errors.APIError: + return None + + loop = asyncio.get_event_loop() + return await loop.run_in_executor(None, _pull) + async def _ensure_fallback_image(self) -> None: """Ensure the fallback Docker image is built.""" if self._fallback_image_built: @@ -557,9 +581,16 @@ async def create_environment( uses_prebuilt = False if self.use_prebuilt: - image_name = await self._try_pull_prebuilt(instance_id) - if image_name: - uses_prebuilt = True + # Check for explicit image override (e.g., SWE-bench Pro DockerHub images) + image_override = task.get("_image_override") + if image_override: + image_name = await self._try_pull_image(image_override) + if image_name: + uses_prebuilt = True + if not image_name: + image_name = await self._try_pull_prebuilt(instance_id) + if image_name: + uses_prebuilt = True if not image_name: await self._ensure_fallback_image() @@ -574,7 +605,18 @@ async def create_environment( unique_suffix = uuid.uuid4().hex[:6] container_name = f"mcpbr-{self._session_id}-{instance_id}-{unique_suffix}" - container_workdir = "/testbed" if uses_prebuilt else "/workspace" + # SWE-bench Pro images use /app, standard SWE-bench uses /testbed + workdir_override = task.get("_workdir_override") + if workdir_override: + container_workdir = workdir_override + elif uses_prebuilt: + container_workdir = "/testbed" + else: + container_workdir = "/workspace" + + # Some pre-built images set an entrypoint (e.g., /bin/bash) that + # conflicts with our "tail -f /dev/null" keep-alive command. + has_entrypoint_override = bool(task.get("_image_override")) def _create_container() -> Container: max_retries = 3 @@ -599,9 +641,20 @@ def _create_container() -> Container: # Default network mode; sandbox may override network_mode = sandbox_kwargs.pop("network_mode", "bridge") + # Override entrypoint for images that set one (e.g., + # SWE-bench Pro's /bin/bash entrypoint conflicts with + # our "tail -f /dev/null" keep-alive command). + entrypoint_kwargs: dict = {} + if has_entrypoint_override: + entrypoint_kwargs["entrypoint"] = [ + "/bin/sh", + "-c", + "tail -f /dev/null", + ] + container = self.client.containers.run( image_name, - command="tail -f /dev/null", + command="tail -f /dev/null" if not has_entrypoint_override else None, name=container_name, detach=True, platform="linux/amd64" if uses_prebuilt else None, @@ -617,6 +670,7 @@ def _create_container() -> Container: MCPBR_SESSION_LABEL: self._session_id, MCPBR_TIMESTAMP_LABEL: self._session_timestamp, }, + **entrypoint_kwargs, **sandbox_kwargs, ) return container @@ -716,8 +770,10 @@ def _create_container() -> Container: if uses_prebuilt: await self._copy_repo_to_workspace(env) # Install Claude CLI for running agent inside container - await self._install_claude_cli(env) - env.claude_cli_installed = True + # (skip when running preflight checks or evaluation-only workflows) + if not task.get("_skip_cli_install"): + await self._install_claude_cli(env) + env.claude_cli_installed = True else: await self._setup_repo(env, repo, base_commit) @@ -742,7 +798,7 @@ async def _check_workspace_file_count(self, env: TaskEnvironment) -> int: return 0 async def _copy_repo_to_workspace(self, env: TaskEnvironment) -> None: - """Copy repo from pre-built image /testbed to /workspace for agent access. + """Copy repo from pre-built image source dir to /workspace for agent access. Under high concurrency the Docker filesystem copy can silently produce an empty workspace. This method retries with a sync and, if necessary, @@ -751,9 +807,13 @@ async def _copy_repo_to_workspace(self, env: TaskEnvironment) -> None: Args: env: Task environment with pre-built image. """ + # The source directory is the container's working directory (e.g., + # /testbed for standard SWE-bench, /app for SWE-bench Pro). + source_dir = env.workdir if env.workdir != "/workspace" else "/testbed" + # --- Phase 1: initial copy + verify --- exit_code, _stdout, stderr = await env.exec_command( - "cp -r /testbed/. /workspace/", + f"cp -r {source_dir}/. /workspace/", timeout=120, ) if exit_code != 0: @@ -799,11 +859,11 @@ async def _copy_repo_to_workspace(self, env: TaskEnvironment) -> None: # --- Phase 3: full copy retry --- logger.warning( - "Workspace still empty after sync retry — re-copying from /testbed " + f"Workspace still empty after sync retry — re-copying from {source_dir} " f"(instance={env.instance_id})" ) exit_code, _, stderr = await env.exec_command( - "cp -r /testbed/. /workspace/", + f"cp -r {source_dir}/. /workspace/", timeout=120, ) if exit_code != 0: diff --git a/src/mcpbr/evaluation.py b/src/mcpbr/evaluation.py index bfb2614..d7861fb 100644 --- a/src/mcpbr/evaluation.py +++ b/src/mcpbr/evaluation.py @@ -1,6 +1,7 @@ """Evaluation logic for applying patches and running tests.""" import ast +import contextlib import json from dataclasses import dataclass from typing import Any @@ -28,6 +29,31 @@ class EvaluationResult: error: str | None = None +def get_test_list_field(task: dict[str, Any], field_name: str) -> str: + """Get a test list field from a task, checking both lowercase and uppercase names. + + SWE-bench uses FAIL_TO_PASS/PASS_TO_PASS while SWE-bench Pro uses + fail_to_pass/pass_to_pass. This helper provides backward-compatible access. + + Args: + task: Task dictionary. + field_name: Field name in lowercase (e.g., "fail_to_pass"). + + Returns: + Field value as string, or "[]" if not found. + """ + # Try lowercase first (SWE-bench Pro convention) + value = task.get(field_name) + if value is not None: + return str(value) + # Fall back to uppercase (SWE-bench convention) + upper_name = field_name.upper() + value = task.get(upper_name) + if value is not None: + return str(value) + return "[]" + + def parse_test_list(test_str: str) -> list[str]: """Parse test list from SWE-bench format (JSON string or Python literal). @@ -263,7 +289,7 @@ def _build_test_command(test: str, uses_prebuilt: bool = False, repo: str | None test_module = ".".join(test.split(".")[:2]) # Extract test_utils.tests return f"{activate}cd /testbed/tests && ./runtests.py {test_module}" elif "::" in test or test.endswith(".py"): - return f"{activate}python -m pytest {test} -xvs 2>&1" + return f"{activate}python -m pytest '{test}' -xvs 2>&1" else: return f"{activate}python -m pytest -k '{test}' -xvs 2>&1" @@ -375,6 +401,16 @@ async def evaluate_patch( patch_applied=True, error="Docker exec timed out during dependency installation", ) + elif task.get("dockerhub_tag") and task.get("repo_language", "python").lower() == "python": + # SWE-bench Pro images install packages into site-packages (not + # editable). After patching we must reinstall so the new code is + # importable. + with contextlib.suppress(TimeoutError): + await env.exec_command( + "pip install -e . -q 2>/dev/null || true", + timeout=120, + workdir=eval_workdir, + ) repo = task.get("repo") diff --git a/src/mcpbr/swebench_test_specs.py b/src/mcpbr/swebench_test_specs.py index 9c1bdde..0afeb08 100644 --- a/src/mcpbr/swebench_test_specs.py +++ b/src/mcpbr/swebench_test_specs.py @@ -31,3 +31,24 @@ def get_repo_test_command(repo: str) -> str | None: Returns None if repo uses standard pytest (handled by existing logic). """ return REPO_TO_TEST_CMD.get(repo) + + +# Language → default test command for SWE-bench Pro multi-language support +LANGUAGE_TO_TEST_CMD: dict[str, str] = { + "python": TEST_PYTEST, + "go": "go test -v -count=1", + "typescript": "npx jest --verbose --no-cache", + "javascript": "npx jest --verbose --no-cache", +} + + +def get_language_test_command(language: str) -> str | None: + """Look up the default test command for a programming language. + + Args: + language: Programming language name (lowercase). + + Returns: + Default test command string, or None if language is not recognized. + """ + return LANGUAGE_TO_TEST_CMD.get(language.lower()) diff --git a/tests/test_benchmark_preflight.py b/tests/test_benchmark_preflight.py new file mode 100644 index 0000000..d95e586 --- /dev/null +++ b/tests/test_benchmark_preflight.py @@ -0,0 +1,340 @@ +"""Tests for benchmark preflight validation system.""" + +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from mcpbr.benchmark_preflight import ( + PreflightReport, + PreflightResult, + _check_single_instance, + run_benchmark_preflight, +) + + +class TestPreflightResult: + """Tests for PreflightResult data structure.""" + + def test_basic_construction(self) -> None: + result = PreflightResult( + instance_id="test-123", + status="passed", + fail_to_pass_passed=3, + fail_to_pass_total=3, + pass_to_pass_passed=5, + pass_to_pass_total=5, + language="python", + ) + assert result.instance_id == "test-123" + assert result.status == "passed" + assert result.error is None + + def test_failed_result(self) -> None: + result = PreflightResult( + instance_id="test-456", + status="failed", + fail_to_pass_passed=1, + fail_to_pass_total=3, + error="fail_to_pass: 1/3 passed", + language="go", + ) + assert result.status == "failed" + assert result.error is not None + + def test_error_result(self) -> None: + result = PreflightResult( + instance_id="test-789", + status="error", + error="Docker connection failed", + ) + assert result.status == "error" + assert result.language == "unknown" + + +class TestPreflightReport: + """Tests for PreflightReport aggregate results.""" + + def test_all_passed(self) -> None: + report = PreflightReport(total=3, passed=3, failed=0, errors=0) + assert report.success_rate == 100.0 + + def test_empty_report(self) -> None: + report = PreflightReport(total=0, passed=0, failed=0, errors=0) + assert report.success_rate == 0.0 + + def test_partial_success(self) -> None: + report = PreflightReport(total=10, passed=7, failed=2, errors=1) + assert report.success_rate == 70.0 + + def test_all_failed(self) -> None: + report = PreflightReport(total=5, passed=0, failed=5, errors=0) + assert report.success_rate == 0.0 + + def test_default_results_list(self) -> None: + report = PreflightReport() + assert report.results == [] + + +class TestCheckSingleInstance: + """Tests for single instance preflight check.""" + + @pytest.mark.asyncio + async def test_successful_check(self) -> None: + mock_env = MagicMock() + mock_env.uses_prebuilt = True + mock_env.cleanup = AsyncMock() + mock_env.exec_command = AsyncMock(return_value=(0, "", "")) + + mock_benchmark = MagicMock() + mock_benchmark.create_environment = AsyncMock(return_value=mock_env) + + task = { + "instance_id": "django__django-16046", + "repo": "django/django", + "repo_language": "python", + "patch": "diff --git a/fix.py", + "test_patch": "", + "fail_to_pass": '["test_one"]', + "pass_to_pass": '["test_two"]', + } + + mock_docker = MagicMock() + + with ( + patch("mcpbr.benchmark_preflight.apply_patch", new_callable=AsyncMock) as mock_apply, + patch("mcpbr.benchmark_preflight.run_tests", new_callable=AsyncMock) as mock_tests, + ): + mock_apply.return_value = (True, "") + # fail_to_pass: 1/1 passed, pass_to_pass: 1/1 passed + mock_tests.side_effect = [ + MagicMock(passed=1, total=1), + MagicMock(passed=1, total=1), + ] + + result = await _check_single_instance(mock_benchmark, task, mock_docker) + + assert result.status == "passed" + assert result.instance_id == "django__django-16046" + assert result.language == "python" + mock_env.cleanup.assert_called_once() + + @pytest.mark.asyncio + async def test_patch_apply_failure(self) -> None: + mock_env = MagicMock() + mock_env.uses_prebuilt = True + mock_env.cleanup = AsyncMock() + + mock_benchmark = MagicMock() + mock_benchmark.create_environment = AsyncMock(return_value=mock_env) + + task = { + "instance_id": "test-fail", + "repo": "org/repo", + "repo_language": "go", + "patch": "bad patch", + "fail_to_pass": '["test"]', + "pass_to_pass": "[]", + } + + mock_docker = MagicMock() + + with patch("mcpbr.benchmark_preflight.apply_patch", new_callable=AsyncMock) as mock_apply: + mock_apply.return_value = (False, "Patch does not apply") + + result = await _check_single_instance(mock_benchmark, task, mock_docker) + + assert result.status == "failed" + assert "Golden patch failed to apply" in (result.error or "") + + @pytest.mark.asyncio + async def test_no_golden_patch(self) -> None: + mock_env = MagicMock() + mock_env.uses_prebuilt = True + mock_env.cleanup = AsyncMock() + + mock_benchmark = MagicMock() + mock_benchmark.create_environment = AsyncMock(return_value=mock_env) + + task = { + "instance_id": "no-patch", + "repo": "org/repo", + "repo_language": "python", + "patch": "", + "fail_to_pass": '["test"]', + "pass_to_pass": "[]", + } + + mock_docker = MagicMock() + result = await _check_single_instance(mock_benchmark, task, mock_docker) + + assert result.status == "error" + assert "No golden patch" in (result.error or "") + + @pytest.mark.asyncio + async def test_exception_handling(self) -> None: + mock_benchmark = MagicMock() + mock_benchmark.create_environment = AsyncMock( + side_effect=RuntimeError("Docker not available") + ) + + task = { + "instance_id": "error-task", + "repo": "org/repo", + "patch": "diff", + } + + mock_docker = MagicMock() + result = await _check_single_instance(mock_benchmark, task, mock_docker) + + assert result.status == "error" + assert "Docker not available" in (result.error or "") + + +class TestRunBenchmarkPreflight: + """Tests for the main preflight runner.""" + + @pytest.mark.asyncio + async def test_concurrent_execution(self) -> None: + mock_benchmark = MagicMock() + mock_env = MagicMock() + mock_env.uses_prebuilt = True + mock_env.cleanup = AsyncMock() + mock_env.exec_command = AsyncMock(return_value=(0, "", "")) + mock_benchmark.create_environment = AsyncMock(return_value=mock_env) + + tasks = [ + { + "instance_id": f"task-{i}", + "repo": "org/repo", + "repo_language": "python", + "patch": "diff --git", + "test_patch": "", + "fail_to_pass": '["test"]', + "pass_to_pass": "[]", + } + for i in range(3) + ] + + mock_docker = MagicMock() + + with ( + patch("mcpbr.benchmark_preflight.apply_patch", new_callable=AsyncMock) as mock_apply, + patch("mcpbr.benchmark_preflight.run_tests", new_callable=AsyncMock) as mock_tests, + ): + mock_apply.return_value = (True, "") + mock_tests.return_value = MagicMock(passed=1, total=1) + + report = await run_benchmark_preflight( + benchmark=mock_benchmark, + tasks=tasks, + docker_manager=mock_docker, + max_concurrent=2, + ) + + assert report.total == 3 + assert report.passed == 3 + assert report.failed == 0 + assert report.success_rate == 100.0 + + @pytest.mark.asyncio + async def test_fail_fast(self) -> None: + call_count = 0 + + async def mock_check( + benchmark: object, task: dict, docker: object, timeout: int = 300 + ) -> PreflightResult: + nonlocal call_count + call_count += 1 + if call_count == 2: + return PreflightResult( + instance_id=task["instance_id"], + status="failed", + error="Test failure", + language="python", + ) + return PreflightResult( + instance_id=task["instance_id"], + status="passed", + language="python", + ) + + tasks = [{"instance_id": f"task-{i}", "repo": "r", "patch": "d"} for i in range(5)] + + mock_docker = MagicMock() + mock_benchmark = MagicMock() + + with patch( + "mcpbr.benchmark_preflight._check_single_instance", + side_effect=mock_check, + ): + report = await run_benchmark_preflight( + benchmark=mock_benchmark, + tasks=tasks, + docker_manager=mock_docker, + fail_fast=True, + ) + + # Should stop after the failure (task 2) + assert report.total == 5 + assert report.passed == 1 + assert report.failed == 1 + assert len(report.results) == 2 + + @pytest.mark.asyncio + async def test_error_handling_in_gather(self) -> None: + mock_benchmark = MagicMock() + + async def failing_create(*args: object, **kwargs: object) -> None: + raise RuntimeError("Docker error") + + mock_benchmark.create_environment = AsyncMock(side_effect=failing_create) + + tasks = [ + { + "instance_id": "err-task", + "repo": "org/repo", + "patch": "diff", + "fail_to_pass": '["test"]', + "pass_to_pass": "[]", + } + ] + + mock_docker = MagicMock() + + report = await run_benchmark_preflight( + benchmark=mock_benchmark, + tasks=tasks, + docker_manager=mock_docker, + ) + + assert report.total == 1 + assert report.errors == 1 + assert report.success_rate == 0.0 + + +class TestGetTestListField: + """Tests for the get_test_list_field helper.""" + + def test_lowercase_field(self) -> None: + from mcpbr.evaluation import get_test_list_field + + task = {"fail_to_pass": '["test_a"]'} + assert get_test_list_field(task, "fail_to_pass") == '["test_a"]' + + def test_uppercase_field(self) -> None: + from mcpbr.evaluation import get_test_list_field + + task = {"FAIL_TO_PASS": '["test_b"]'} + assert get_test_list_field(task, "fail_to_pass") == '["test_b"]' + + def test_lowercase_preferred(self) -> None: + from mcpbr.evaluation import get_test_list_field + + task = {"fail_to_pass": '["lower"]', "FAIL_TO_PASS": '["upper"]'} + assert get_test_list_field(task, "fail_to_pass") == '["lower"]' + + def test_missing_field(self) -> None: + from mcpbr.evaluation import get_test_list_field + + task = {"something_else": "value"} + assert get_test_list_field(task, "fail_to_pass") == "[]" diff --git a/tests/test_swebench_pro.py b/tests/test_swebench_pro.py new file mode 100644 index 0000000..fde8b8e --- /dev/null +++ b/tests/test_swebench_pro.py @@ -0,0 +1,383 @@ +"""Tests for SWE-bench Pro benchmark implementation.""" + +from unittest.mock import MagicMock, patch + +from mcpbr.benchmarks.swebench_pro import ( + PRO_LANGUAGES, + SWEBENCH_PRO_IMAGE_PREFIX, + SWEBenchProBenchmark, + _build_pro_test_command, +) + + +class TestSWEBenchProInit: + """Tests for SWEBenchProBenchmark initialization.""" + + def test_default_dataset(self) -> None: + benchmark = SWEBenchProBenchmark() + assert benchmark.dataset == "ScaleAI/SWE-bench_Pro" + + def test_custom_dataset(self) -> None: + benchmark = SWEBenchProBenchmark(dataset="custom/dataset") + assert benchmark.dataset == "custom/dataset" + + def test_name(self) -> None: + benchmark = SWEBenchProBenchmark() + assert benchmark.name == "swe-bench-pro" + + +class TestSWEBenchProNormalizeTask: + """Tests for task normalization.""" + + def test_normalize_basic_task(self) -> None: + task = { + "instance_id": "django__django-16046", + "problem_statement": "Fix the bug", + "repo": "django/django", + "base_commit": "abc123", + "fail_to_pass": '["test_one"]', + "pass_to_pass": '["test_two"]', + "test_patch": "diff --git a/test.py", + "repo_language": "python", + } + benchmark = SWEBenchProBenchmark() + bt = benchmark.normalize_task(task) + assert bt.task_id == "django__django-16046" + assert bt.problem_statement == "Fix the bug" + assert bt.repo == "django/django" + assert bt.commit == "abc123" + assert bt.metadata["repo_language"] == "python" + + def test_normalize_with_uppercase_fields(self) -> None: + """Test that uppercase FAIL_TO_PASS/PASS_TO_PASS are handled.""" + task = { + "instance_id": "test-123", + "problem_statement": "desc", + "repo": "org/repo", + "base_commit": "def456", + "FAIL_TO_PASS": '["test_a"]', + "PASS_TO_PASS": '["test_b"]', + } + benchmark = SWEBenchProBenchmark() + bt = benchmark.normalize_task(task) + assert bt.task_id == "test-123" + assert bt.metadata["fail_to_pass"] == '["test_a"]' + assert bt.metadata["pass_to_pass"] == '["test_b"]' + + def test_normalize_missing_language(self) -> None: + task = { + "instance_id": "test-456", + "problem_statement": "desc", + "repo": "org/repo", + "base_commit": "ghi789", + } + benchmark = SWEBenchProBenchmark() + bt = benchmark.normalize_task(task) + assert bt.metadata["repo_language"] == "unknown" + + def test_normalize_go_task(self) -> None: + task = { + "instance_id": "gin-gonic__gin-3890", + "problem_statement": "Fix routing", + "repo": "gin-gonic/gin", + "base_commit": "jkl012", + "fail_to_pass": '["TestRoute"]', + "pass_to_pass": "[]", + "repo_language": "go", + } + benchmark = SWEBenchProBenchmark() + bt = benchmark.normalize_task(task) + assert bt.metadata["repo_language"] == "go" + + +class TestBuildProTestCommand: + """Tests for language-specific test command building.""" + + def test_python_delegates(self) -> None: + """Python should delegate to existing _build_test_command.""" + cmd = _build_pro_test_command("tests/test_foo.py::test_bar", "python") + assert "pytest" in cmd or "test_foo" in cmd + + def test_go_package_path(self) -> None: + cmd = _build_pro_test_command("./pkg/router", "go") + assert "go test" in cmd + assert "./pkg/router" in cmd + assert "-v" in cmd + + def test_go_function_name(self) -> None: + cmd = _build_pro_test_command("TestRouteMatching", "go") + assert "go test" in cmd + assert "-run" in cmd + assert "TestRouteMatching" in cmd + + def test_typescript_file(self) -> None: + cmd = _build_pro_test_command("src/__tests__/parser.test.ts", "typescript") + assert "npx jest" in cmd + assert "parser.test.ts" in cmd + + def test_typescript_pattern(self) -> None: + cmd = _build_pro_test_command("should parse tokens", "typescript") + assert "npx jest" in cmd + assert "-t" in cmd + + def test_javascript_file(self) -> None: + cmd = _build_pro_test_command("test/index.test.js", "javascript") + assert "npx jest" in cmd + assert "index.test.js" in cmd + + def test_javascript_pattern(self) -> None: + cmd = _build_pro_test_command("handles edge case", "javascript") + assert "npx jest" in cmd + assert "-t" in cmd + + def test_prebuilt_conda_activation(self) -> None: + cmd = _build_pro_test_command("TestFoo", "go", uses_prebuilt=True) + assert "conda activate testbed" in cmd + + def test_unknown_language_fallback(self) -> None: + cmd = _build_pro_test_command("test_something", "rust") + assert "test_something" in cmd + + +class TestSWEBenchProDockerImage: + """Tests for pre-built image lookup.""" + + def test_get_prebuilt_image_with_tag(self) -> None: + task = {"dockerhub_tag": "django.django-django__django-abc123"} + benchmark = SWEBenchProBenchmark() + expected = f"{SWEBENCH_PRO_IMAGE_PREFIX}:django.django-django__django-abc123" + assert benchmark.get_prebuilt_image(task) == expected + + def test_get_prebuilt_image_missing(self) -> None: + task = {"instance_id": "test-123"} + benchmark = SWEBenchProBenchmark() + assert benchmark.get_prebuilt_image(task) is None + + +class TestSWEBenchProPromptTemplate: + """Tests for prompt template.""" + + def test_has_placeholder(self) -> None: + benchmark = SWEBenchProBenchmark() + template = benchmark.get_prompt_template() + assert "{problem_statement}" in template + + def test_mentions_multiple_languages(self) -> None: + benchmark = SWEBenchProBenchmark() + template = benchmark.get_prompt_template() + assert "Go" in template + assert "TypeScript" in template + assert "JavaScript" in template + + +class TestSWEBenchProFilterCategory: + """Tests for category filtering in load_tasks.""" + + @patch("mcpbr.benchmarks.swebench_pro.load_dataset") + def test_filter_by_language(self, mock_load: MagicMock) -> None: + mock_dataset = [ + { + "instance_id": "t1", + "repo": "django/django", + "repo_language": "python", + "problem_statement": "p", + "base_commit": "c", + }, + { + "instance_id": "t2", + "repo": "gin-gonic/gin", + "repo_language": "go", + "problem_statement": "p", + "base_commit": "c", + }, + { + "instance_id": "t3", + "repo": "vercel/next.js", + "repo_language": "typescript", + "problem_statement": "p", + "base_commit": "c", + }, + ] + mock_load.return_value = MagicMock( + __iter__=lambda self: iter(mock_dataset), + __len__=lambda self: len(mock_dataset), + ) + + benchmark = SWEBenchProBenchmark() + tasks = benchmark.load_tasks(filter_category=["go"]) + assert len(tasks) == 1 + assert tasks[0]["instance_id"] == "t2" + + @patch("mcpbr.benchmarks.swebench_pro.load_dataset") + def test_filter_by_repo_substring(self, mock_load: MagicMock) -> None: + mock_dataset = [ + { + "instance_id": "t1", + "repo": "django/django", + "repo_language": "python", + "problem_statement": "p", + "base_commit": "c", + }, + { + "instance_id": "t2", + "repo": "gin-gonic/gin", + "repo_language": "go", + "problem_statement": "p", + "base_commit": "c", + }, + ] + mock_load.return_value = MagicMock( + __iter__=lambda self: iter(mock_dataset), + __len__=lambda self: len(mock_dataset), + ) + + benchmark = SWEBenchProBenchmark() + tasks = benchmark.load_tasks(filter_category=["django"]) + assert len(tasks) == 1 + assert tasks[0]["instance_id"] == "t1" + + def test_pro_languages_set(self) -> None: + assert {"python", "go", "typescript", "javascript"} == PRO_LANGUAGES + + +class TestSWEBenchProLoadTasks: + """Tests for task loading.""" + + @patch("mcpbr.benchmarks.swebench_pro.load_dataset") + def test_sample_size(self, mock_load: MagicMock) -> None: + mock_dataset = [ + { + "instance_id": f"t{i}", + "repo": "r", + "problem_statement": "p", + "base_commit": "c", + } + for i in range(10) + ] + mock_ds = MagicMock() + mock_ds.__iter__ = lambda self: iter(mock_dataset) + mock_ds.__len__ = lambda self: len(mock_dataset) + mock_ds.select = MagicMock(return_value=mock_dataset[:3]) + mock_load.return_value = mock_ds + + benchmark = SWEBenchProBenchmark() + tasks = benchmark.load_tasks(sample_size=3) + assert len(tasks) == 3 + + @patch("mcpbr.benchmarks.swebench_pro.load_dataset") + def test_task_ids(self, mock_load: MagicMock) -> None: + mock_dataset = [ + { + "instance_id": f"t{i}", + "repo": "r", + "problem_statement": "p", + "base_commit": "c", + } + for i in range(5) + ] + mock_load.return_value = MagicMock( + __iter__=lambda self: iter(mock_dataset), + __len__=lambda self: len(mock_dataset), + ) + + benchmark = SWEBenchProBenchmark() + tasks = benchmark.load_tasks(task_ids=["t1", "t3"]) + assert len(tasks) == 2 + ids = {t["instance_id"] for t in tasks} + assert ids == {"t1", "t3"} + + @patch("mcpbr.benchmarks.swebench_pro.load_dataset") + def test_combined_filters(self, mock_load: MagicMock) -> None: + mock_dataset = [ + { + "instance_id": "t1", + "repo": "django/django", + "repo_language": "python", + "problem_statement": "p", + "base_commit": "c", + }, + { + "instance_id": "t2", + "repo": "gin-gonic/gin", + "repo_language": "go", + "problem_statement": "p", + "base_commit": "c", + }, + { + "instance_id": "t3", + "repo": "vercel/next.js", + "repo_language": "typescript", + "problem_statement": "p", + "base_commit": "c", + }, + ] + mock_load.return_value = MagicMock( + __iter__=lambda self: iter(mock_dataset), + __len__=lambda self: len(mock_dataset), + ) + + benchmark = SWEBenchProBenchmark() + tasks = benchmark.load_tasks( + task_ids=["t1", "t2"], + filter_category=["python"], + ) + assert len(tasks) == 1 + assert tasks[0]["instance_id"] == "t1" + + +class TestSWEBenchProSandboxLevel: + """Tests for sandbox level.""" + + def test_default_sandbox_level(self) -> None: + benchmark = SWEBenchProBenchmark() + assert benchmark.get_default_sandbox_level() is None + + +class TestSWEBenchProRegistry: + """Tests for benchmark registry integration.""" + + def test_create_swebench_pro(self) -> None: + from mcpbr.benchmarks import create_benchmark + + benchmark = create_benchmark("swe-bench-pro") + assert isinstance(benchmark, SWEBenchProBenchmark) + assert benchmark.dataset == "ScaleAI/SWE-bench_Pro" + + def test_listed_in_registry(self) -> None: + from mcpbr.benchmarks import list_benchmarks + + assert "swe-bench-pro" in list_benchmarks() + + +class TestSWEBenchProEvalResultToDict: + """Tests for _eval_result_to_dict helper.""" + + def test_basic_conversion(self) -> None: + from mcpbr.evaluation import EvaluationResult, TestResults + + benchmark = SWEBenchProBenchmark() + result = EvaluationResult( + resolved=True, + patch_applied=True, + fail_to_pass=TestResults(passed=2, total=2, details=[]), + pass_to_pass=TestResults(passed=5, total=5, details=[]), + ) + d = benchmark._eval_result_to_dict(result) + assert d["resolved"] is True + assert d["patch_applied"] is True + assert d["fail_to_pass"]["passed"] == 2 + assert d["pass_to_pass"]["passed"] == 5 + + def test_with_error(self) -> None: + from mcpbr.evaluation import EvaluationResult + + benchmark = SWEBenchProBenchmark() + result = EvaluationResult( + resolved=False, + patch_applied=False, + error="Patch failed", + ) + d = benchmark._eval_result_to_dict(result) + assert d["resolved"] is False + assert d["eval_error"] == "Patch failed" + assert "fail_to_pass" not in d diff --git a/uv.lock b/uv.lock index ef2aaf8..bfeb7e8 100644 --- a/uv.lock +++ b/uv.lock @@ -190,29 +190,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3a/2a/7cc015f5b9f5db42b7d48157e23356022889fc354a2813c15934b7cb5c0e/attrs-25.4.0-py3-none-any.whl", hash = "sha256:adcf7e2a1fb3b36ac48d97835bb6d8ade15b8dcce26aba8bf1d14847b57a3373", size = 67615, upload-time = "2025-10-06T13:54:43.17Z" }, ] -[[package]] -name = "babel" -version = "2.17.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/7d/6b/d52e42361e1aa00709585ecc30b3f9684b3ab62530771402248b1b1d6240/babel-2.17.0.tar.gz", hash = "sha256:0c54cffb19f690cdcc52a3b50bcbf71e07a808d1c80d549f2459b9d2cf0afb9d", size = 9951852, upload-time = "2025-02-01T15:17:41.026Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b7/b8/3fe70c75fe32afc4bb507f75563d39bc5642255d1d94f1f23604725780bf/babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2", size = 10182537, upload-time = "2025-02-01T15:17:37.39Z" }, -] - -[[package]] -name = "backrefs" -version = "6.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/86/e3/bb3a439d5cb255c4774724810ad8073830fac9c9dee123555820c1bcc806/backrefs-6.1.tar.gz", hash = "sha256:3bba1749aafe1db9b915f00e0dd166cba613b6f788ffd63060ac3485dc9be231", size = 7011962, upload-time = "2025-11-15T14:52:08.323Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3b/ee/c216d52f58ea75b5e1841022bbae24438b19834a29b163cb32aa3a2a7c6e/backrefs-6.1-py310-none-any.whl", hash = "sha256:2a2ccb96302337ce61ee4717ceacfbf26ba4efb1d55af86564b8bbaeda39cac1", size = 381059, upload-time = "2025-11-15T14:51:59.758Z" }, - { url = "https://files.pythonhosted.org/packages/e6/9a/8da246d988ded941da96c7ed945d63e94a445637eaad985a0ed88787cb89/backrefs-6.1-py311-none-any.whl", hash = "sha256:e82bba3875ee4430f4de4b6db19429a27275d95a5f3773c57e9e18abc23fd2b7", size = 392854, upload-time = "2025-11-15T14:52:01.194Z" }, - { url = "https://files.pythonhosted.org/packages/37/c9/fd117a6f9300c62bbc33bc337fd2b3c6bfe28b6e9701de336b52d7a797ad/backrefs-6.1-py312-none-any.whl", hash = "sha256:c64698c8d2269343d88947c0735cb4b78745bd3ba590e10313fbf3f78c34da5a", size = 398770, upload-time = "2025-11-15T14:52:02.584Z" }, - { url = "https://files.pythonhosted.org/packages/eb/95/7118e935b0b0bd3f94dfec2d852fd4e4f4f9757bdb49850519acd245cd3a/backrefs-6.1-py313-none-any.whl", hash = "sha256:4c9d3dc1e2e558965202c012304f33d4e0e477e1c103663fd2c3cc9bb18b0d05", size = 400726, upload-time = "2025-11-15T14:52:04.093Z" }, - { url = "https://files.pythonhosted.org/packages/1d/72/6296bad135bfafd3254ae3648cd152980a424bd6fed64a101af00cc7ba31/backrefs-6.1-py314-none-any.whl", hash = "sha256:13eafbc9ccd5222e9c1f0bec563e6d2a6d21514962f11e7fc79872fd56cbc853", size = 412584, upload-time = "2025-11-15T14:52:05.233Z" }, - { url = "https://files.pythonhosted.org/packages/02/e3/a4fa1946722c4c7b063cc25043a12d9ce9b4323777f89643be74cef2993c/backrefs-6.1-py39-none-any.whl", hash = "sha256:a9e99b8a4867852cad177a6430e31b0f6e495d65f8c6c134b68c14c3c95bf4b0", size = 381058, upload-time = "2025-11-15T14:52:06.698Z" }, -] - [[package]] name = "bcrypt" version = "5.0.0" @@ -524,12 +501,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/bc/58/6b3d24e6b9bc474a2dcdee65dfd1f008867015408a271562e4b690561a4d/cryptography-46.0.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:8456928655f856c6e1533ff59d5be76578a7157224dbd9ce6872f25055ab9ab7", size = 3407605, upload-time = "2026-02-10T19:18:29.233Z" }, ] -[[package]] -name = "csscompressor" -version = "0.9.5" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f1/2a/8c3ac3d8bc94e6de8d7ae270bb5bc437b210bb9d6d9e46630c98f4abd20c/csscompressor-0.9.5.tar.gz", hash = "sha256:afa22badbcf3120a4f392e4d22f9fff485c044a1feda4a950ecc5eba9dd31a05", size = 237808, upload-time = "2017-11-26T21:13:08.238Z" } - [[package]] name = "datasets" version = "4.5.0" @@ -733,18 +704,6 @@ http = [ { name = "aiohttp" }, ] -[[package]] -name = "ghp-import" -version = "2.1.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "python-dateutil" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/d9/29/d40217cbe2f6b1359e00c6c307bb3fc876ba74068cbab3dde77f03ca0dc4/ghp-import-2.1.0.tar.gz", hash = "sha256:9c535c4c61193c2df8871222567d7fd7e5014d835f97dc7b7439069e2413d343", size = 10943, upload-time = "2022-05-02T15:47:16.11Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f7/ec/67fbef5d497f86283db54c22eec6f6140243aae73265799baaaa19cd17fb/ghp_import-2.1.0-py3-none-any.whl", hash = "sha256:8337dd7b50877f163d4c0289bc1f1c7f127550241988d568c1db512c4324a619", size = 11034, upload-time = "2022-05-02T15:47:14.552Z" }, -] - [[package]] name = "gitdb" version = "4.0.12" @@ -879,18 +838,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c4/ab/09169d5a4612a5f92490806649ac8d41e3ec9129c636754575b3553f4ea4/googleapis_common_protos-1.72.0-py3-none-any.whl", hash = "sha256:4299c5a82d5ae1a9702ada957347726b167f9f8d1fc352477702a1e851ff4038", size = 297515, upload-time = "2025-11-06T18:29:13.14Z" }, ] -[[package]] -name = "griffe" -version = "1.15.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "colorama" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/0d/0c/3a471b6e31951dce2360477420d0a8d1e00dea6cf33b70f3e8c3ab6e28e1/griffe-1.15.0.tar.gz", hash = "sha256:7726e3afd6f298fbc3696e67958803e7ac843c1cfe59734b6251a40cdbfb5eea", size = 424112, upload-time = "2025-11-10T15:03:15.52Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/9c/83/3b1d03d36f224edded98e9affd0467630fc09d766c0e56fb1498cbb04a9b/griffe-1.15.0-py3-none-any.whl", hash = "sha256:6f6762661949411031f5fcda9593f586e6ce8340f0ba88921a0f2ef7a81eb9a3", size = 150705, upload-time = "2025-11-10T15:03:13.549Z" }, -] - [[package]] name = "grpcio" version = "1.78.0" @@ -994,14 +941,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cb/44/870d44b30e1dcfb6a65932e3e1506c103a8a5aea9103c337e7a53180322c/hf_xet-1.2.0-cp37-abi3-win_amd64.whl", hash = "sha256:e6584a52253f72c9f52f9e549d5895ca7a471608495c4ecaa6cc73dba2b24d69", size = 2905735, upload-time = "2025-10-24T19:04:35.928Z" }, ] -[[package]] -name = "htmlmin2" -version = "0.1.13" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/be/31/a76f4bfa885f93b8167cb4c85cf32b54d1f64384d0b897d45bc6d19b7b45/htmlmin2-0.1.13-py3-none-any.whl", hash = "sha256:75609f2a42e64f7ce57dbff28a39890363bde9e7e5885db633317efbdf8c79a2", size = 34486, upload-time = "2023-03-14T21:28:30.388Z" }, -] - [[package]] name = "httpcore" version = "1.0.9" @@ -1108,18 +1047,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/32/4b/b99e37f88336009971405cbb7630610322ed6fbfa31e1d7ab3fbf3049a2d/invoke-2.2.1-py3-none-any.whl", hash = "sha256:2413bc441b376e5cd3f55bb5d364f973ad8bdd7bf87e53c79de3c11bf3feecc8", size = 160287, upload-time = "2025-10-11T00:36:33.703Z" }, ] -[[package]] -name = "jinja2" -version = "3.1.6" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "markupsafe" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, -] - [[package]] name = "jiter" version = "0.12.0" @@ -1205,12 +1132,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2f/9c/6753e6522b8d0ef07d3a3d239426669e984fb0eba15a315cdbc1253904e4/jiter-0.12.0-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c24e864cb30ab82311c6425655b0cdab0a98c5d973b065c66a3f020740c2324c", size = 346110, upload-time = "2025-11-09T20:49:21.817Z" }, ] -[[package]] -name = "jsmin" -version = "3.0.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/5e/73/e01e4c5e11ad0494f4407a3f623ad4d87714909f50b17a06ed121034ff6e/jsmin-3.0.1.tar.gz", hash = "sha256:c0959a121ef94542e807a674142606f7e90214a2b3d1eb17300244bbb5cc2bfc", size = 13925, upload-time = "2022-01-16T20:35:59.13Z" } - [[package]] name = "jsonschema" version = "4.26.0" @@ -1311,15 +1232,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/94/d1/433b3c06e78f23486fe4fdd19bc134657eb30997d2054b0dbf52bbf3382e/librt-0.8.0-cp314-cp314t-win_arm64.whl", hash = "sha256:92249938ab744a5890580d3cb2b22042f0dce71cdaa7c1369823df62bedf7cbc", size = 48753, upload-time = "2026-02-12T14:53:38.539Z" }, ] -[[package]] -name = "markdown" -version = "3.10.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/b7/b1/af95bcae8549f1f3fd70faacb29075826a0d689a27f232e8cee315efa053/markdown-3.10.1.tar.gz", hash = "sha256:1c19c10bd5c14ac948c53d0d762a04e2fa35a6d58a6b7b1e6bfcbe6fefc0001a", size = 365402, upload-time = "2026-01-21T18:09:28.206Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/59/1b/6ef961f543593969d25b2afe57a3564200280528caa9bd1082eecdd7b3bc/markdown-3.10.1-py3-none-any.whl", hash = "sha256:867d788939fe33e4b736426f5b9f651ad0c0ae0ecf89df0ca5d1176c70812fe3", size = 107684, upload-time = "2026-01-21T18:09:27.203Z" }, -] - [[package]] name = "markdown-it-py" version = "4.0.0" @@ -1332,80 +1244,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" }, ] -[[package]] -name = "markupsafe" -version = "3.0.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313, upload-time = "2025-09-27T18:37:40.426Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/08/db/fefacb2136439fc8dd20e797950e749aa1f4997ed584c62cfb8ef7c2be0e/markupsafe-3.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1cc7ea17a6824959616c525620e387f6dd30fec8cb44f649e31712db02123dad", size = 11631, upload-time = "2025-09-27T18:36:18.185Z" }, - { url = "https://files.pythonhosted.org/packages/e1/2e/5898933336b61975ce9dc04decbc0a7f2fee78c30353c5efba7f2d6ff27a/markupsafe-3.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4bd4cd07944443f5a265608cc6aab442e4f74dff8088b0dfc8238647b8f6ae9a", size = 12058, upload-time = "2025-09-27T18:36:19.444Z" }, - { url = "https://files.pythonhosted.org/packages/1d/09/adf2df3699d87d1d8184038df46a9c80d78c0148492323f4693df54e17bb/markupsafe-3.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b5420a1d9450023228968e7e6a9ce57f65d148ab56d2313fcd589eee96a7a50", size = 24287, upload-time = "2025-09-27T18:36:20.768Z" }, - { url = "https://files.pythonhosted.org/packages/30/ac/0273f6fcb5f42e314c6d8cd99effae6a5354604d461b8d392b5ec9530a54/markupsafe-3.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0bf2a864d67e76e5c9a34dc26ec616a66b9888e25e7b9460e1c76d3293bd9dbf", size = 22940, upload-time = "2025-09-27T18:36:22.249Z" }, - { url = "https://files.pythonhosted.org/packages/19/ae/31c1be199ef767124c042c6c3e904da327a2f7f0cd63a0337e1eca2967a8/markupsafe-3.0.3-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:bc51efed119bc9cfdf792cdeaa4d67e8f6fcccab66ed4bfdd6bde3e59bfcbb2f", size = 21887, upload-time = "2025-09-27T18:36:23.535Z" }, - { url = "https://files.pythonhosted.org/packages/b2/76/7edcab99d5349a4532a459e1fe64f0b0467a3365056ae550d3bcf3f79e1e/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:068f375c472b3e7acbe2d5318dea141359e6900156b5b2ba06a30b169086b91a", size = 23692, upload-time = "2025-09-27T18:36:24.823Z" }, - { url = "https://files.pythonhosted.org/packages/a4/28/6e74cdd26d7514849143d69f0bf2399f929c37dc2b31e6829fd2045b2765/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:7be7b61bb172e1ed687f1754f8e7484f1c8019780f6f6b0786e76bb01c2ae115", size = 21471, upload-time = "2025-09-27T18:36:25.95Z" }, - { url = "https://files.pythonhosted.org/packages/62/7e/a145f36a5c2945673e590850a6f8014318d5577ed7e5920a4b3448e0865d/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f9e130248f4462aaa8e2552d547f36ddadbeaa573879158d721bbd33dfe4743a", size = 22923, upload-time = "2025-09-27T18:36:27.109Z" }, - { url = "https://files.pythonhosted.org/packages/0f/62/d9c46a7f5c9adbeeeda52f5b8d802e1094e9717705a645efc71b0913a0a8/markupsafe-3.0.3-cp311-cp311-win32.whl", hash = "sha256:0db14f5dafddbb6d9208827849fad01f1a2609380add406671a26386cdf15a19", size = 14572, upload-time = "2025-09-27T18:36:28.045Z" }, - { url = "https://files.pythonhosted.org/packages/83/8a/4414c03d3f891739326e1783338e48fb49781cc915b2e0ee052aa490d586/markupsafe-3.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:de8a88e63464af587c950061a5e6a67d3632e36df62b986892331d4620a35c01", size = 15077, upload-time = "2025-09-27T18:36:29.025Z" }, - { url = "https://files.pythonhosted.org/packages/35/73/893072b42e6862f319b5207adc9ae06070f095b358655f077f69a35601f0/markupsafe-3.0.3-cp311-cp311-win_arm64.whl", hash = "sha256:3b562dd9e9ea93f13d53989d23a7e775fdfd1066c33494ff43f5418bc8c58a5c", size = 13876, upload-time = "2025-09-27T18:36:29.954Z" }, - { url = "https://files.pythonhosted.org/packages/5a/72/147da192e38635ada20e0a2e1a51cf8823d2119ce8883f7053879c2199b5/markupsafe-3.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d53197da72cc091b024dd97249dfc7794d6a56530370992a5e1a08983ad9230e", size = 11615, upload-time = "2025-09-27T18:36:30.854Z" }, - { url = "https://files.pythonhosted.org/packages/9a/81/7e4e08678a1f98521201c3079f77db69fb552acd56067661f8c2f534a718/markupsafe-3.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1872df69a4de6aead3491198eaf13810b565bdbeec3ae2dc8780f14458ec73ce", size = 12020, upload-time = "2025-09-27T18:36:31.971Z" }, - { url = "https://files.pythonhosted.org/packages/1e/2c/799f4742efc39633a1b54a92eec4082e4f815314869865d876824c257c1e/markupsafe-3.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a7e8ae81ae39e62a41ec302f972ba6ae23a5c5396c8e60113e9066ef893da0d", size = 24332, upload-time = "2025-09-27T18:36:32.813Z" }, - { url = "https://files.pythonhosted.org/packages/3c/2e/8d0c2ab90a8c1d9a24f0399058ab8519a3279d1bd4289511d74e909f060e/markupsafe-3.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d6dd0be5b5b189d31db7cda48b91d7e0a9795f31430b7f271219ab30f1d3ac9d", size = 22947, upload-time = "2025-09-27T18:36:33.86Z" }, - { url = "https://files.pythonhosted.org/packages/2c/54/887f3092a85238093a0b2154bd629c89444f395618842e8b0c41783898ea/markupsafe-3.0.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:94c6f0bb423f739146aec64595853541634bde58b2135f27f61c1ffd1cd4d16a", size = 21962, upload-time = "2025-09-27T18:36:35.099Z" }, - { url = "https://files.pythonhosted.org/packages/c9/2f/336b8c7b6f4a4d95e91119dc8521402461b74a485558d8f238a68312f11c/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:be8813b57049a7dc738189df53d69395eba14fb99345e0a5994914a3864c8a4b", size = 23760, upload-time = "2025-09-27T18:36:36.001Z" }, - { url = "https://files.pythonhosted.org/packages/32/43/67935f2b7e4982ffb50a4d169b724d74b62a3964bc1a9a527f5ac4f1ee2b/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:83891d0e9fb81a825d9a6d61e3f07550ca70a076484292a70fde82c4b807286f", size = 21529, upload-time = "2025-09-27T18:36:36.906Z" }, - { url = "https://files.pythonhosted.org/packages/89/e0/4486f11e51bbba8b0c041098859e869e304d1c261e59244baa3d295d47b7/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:77f0643abe7495da77fb436f50f8dab76dbc6e5fd25d39589a0f1fe6548bfa2b", size = 23015, upload-time = "2025-09-27T18:36:37.868Z" }, - { url = "https://files.pythonhosted.org/packages/2f/e1/78ee7a023dac597a5825441ebd17170785a9dab23de95d2c7508ade94e0e/markupsafe-3.0.3-cp312-cp312-win32.whl", hash = "sha256:d88b440e37a16e651bda4c7c2b930eb586fd15ca7406cb39e211fcff3bf3017d", size = 14540, upload-time = "2025-09-27T18:36:38.761Z" }, - { url = "https://files.pythonhosted.org/packages/aa/5b/bec5aa9bbbb2c946ca2733ef9c4ca91c91b6a24580193e891b5f7dbe8e1e/markupsafe-3.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:26a5784ded40c9e318cfc2bdb30fe164bdb8665ded9cd64d500a34fb42067b1c", size = 15105, upload-time = "2025-09-27T18:36:39.701Z" }, - { url = "https://files.pythonhosted.org/packages/e5/f1/216fc1bbfd74011693a4fd837e7026152e89c4bcf3e77b6692fba9923123/markupsafe-3.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:35add3b638a5d900e807944a078b51922212fb3dedb01633a8defc4b01a3c85f", size = 13906, upload-time = "2025-09-27T18:36:40.689Z" }, - { url = "https://files.pythonhosted.org/packages/38/2f/907b9c7bbba283e68f20259574b13d005c121a0fa4c175f9bed27c4597ff/markupsafe-3.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e1cf1972137e83c5d4c136c43ced9ac51d0e124706ee1c8aa8532c1287fa8795", size = 11622, upload-time = "2025-09-27T18:36:41.777Z" }, - { url = "https://files.pythonhosted.org/packages/9c/d9/5f7756922cdd676869eca1c4e3c0cd0df60ed30199ffd775e319089cb3ed/markupsafe-3.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:116bb52f642a37c115f517494ea5feb03889e04df47eeff5b130b1808ce7c219", size = 12029, upload-time = "2025-09-27T18:36:43.257Z" }, - { url = "https://files.pythonhosted.org/packages/00/07/575a68c754943058c78f30db02ee03a64b3c638586fba6a6dd56830b30a3/markupsafe-3.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:133a43e73a802c5562be9bbcd03d090aa5a1fe899db609c29e8c8d815c5f6de6", size = 24374, upload-time = "2025-09-27T18:36:44.508Z" }, - { url = "https://files.pythonhosted.org/packages/a9/21/9b05698b46f218fc0e118e1f8168395c65c8a2c750ae2bab54fc4bd4e0e8/markupsafe-3.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ccfcd093f13f0f0b7fdd0f198b90053bf7b2f02a3927a30e63f3ccc9df56b676", size = 22980, upload-time = "2025-09-27T18:36:45.385Z" }, - { url = "https://files.pythonhosted.org/packages/7f/71/544260864f893f18b6827315b988c146b559391e6e7e8f7252839b1b846a/markupsafe-3.0.3-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:509fa21c6deb7a7a273d629cf5ec029bc209d1a51178615ddf718f5918992ab9", size = 21990, upload-time = "2025-09-27T18:36:46.916Z" }, - { url = "https://files.pythonhosted.org/packages/c2/28/b50fc2f74d1ad761af2f5dcce7492648b983d00a65b8c0e0cb457c82ebbe/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4afe79fb3de0b7097d81da19090f4df4f8d3a2b3adaa8764138aac2e44f3af1", size = 23784, upload-time = "2025-09-27T18:36:47.884Z" }, - { url = "https://files.pythonhosted.org/packages/ed/76/104b2aa106a208da8b17a2fb72e033a5a9d7073c68f7e508b94916ed47a9/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:795e7751525cae078558e679d646ae45574b47ed6e7771863fcc079a6171a0fc", size = 21588, upload-time = "2025-09-27T18:36:48.82Z" }, - { url = "https://files.pythonhosted.org/packages/b5/99/16a5eb2d140087ebd97180d95249b00a03aa87e29cc224056274f2e45fd6/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8485f406a96febb5140bfeca44a73e3ce5116b2501ac54fe953e488fb1d03b12", size = 23041, upload-time = "2025-09-27T18:36:49.797Z" }, - { url = "https://files.pythonhosted.org/packages/19/bc/e7140ed90c5d61d77cea142eed9f9c303f4c4806f60a1044c13e3f1471d0/markupsafe-3.0.3-cp313-cp313-win32.whl", hash = "sha256:bdd37121970bfd8be76c5fb069c7751683bdf373db1ed6c010162b2a130248ed", size = 14543, upload-time = "2025-09-27T18:36:51.584Z" }, - { url = "https://files.pythonhosted.org/packages/05/73/c4abe620b841b6b791f2edc248f556900667a5a1cf023a6646967ae98335/markupsafe-3.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:9a1abfdc021a164803f4d485104931fb8f8c1efd55bc6b748d2f5774e78b62c5", size = 15113, upload-time = "2025-09-27T18:36:52.537Z" }, - { url = "https://files.pythonhosted.org/packages/f0/3a/fa34a0f7cfef23cf9500d68cb7c32dd64ffd58a12b09225fb03dd37d5b80/markupsafe-3.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:7e68f88e5b8799aa49c85cd116c932a1ac15caaa3f5db09087854d218359e485", size = 13911, upload-time = "2025-09-27T18:36:53.513Z" }, - { url = "https://files.pythonhosted.org/packages/e4/d7/e05cd7efe43a88a17a37b3ae96e79a19e846f3f456fe79c57ca61356ef01/markupsafe-3.0.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:218551f6df4868a8d527e3062d0fb968682fe92054e89978594c28e642c43a73", size = 11658, upload-time = "2025-09-27T18:36:54.819Z" }, - { url = "https://files.pythonhosted.org/packages/99/9e/e412117548182ce2148bdeacdda3bb494260c0b0184360fe0d56389b523b/markupsafe-3.0.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:3524b778fe5cfb3452a09d31e7b5adefeea8c5be1d43c4f810ba09f2ceb29d37", size = 12066, upload-time = "2025-09-27T18:36:55.714Z" }, - { url = "https://files.pythonhosted.org/packages/bc/e6/fa0ffcda717ef64a5108eaa7b4f5ed28d56122c9a6d70ab8b72f9f715c80/markupsafe-3.0.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4e885a3d1efa2eadc93c894a21770e4bc67899e3543680313b09f139e149ab19", size = 25639, upload-time = "2025-09-27T18:36:56.908Z" }, - { url = "https://files.pythonhosted.org/packages/96/ec/2102e881fe9d25fc16cb4b25d5f5cde50970967ffa5dddafdb771237062d/markupsafe-3.0.3-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8709b08f4a89aa7586de0aadc8da56180242ee0ada3999749b183aa23df95025", size = 23569, upload-time = "2025-09-27T18:36:57.913Z" }, - { url = "https://files.pythonhosted.org/packages/4b/30/6f2fce1f1f205fc9323255b216ca8a235b15860c34b6798f810f05828e32/markupsafe-3.0.3-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b8512a91625c9b3da6f127803b166b629725e68af71f8184ae7e7d54686a56d6", size = 23284, upload-time = "2025-09-27T18:36:58.833Z" }, - { url = "https://files.pythonhosted.org/packages/58/47/4a0ccea4ab9f5dcb6f79c0236d954acb382202721e704223a8aafa38b5c8/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9b79b7a16f7fedff2495d684f2b59b0457c3b493778c9eed31111be64d58279f", size = 24801, upload-time = "2025-09-27T18:36:59.739Z" }, - { url = "https://files.pythonhosted.org/packages/6a/70/3780e9b72180b6fecb83a4814d84c3bf4b4ae4bf0b19c27196104149734c/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:12c63dfb4a98206f045aa9563db46507995f7ef6d83b2f68eda65c307c6829eb", size = 22769, upload-time = "2025-09-27T18:37:00.719Z" }, - { url = "https://files.pythonhosted.org/packages/98/c5/c03c7f4125180fc215220c035beac6b9cb684bc7a067c84fc69414d315f5/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8f71bc33915be5186016f675cd83a1e08523649b0e33efdb898db577ef5bb009", size = 23642, upload-time = "2025-09-27T18:37:01.673Z" }, - { url = "https://files.pythonhosted.org/packages/80/d6/2d1b89f6ca4bff1036499b1e29a1d02d282259f3681540e16563f27ebc23/markupsafe-3.0.3-cp313-cp313t-win32.whl", hash = "sha256:69c0b73548bc525c8cb9a251cddf1931d1db4d2258e9599c28c07ef3580ef354", size = 14612, upload-time = "2025-09-27T18:37:02.639Z" }, - { url = "https://files.pythonhosted.org/packages/2b/98/e48a4bfba0a0ffcf9925fe2d69240bfaa19c6f7507b8cd09c70684a53c1e/markupsafe-3.0.3-cp313-cp313t-win_amd64.whl", hash = "sha256:1b4b79e8ebf6b55351f0d91fe80f893b4743f104bff22e90697db1590e47a218", size = 15200, upload-time = "2025-09-27T18:37:03.582Z" }, - { url = "https://files.pythonhosted.org/packages/0e/72/e3cc540f351f316e9ed0f092757459afbc595824ca724cbc5a5d4263713f/markupsafe-3.0.3-cp313-cp313t-win_arm64.whl", hash = "sha256:ad2cf8aa28b8c020ab2fc8287b0f823d0a7d8630784c31e9ee5edea20f406287", size = 13973, upload-time = "2025-09-27T18:37:04.929Z" }, - { url = "https://files.pythonhosted.org/packages/33/8a/8e42d4838cd89b7dde187011e97fe6c3af66d8c044997d2183fbd6d31352/markupsafe-3.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:eaa9599de571d72e2daf60164784109f19978b327a3910d3e9de8c97b5b70cfe", size = 11619, upload-time = "2025-09-27T18:37:06.342Z" }, - { url = "https://files.pythonhosted.org/packages/b5/64/7660f8a4a8e53c924d0fa05dc3a55c9cee10bbd82b11c5afb27d44b096ce/markupsafe-3.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c47a551199eb8eb2121d4f0f15ae0f923d31350ab9280078d1e5f12b249e0026", size = 12029, upload-time = "2025-09-27T18:37:07.213Z" }, - { url = "https://files.pythonhosted.org/packages/da/ef/e648bfd021127bef5fa12e1720ffed0c6cbb8310c8d9bea7266337ff06de/markupsafe-3.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f34c41761022dd093b4b6896d4810782ffbabe30f2d443ff5f083e0cbbb8c737", size = 24408, upload-time = "2025-09-27T18:37:09.572Z" }, - { url = "https://files.pythonhosted.org/packages/41/3c/a36c2450754618e62008bf7435ccb0f88053e07592e6028a34776213d877/markupsafe-3.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:457a69a9577064c05a97c41f4e65148652db078a3a509039e64d3467b9e7ef97", size = 23005, upload-time = "2025-09-27T18:37:10.58Z" }, - { url = "https://files.pythonhosted.org/packages/bc/20/b7fdf89a8456b099837cd1dc21974632a02a999ec9bf7ca3e490aacd98e7/markupsafe-3.0.3-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e8afc3f2ccfa24215f8cb28dcf43f0113ac3c37c2f0f0806d8c70e4228c5cf4d", size = 22048, upload-time = "2025-09-27T18:37:11.547Z" }, - { url = "https://files.pythonhosted.org/packages/9a/a7/591f592afdc734f47db08a75793a55d7fbcc6902a723ae4cfbab61010cc5/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ec15a59cf5af7be74194f7ab02d0f59a62bdcf1a537677ce67a2537c9b87fcda", size = 23821, upload-time = "2025-09-27T18:37:12.48Z" }, - { url = "https://files.pythonhosted.org/packages/7d/33/45b24e4f44195b26521bc6f1a82197118f74df348556594bd2262bda1038/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:0eb9ff8191e8498cca014656ae6b8d61f39da5f95b488805da4bb029cccbfbaf", size = 21606, upload-time = "2025-09-27T18:37:13.485Z" }, - { url = "https://files.pythonhosted.org/packages/ff/0e/53dfaca23a69fbfbbf17a4b64072090e70717344c52eaaaa9c5ddff1e5f0/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2713baf880df847f2bece4230d4d094280f4e67b1e813eec43b4c0e144a34ffe", size = 23043, upload-time = "2025-09-27T18:37:14.408Z" }, - { url = "https://files.pythonhosted.org/packages/46/11/f333a06fc16236d5238bfe74daccbca41459dcd8d1fa952e8fbd5dccfb70/markupsafe-3.0.3-cp314-cp314-win32.whl", hash = "sha256:729586769a26dbceff69f7a7dbbf59ab6572b99d94576a5592625d5b411576b9", size = 14747, upload-time = "2025-09-27T18:37:15.36Z" }, - { url = "https://files.pythonhosted.org/packages/28/52/182836104b33b444e400b14f797212f720cbc9ed6ba34c800639d154e821/markupsafe-3.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:bdc919ead48f234740ad807933cdf545180bfbe9342c2bb451556db2ed958581", size = 15341, upload-time = "2025-09-27T18:37:16.496Z" }, - { url = "https://files.pythonhosted.org/packages/6f/18/acf23e91bd94fd7b3031558b1f013adfa21a8e407a3fdb32745538730382/markupsafe-3.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:5a7d5dc5140555cf21a6fefbdbf8723f06fcd2f63ef108f2854de715e4422cb4", size = 14073, upload-time = "2025-09-27T18:37:17.476Z" }, - { url = "https://files.pythonhosted.org/packages/3c/f0/57689aa4076e1b43b15fdfa646b04653969d50cf30c32a102762be2485da/markupsafe-3.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:1353ef0c1b138e1907ae78e2f6c63ff67501122006b0f9abad68fda5f4ffc6ab", size = 11661, upload-time = "2025-09-27T18:37:18.453Z" }, - { url = "https://files.pythonhosted.org/packages/89/c3/2e67a7ca217c6912985ec766c6393b636fb0c2344443ff9d91404dc4c79f/markupsafe-3.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1085e7fbddd3be5f89cc898938f42c0b3c711fdcb37d75221de2666af647c175", size = 12069, upload-time = "2025-09-27T18:37:19.332Z" }, - { url = "https://files.pythonhosted.org/packages/f0/00/be561dce4e6ca66b15276e184ce4b8aec61fe83662cce2f7d72bd3249d28/markupsafe-3.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1b52b4fb9df4eb9ae465f8d0c228a00624de2334f216f178a995ccdcf82c4634", size = 25670, upload-time = "2025-09-27T18:37:20.245Z" }, - { url = "https://files.pythonhosted.org/packages/50/09/c419f6f5a92e5fadde27efd190eca90f05e1261b10dbd8cbcb39cd8ea1dc/markupsafe-3.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fed51ac40f757d41b7c48425901843666a6677e3e8eb0abcff09e4ba6e664f50", size = 23598, upload-time = "2025-09-27T18:37:21.177Z" }, - { url = "https://files.pythonhosted.org/packages/22/44/a0681611106e0b2921b3033fc19bc53323e0b50bc70cffdd19f7d679bb66/markupsafe-3.0.3-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f190daf01f13c72eac4efd5c430a8de82489d9cff23c364c3ea822545032993e", size = 23261, upload-time = "2025-09-27T18:37:22.167Z" }, - { url = "https://files.pythonhosted.org/packages/5f/57/1b0b3f100259dc9fffe780cfb60d4be71375510e435efec3d116b6436d43/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e56b7d45a839a697b5eb268c82a71bd8c7f6c94d6fd50c3d577fa39a9f1409f5", size = 24835, upload-time = "2025-09-27T18:37:23.296Z" }, - { url = "https://files.pythonhosted.org/packages/26/6a/4bf6d0c97c4920f1597cc14dd720705eca0bf7c787aebc6bb4d1bead5388/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:f3e98bb3798ead92273dc0e5fd0f31ade220f59a266ffd8a4f6065e0a3ce0523", size = 22733, upload-time = "2025-09-27T18:37:24.237Z" }, - { url = "https://files.pythonhosted.org/packages/14/c7/ca723101509b518797fedc2fdf79ba57f886b4aca8a7d31857ba3ee8281f/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5678211cb9333a6468fb8d8be0305520aa073f50d17f089b5b4b477ea6e67fdc", size = 23672, upload-time = "2025-09-27T18:37:25.271Z" }, - { url = "https://files.pythonhosted.org/packages/fb/df/5bd7a48c256faecd1d36edc13133e51397e41b73bb77e1a69deab746ebac/markupsafe-3.0.3-cp314-cp314t-win32.whl", hash = "sha256:915c04ba3851909ce68ccc2b8e2cd691618c4dc4c4232fb7982bca3f41fd8c3d", size = 14819, upload-time = "2025-09-27T18:37:26.285Z" }, - { url = "https://files.pythonhosted.org/packages/1a/8a/0402ba61a2f16038b48b39bccca271134be00c5c9f0f623208399333c448/markupsafe-3.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4faffd047e07c38848ce017e8725090413cd80cbc23d86e55c587bf979e579c9", size = 15426, upload-time = "2025-09-27T18:37:27.316Z" }, - { url = "https://files.pythonhosted.org/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa", size = 14146, upload-time = "2025-09-27T18:37:28.327Z" }, -] - [[package]] name = "mcp" version = "1.26.0" @@ -1433,7 +1271,7 @@ wheels = [ [[package]] name = "mcpbr" -version = "0.14.0" +version = "0.14.1" source = { editable = "." } dependencies = [ { name = "anthropic" }, @@ -1468,12 +1306,6 @@ dev = [ { name = "types-pyyaml" }, { name = "types-requests" }, ] -docs = [ - { name = "mkdocs" }, - { name = "mkdocs-material" }, - { name = "mkdocs-minify-plugin" }, - { name = "mkdocstrings", extra = ["python"] }, -] gemini = [ { name = "google-generativeai" }, ] @@ -1496,10 +1328,6 @@ requires-dist = [ { name = "google-generativeai", marker = "extra == 'all-providers'", specifier = ">=0.3.0" }, { name = "google-generativeai", marker = "extra == 'gemini'", specifier = ">=0.3.0" }, { name = "mcp", specifier = ">=1.0.0" }, - { name = "mkdocs", marker = "extra == 'docs'", specifier = ">=1.5.0" }, - { name = "mkdocs-material", marker = "extra == 'docs'", specifier = ">=9.5.0" }, - { name = "mkdocs-minify-plugin", marker = "extra == 'docs'", specifier = ">=0.7.0" }, - { name = "mkdocstrings", extras = ["python"], marker = "extra == 'docs'", specifier = ">=0.24.0" }, { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.11.0" }, { name = "openai", marker = "extra == 'all-providers'", specifier = ">=1.0.0" }, { name = "openai", marker = "extra == 'openai'", specifier = ">=1.0.0" }, @@ -1534,149 +1362,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, ] -[[package]] -name = "mergedeep" -version = "1.3.4" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/3a/41/580bb4006e3ed0361b8151a01d324fb03f420815446c7def45d02f74c270/mergedeep-1.3.4.tar.gz", hash = "sha256:0096d52e9dad9939c3d975a774666af186eda617e6ca84df4c94dec30004f2a8", size = 4661, upload-time = "2021-02-05T18:55:30.623Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2c/19/04f9b178c2d8a15b076c8b5140708fa6ffc5601fb6f1e975537072df5b2a/mergedeep-1.3.4-py3-none-any.whl", hash = "sha256:70775750742b25c0d8f36c55aed03d24c3384d17c951b3175d898bd778ef0307", size = 6354, upload-time = "2021-02-05T18:55:29.583Z" }, -] - -[[package]] -name = "mkdocs" -version = "1.6.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "click" }, - { name = "colorama", marker = "sys_platform == 'win32'" }, - { name = "ghp-import" }, - { name = "jinja2" }, - { name = "markdown" }, - { name = "markupsafe" }, - { name = "mergedeep" }, - { name = "mkdocs-get-deps" }, - { name = "packaging" }, - { name = "pathspec" }, - { name = "pyyaml" }, - { name = "pyyaml-env-tag" }, - { name = "watchdog" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/bc/c6/bbd4f061bd16b378247f12953ffcb04786a618ce5e904b8c5a01a0309061/mkdocs-1.6.1.tar.gz", hash = "sha256:7b432f01d928c084353ab39c57282f29f92136665bdd6abf7c1ec8d822ef86f2", size = 3889159, upload-time = "2024-08-30T12:24:06.899Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/22/5b/dbc6a8cddc9cfa9c4971d59fb12bb8d42e161b7e7f8cc89e49137c5b279c/mkdocs-1.6.1-py3-none-any.whl", hash = "sha256:db91759624d1647f3f34aa0c3f327dd2601beae39a366d6e064c03468d35c20e", size = 3864451, upload-time = "2024-08-30T12:24:05.054Z" }, -] - -[[package]] -name = "mkdocs-autorefs" -version = "1.4.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "markdown" }, - { name = "markupsafe" }, - { name = "mkdocs" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/51/fa/9124cd63d822e2bcbea1450ae68cdc3faf3655c69b455f3a7ed36ce6c628/mkdocs_autorefs-1.4.3.tar.gz", hash = "sha256:beee715b254455c4aa93b6ef3c67579c399ca092259cc41b7d9342573ff1fc75", size = 55425, upload-time = "2025-08-26T14:23:17.223Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/9f/4d/7123b6fa2278000688ebd338e2a06d16870aaf9eceae6ba047ea05f92df1/mkdocs_autorefs-1.4.3-py3-none-any.whl", hash = "sha256:469d85eb3114801d08e9cc55d102b3ba65917a869b893403b8987b601cf55dc9", size = 25034, upload-time = "2025-08-26T14:23:15.906Z" }, -] - -[[package]] -name = "mkdocs-get-deps" -version = "0.2.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "mergedeep" }, - { name = "platformdirs" }, - { name = "pyyaml" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/98/f5/ed29cd50067784976f25ed0ed6fcd3c2ce9eb90650aa3b2796ddf7b6870b/mkdocs_get_deps-0.2.0.tar.gz", hash = "sha256:162b3d129c7fad9b19abfdcb9c1458a651628e4b1dea628ac68790fb3061c60c", size = 10239, upload-time = "2023-11-20T17:51:09.981Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/9f/d4/029f984e8d3f3b6b726bd33cafc473b75e9e44c0f7e80a5b29abc466bdea/mkdocs_get_deps-0.2.0-py3-none-any.whl", hash = "sha256:2bf11d0b133e77a0dd036abeeb06dec8775e46efa526dc70667d8863eefc6134", size = 9521, upload-time = "2023-11-20T17:51:08.587Z" }, -] - -[[package]] -name = "mkdocs-material" -version = "9.7.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "babel" }, - { name = "backrefs" }, - { name = "colorama" }, - { name = "jinja2" }, - { name = "markdown" }, - { name = "mkdocs" }, - { name = "mkdocs-material-extensions" }, - { name = "paginate" }, - { name = "pygments" }, - { name = "pymdown-extensions" }, - { name = "requests" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/27/e2/2ffc356cd72f1473d07c7719d82a8f2cbd261666828614ecb95b12169f41/mkdocs_material-9.7.1.tar.gz", hash = "sha256:89601b8f2c3e6c6ee0a918cc3566cb201d40bf37c3cd3c2067e26fadb8cce2b8", size = 4094392, upload-time = "2025-12-18T09:49:00.308Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3e/32/ed071cb721aca8c227718cffcf7bd539620e9799bbf2619e90c757bfd030/mkdocs_material-9.7.1-py3-none-any.whl", hash = "sha256:3f6100937d7d731f87f1e3e3b021c97f7239666b9ba1151ab476cabb96c60d5c", size = 9297166, upload-time = "2025-12-18T09:48:56.664Z" }, -] - -[[package]] -name = "mkdocs-material-extensions" -version = "1.3.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/79/9b/9b4c96d6593b2a541e1cb8b34899a6d021d208bb357042823d4d2cabdbe7/mkdocs_material_extensions-1.3.1.tar.gz", hash = "sha256:10c9511cea88f568257f960358a467d12b970e1f7b2c0e5fb2bb48cab1928443", size = 11847, upload-time = "2023-11-22T19:09:45.208Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/5b/54/662a4743aa81d9582ee9339d4ffa3c8fd40a4965e033d77b9da9774d3960/mkdocs_material_extensions-1.3.1-py3-none-any.whl", hash = "sha256:adff8b62700b25cb77b53358dad940f3ef973dd6db797907c49e3c2ef3ab4e31", size = 8728, upload-time = "2023-11-22T19:09:43.465Z" }, -] - -[[package]] -name = "mkdocs-minify-plugin" -version = "0.8.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "csscompressor" }, - { name = "htmlmin2" }, - { name = "jsmin" }, - { name = "mkdocs" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/52/67/fe4b77e7a8ae7628392e28b14122588beaf6078b53eb91c7ed000fd158ac/mkdocs-minify-plugin-0.8.0.tar.gz", hash = "sha256:bc11b78b8120d79e817308e2b11539d790d21445eb63df831e393f76e52e753d", size = 8366, upload-time = "2024-01-29T16:11:32.982Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/1b/cd/2e8d0d92421916e2ea4ff97f10a544a9bd5588eb747556701c983581df13/mkdocs_minify_plugin-0.8.0-py3-none-any.whl", hash = "sha256:5fba1a3f7bd9a2142c9954a6559a57e946587b21f133165ece30ea145c66aee6", size = 6723, upload-time = "2024-01-29T16:11:31.851Z" }, -] - -[[package]] -name = "mkdocstrings" -version = "1.0.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "jinja2" }, - { name = "markdown" }, - { name = "markupsafe" }, - { name = "mkdocs" }, - { name = "mkdocs-autorefs" }, - { name = "pymdown-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/63/4d/1ca8a9432579184599714aaeb36591414cc3d3bfd9d494f6db540c995ae4/mkdocstrings-1.0.2.tar.gz", hash = "sha256:48edd0ccbcb9e30a3121684e165261a9d6af4d63385fc4f39a54a49ac3b32ea8", size = 101048, upload-time = "2026-01-24T15:57:25.735Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/57/32/407a9a5fdd7d8ecb4af8d830b9bcdf47ea68f916869b3f44bac31f081250/mkdocstrings-1.0.2-py3-none-any.whl", hash = "sha256:41897815a8026c3634fe5d51472c3a569f92ded0ad8c7a640550873eea3b6817", size = 35443, upload-time = "2026-01-24T15:57:23.933Z" }, -] - -[package.optional-dependencies] -python = [ - { name = "mkdocstrings-python" }, -] - -[[package]] -name = "mkdocstrings-python" -version = "2.0.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "griffe" }, - { name = "mkdocs-autorefs" }, - { name = "mkdocstrings" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/24/75/d30af27a2906f00eb90143470272376d728521997800f5dce5b340ba35bc/mkdocstrings_python-2.0.1.tar.gz", hash = "sha256:843a562221e6a471fefdd4b45cc6c22d2607ccbad632879234fa9692e9cf7732", size = 199345, upload-time = "2025-12-03T14:26:11.755Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/81/06/c5f8deba7d2cbdfa7967a716ae801aa9ca5f734b8f54fd473ef77a088dbe/mkdocstrings_python-2.0.1-py3-none-any.whl", hash = "sha256:66ecff45c5f8b71bf174e11d49afc845c2dfc7fc0ab17a86b6b337e0f24d8d90", size = 105055, upload-time = "2025-12-03T14:26:10.184Z" }, -] - [[package]] name = "multidict" version = "6.7.1" @@ -1978,15 +1663,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" }, ] -[[package]] -name = "paginate" -version = "0.5.7" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ec/46/68dde5b6bc00c1296ec6466ab27dddede6aec9af1b99090e1107091b3b84/paginate-0.5.7.tar.gz", hash = "sha256:22bd083ab41e1a8b4f3690544afb2c60c25e5c9a63a30fa2f483f6c60c8e5945", size = 19252, upload-time = "2024-08-25T14:17:24.139Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/90/96/04b8e52da071d28f5e21a805b19cb9390aa17a47462ac87f5e2696b9566d/paginate-0.5.7-py2.py3-none-any.whl", hash = "sha256:b885e2af73abcf01d9559fd5216b57ef722f8c42affbb63942377668e35c7591", size = 13746, upload-time = "2024-08-25T14:17:22.55Z" }, -] - [[package]] name = "pandas" version = "3.0.0" @@ -2487,19 +2163,6 @@ crypto = [ { name = "cryptography" }, ] -[[package]] -name = "pymdown-extensions" -version = "10.20.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "markdown" }, - { name = "pyyaml" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/1e/6c/9e370934bfa30e889d12e61d0dae009991294f40055c238980066a7fbd83/pymdown_extensions-10.20.1.tar.gz", hash = "sha256:e7e39c865727338d434b55f1dd8da51febcffcaebd6e1a0b9c836243f660740a", size = 852860, upload-time = "2026-01-24T05:56:56.758Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/40/6d/b6ee155462a0156b94312bdd82d2b92ea56e909740045a87ccb98bf52405/pymdown_extensions-10.20.1-py3-none-any.whl", hash = "sha256:24af7feacbca56504b313b7b418c4f5e1317bb5fea60f03d57be7fcc40912aa0", size = 268768, upload-time = "2026-01-24T05:56:54.537Z" }, -] - [[package]] name = "pynacl" version = "1.6.2" @@ -2677,18 +2340,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" }, ] -[[package]] -name = "pyyaml-env-tag" -version = "1.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pyyaml" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/eb/2e/79c822141bfd05a853236b504869ebc6b70159afc570e1d5a20641782eaa/pyyaml_env_tag-1.1.tar.gz", hash = "sha256:2eb38b75a2d21ee0475d6d97ec19c63287a7e140231e4214969d0eac923cd7ff", size = 5737, upload-time = "2025-05-13T15:24:01.64Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/04/11/432f32f8097b03e3cd5fe57e88efb685d964e2e5178a48ed61e841f7fdce/pyyaml_env_tag-1.1-py3-none-any.whl", hash = "sha256:17109e1a528561e32f026364712fee1264bc2ea6715120891174ed1b980d2e04", size = 4722, upload-time = "2025-05-13T15:23:59.629Z" }, -] - [[package]] name = "referencing" version = "0.37.0" @@ -3146,33 +2797,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3a/9a/f3919d7ee7ba99dabf0aac7e299c6c328f5eae94f9f6b28c76005f882d5d/wandb-0.24.2-py3-none-win_arm64.whl", hash = "sha256:b42614b99f8b9af69f88c15a84283a973c8cd5750e9c4752aa3ce21f13dbac9a", size = 20268261, upload-time = "2026-02-05T00:12:14.353Z" }, ] -[[package]] -name = "watchdog" -version = "6.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/db/7d/7f3d619e951c88ed75c6037b246ddcf2d322812ee8ea189be89511721d54/watchdog-6.0.0.tar.gz", hash = "sha256:9ddf7c82fda3ae8e24decda1338ede66e1c99883db93711d8fb941eaa2d8c282", size = 131220, upload-time = "2024-11-01T14:07:13.037Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e0/24/d9be5cd6642a6aa68352ded4b4b10fb0d7889cb7f45814fb92cecd35f101/watchdog-6.0.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6eb11feb5a0d452ee41f824e271ca311a09e250441c262ca2fd7ebcf2461a06c", size = 96393, upload-time = "2024-11-01T14:06:31.756Z" }, - { url = "https://files.pythonhosted.org/packages/63/7a/6013b0d8dbc56adca7fdd4f0beed381c59f6752341b12fa0886fa7afc78b/watchdog-6.0.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ef810fbf7b781a5a593894e4f439773830bdecb885e6880d957d5b9382a960d2", size = 88392, upload-time = "2024-11-01T14:06:32.99Z" }, - { url = "https://files.pythonhosted.org/packages/d1/40/b75381494851556de56281e053700e46bff5b37bf4c7267e858640af5a7f/watchdog-6.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:afd0fe1b2270917c5e23c2a65ce50c2a4abb63daafb0d419fde368e272a76b7c", size = 89019, upload-time = "2024-11-01T14:06:34.963Z" }, - { url = "https://files.pythonhosted.org/packages/39/ea/3930d07dafc9e286ed356a679aa02d777c06e9bfd1164fa7c19c288a5483/watchdog-6.0.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:bdd4e6f14b8b18c334febb9c4425a878a2ac20efd1e0b231978e7b150f92a948", size = 96471, upload-time = "2024-11-01T14:06:37.745Z" }, - { url = "https://files.pythonhosted.org/packages/12/87/48361531f70b1f87928b045df868a9fd4e253d9ae087fa4cf3f7113be363/watchdog-6.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c7c15dda13c4eb00d6fb6fc508b3c0ed88b9d5d374056b239c4ad1611125c860", size = 88449, upload-time = "2024-11-01T14:06:39.748Z" }, - { url = "https://files.pythonhosted.org/packages/5b/7e/8f322f5e600812e6f9a31b75d242631068ca8f4ef0582dd3ae6e72daecc8/watchdog-6.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6f10cb2d5902447c7d0da897e2c6768bca89174d0c6e1e30abec5421af97a5b0", size = 89054, upload-time = "2024-11-01T14:06:41.009Z" }, - { url = "https://files.pythonhosted.org/packages/68/98/b0345cabdce2041a01293ba483333582891a3bd5769b08eceb0d406056ef/watchdog-6.0.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:490ab2ef84f11129844c23fb14ecf30ef3d8a6abafd3754a6f75ca1e6654136c", size = 96480, upload-time = "2024-11-01T14:06:42.952Z" }, - { url = "https://files.pythonhosted.org/packages/85/83/cdf13902c626b28eedef7ec4f10745c52aad8a8fe7eb04ed7b1f111ca20e/watchdog-6.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:76aae96b00ae814b181bb25b1b98076d5fc84e8a53cd8885a318b42b6d3a5134", size = 88451, upload-time = "2024-11-01T14:06:45.084Z" }, - { url = "https://files.pythonhosted.org/packages/fe/c4/225c87bae08c8b9ec99030cd48ae9c4eca050a59bf5c2255853e18c87b50/watchdog-6.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a175f755fc2279e0b7312c0035d52e27211a5bc39719dd529625b1930917345b", size = 89057, upload-time = "2024-11-01T14:06:47.324Z" }, - { url = "https://files.pythonhosted.org/packages/a9/c7/ca4bf3e518cb57a686b2feb4f55a1892fd9a3dd13f470fca14e00f80ea36/watchdog-6.0.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7607498efa04a3542ae3e05e64da8202e58159aa1fa4acddf7678d34a35d4f13", size = 79079, upload-time = "2024-11-01T14:06:59.472Z" }, - { url = "https://files.pythonhosted.org/packages/5c/51/d46dc9332f9a647593c947b4b88e2381c8dfc0942d15b8edc0310fa4abb1/watchdog-6.0.0-py3-none-manylinux2014_armv7l.whl", hash = "sha256:9041567ee8953024c83343288ccc458fd0a2d811d6a0fd68c4c22609e3490379", size = 79078, upload-time = "2024-11-01T14:07:01.431Z" }, - { url = "https://files.pythonhosted.org/packages/d4/57/04edbf5e169cd318d5f07b4766fee38e825d64b6913ca157ca32d1a42267/watchdog-6.0.0-py3-none-manylinux2014_i686.whl", hash = "sha256:82dc3e3143c7e38ec49d61af98d6558288c415eac98486a5c581726e0737c00e", size = 79076, upload-time = "2024-11-01T14:07:02.568Z" }, - { url = "https://files.pythonhosted.org/packages/ab/cc/da8422b300e13cb187d2203f20b9253e91058aaf7db65b74142013478e66/watchdog-6.0.0-py3-none-manylinux2014_ppc64.whl", hash = "sha256:212ac9b8bf1161dc91bd09c048048a95ca3a4c4f5e5d4a7d1b1a7d5752a7f96f", size = 79077, upload-time = "2024-11-01T14:07:03.893Z" }, - { url = "https://files.pythonhosted.org/packages/2c/3b/b8964e04ae1a025c44ba8e4291f86e97fac443bca31de8bd98d3263d2fcf/watchdog-6.0.0-py3-none-manylinux2014_ppc64le.whl", hash = "sha256:e3df4cbb9a450c6d49318f6d14f4bbc80d763fa587ba46ec86f99f9e6876bb26", size = 79078, upload-time = "2024-11-01T14:07:05.189Z" }, - { url = "https://files.pythonhosted.org/packages/62/ae/a696eb424bedff7407801c257d4b1afda455fe40821a2be430e173660e81/watchdog-6.0.0-py3-none-manylinux2014_s390x.whl", hash = "sha256:2cce7cfc2008eb51feb6aab51251fd79b85d9894e98ba847408f662b3395ca3c", size = 79077, upload-time = "2024-11-01T14:07:06.376Z" }, - { url = "https://files.pythonhosted.org/packages/b5/e8/dbf020b4d98251a9860752a094d09a65e1b436ad181faf929983f697048f/watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl", hash = "sha256:20ffe5b202af80ab4266dcd3e91aae72bf2da48c0d33bdb15c66658e685e94e2", size = 79078, upload-time = "2024-11-01T14:07:07.547Z" }, - { url = "https://files.pythonhosted.org/packages/07/f6/d0e5b343768e8bcb4cda79f0f2f55051bf26177ecd5651f84c07567461cf/watchdog-6.0.0-py3-none-win32.whl", hash = "sha256:07df1fdd701c5d4c8e55ef6cf55b8f0120fe1aef7ef39a1c6fc6bc2e606d517a", size = 79065, upload-time = "2024-11-01T14:07:09.525Z" }, - { url = "https://files.pythonhosted.org/packages/db/d9/c495884c6e548fce18a8f40568ff120bc3a4b7b99813081c8ac0c936fa64/watchdog-6.0.0-py3-none-win_amd64.whl", hash = "sha256:cbafb470cf848d93b5d013e2ecb245d4aa1c8fd0504e863ccefa32445359d680", size = 79070, upload-time = "2024-11-01T14:07:10.686Z" }, - { url = "https://files.pythonhosted.org/packages/33/e8/e40370e6d74ddba47f002a32919d91310d6074130fe4e17dabcafc15cbf1/watchdog-6.0.0-py3-none-win_ia64.whl", hash = "sha256:a1914259fa9e1454315171103c6a30961236f508b9b623eae470268bbcc6a22f", size = 79067, upload-time = "2024-11-01T14:07:11.845Z" }, -] - [[package]] name = "xxhash" version = "3.6.0" From df54b1902cb5f5412a7c9d7bc4ed2b9faa5387a2 Mon Sep 17 00:00:00 2001 From: Grey Newell Date: Wed, 25 Feb 2026 17:00:31 -0500 Subject: [PATCH 02/14] fix: use shlex.quote for test IDs with special characters Test IDs from SWE-bench Pro datasets can contain single quotes and literal \uXXXX escape sequences that break shell command construction. - Replace bare single-quote wrapping with shlex.quote() in _build_test_command and _build_pro_test_command - Add _normalize_test_id() to decode \uXXXX to actual unicode chars - Fixes qutebrowser PTP test failures where 8/10 tests had IDs containing embedded single quotes Co-Authored-By: Claude Opus 4.6 --- src/mcpbr/benchmarks/codegraph.py | 8 +++++--- src/mcpbr/benchmarks/swebench_pro.py | 16 ++++++++++------ src/mcpbr/evaluation.py | 20 ++++++++++++++++---- 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/src/mcpbr/benchmarks/codegraph.py b/src/mcpbr/benchmarks/codegraph.py index 459a712..5396914 100644 --- a/src/mcpbr/benchmarks/codegraph.py +++ b/src/mcpbr/benchmarks/codegraph.py @@ -15,7 +15,7 @@ import json import logging import re -from datetime import datetime, timezone +from datetime import UTC, datetime from typing import Any from datasets import load_dataset @@ -266,7 +266,7 @@ async def _setup_environment(self, env: TaskEnvironment, task: dict[str, Any]) - "version": 1, "repoName": cache_name, "commitHash": None, - "savedAt": datetime.now(timezone.utc).isoformat(), + "savedAt": datetime.now(UTC).isoformat(), "raw": result, } @@ -382,7 +382,9 @@ def _count_steps(self, text: str) -> int: return 1 # Count tool call patterns in the output - tool_calls = len(re.findall(r"(?:tool_use|tool_call||Tool:|Calling)", text, re.IGNORECASE)) + tool_calls = len( + re.findall(r"(?:tool_use|tool_call||Tool:|Calling)", text, re.IGNORECASE) + ) return max(tool_calls, 1) def get_prebuilt_image(self, task: dict[str, Any]) -> str | None: diff --git a/src/mcpbr/benchmarks/swebench_pro.py b/src/mcpbr/benchmarks/swebench_pro.py index 96e692b..3339f03 100644 --- a/src/mcpbr/benchmarks/swebench_pro.py +++ b/src/mcpbr/benchmarks/swebench_pro.py @@ -396,11 +396,15 @@ def _build_pro_test_command(test: str, language: str, uses_prebuilt: bool = Fals Returns: Shell command string to run the test. """ - if language == "python": - from ..evaluation import _build_test_command + import shlex + + from ..evaluation import _build_test_command, _normalize_test_id + if language == "python": return _build_test_command(test, uses_prebuilt) + test = _normalize_test_id(test) + if uses_prebuilt: activate = "source /opt/miniconda3/etc/profile.d/conda.sh && conda activate testbed && " else: @@ -410,19 +414,19 @@ def _build_pro_test_command(test: str, language: str, uses_prebuilt: bool = Fals # Go test identifiers can be package paths or test function names if "/" in test or test.startswith("."): # Package path: go test -v ./path/to/package - return f"{activate}go test -v -count=1 {test} 2>&1" + return f"{activate}go test -v -count=1 {shlex.quote(test)} 2>&1" else: # Test function name: go test -v -run TestName ./... - return f"{activate}go test -v -count=1 -run '{test}' ./... 2>&1" + return f"{activate}go test -v -count=1 -run {shlex.quote(test)} ./... 2>&1" if language in ("typescript", "javascript"): # Jest-style test identifiers if "/" in test or test.endswith((".ts", ".js", ".tsx", ".jsx")): # File path - return f"{activate}npx jest {test} --verbose --no-cache 2>&1" + return f"{activate}npx jest {shlex.quote(test)} --verbose --no-cache 2>&1" else: # Test name pattern - return f"{activate}npx jest -t '{test}' --verbose --no-cache 2>&1" + return f"{activate}npx jest -t {shlex.quote(test)} --verbose --no-cache 2>&1" # Fallback: try running as-is return f"{activate}{test} 2>&1" diff --git a/src/mcpbr/evaluation.py b/src/mcpbr/evaluation.py index d7861fb..523a918 100644 --- a/src/mcpbr/evaluation.py +++ b/src/mcpbr/evaluation.py @@ -3,6 +3,8 @@ import ast import contextlib import json +import re +import shlex from dataclasses import dataclass from typing import Any @@ -235,6 +237,16 @@ async def run_tests( ) +def _normalize_test_id(test: str) -> str: + """Normalize a test identifier for shell-safe command construction. + + Decodes literal ``\\uXXXX`` escape sequences to actual unicode characters. + Some datasets (e.g. SWE-bench Pro) store pytest parametrize IDs with + escaped unicode (``\\u2026``) instead of the real character (``…``). + """ + return re.sub(r"\\u([0-9a-fA-F]{4})", lambda m: chr(int(m.group(1), 16)), test) + + def _build_test_command(test: str, uses_prebuilt: bool = False, repo: str | None = None) -> str: """Build a test command for the given test identifier. @@ -249,10 +261,10 @@ def _build_test_command(test: str, uses_prebuilt: bool = False, repo: str | None Returns: Shell command string to run the test. """ - import re - from .swebench_test_specs import get_repo_test_command + test = _normalize_test_id(test) + # Pre-built SWE-bench images use a conda environment called 'testbed' if uses_prebuilt: activate = "source /opt/miniconda3/etc/profile.d/conda.sh && conda activate testbed && " @@ -289,9 +301,9 @@ def _build_test_command(test: str, uses_prebuilt: bool = False, repo: str | None test_module = ".".join(test.split(".")[:2]) # Extract test_utils.tests return f"{activate}cd /testbed/tests && ./runtests.py {test_module}" elif "::" in test or test.endswith(".py"): - return f"{activate}python -m pytest '{test}' -xvs 2>&1" + return f"{activate}python -m pytest {shlex.quote(test)} -xvs 2>&1" else: - return f"{activate}python -m pytest -k '{test}' -xvs 2>&1" + return f"{activate}python -m pytest -k {shlex.quote(test)} -xvs 2>&1" async def _apply_test_patch( From 78487e8575c3e8691b38d9b94bdeb4431a75e3c9 Mon Sep 17 00:00:00 2001 From: Grey Newell Date: Wed, 25 Feb 2026 17:05:32 -0500 Subject: [PATCH 03/14] fix: remove broken unicode normalization from test ID handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _normalize_test_id was incorrectly converting literal \uXXXX sequences to actual unicode characters. SWE-bench Pro test IDs already contain the correct literal sequences that match pytest node IDs — converting them breaks test matching. Co-Authored-By: Claude Opus 4.6 --- src/mcpbr/evaluation.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/mcpbr/evaluation.py b/src/mcpbr/evaluation.py index 523a918..0a45ea5 100644 --- a/src/mcpbr/evaluation.py +++ b/src/mcpbr/evaluation.py @@ -240,11 +240,12 @@ async def run_tests( def _normalize_test_id(test: str) -> str: """Normalize a test identifier for shell-safe command construction. - Decodes literal ``\\uXXXX`` escape sequences to actual unicode characters. - Some datasets (e.g. SWE-bench Pro) store pytest parametrize IDs with - escaped unicode (``\\u2026``) instead of the real character (``…``). + Currently a no-op pass-through. SWE-bench Pro test IDs already contain + the correct literal sequences (e.g. ``\\u2026`` as 7 ASCII characters) + that match what pytest uses in its node IDs. Converting them to actual + unicode characters would break matching. """ - return re.sub(r"\\u([0-9a-fA-F]{4})", lambda m: chr(int(m.group(1), 16)), test) + return test def _build_test_command(test: str, uses_prebuilt: bool = False, repo: str | None = None) -> str: From cd5378f4a98ad4c3eaf98f1c35c8b9768cc458c2 Mon Sep 17 00:00:00 2001 From: Grey Newell Date: Wed, 25 Feb 2026 17:57:49 -0500 Subject: [PATCH 04/14] fix: resolve language alias mismatch in SWE-bench Pro filter The dataset uses 'js' and 'ts' as language values, but filter_category accepted 'javascript' and 'typescript' without mapping to the dataset values. Added _LANGUAGE_ALIASES dict to resolve user-friendly names to dataset values. Co-Authored-By: Claude Opus 4.6 --- src/mcpbr/benchmarks/swebench_pro.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/mcpbr/benchmarks/swebench_pro.py b/src/mcpbr/benchmarks/swebench_pro.py index 3339f03..d28da57 100644 --- a/src/mcpbr/benchmarks/swebench_pro.py +++ b/src/mcpbr/benchmarks/swebench_pro.py @@ -26,7 +26,13 @@ from .base import BenchmarkTask # Supported languages in SWE-bench Pro -PRO_LANGUAGES = {"python", "go", "typescript", "javascript"} +PRO_LANGUAGES = {"python", "go", "typescript", "javascript", "ts", "js"} + +# Aliases: user-friendly names → dataset values +_LANGUAGE_ALIASES: dict[str, str] = { + "javascript": "js", + "typescript": "ts", +} # DockerHub registry prefix for SWE-bench Pro pre-built images SWEBENCH_PRO_IMAGE_PREFIX = "jefzda/sweap-images" @@ -94,7 +100,9 @@ def load_tasks( cat_lower = category.lower() # If the category is a known language, match by language only if cat_lower in PRO_LANGUAGES: - if cat_lower == language: + # Resolve aliases (e.g., "javascript" -> "js") + resolved = _LANGUAGE_ALIASES.get(cat_lower, cat_lower) + if resolved == language: filtered.append(task) break elif cat_lower in repo.lower(): From 6472bbb93f7b40f873c5fc6cff1f6c7dd0bef2e5 Mon Sep 17 00:00:00 2001 From: Grey Newell Date: Wed, 25 Feb 2026 18:24:24 -0500 Subject: [PATCH 05/14] fix: use language-specific test runners in preflight validation Preflight was using the Python-only run_tests() (pytest) for all languages, causing 100% failure for Go, JavaScript, and TypeScript instances. Now routes non-Python languages through _build_pro_test_command() which generates the correct commands: - Go: go test -v -count=1 ... - JS/TS: npx jest ... Also fixes conda activation bug in _run_lang_tests: SWE-bench Pro images don't have conda, so uses_prebuilt=False is passed for non-Python test commands to avoid prepending conda activation. Co-Authored-By: Claude Opus 4.6 --- src/mcpbr/benchmark_preflight.py | 88 ++++++++++++++++++++++++++-- src/mcpbr/benchmarks/swebench_pro.py | 4 +- tests/test_swebench_pro.py | 2 +- 3 files changed, 88 insertions(+), 6 deletions(-) diff --git a/src/mcpbr/benchmark_preflight.py b/src/mcpbr/benchmark_preflight.py index e95efe1..971d648 100644 --- a/src/mcpbr/benchmark_preflight.py +++ b/src/mcpbr/benchmark_preflight.py @@ -12,6 +12,7 @@ from .docker_env import DockerEnvironmentManager, TaskEnvironment from .evaluation import ( + TestResults, _apply_test_patch, apply_patch, get_test_list_field, @@ -54,6 +55,83 @@ def success_rate(self) -> float: return (self.passed / self.total) * 100.0 +async def _run_preflight_tests( + env: TaskEnvironment, + tests: list[str], + language: str, + timeout: int = 300, + uses_conda: bool = False, + workdir: str | None = None, + repo: str | None = None, +) -> TestResults: + """Run tests using the appropriate language-specific runner. + + For Python, delegates to the standard run_tests(). For Go, JavaScript, + and TypeScript, builds language-specific commands (go test, npx jest). + + Args: + env: Task environment. + tests: List of test identifiers. + language: Programming language. + timeout: Timeout per test in seconds. + uses_conda: Whether to activate conda environment. + workdir: Working directory inside container. + repo: Repository name (used for Python test specs). + + Returns: + TestResults with pass/fail counts. + """ + if language == "python": + return await run_tests( + env, + tests, + timeout=timeout, + uses_prebuilt=uses_conda, + workdir=workdir, + repo=repo, + ) + + # Non-Python: use language-specific test commands + from .benchmarks.swebench_pro import _build_pro_test_command + + if not tests: + return TestResults(passed=0, total=0, details=[]) + + results = [] + passed = 0 + + for test in tests: + test_cmd = _build_pro_test_command(test, language, uses_conda) + try: + exit_code, stdout, stderr = await env.exec_command( + test_cmd, timeout=timeout, workdir=workdir + ) + test_passed = exit_code == 0 + if test_passed: + passed += 1 + results.append( + { + "test": test, + "passed": test_passed, + "exit_code": exit_code, + "output": stdout[:1000] if stdout else "", + "error": stderr[:1000] if stderr else "", + } + ) + except TimeoutError: + results.append( + { + "test": test, + "passed": False, + "exit_code": -1, + "output": "", + "error": "Test timed out", + } + ) + + return TestResults(passed=passed, total=len(tests), details=results) + + async def _check_single_instance( benchmark: Any, task: dict[str, Any], @@ -137,21 +215,23 @@ async def _check_single_instance( uses_conda = env.uses_prebuilt and not task.get("dockerhub_tag") # Run fail_to_pass tests (all must PASS with golden patch) - ftp_results = await run_tests( + ftp_results = await _run_preflight_tests( env, fail_to_pass_tests, + language=language, timeout=timeout, - uses_prebuilt=uses_conda, + uses_conda=uses_conda, workdir=eval_workdir, repo=task.get("repo"), ) # Run pass_to_pass tests (all must still PASS) - ptp_results = await run_tests( + ptp_results = await _run_preflight_tests( env, pass_to_pass_tests[:10], + language=language, timeout=timeout, - uses_prebuilt=uses_conda, + uses_conda=uses_conda, workdir=eval_workdir, repo=task.get("repo"), ) diff --git a/src/mcpbr/benchmarks/swebench_pro.py b/src/mcpbr/benchmarks/swebench_pro.py index d28da57..7795338 100644 --- a/src/mcpbr/benchmarks/swebench_pro.py +++ b/src/mcpbr/benchmarks/swebench_pro.py @@ -304,7 +304,9 @@ async def _run_lang_tests( passed = 0 for test in tests: - test_cmd = _build_pro_test_command(test, language, env.uses_prebuilt) + # SWE-bench Pro images don't use conda — never prepend conda activation + # for non-Python languages (uses_prebuilt=False disables it) + test_cmd = _build_pro_test_command(test, language, uses_prebuilt=False) try: exit_code, stdout, stderr = await env.exec_command( test_cmd, timeout=timeout, workdir=workdir diff --git a/tests/test_swebench_pro.py b/tests/test_swebench_pro.py index fde8b8e..c9a1c97 100644 --- a/tests/test_swebench_pro.py +++ b/tests/test_swebench_pro.py @@ -237,7 +237,7 @@ def test_filter_by_repo_substring(self, mock_load: MagicMock) -> None: assert tasks[0]["instance_id"] == "t1" def test_pro_languages_set(self) -> None: - assert {"python", "go", "typescript", "javascript"} == PRO_LANGUAGES + assert {"python", "go", "typescript", "javascript", "ts", "js"} == PRO_LANGUAGES class TestSWEBenchProLoadTasks: From 351abd4787bb53a417eb4abda7f463f1928a0a1f Mon Sep 17 00:00:00 2001 From: Grey Newell Date: Wed, 25 Feb 2026 18:39:14 -0500 Subject: [PATCH 06/14] fix: correct test command parsing for Go subtests and JS/TS pipe format Go: Test IDs like "TestFoo/#00" and "TestFoo//api/v1" are subtests, not package paths. Always extract the top-level function name and use -run with ./... to search all packages. JS/TS: SWE-bench Pro uses "file.js | test description" format. Split on " | " to get the file path and test name separately. "test suite" as description runs the whole file without -t filter. Co-Authored-By: Claude Opus 4.6 --- src/mcpbr/benchmarks/swebench_pro.py | 42 ++++++++++++++++++++-------- tests/test_swebench_pro.py | 29 +++++++++++++++---- 2 files changed, 54 insertions(+), 17 deletions(-) diff --git a/src/mcpbr/benchmarks/swebench_pro.py b/src/mcpbr/benchmarks/swebench_pro.py index 7795338..f92ccbc 100644 --- a/src/mcpbr/benchmarks/swebench_pro.py +++ b/src/mcpbr/benchmarks/swebench_pro.py @@ -398,10 +398,15 @@ def get_default_sandbox_level(self) -> str | None: def _build_pro_test_command(test: str, language: str, uses_prebuilt: bool = False) -> str: """Build a language-specific test command for SWE-bench Pro. + Test ID formats by language: + Go: "TestFoo", "TestFoo/subtest", "TestFoo/#00" + JS/TS: "file.js | test description", "file.ts | suite name" + Python: "tests/test_foo.py::TestClass::test_method" + Args: test: Test identifier. - language: Programming language (python, go, typescript, javascript). - uses_prebuilt: Whether a pre-built image is being used. + language: Programming language (python, go, typescript, javascript, js, ts). + uses_prebuilt: Whether a pre-built image is being used (adds conda activation). Returns: Shell command string to run the test. @@ -421,18 +426,33 @@ def _build_pro_test_command(test: str, language: str, uses_prebuilt: bool = Fals activate = "" if language == "go": - # Go test identifiers can be package paths or test function names - if "/" in test or test.startswith("."): - # Package path: go test -v ./path/to/package - return f"{activate}go test -v -count=1 {shlex.quote(test)} 2>&1" + # Go test IDs are always function names, optionally with subtests via / + # e.g., "TestFoo", "TestFoo/subtest", "TestFoo/#00", "TestFoo//api/v1" + # Always use -run with the top-level test name and ./... to search all packages + if "/" in test: + # Extract top-level test name (before first /) + top_level = test.split("/", 1)[0] + return f"{activate}go test -v -count=1 -run {shlex.quote(top_level)} ./... 2>&1" else: - # Test function name: go test -v -run TestName ./... return f"{activate}go test -v -count=1 -run {shlex.quote(test)} ./... 2>&1" - if language in ("typescript", "javascript"): - # Jest-style test identifiers - if "/" in test or test.endswith((".ts", ".js", ".tsx", ".jsx")): - # File path + if language in ("typescript", "javascript", "ts", "js"): + # SWE-bench Pro format: "file_path | test description" + if " | " in test: + parts = test.split(" | ", 1) + file_path = parts[0].strip() + test_name = parts[1].strip() + if test_name and test_name != "test suite": + # Run specific test file with test name filter + return ( + f"{activate}npx jest {shlex.quote(file_path)}" + f" -t {shlex.quote(test_name)} --verbose --no-cache 2>&1" + ) + else: + # "test suite" means run the whole file + return f"{activate}npx jest {shlex.quote(file_path)} --verbose --no-cache 2>&1" + elif "/" in test or test.endswith((".ts", ".js", ".tsx", ".jsx")): + # Plain file path return f"{activate}npx jest {shlex.quote(test)} --verbose --no-cache 2>&1" else: # Test name pattern diff --git a/tests/test_swebench_pro.py b/tests/test_swebench_pro.py index c9a1c97..d7e12ed 100644 --- a/tests/test_swebench_pro.py +++ b/tests/test_swebench_pro.py @@ -98,17 +98,20 @@ def test_python_delegates(self) -> None: cmd = _build_pro_test_command("tests/test_foo.py::test_bar", "python") assert "pytest" in cmd or "test_foo" in cmd - def test_go_package_path(self) -> None: - cmd = _build_pro_test_command("./pkg/router", "go") - assert "go test" in cmd - assert "./pkg/router" in cmd - assert "-v" in cmd - def test_go_function_name(self) -> None: cmd = _build_pro_test_command("TestRouteMatching", "go") assert "go test" in cmd assert "-run" in cmd assert "TestRouteMatching" in cmd + assert "./..." in cmd + + def test_go_subtest(self) -> None: + """Go subtests (TestFoo/#00, TestFoo/subtest) use top-level name with -run.""" + cmd = _build_pro_test_command("TestParseResourcePath/#00", "go") + assert "go test" in cmd + assert "-run" in cmd + assert "TestParseResourcePath" in cmd + assert "./..." in cmd def test_typescript_file(self) -> None: cmd = _build_pro_test_command("src/__tests__/parser.test.ts", "typescript") @@ -128,7 +131,21 @@ def test_javascript_file(self) -> None: def test_javascript_pattern(self) -> None: cmd = _build_pro_test_command("handles edge case", "javascript") assert "npx jest" in cmd + + def test_js_pipe_format(self) -> None: + """SWE-bench Pro JS format: 'file.js | test description'.""" + cmd = _build_pro_test_command("test/database.js | Test database key methods", "js") + assert "npx jest" in cmd + assert "test/database.js" in cmd assert "-t" in cmd + assert "Test database key methods" in cmd + + def test_ts_test_suite_format(self) -> None: + """TS 'test suite' format runs the whole file without -t filter.""" + cmd = _build_pro_test_command("test/tests/LoginFacadeTest.js | test suite", "ts") + assert "npx jest" in cmd + assert "test/tests/LoginFacadeTest.js" in cmd + assert "-t" not in cmd def test_prebuilt_conda_activation(self) -> None: cmd = _build_pro_test_command("TestFoo", "go", uses_prebuilt=True) From e5ccd29ca4674a1c67c91199606ae6608e7c80de Mon Sep 17 00:00:00 2001 From: Grey Newell Date: Wed, 25 Feb 2026 18:54:04 -0500 Subject: [PATCH 07/14] fix: auto-detect JS/TS test runner (jest/mocha/vitest) in containers Different SWE-bench Pro JS/TS repos use different test frameworks: - NodeBB: mocha - element-web: jest - protonmail/webclients: jest - tutanota: custom (testdouble) Added _detect_js_runner() that checks for runner binaries in node_modules/.bin/ inside the container. Mocha uses --grep for filtering, jest uses -t, vitest uses -t. Also refactored JS/TS command building into _build_js_test_command() for clearer separation of runner-specific logic. Co-Authored-By: Claude Opus 4.6 --- src/mcpbr/benchmark_preflight.py | 9 +- src/mcpbr/benchmarks/swebench_pro.py | 127 ++++++++++++++++++++++----- tests/test_swebench_pro.py | 31 ++++++- 3 files changed, 139 insertions(+), 28 deletions(-) diff --git a/src/mcpbr/benchmark_preflight.py b/src/mcpbr/benchmark_preflight.py index 971d648..fd11807 100644 --- a/src/mcpbr/benchmark_preflight.py +++ b/src/mcpbr/benchmark_preflight.py @@ -92,16 +92,21 @@ async def _run_preflight_tests( ) # Non-Python: use language-specific test commands - from .benchmarks.swebench_pro import _build_pro_test_command + from .benchmarks.swebench_pro import _build_pro_test_command, _detect_js_runner if not tests: return TestResults(passed=0, total=0, details=[]) + # Detect JS/TS test runner once per instance + js_runner = "jest" + if language in ("typescript", "javascript", "ts", "js"): + js_runner = await _detect_js_runner(env, workdir=workdir) + results = [] passed = 0 for test in tests: - test_cmd = _build_pro_test_command(test, language, uses_conda) + test_cmd = _build_pro_test_command(test, language, uses_conda, js_runner=js_runner) try: exit_code, stdout, stderr = await env.exec_command( test_cmd, timeout=timeout, workdir=workdir diff --git a/src/mcpbr/benchmarks/swebench_pro.py b/src/mcpbr/benchmarks/swebench_pro.py index f92ccbc..7e93f27 100644 --- a/src/mcpbr/benchmarks/swebench_pro.py +++ b/src/mcpbr/benchmarks/swebench_pro.py @@ -11,6 +11,7 @@ - Language metadata per task (repo_language field) """ +import logging from typing import Any from datasets import load_dataset @@ -25,6 +26,8 @@ ) from .base import BenchmarkTask +logger = logging.getLogger(__name__) + # Supported languages in SWE-bench Pro PRO_LANGUAGES = {"python", "go", "typescript", "javascript", "ts", "js"} @@ -300,13 +303,20 @@ async def _run_lang_tests( if not tests: return TestResults(passed=0, total=0, details=[]) + # Detect JS/TS test runner once (avoids repeated detection per test) + js_runner = "jest" + if language in ("typescript", "javascript", "ts", "js"): + js_runner = await _detect_js_runner(env, workdir=workdir) + results = [] passed = 0 for test in tests: # SWE-bench Pro images don't use conda — never prepend conda activation # for non-Python languages (uses_prebuilt=False disables it) - test_cmd = _build_pro_test_command(test, language, uses_prebuilt=False) + test_cmd = _build_pro_test_command( + test, language, uses_prebuilt=False, js_runner=js_runner + ) try: exit_code, stdout, stderr = await env.exec_command( test_cmd, timeout=timeout, workdir=workdir @@ -395,7 +405,43 @@ def get_default_sandbox_level(self) -> str | None: return None -def _build_pro_test_command(test: str, language: str, uses_prebuilt: bool = False) -> str: +async def _detect_js_runner(env: "TaskEnvironment", workdir: str | None = None) -> str: + """Detect the JavaScript/TypeScript test runner installed in a container. + + Checks for common test runners in order of preference: + jest, mocha, vitest. Falls back to "jest" if none detected. + + Args: + env: Task environment with exec_command. + workdir: Working directory inside container. + + Returns: + Runner name: "jest", "mocha", or "vitest". + """ + # Check for runner binaries in node_modules + detect_cmd = ( + "if [ -f node_modules/.bin/jest ]; then echo jest; " + "elif [ -f node_modules/.bin/mocha ]; then echo mocha; " + "elif [ -f node_modules/.bin/vitest ]; then echo vitest; " + "else echo jest; fi" + ) + try: + exit_code, stdout, _ = await env.exec_command(detect_cmd, timeout=10, workdir=workdir) + if exit_code == 0 and stdout: + runner = stdout.strip().split("\n")[-1].strip() + if runner in ("jest", "mocha", "vitest"): + return runner + except Exception: + logger.debug("Failed to detect JS test runner, defaulting to jest") + return "jest" + + +def _build_pro_test_command( + test: str, + language: str, + uses_prebuilt: bool = False, + js_runner: str = "jest", +) -> str: """Build a language-specific test command for SWE-bench Pro. Test ID formats by language: @@ -407,6 +453,7 @@ def _build_pro_test_command(test: str, language: str, uses_prebuilt: bool = Fals test: Test identifier. language: Programming language (python, go, typescript, javascript, js, ts). uses_prebuilt: Whether a pre-built image is being used (adds conda activation). + js_runner: JavaScript test runner ("jest", "mocha", or "vitest"). Returns: Shell command string to run the test. @@ -437,26 +484,62 @@ def _build_pro_test_command(test: str, language: str, uses_prebuilt: bool = Fals return f"{activate}go test -v -count=1 -run {shlex.quote(test)} ./... 2>&1" if language in ("typescript", "javascript", "ts", "js"): - # SWE-bench Pro format: "file_path | test description" - if " | " in test: - parts = test.split(" | ", 1) - file_path = parts[0].strip() - test_name = parts[1].strip() - if test_name and test_name != "test suite": - # Run specific test file with test name filter - return ( - f"{activate}npx jest {shlex.quote(file_path)}" - f" -t {shlex.quote(test_name)} --verbose --no-cache 2>&1" - ) - else: - # "test suite" means run the whole file - return f"{activate}npx jest {shlex.quote(file_path)} --verbose --no-cache 2>&1" - elif "/" in test or test.endswith((".ts", ".js", ".tsx", ".jsx")): - # Plain file path - return f"{activate}npx jest {shlex.quote(test)} --verbose --no-cache 2>&1" - else: - # Test name pattern - return f"{activate}npx jest -t {shlex.quote(test)} --verbose --no-cache 2>&1" + return _build_js_test_command(test, js_runner, activate) # Fallback: try running as-is return f"{activate}{test} 2>&1" + + +def _build_js_test_command(test: str, runner: str, activate: str = "") -> str: + """Build a JS/TS test command for the detected runner. + + Args: + test: Test identifier in "file | description" format. + runner: Test runner name ("jest", "mocha", or "vitest"). + activate: Optional conda activation prefix. + + Returns: + Shell command string. + """ + import shlex + + # Parse "file | description" format + file_path = "" + test_name = "" + if " | " in test: + parts = test.split(" | ", 1) + file_path = parts[0].strip() + test_name = parts[1].strip() + elif "/" in test or test.endswith((".ts", ".js", ".tsx", ".jsx")): + file_path = test + else: + test_name = test + + if runner == "mocha": + # mocha: npx mocha --grep "pattern" + cmd = f"{activate}npx mocha" + if file_path: + cmd += f" {shlex.quote(file_path)}" + if test_name and test_name != "test suite": + cmd += f" --grep {shlex.quote(test_name)}" + cmd += " --timeout 30000 2>&1" + return cmd + + if runner == "vitest": + # vitest: npx vitest run -t "pattern" + cmd = f"{activate}npx vitest run" + if file_path: + cmd += f" {shlex.quote(file_path)}" + if test_name and test_name != "test suite": + cmd += f" -t {shlex.quote(test_name)}" + cmd += " 2>&1" + return cmd + + # Default: jest + cmd = f"{activate}npx jest" + if file_path: + cmd += f" {shlex.quote(file_path)}" + if test_name and test_name != "test suite": + cmd += f" -t {shlex.quote(test_name)}" + cmd += " --verbose --no-cache 2>&1" + return cmd diff --git a/tests/test_swebench_pro.py b/tests/test_swebench_pro.py index d7e12ed..afa44c5 100644 --- a/tests/test_swebench_pro.py +++ b/tests/test_swebench_pro.py @@ -132,21 +132,44 @@ def test_javascript_pattern(self) -> None: cmd = _build_pro_test_command("handles edge case", "javascript") assert "npx jest" in cmd - def test_js_pipe_format(self) -> None: - """SWE-bench Pro JS format: 'file.js | test description'.""" - cmd = _build_pro_test_command("test/database.js | Test database key methods", "js") + def test_js_pipe_format_jest(self) -> None: + """SWE-bench Pro JS format with jest runner.""" + cmd = _build_pro_test_command( + "test/database.js | Test database key methods", "js", js_runner="jest" + ) assert "npx jest" in cmd assert "test/database.js" in cmd assert "-t" in cmd assert "Test database key methods" in cmd + def test_js_pipe_format_mocha(self) -> None: + """SWE-bench Pro JS format with mocha runner.""" + cmd = _build_pro_test_command( + "test/database.js | Test database key methods", "js", js_runner="mocha" + ) + assert "npx mocha" in cmd + assert "test/database.js" in cmd + assert "--grep" in cmd + assert "Test database key methods" in cmd + def test_ts_test_suite_format(self) -> None: """TS 'test suite' format runs the whole file without -t filter.""" - cmd = _build_pro_test_command("test/tests/LoginFacadeTest.js | test suite", "ts") + cmd = _build_pro_test_command( + "test/tests/LoginFacadeTest.js | test suite", "ts", js_runner="jest" + ) assert "npx jest" in cmd assert "test/tests/LoginFacadeTest.js" in cmd assert "-t" not in cmd + def test_mocha_test_suite_format(self) -> None: + """Mocha 'test suite' runs whole file without --grep.""" + cmd = _build_pro_test_command( + "test/tests/LoginFacadeTest.js | test suite", "js", js_runner="mocha" + ) + assert "npx mocha" in cmd + assert "test/tests/LoginFacadeTest.js" in cmd + assert "--grep" not in cmd + def test_prebuilt_conda_activation(self) -> None: cmd = _build_pro_test_command("TestFoo", "go", uses_prebuilt=True) assert "conda activate testbed" in cmd From 135c73f6b89507a9910d1019661e5865789d575d Mon Sep 17 00:00:00 2001 From: Grey Newell Date: Thu, 26 Feb 2026 10:22:58 -0500 Subject: [PATCH 08/14] fix: prune Docker images after each preflight instance and improve JS/TS runner detection Add docker image prune after each preflight instance to prevent disk exhaustion on CI runners (each SWE-bench Pro image is ~1.5GB). Enhance JS/TS test runner detection to check package.json scripts.test and support ospec, ava, and npm test fallback. Previously, unrecognized projects (like tutanota) defaulted to jest, causing 100% test failures. Co-Authored-By: Claude Opus 4.6 --- src/mcpbr/benchmark_preflight.py | 24 ++++++++++ src/mcpbr/benchmarks/swebench_pro.py | 69 ++++++++++++++++++++++++---- tests/test_swebench_pro.py | 29 ++++++++++++ 3 files changed, 112 insertions(+), 10 deletions(-) diff --git a/src/mcpbr/benchmark_preflight.py b/src/mcpbr/benchmark_preflight.py index fd11807..1996ce0 100644 --- a/src/mcpbr/benchmark_preflight.py +++ b/src/mcpbr/benchmark_preflight.py @@ -7,6 +7,7 @@ import asyncio import logging +import subprocess from dataclasses import dataclass, field from typing import Any @@ -137,6 +138,27 @@ async def _run_preflight_tests( return TestResults(passed=passed, total=len(tests), details=results) +async def _prune_docker_images() -> None: + """Remove unused Docker images to free disk space. + + Called after each preflight instance to prevent disk exhaustion. + Each SWE-bench Pro image is ~1.5GB and each instance uses a unique image, + so pruning after cleanup is critical for processing many instances. + """ + try: + proc = await asyncio.create_subprocess_exec( + "docker", + "image", + "prune", + "-af", + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + await proc.wait() + except Exception: + logger.debug("Failed to prune Docker images") + + async def _check_single_instance( benchmark: Any, task: dict[str, Any], @@ -283,6 +305,8 @@ async def _check_single_instance( await env.cleanup() except Exception: logger.warning(f"Failed to clean up container for {instance_id}") + # Prune unused images to free disk space (each image is ~1.5GB) + await _prune_docker_images() async def run_benchmark_preflight( diff --git a/src/mcpbr/benchmarks/swebench_pro.py b/src/mcpbr/benchmarks/swebench_pro.py index 7e93f27..ac5a447 100644 --- a/src/mcpbr/benchmarks/swebench_pro.py +++ b/src/mcpbr/benchmarks/swebench_pro.py @@ -405,35 +405,59 @@ def get_default_sandbox_level(self) -> str | None: return None +_KNOWN_RUNNERS = ("jest", "mocha", "vitest", "ospec", "ava") + + async def _detect_js_runner(env: "TaskEnvironment", workdir: str | None = None) -> str: """Detect the JavaScript/TypeScript test runner installed in a container. - Checks for common test runners in order of preference: - jest, mocha, vitest. Falls back to "jest" if none detected. + Detection strategy: + 1. Check node_modules/.bin/ for known runner binaries + 2. Parse package.json scripts.test for runner hints + 3. Fall back to "npm" (runs npm test) if nothing is detected Args: env: Task environment with exec_command. workdir: Working directory inside container. Returns: - Runner name: "jest", "mocha", or "vitest". + Runner name: "jest", "mocha", "vitest", "ospec", "ava", or "npm". """ # Check for runner binaries in node_modules detect_cmd = ( "if [ -f node_modules/.bin/jest ]; then echo jest; " "elif [ -f node_modules/.bin/mocha ]; then echo mocha; " "elif [ -f node_modules/.bin/vitest ]; then echo vitest; " - "else echo jest; fi" + "elif [ -f node_modules/.bin/ospec ]; then echo ospec; " + "elif [ -f node_modules/.bin/ava ]; then echo ava; " + "else echo none; fi" ) try: exit_code, stdout, _ = await env.exec_command(detect_cmd, timeout=10, workdir=workdir) if exit_code == 0 and stdout: runner = stdout.strip().split("\n")[-1].strip() - if runner in ("jest", "mocha", "vitest"): + if runner in _KNOWN_RUNNERS: return runner except Exception: - logger.debug("Failed to detect JS test runner, defaulting to jest") - return "jest" + logger.debug("Failed to detect JS test runner from node_modules") + + # Fallback: parse package.json scripts.test for runner hints + pkg_cmd = ( + "node -e \"try{const p=require('./package.json');" + "console.log(p.scripts&&p.scripts.test||'')}catch(e){console.log('')}\" 2>/dev/null" + ) + try: + exit_code, stdout, _ = await env.exec_command(pkg_cmd, timeout=10, workdir=workdir) + if exit_code == 0 and stdout: + test_script = stdout.strip().split("\n")[-1].strip().lower() + for runner in _KNOWN_RUNNERS: + if runner in test_script: + return runner + except Exception: + logger.debug("Failed to detect JS test runner from package.json") + + # Ultimate fallback: use npm test + return "npm" def _build_pro_test_command( @@ -495,7 +519,7 @@ def _build_js_test_command(test: str, runner: str, activate: str = "") -> str: Args: test: Test identifier in "file | description" format. - runner: Test runner name ("jest", "mocha", or "vitest"). + runner: Test runner name ("jest", "mocha", "vitest", "ospec", "ava", "npm"). activate: Optional conda activation prefix. Returns: @@ -516,7 +540,6 @@ def _build_js_test_command(test: str, runner: str, activate: str = "") -> str: test_name = test if runner == "mocha": - # mocha: npx mocha --grep "pattern" cmd = f"{activate}npx mocha" if file_path: cmd += f" {shlex.quote(file_path)}" @@ -526,7 +549,6 @@ def _build_js_test_command(test: str, runner: str, activate: str = "") -> str: return cmd if runner == "vitest": - # vitest: npx vitest run -t "pattern" cmd = f"{activate}npx vitest run" if file_path: cmd += f" {shlex.quote(file_path)}" @@ -535,6 +557,33 @@ def _build_js_test_command(test: str, runner: str, activate: str = "") -> str: cmd += " 2>&1" return cmd + if runner == "ospec": + # ospec: run file directly with node (ospec tests are self-executing) + if file_path: + cmd = f"{activate}node {shlex.quote(file_path)} 2>&1" + else: + cmd = f"{activate}npx ospec 2>&1" + return cmd + + if runner == "ava": + cmd = f"{activate}npx ava" + if file_path: + cmd += f" {shlex.quote(file_path)}" + if test_name and test_name != "test suite": + cmd += f" -m {shlex.quote(test_name)}" + cmd += " 2>&1" + return cmd + + if runner == "npm": + # Fallback: use npm test, passing file as argument if possible + if file_path: + cmd = f"{activate}npm test -- {shlex.quote(file_path)} 2>&1" + elif test_name: + cmd = f"{activate}npm test 2>&1" + else: + cmd = f"{activate}npm test 2>&1" + return cmd + # Default: jest cmd = f"{activate}npx jest" if file_path: diff --git a/tests/test_swebench_pro.py b/tests/test_swebench_pro.py index afa44c5..551d6a4 100644 --- a/tests/test_swebench_pro.py +++ b/tests/test_swebench_pro.py @@ -170,6 +170,35 @@ def test_mocha_test_suite_format(self) -> None: assert "test/tests/LoginFacadeTest.js" in cmd assert "--grep" not in cmd + def test_ospec_runner_file(self) -> None: + """ospec runs test files directly with node.""" + cmd = _build_pro_test_command( + "test/tests/LoginFacadeTest.js | test suite", "ts", js_runner="ospec" + ) + assert "node" in cmd + assert "test/tests/LoginFacadeTest.js" in cmd + + def test_ava_runner(self) -> None: + """ava runner uses -m for test name matching.""" + cmd = _build_pro_test_command("test/database.js | Test db methods", "js", js_runner="ava") + assert "npx ava" in cmd + assert "test/database.js" in cmd + assert "-m" in cmd + assert "Test db methods" in cmd + + def test_npm_fallback_with_file(self) -> None: + """npm fallback passes file via -- to npm test.""" + cmd = _build_pro_test_command( + "test/tests/LoginFacadeTest.js | test suite", "ts", js_runner="npm" + ) + assert "npm test" in cmd + assert "test/tests/LoginFacadeTest.js" in cmd + + def test_npm_fallback_no_file(self) -> None: + """npm fallback with no file runs plain npm test.""" + cmd = _build_pro_test_command("should work", "js", js_runner="npm") + assert "npm test" in cmd + def test_prebuilt_conda_activation(self) -> None: cmd = _build_pro_test_command("TestFoo", "go", uses_prebuilt=True) assert "conda activate testbed" in cmd From 136a331232d8a3cb77d26452d1a3cb589d8468d3 Mon Sep 17 00:00:00 2001 From: Grey Newell Date: Thu, 26 Feb 2026 11:07:10 -0500 Subject: [PATCH 09/14] feat: use official SWE-bench Pro run scripts for test execution Replace custom language-specific test command building (jest/mocha/go test) with official run_script.sh + parser.py from scaleapi/SWE-bench_Pro-os. Each of the 11 repos has unique test infrastructure that the official scripts handle correctly (e.g., Redis for NodeBB, ansible-test for ansible, custom runners for tutanota). Parser runs locally on the host, avoiding Python dependency in Go/JS/TS container images. Co-Authored-By: Claude Opus 4.6 --- CHANGELOG.md | 10 +- src/mcpbr/benchmark_preflight.py | 305 +++++++------ src/mcpbr/benchmarks/swebench_pro.py | 657 +++++++++++++++------------ tests/test_swebench_pro.py | 427 ++++++++++++----- 4 files changed, 857 insertions(+), 542 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 54bf643..7af89fe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **SWE-bench Pro benchmark**: Multi-language benchmark support (Python, Go, TypeScript, JavaScript) with 731 instances across 11 repositories - DockerHub-hosted pre-built images via `dockerhub_tag` field - - Language-aware test runners (Go `go test`, TS/JS `npx jest`, Python delegates to existing) + - Official run scripts from `scaleapi/SWE-bench_Pro-os` for per-repo test infrastructure - Filter by language or repository substring with `--filter-category` - **Preflight check command**: `mcpbr preflight` validates golden patches pass all tests before evaluation - Concurrent validation with configurable parallelism (`--max-concurrent`) @@ -20,6 +20,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **Case-insensitive test list field access**: `get_test_list_field()` helper supports both SWE-bench (`FAIL_TO_PASS`) and SWE-bench Pro (`fail_to_pass`) conventions - **Docker image override support**: `_image_override` task field allows benchmarks to specify custom Docker images +### Changed + +- **SWE-bench Pro test execution**: Replaced custom language-specific test command building (jest/mocha/go test) with official `run_script.sh` + `parser.py` from `scaleapi/SWE-bench_Pro-os` + - Each of the 11 repos has unique test infrastructure (e.g., Redis for NodeBB, `ansible-test` for ansible, custom runners for tutanota) that the official scripts handle correctly + - Parser runs locally on the host, avoiding Python dependency in Go/JS/TS container images + - Scripts repo is shallow-cloned and cached in `~/.cache/mcpbr/swebench-pro-scripts/` + - Falls back to standard `evaluate_patch()` for Python tasks without official scripts + ## [0.14.0] - 2026-02-13 ### Added diff --git a/src/mcpbr/benchmark_preflight.py b/src/mcpbr/benchmark_preflight.py index 1996ce0..a1a9af9 100644 --- a/src/mcpbr/benchmark_preflight.py +++ b/src/mcpbr/benchmark_preflight.py @@ -11,9 +11,14 @@ from dataclasses import dataclass, field from typing import Any +from .benchmarks.swebench_pro import ( + _ensure_run_scripts_repo, + _get_instance_scripts, + _match_test_results, + _run_official_tests, +) from .docker_env import DockerEnvironmentManager, TaskEnvironment from .evaluation import ( - TestResults, _apply_test_patch, apply_patch, get_test_list_field, @@ -56,88 +61,6 @@ def success_rate(self) -> float: return (self.passed / self.total) * 100.0 -async def _run_preflight_tests( - env: TaskEnvironment, - tests: list[str], - language: str, - timeout: int = 300, - uses_conda: bool = False, - workdir: str | None = None, - repo: str | None = None, -) -> TestResults: - """Run tests using the appropriate language-specific runner. - - For Python, delegates to the standard run_tests(). For Go, JavaScript, - and TypeScript, builds language-specific commands (go test, npx jest). - - Args: - env: Task environment. - tests: List of test identifiers. - language: Programming language. - timeout: Timeout per test in seconds. - uses_conda: Whether to activate conda environment. - workdir: Working directory inside container. - repo: Repository name (used for Python test specs). - - Returns: - TestResults with pass/fail counts. - """ - if language == "python": - return await run_tests( - env, - tests, - timeout=timeout, - uses_prebuilt=uses_conda, - workdir=workdir, - repo=repo, - ) - - # Non-Python: use language-specific test commands - from .benchmarks.swebench_pro import _build_pro_test_command, _detect_js_runner - - if not tests: - return TestResults(passed=0, total=0, details=[]) - - # Detect JS/TS test runner once per instance - js_runner = "jest" - if language in ("typescript", "javascript", "ts", "js"): - js_runner = await _detect_js_runner(env, workdir=workdir) - - results = [] - passed = 0 - - for test in tests: - test_cmd = _build_pro_test_command(test, language, uses_conda, js_runner=js_runner) - try: - exit_code, stdout, stderr = await env.exec_command( - test_cmd, timeout=timeout, workdir=workdir - ) - test_passed = exit_code == 0 - if test_passed: - passed += 1 - results.append( - { - "test": test, - "passed": test_passed, - "exit_code": exit_code, - "output": stdout[:1000] if stdout else "", - "error": stderr[:1000] if stderr else "", - } - ) - except TimeoutError: - results.append( - { - "test": test, - "passed": False, - "exit_code": -1, - "output": "", - "error": "Test timed out", - } - ) - - return TestResults(passed=passed, total=len(tests), details=results) - - async def _prune_docker_images() -> None: """Remove unused Docker images to free disk space. @@ -167,6 +90,10 @@ async def _check_single_instance( ) -> PreflightResult: """Validate a single benchmark instance by applying the golden patch. + For SWE-bench Pro tasks (identified by having a dockerhub_tag), uses + official run scripts from scaleapi/SWE-bench_Pro-os. For standard + SWE-bench tasks, falls back to the existing test runner logic. + Args: benchmark: Benchmark instance with create_environment method. task: Task dictionary with patch, test_patch, fail_to_pass, pass_to_pass. @@ -231,63 +158,13 @@ async def _check_single_instance( workdir=eval_workdir, ) - # Parse test lists (handle both uppercase and lowercase field names) - fail_to_pass_str = get_test_list_field(task, "fail_to_pass") - pass_to_pass_str = get_test_list_field(task, "pass_to_pass") - fail_to_pass_tests = parse_test_list(fail_to_pass_str) - pass_to_pass_tests = parse_test_list(pass_to_pass_str) - - # SWE-bench Pro images don't use conda, so skip conda activation - # even though uses_prebuilt is True (it only means "image was pulled") - uses_conda = env.uses_prebuilt and not task.get("dockerhub_tag") - - # Run fail_to_pass tests (all must PASS with golden patch) - ftp_results = await _run_preflight_tests( - env, - fail_to_pass_tests, - language=language, - timeout=timeout, - uses_conda=uses_conda, - workdir=eval_workdir, - repo=task.get("repo"), - ) + # Try official SWE-bench Pro run scripts first (for all languages) + if task.get("dockerhub_tag"): + return await _check_with_official_scripts(env, task, instance_id, language, timeout) - # Run pass_to_pass tests (all must still PASS) - ptp_results = await _run_preflight_tests( - env, - pass_to_pass_tests[:10], - language=language, - timeout=timeout, - uses_conda=uses_conda, - workdir=eval_workdir, - repo=task.get("repo"), - ) - - # Determine status - all_ftp_pass = ftp_results.passed == ftp_results.total and ftp_results.total > 0 - all_ptp_pass = ptp_results.passed == ptp_results.total - - if all_ftp_pass and all_ptp_pass: - status = "passed" - error_msg = None - else: - status = "failed" - parts = [] - if not all_ftp_pass: - parts.append(f"fail_to_pass: {ftp_results.passed}/{ftp_results.total} passed") - if not all_ptp_pass: - parts.append(f"pass_to_pass: {ptp_results.passed}/{ptp_results.total} passed") - error_msg = "; ".join(parts) - - return PreflightResult( - instance_id=instance_id, - status=status, - fail_to_pass_passed=ftp_results.passed, - fail_to_pass_total=ftp_results.total, - pass_to_pass_passed=ptp_results.passed, - pass_to_pass_total=ptp_results.total, - error=error_msg, - language=language, + # Fallback: standard SWE-bench test runner (Python only) + return await _check_with_standard_runner( + env, task, instance_id, language, eval_workdir, timeout ) except Exception as e: @@ -309,6 +186,156 @@ async def _check_single_instance( await _prune_docker_images() +async def _check_with_official_scripts( + env: TaskEnvironment, + task: dict[str, Any], + instance_id: str, + language: str, + timeout: int, +) -> PreflightResult: + """Run preflight using official SWE-bench Pro run scripts. + + Args: + env: Task environment. + task: Task dictionary. + instance_id: Instance ID. + language: Programming language. + timeout: Test timeout in seconds. + + Returns: + PreflightResult. + """ + try: + scripts_repo = _ensure_run_scripts_repo() + run_script, parser_script = _get_instance_scripts(scripts_repo, instance_id) + except (FileNotFoundError, subprocess.CalledProcessError) as e: + return PreflightResult( + instance_id=instance_id, + status="error", + error=f"Failed to get run scripts: {e}", + language=language, + ) + + # Run tests using official scripts + parsed_results = await _run_official_tests( + env, task, run_script, parser_script, timeout=timeout + ) + + # Parse expected test lists + fail_to_pass_str = get_test_list_field(task, "fail_to_pass") + pass_to_pass_str = get_test_list_field(task, "pass_to_pass") + fail_to_pass_tests = parse_test_list(fail_to_pass_str) + pass_to_pass_tests = parse_test_list(pass_to_pass_str) + + # Match parsed results against expectations + ftp_results, ptp_results = _match_test_results( + parsed_results, fail_to_pass_tests, pass_to_pass_tests + ) + + # Determine status + all_ftp_pass = ftp_results.passed == ftp_results.total and ftp_results.total > 0 + all_ptp_pass = ptp_results.passed == ptp_results.total + + if all_ftp_pass and all_ptp_pass: + status = "passed" + error_msg = None + else: + status = "failed" + parts = [] + if not all_ftp_pass: + parts.append(f"fail_to_pass: {ftp_results.passed}/{ftp_results.total} passed") + if not all_ptp_pass: + parts.append(f"pass_to_pass: {ptp_results.passed}/{ptp_results.total} passed") + error_msg = "; ".join(parts) + + return PreflightResult( + instance_id=instance_id, + status=status, + fail_to_pass_passed=ftp_results.passed, + fail_to_pass_total=ftp_results.total, + pass_to_pass_passed=ptp_results.passed, + pass_to_pass_total=ptp_results.total, + error=error_msg, + language=language, + ) + + +async def _check_with_standard_runner( + env: TaskEnvironment, + task: dict[str, Any], + instance_id: str, + language: str, + eval_workdir: str | None, + timeout: int, +) -> PreflightResult: + """Run preflight using standard SWE-bench test runner (Python). + + Falls back to this for standard SWE-bench tasks that don't have + official run scripts (non-Pro tasks). + + Args: + env: Task environment. + task: Task dictionary. + instance_id: Instance ID. + language: Programming language. + eval_workdir: Working directory inside container. + timeout: Test timeout in seconds. + + Returns: + PreflightResult. + """ + fail_to_pass_str = get_test_list_field(task, "fail_to_pass") + pass_to_pass_str = get_test_list_field(task, "pass_to_pass") + fail_to_pass_tests = parse_test_list(fail_to_pass_str) + pass_to_pass_tests = parse_test_list(pass_to_pass_str) + + uses_conda = env.uses_prebuilt and not task.get("dockerhub_tag") + + ftp_results = await run_tests( + env, + fail_to_pass_tests, + timeout=timeout, + uses_prebuilt=uses_conda, + workdir=eval_workdir, + repo=task.get("repo"), + ) + + ptp_results = await run_tests( + env, + pass_to_pass_tests[:10], + timeout=timeout, + uses_prebuilt=uses_conda, + workdir=eval_workdir, + repo=task.get("repo"), + ) + + all_ftp_pass = ftp_results.passed == ftp_results.total and ftp_results.total > 0 + all_ptp_pass = ptp_results.passed == ptp_results.total + + if all_ftp_pass and all_ptp_pass: + status = "passed" + error_msg = None + else: + status = "failed" + parts = [] + if not all_ftp_pass: + parts.append(f"fail_to_pass: {ftp_results.passed}/{ftp_results.total} passed") + if not all_ptp_pass: + parts.append(f"pass_to_pass: {ptp_results.passed}/{ptp_results.total} passed") + error_msg = "; ".join(parts) + + return PreflightResult( + instance_id=instance_id, + status=status, + fail_to_pass_passed=ftp_results.passed, + fail_to_pass_total=ftp_results.total, + pass_to_pass_passed=ptp_results.passed, + pass_to_pass_total=ptp_results.total, + error=error_msg, + language=language, + ) + + async def run_benchmark_preflight( benchmark: Any, tasks: list[dict[str, Any]], diff --git a/src/mcpbr/benchmarks/swebench_pro.py b/src/mcpbr/benchmarks/swebench_pro.py index ac5a447..406ed66 100644 --- a/src/mcpbr/benchmarks/swebench_pro.py +++ b/src/mcpbr/benchmarks/swebench_pro.py @@ -9,9 +9,17 @@ - Multi-language test runners (Python, Go, TypeScript, JavaScript) - Lowercase field names (fail_to_pass instead of FAIL_TO_PASS) - Language metadata per task (repo_language field) + +Test execution uses official run scripts from scaleapi/SWE-bench_Pro-os, +which handle per-repo test infrastructure (e.g., Redis for NodeBB, +ansible-test for ansible, custom runners for tutanota). """ +import json import logging +import subprocess +import tempfile +from pathlib import Path from typing import Any from datasets import load_dataset @@ -19,10 +27,12 @@ from ..docker_env import DockerEnvironmentManager, TaskEnvironment from ..evaluation import ( EvaluationResult, + TestResults, + _apply_test_patch, + apply_patch, evaluate_patch, get_test_list_field, parse_test_list, - run_tests, ) from .base import BenchmarkTask @@ -40,6 +50,298 @@ # DockerHub registry prefix for SWE-bench Pro pre-built images SWEBENCH_PRO_IMAGE_PREFIX = "jefzda/sweap-images" +# Git URL for the official SWE-bench Pro run scripts repository +_RUN_SCRIPTS_REPO = "https://github.com/scaleapi/SWE-bench_Pro-os.git" + +# Default cache directory for cloned run scripts +_DEFAULT_CACHE_DIR = Path.home() / ".cache" / "mcpbr" / "swebench-pro-scripts" + + +def _ensure_run_scripts_repo(cache_dir: Path | None = None) -> Path: + """Clone or update the official SWE-bench Pro run scripts repository. + + Performs a shallow clone of scaleapi/SWE-bench_Pro-os into the cache + directory. If the repo already exists, reuses it. + + Args: + cache_dir: Directory to clone into. Defaults to ~/.cache/mcpbr/swebench-pro-scripts/. + + Returns: + Path to the cloned repository root. + """ + repo_dir = cache_dir or _DEFAULT_CACHE_DIR + + if (repo_dir / "run_scripts").is_dir(): + logger.debug("Run scripts repo already cached at %s", repo_dir) + return repo_dir + + repo_dir.mkdir(parents=True, exist_ok=True) + + logger.info("Cloning SWE-bench Pro run scripts to %s", repo_dir) + subprocess.run( + [ + "git", + "clone", + "--depth", + "1", + "--filter=blob:none", + "--sparse", + _RUN_SCRIPTS_REPO, + str(repo_dir), + ], + check=True, + capture_output=True, + text=True, + ) + # Sparse checkout only the run_scripts directory + subprocess.run( + ["git", "sparse-checkout", "set", "run_scripts"], + cwd=str(repo_dir), + check=True, + capture_output=True, + text=True, + ) + + return repo_dir + + +def _get_instance_scripts(repo_path: Path, instance_id: str) -> tuple[str, str]: + """Read the run_script.sh and parser.py for a specific instance. + + Args: + repo_path: Path to the cloned SWE-bench_Pro-os repository. + instance_id: Instance ID matching the directory name in run_scripts/. + + Returns: + Tuple of (run_script_content, parser_content). + + Raises: + FileNotFoundError: If instance scripts don't exist. + """ + instance_dir = repo_path / "run_scripts" / instance_id + + run_script_path = instance_dir / "run_script.sh" + parser_path = instance_dir / "parser.py" + + if not run_script_path.exists(): + raise FileNotFoundError( + f"No run_script.sh found for instance {instance_id} at {run_script_path}" + ) + if not parser_path.exists(): + raise FileNotFoundError(f"No parser.py found for instance {instance_id} at {parser_path}") + + return run_script_path.read_text(), parser_path.read_text() + + +async def _run_official_tests( + env: TaskEnvironment, + task: dict[str, Any], + run_script: str, + parser_script: str, + timeout: int = 300, +) -> TestResults: + """Run tests using the official SWE-bench Pro run scripts. + + Copies run_script.sh into the container, executes it with the selected + test files, captures stdout/stderr, then runs parser.py locally on + the host to parse results. + + Args: + env: Task environment with a running container. + task: SWE-bench Pro task dictionary (needs selected_test_files_to_run). + run_script: Content of run_script.sh. + parser_script: Content of parser.py. + timeout: Timeout for test execution in seconds. + + Returns: + TestResults with parsed pass/fail counts. + """ + eval_workdir = "/app" if env.uses_prebuilt else None + + # Build test files argument from selected_test_files_to_run + selected_files_raw = task.get("selected_test_files_to_run", "[]") + try: + selected_files = ( + json.loads(selected_files_raw) + if isinstance(selected_files_raw, str) + else selected_files_raw + ) + except (json.JSONDecodeError, TypeError): + selected_files = [] + + if not selected_files: + logger.warning("No selected_test_files_to_run for %s", task.get("instance_id")) + return TestResults(passed=0, total=0, details=[]) + + # Write run_script.sh to container + await env.write_file("run_script.sh", run_script, workdir=eval_workdir) + await env.exec_command("chmod +x /app/run_script.sh", timeout=10, workdir=eval_workdir) + + # Join test files as comma-separated argument + test_files_arg = ",".join(str(f) for f in selected_files) + + # Run the official test script + try: + _exit_code, stdout, stderr = await env.exec_command( + f"bash /app/run_script.sh '{test_files_arg}'", + timeout=timeout, + workdir=eval_workdir, + ) + except TimeoutError: + logger.warning("Test execution timed out for %s", task.get("instance_id")) + return TestResults(passed=0, total=0, details=[{"error": "Test timed out"}]) + + # Run parser.py locally on host to parse the test output + return _parse_test_output_locally( + parser_script, stdout, stderr, task.get("instance_id", "unknown") + ) + + +def _parse_test_output_locally( + parser_script: str, + stdout: str, + stderr: str, + instance_id: str, +) -> TestResults: + """Run parser.py as a local subprocess to parse test output. + + The parser runs on the host (not in the container) because Go/JS/TS + container images may not have Python installed. + + Args: + parser_script: Content of parser.py. + stdout: Captured stdout from test execution. + stderr: Captured stderr from test execution. + instance_id: Instance ID for logging. + + Returns: + TestResults parsed from the output. + """ + with tempfile.TemporaryDirectory() as tmpdir: + tmp = Path(tmpdir) + parser_path = tmp / "parser.py" + stdout_path = tmp / "stdout.log" + stderr_path = tmp / "stderr.log" + output_path = tmp / "output.json" + + parser_path.write_text(parser_script) + stdout_path.write_text(stdout or "") + stderr_path.write_text(stderr or "") + + try: + result = subprocess.run( + [ + "python3", + str(parser_path), + str(stdout_path), + str(stderr_path), + str(output_path), + ], + capture_output=True, + text=True, + timeout=30, + ) + except subprocess.TimeoutExpired: + logger.warning("Parser timed out for %s", instance_id) + return TestResults(passed=0, total=0, details=[{"error": "Parser timed out"}]) + + if result.returncode != 0: + logger.warning("Parser failed for %s: %s", instance_id, result.stderr[:500]) + return TestResults( + passed=0, + total=0, + details=[{"error": f"Parser failed: {result.stderr[:500]}"}], + ) + + if not output_path.exists(): + logger.warning("Parser produced no output.json for %s", instance_id) + return TestResults(passed=0, total=0, details=[]) + + try: + output_data = json.loads(output_path.read_text()) + except json.JSONDecodeError: + logger.warning("Parser output is not valid JSON for %s", instance_id) + return TestResults(passed=0, total=0, details=[]) + + tests = output_data.get("tests", []) + passed = sum(1 for t in tests if t.get("status") == "PASSED") + total = len(tests) + + details = [ + { + "test": t.get("name", "unknown"), + "passed": t.get("status") == "PASSED", + "status": t.get("status", "UNKNOWN"), + } + for t in tests + ] + + return TestResults(passed=passed, total=total, details=details) + + +def _match_test_results( + parsed_results: TestResults, + fail_to_pass: list[str], + pass_to_pass: list[str], +) -> tuple[TestResults, TestResults]: + """Match parsed test results against expected fail_to_pass and pass_to_pass lists. + + The parser produces test names like "file.js | test description" or + "TestFoo" etc. We match these against the expected test lists from + the dataset. + + Args: + parsed_results: TestResults from _run_official_tests. + fail_to_pass: Expected tests that should pass (were failing before fix). + pass_to_pass: Expected tests that should still pass (regression check). + + Returns: + Tuple of (fail_to_pass_results, pass_to_pass_results). + """ + # Build a lookup of parsed test name → status + parsed_status: dict[str, str] = {} + for detail in parsed_results.details: + name = detail.get("test", "") + status = detail.get("status", "UNKNOWN") + if name: + parsed_status[name] = status + + def _check_tests(expected: list[str]) -> TestResults: + if not expected: + return TestResults(passed=0, total=0, details=[]) + + passed = 0 + details = [] + for test_name in expected: + # Try exact match first + status = parsed_status.get(test_name) + + # If no exact match, try substring matching (parser may add + # file prefixes or slightly different formatting) + if status is None: + for parsed_name, parsed_stat in parsed_status.items(): + if test_name in parsed_name or parsed_name in test_name: + status = parsed_stat + break + + test_passed = status == "PASSED" + if test_passed: + passed += 1 + details.append( + { + "test": test_name, + "passed": test_passed, + "status": status or "NOT_FOUND", + } + ) + + return TestResults(passed=passed, total=len(expected), details=details) + + ftp_results = _check_tests(fail_to_pass) + ptp_results = _check_tests(pass_to_pass) + + return ftp_results, ptp_results + class SWEBenchProBenchmark: """SWE-bench Pro benchmark implementation. @@ -50,13 +352,26 @@ class SWEBenchProBenchmark: name = "swe-bench-pro" - def __init__(self, dataset: str = "ScaleAI/SWE-bench_Pro"): + def __init__( + self, + dataset: str = "ScaleAI/SWE-bench_Pro", + scripts_cache_dir: Path | None = None, + ): """Initialize SWE-bench Pro benchmark. Args: dataset: HuggingFace dataset identifier. + scripts_cache_dir: Override cache dir for run scripts repo. """ self.dataset = dataset + self._scripts_cache_dir = scripts_cache_dir + self._scripts_repo_path: Path | None = None + + def _get_scripts_repo(self) -> Path: + """Lazily clone and return the run scripts repo path.""" + if self._scripts_repo_path is None: + self._scripts_repo_path = _ensure_run_scripts_repo(self._scripts_cache_dir) + return self._scripts_repo_path def load_tasks( self, @@ -180,8 +495,9 @@ async def evaluate( ) -> dict[str, Any]: """Evaluate a patch for SWE-bench Pro task. - For Python tasks, delegates to the existing evaluate_patch(). - For Go/TypeScript/JavaScript, uses language-specific test runners. + Uses official run scripts from scaleapi/SWE-bench_Pro-os for all + languages. Falls back to the standard evaluate_patch() for Python + tasks when official scripts are not available. Args: env: Task environment. @@ -191,37 +507,54 @@ async def evaluate( Returns: Dictionary with evaluation results including 'resolved' boolean. """ - language = task.get("repo_language", "python").lower() + instance_id = task.get("instance_id", "") + + # Try to use official run scripts + try: + scripts_repo = self._get_scripts_repo() + run_script, parser_script = _get_instance_scripts(scripts_repo, instance_id) + return await self._evaluate_with_official_scripts( + env, task, solution, run_script, parser_script + ) + except FileNotFoundError: + logger.info( + "No official scripts for %s, falling back to standard evaluation", + instance_id, + ) + # Fallback for Python tasks without official scripts + language = task.get("repo_language", "python").lower() if language == "python": - # Delegate Python evaluation to existing logic eval_result: EvaluationResult = await evaluate_patch(env, task, solution) return self._eval_result_to_dict(eval_result) - # For non-Python languages, use language-specific evaluation - return await self._evaluate_multilang(env, task, solution, language) + return { + "resolved": False, + "patch_applied": False, + "eval_error": f"No official run scripts found for {instance_id}", + } - async def _evaluate_multilang( + async def _evaluate_with_official_scripts( self, env: TaskEnvironment, task: dict[str, Any], patch: str, - language: str, + run_script: str, + parser_script: str, ) -> dict[str, Any]: - """Evaluate a patch using language-specific test runners. + """Evaluate using official SWE-bench Pro run scripts. Args: env: Task environment. task: SWE-bench Pro task dictionary. patch: Unified diff patch to evaluate. - language: Programming language (go, typescript, javascript). + run_script: Content of run_script.sh. + parser_script: Content of parser.py. Returns: Dictionary with evaluation results. """ - from ..evaluation import _apply_test_patch, apply_patch - - # SWE-bench Pro images use /app as their working directory + language = task.get("repo_language", "python").lower() eval_workdir = "/app" if env.uses_prebuilt else None applied, error = await apply_patch(env, patch, workdir=eval_workdir) @@ -241,111 +574,40 @@ async def _evaluate_multilang( workdir=eval_workdir, ) + # Run tests using official scripts + parsed_results = await _run_official_tests( + env, task, run_script, parser_script, timeout=300 + ) + + # Match against expected test lists fail_to_pass_str = get_test_list_field(task, "fail_to_pass") pass_to_pass_str = get_test_list_field(task, "pass_to_pass") fail_to_pass_tests = parse_test_list(fail_to_pass_str) pass_to_pass_tests = parse_test_list(pass_to_pass_str) - fail_to_pass_results = await self._run_lang_tests( - env, fail_to_pass_tests, language, workdir=eval_workdir - ) - pass_to_pass_results = await self._run_lang_tests( - env, pass_to_pass_tests[:10], language, workdir=eval_workdir + ftp_results, ptp_results = _match_test_results( + parsed_results, fail_to_pass_tests, pass_to_pass_tests ) resolved = ( - fail_to_pass_results.passed == fail_to_pass_results.total - and fail_to_pass_results.total > 0 - and pass_to_pass_results.passed == pass_to_pass_results.total + ftp_results.passed == ftp_results.total + and ftp_results.total > 0 + and ptp_results.passed == ptp_results.total ) result: dict[str, Any] = {"resolved": resolved, "patch_applied": True} - if fail_to_pass_results: + if ftp_results: result["fail_to_pass"] = { - "passed": fail_to_pass_results.passed, - "total": fail_to_pass_results.total, + "passed": ftp_results.passed, + "total": ftp_results.total, } - if pass_to_pass_results: + if ptp_results: result["pass_to_pass"] = { - "passed": pass_to_pass_results.passed, - "total": pass_to_pass_results.total, + "passed": ptp_results.passed, + "total": ptp_results.total, } return result - async def _run_lang_tests( - self, - env: TaskEnvironment, - tests: list[str], - language: str, - workdir: str | None = None, - timeout: int = 120, - ) -> Any: - """Run tests using language-specific commands. - - Args: - env: Task environment. - tests: List of test identifiers. - language: Programming language. - workdir: Working directory. - timeout: Timeout per test in seconds. - - Returns: - TestResults instance. - """ - if language == "python": - return await run_tests( - env, tests, timeout=timeout, uses_prebuilt=env.uses_prebuilt, workdir=workdir - ) - - # For non-Python, build language-specific commands and run - from ..evaluation import TestResults - - if not tests: - return TestResults(passed=0, total=0, details=[]) - - # Detect JS/TS test runner once (avoids repeated detection per test) - js_runner = "jest" - if language in ("typescript", "javascript", "ts", "js"): - js_runner = await _detect_js_runner(env, workdir=workdir) - - results = [] - passed = 0 - - for test in tests: - # SWE-bench Pro images don't use conda — never prepend conda activation - # for non-Python languages (uses_prebuilt=False disables it) - test_cmd = _build_pro_test_command( - test, language, uses_prebuilt=False, js_runner=js_runner - ) - try: - exit_code, stdout, stderr = await env.exec_command( - test_cmd, timeout=timeout, workdir=workdir - ) - test_passed = exit_code == 0 - if test_passed: - passed += 1 - results.append( - { - "test": test, - "passed": test_passed, - "exit_code": exit_code, - "output": stdout[:1000] if stdout else "", - "error": stderr[:1000] if stderr else "", - } - ) - except TimeoutError: - results.append( - { - "test": test, - "passed": False, - "exit_code": -1, - "output": "", - "error": "Test timed out", - } - ) - - return TestResults(passed=passed, total=len(tests), details=results) - def _eval_result_to_dict(self, eval_result: EvaluationResult) -> dict[str, Any]: """Convert EvaluationResult to dictionary format.""" result: dict[str, Any] = { @@ -403,192 +665,3 @@ def get_prompt_template(self) -> str: def get_default_sandbox_level(self) -> str | None: """Get default sandbox level for SWE-bench Pro.""" return None - - -_KNOWN_RUNNERS = ("jest", "mocha", "vitest", "ospec", "ava") - - -async def _detect_js_runner(env: "TaskEnvironment", workdir: str | None = None) -> str: - """Detect the JavaScript/TypeScript test runner installed in a container. - - Detection strategy: - 1. Check node_modules/.bin/ for known runner binaries - 2. Parse package.json scripts.test for runner hints - 3. Fall back to "npm" (runs npm test) if nothing is detected - - Args: - env: Task environment with exec_command. - workdir: Working directory inside container. - - Returns: - Runner name: "jest", "mocha", "vitest", "ospec", "ava", or "npm". - """ - # Check for runner binaries in node_modules - detect_cmd = ( - "if [ -f node_modules/.bin/jest ]; then echo jest; " - "elif [ -f node_modules/.bin/mocha ]; then echo mocha; " - "elif [ -f node_modules/.bin/vitest ]; then echo vitest; " - "elif [ -f node_modules/.bin/ospec ]; then echo ospec; " - "elif [ -f node_modules/.bin/ava ]; then echo ava; " - "else echo none; fi" - ) - try: - exit_code, stdout, _ = await env.exec_command(detect_cmd, timeout=10, workdir=workdir) - if exit_code == 0 and stdout: - runner = stdout.strip().split("\n")[-1].strip() - if runner in _KNOWN_RUNNERS: - return runner - except Exception: - logger.debug("Failed to detect JS test runner from node_modules") - - # Fallback: parse package.json scripts.test for runner hints - pkg_cmd = ( - "node -e \"try{const p=require('./package.json');" - "console.log(p.scripts&&p.scripts.test||'')}catch(e){console.log('')}\" 2>/dev/null" - ) - try: - exit_code, stdout, _ = await env.exec_command(pkg_cmd, timeout=10, workdir=workdir) - if exit_code == 0 and stdout: - test_script = stdout.strip().split("\n")[-1].strip().lower() - for runner in _KNOWN_RUNNERS: - if runner in test_script: - return runner - except Exception: - logger.debug("Failed to detect JS test runner from package.json") - - # Ultimate fallback: use npm test - return "npm" - - -def _build_pro_test_command( - test: str, - language: str, - uses_prebuilt: bool = False, - js_runner: str = "jest", -) -> str: - """Build a language-specific test command for SWE-bench Pro. - - Test ID formats by language: - Go: "TestFoo", "TestFoo/subtest", "TestFoo/#00" - JS/TS: "file.js | test description", "file.ts | suite name" - Python: "tests/test_foo.py::TestClass::test_method" - - Args: - test: Test identifier. - language: Programming language (python, go, typescript, javascript, js, ts). - uses_prebuilt: Whether a pre-built image is being used (adds conda activation). - js_runner: JavaScript test runner ("jest", "mocha", or "vitest"). - - Returns: - Shell command string to run the test. - """ - import shlex - - from ..evaluation import _build_test_command, _normalize_test_id - - if language == "python": - return _build_test_command(test, uses_prebuilt) - - test = _normalize_test_id(test) - - if uses_prebuilt: - activate = "source /opt/miniconda3/etc/profile.d/conda.sh && conda activate testbed && " - else: - activate = "" - - if language == "go": - # Go test IDs are always function names, optionally with subtests via / - # e.g., "TestFoo", "TestFoo/subtest", "TestFoo/#00", "TestFoo//api/v1" - # Always use -run with the top-level test name and ./... to search all packages - if "/" in test: - # Extract top-level test name (before first /) - top_level = test.split("/", 1)[0] - return f"{activate}go test -v -count=1 -run {shlex.quote(top_level)} ./... 2>&1" - else: - return f"{activate}go test -v -count=1 -run {shlex.quote(test)} ./... 2>&1" - - if language in ("typescript", "javascript", "ts", "js"): - return _build_js_test_command(test, js_runner, activate) - - # Fallback: try running as-is - return f"{activate}{test} 2>&1" - - -def _build_js_test_command(test: str, runner: str, activate: str = "") -> str: - """Build a JS/TS test command for the detected runner. - - Args: - test: Test identifier in "file | description" format. - runner: Test runner name ("jest", "mocha", "vitest", "ospec", "ava", "npm"). - activate: Optional conda activation prefix. - - Returns: - Shell command string. - """ - import shlex - - # Parse "file | description" format - file_path = "" - test_name = "" - if " | " in test: - parts = test.split(" | ", 1) - file_path = parts[0].strip() - test_name = parts[1].strip() - elif "/" in test or test.endswith((".ts", ".js", ".tsx", ".jsx")): - file_path = test - else: - test_name = test - - if runner == "mocha": - cmd = f"{activate}npx mocha" - if file_path: - cmd += f" {shlex.quote(file_path)}" - if test_name and test_name != "test suite": - cmd += f" --grep {shlex.quote(test_name)}" - cmd += " --timeout 30000 2>&1" - return cmd - - if runner == "vitest": - cmd = f"{activate}npx vitest run" - if file_path: - cmd += f" {shlex.quote(file_path)}" - if test_name and test_name != "test suite": - cmd += f" -t {shlex.quote(test_name)}" - cmd += " 2>&1" - return cmd - - if runner == "ospec": - # ospec: run file directly with node (ospec tests are self-executing) - if file_path: - cmd = f"{activate}node {shlex.quote(file_path)} 2>&1" - else: - cmd = f"{activate}npx ospec 2>&1" - return cmd - - if runner == "ava": - cmd = f"{activate}npx ava" - if file_path: - cmd += f" {shlex.quote(file_path)}" - if test_name and test_name != "test suite": - cmd += f" -m {shlex.quote(test_name)}" - cmd += " 2>&1" - return cmd - - if runner == "npm": - # Fallback: use npm test, passing file as argument if possible - if file_path: - cmd = f"{activate}npm test -- {shlex.quote(file_path)} 2>&1" - elif test_name: - cmd = f"{activate}npm test 2>&1" - else: - cmd = f"{activate}npm test 2>&1" - return cmd - - # Default: jest - cmd = f"{activate}npx jest" - if file_path: - cmd += f" {shlex.quote(file_path)}" - if test_name and test_name != "test suite": - cmd += f" -t {shlex.quote(test_name)}" - cmd += " --verbose --no-cache 2>&1" - return cmd diff --git a/tests/test_swebench_pro.py b/tests/test_swebench_pro.py index 551d6a4..5c265e1 100644 --- a/tests/test_swebench_pro.py +++ b/tests/test_swebench_pro.py @@ -1,13 +1,20 @@ """Tests for SWE-bench Pro benchmark implementation.""" -from unittest.mock import MagicMock, patch +import json +from pathlib import Path +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest from mcpbr.benchmarks.swebench_pro import ( PRO_LANGUAGES, SWEBENCH_PRO_IMAGE_PREFIX, SWEBenchProBenchmark, - _build_pro_test_command, + _get_instance_scripts, + _match_test_results, + _parse_test_output_locally, ) +from mcpbr.evaluation import TestResults class TestSWEBenchProInit: @@ -25,6 +32,10 @@ def test_name(self) -> None: benchmark = SWEBenchProBenchmark() assert benchmark.name == "swe-bench-pro" + def test_custom_scripts_cache_dir(self) -> None: + benchmark = SWEBenchProBenchmark(scripts_cache_dir=Path("/tmp/test-scripts")) + assert benchmark._scripts_cache_dir == Path("/tmp/test-scripts") + class TestSWEBenchProNormalizeTask: """Tests for task normalization.""" @@ -90,122 +101,318 @@ def test_normalize_go_task(self) -> None: assert bt.metadata["repo_language"] == "go" -class TestBuildProTestCommand: - """Tests for language-specific test command building.""" - - def test_python_delegates(self) -> None: - """Python should delegate to existing _build_test_command.""" - cmd = _build_pro_test_command("tests/test_foo.py::test_bar", "python") - assert "pytest" in cmd or "test_foo" in cmd - - def test_go_function_name(self) -> None: - cmd = _build_pro_test_command("TestRouteMatching", "go") - assert "go test" in cmd - assert "-run" in cmd - assert "TestRouteMatching" in cmd - assert "./..." in cmd - - def test_go_subtest(self) -> None: - """Go subtests (TestFoo/#00, TestFoo/subtest) use top-level name with -run.""" - cmd = _build_pro_test_command("TestParseResourcePath/#00", "go") - assert "go test" in cmd - assert "-run" in cmd - assert "TestParseResourcePath" in cmd - assert "./..." in cmd - - def test_typescript_file(self) -> None: - cmd = _build_pro_test_command("src/__tests__/parser.test.ts", "typescript") - assert "npx jest" in cmd - assert "parser.test.ts" in cmd - - def test_typescript_pattern(self) -> None: - cmd = _build_pro_test_command("should parse tokens", "typescript") - assert "npx jest" in cmd - assert "-t" in cmd - - def test_javascript_file(self) -> None: - cmd = _build_pro_test_command("test/index.test.js", "javascript") - assert "npx jest" in cmd - assert "index.test.js" in cmd - - def test_javascript_pattern(self) -> None: - cmd = _build_pro_test_command("handles edge case", "javascript") - assert "npx jest" in cmd - - def test_js_pipe_format_jest(self) -> None: - """SWE-bench Pro JS format with jest runner.""" - cmd = _build_pro_test_command( - "test/database.js | Test database key methods", "js", js_runner="jest" +class TestGetInstanceScripts: + """Tests for _get_instance_scripts.""" + + def test_reads_scripts_from_directory(self, tmp_path: Path) -> None: + instance_id = "instance_test__repo-abc123" + instance_dir = tmp_path / "run_scripts" / instance_id + instance_dir.mkdir(parents=True) + + run_script_content = "#!/bin/bash\necho 'test'" + parser_content = "import sys\nprint('parser')" + + (instance_dir / "run_script.sh").write_text(run_script_content) + (instance_dir / "parser.py").write_text(parser_content) + + run_script, parser = _get_instance_scripts(tmp_path, instance_id) + assert run_script == run_script_content + assert parser == parser_content + + def test_raises_on_missing_run_script(self, tmp_path: Path) -> None: + instance_id = "instance_missing__repo-abc123" + instance_dir = tmp_path / "run_scripts" / instance_id + instance_dir.mkdir(parents=True) + (instance_dir / "parser.py").write_text("parser") + + with pytest.raises(FileNotFoundError, match=r"run_script\.sh"): + _get_instance_scripts(tmp_path, instance_id) + + def test_raises_on_missing_parser(self, tmp_path: Path) -> None: + instance_id = "instance_missing__repo-abc123" + instance_dir = tmp_path / "run_scripts" / instance_id + instance_dir.mkdir(parents=True) + (instance_dir / "run_script.sh").write_text("script") + + with pytest.raises(FileNotFoundError, match=r"parser\.py"): + _get_instance_scripts(tmp_path, instance_id) + + def test_raises_on_missing_directory(self, tmp_path: Path) -> None: + (tmp_path / "run_scripts").mkdir(parents=True) + with pytest.raises(FileNotFoundError): + _get_instance_scripts(tmp_path, "nonexistent_instance") + + +class TestMatchTestResults: + """Tests for _match_test_results.""" + + def test_all_tests_pass(self) -> None: + parsed = TestResults( + passed=2, + total=2, + details=[ + {"test": "TestFoo", "passed": True, "status": "PASSED"}, + {"test": "TestBar", "passed": True, "status": "PASSED"}, + ], ) - assert "npx jest" in cmd - assert "test/database.js" in cmd - assert "-t" in cmd - assert "Test database key methods" in cmd - - def test_js_pipe_format_mocha(self) -> None: - """SWE-bench Pro JS format with mocha runner.""" - cmd = _build_pro_test_command( - "test/database.js | Test database key methods", "js", js_runner="mocha" + ftp, ptp = _match_test_results(parsed, ["TestFoo"], ["TestBar"]) + assert ftp.passed == 1 + assert ftp.total == 1 + assert ptp.passed == 1 + assert ptp.total == 1 + + def test_fail_to_pass_fails(self) -> None: + parsed = TestResults( + passed=1, + total=2, + details=[ + {"test": "TestFoo", "passed": False, "status": "FAILED"}, + {"test": "TestBar", "passed": True, "status": "PASSED"}, + ], ) - assert "npx mocha" in cmd - assert "test/database.js" in cmd - assert "--grep" in cmd - assert "Test database key methods" in cmd - - def test_ts_test_suite_format(self) -> None: - """TS 'test suite' format runs the whole file without -t filter.""" - cmd = _build_pro_test_command( - "test/tests/LoginFacadeTest.js | test suite", "ts", js_runner="jest" + ftp, ptp = _match_test_results(parsed, ["TestFoo"], ["TestBar"]) + assert ftp.passed == 0 + assert ftp.total == 1 + assert ptp.passed == 1 + assert ptp.total == 1 + + def test_substring_matching(self) -> None: + """Tests that weren't found by exact match fall back to substring.""" + parsed = TestResults( + passed=1, + total=1, + details=[ + { + "test": "test/database.js | Test database key methods", + "passed": True, + "status": "PASSED", + }, + ], ) - assert "npx jest" in cmd - assert "test/tests/LoginFacadeTest.js" in cmd - assert "-t" not in cmd - - def test_mocha_test_suite_format(self) -> None: - """Mocha 'test suite' runs whole file without --grep.""" - cmd = _build_pro_test_command( - "test/tests/LoginFacadeTest.js | test suite", "js", js_runner="mocha" + ftp, _ptp = _match_test_results( + parsed, + ["test/database.js | Test database key methods"], + [], ) - assert "npx mocha" in cmd - assert "test/tests/LoginFacadeTest.js" in cmd - assert "--grep" not in cmd - - def test_ospec_runner_file(self) -> None: - """ospec runs test files directly with node.""" - cmd = _build_pro_test_command( - "test/tests/LoginFacadeTest.js | test suite", "ts", js_runner="ospec" + assert ftp.passed == 1 + assert ftp.total == 1 + + def test_empty_lists(self) -> None: + parsed = TestResults(passed=0, total=0, details=[]) + ftp, ptp = _match_test_results(parsed, [], []) + assert ftp.total == 0 + assert ptp.total == 0 + + def test_test_not_found(self) -> None: + parsed = TestResults( + passed=1, + total=1, + details=[ + {"test": "TestFoo", "passed": True, "status": "PASSED"}, + ], ) - assert "node" in cmd - assert "test/tests/LoginFacadeTest.js" in cmd - - def test_ava_runner(self) -> None: - """ava runner uses -m for test name matching.""" - cmd = _build_pro_test_command("test/database.js | Test db methods", "js", js_runner="ava") - assert "npx ava" in cmd - assert "test/database.js" in cmd - assert "-m" in cmd - assert "Test db methods" in cmd - - def test_npm_fallback_with_file(self) -> None: - """npm fallback passes file via -- to npm test.""" - cmd = _build_pro_test_command( - "test/tests/LoginFacadeTest.js | test suite", "ts", js_runner="npm" + ftp, _ptp = _match_test_results(parsed, ["TestMissing"], []) + assert ftp.passed == 0 + assert ftp.total == 1 + assert ftp.details[0]["status"] == "NOT_FOUND" + + def test_multiple_fail_to_pass(self) -> None: + parsed = TestResults( + passed=2, + total=3, + details=[ + {"test": "TestA", "passed": True, "status": "PASSED"}, + {"test": "TestB", "passed": True, "status": "PASSED"}, + {"test": "TestC", "passed": False, "status": "FAILED"}, + ], ) - assert "npm test" in cmd - assert "test/tests/LoginFacadeTest.js" in cmd + ftp, _ptp = _match_test_results(parsed, ["TestA", "TestB", "TestC"], []) + assert ftp.passed == 2 + assert ftp.total == 3 - def test_npm_fallback_no_file(self) -> None: - """npm fallback with no file runs plain npm test.""" - cmd = _build_pro_test_command("should work", "js", js_runner="npm") - assert "npm test" in cmd - def test_prebuilt_conda_activation(self) -> None: - cmd = _build_pro_test_command("TestFoo", "go", uses_prebuilt=True) - assert "conda activate testbed" in cmd +class TestParseTestOutputLocally: + """Tests for _parse_test_output_locally.""" + + def test_parses_mocha_json(self) -> None: + """Test parsing mocha JSON output (NodeBB style).""" + mocha_output = json.dumps( + { + "passes": [ + {"file": "test/database.js", "fullTitle": "Test db key methods"}, + {"file": "test/meta.js", "fullTitle": "Meta functions"}, + ], + "failures": [ + {"file": "test/translator.js", "fullTitle": "Translator shim"}, + ], + "pending": [], + } + ) - def test_unknown_language_fallback(self) -> None: - cmd = _build_pro_test_command("test_something", "rust") - assert "test_something" in cmd + # Create a minimal parser.py that handles mocha JSON + parser_script = """ +import json +import sys +import dataclasses +from enum import Enum +from pathlib import Path +from typing import List + +class TestStatus(Enum): + PASSED = 1 + FAILED = 2 + SKIPPED = 3 + ERROR = 4 + +@dataclasses.dataclass +class TestResult: + name: str + status: TestStatus + +def parse_test_output(stdout_content, stderr_content): + results = [] + try: + data = json.loads(stdout_content) + for t in data.get("passes", []): + results.append(TestResult(name=t.get("fullTitle", ""), status=TestStatus.PASSED)) + for t in data.get("failures", []): + results.append(TestResult(name=t.get("fullTitle", ""), status=TestStatus.FAILED)) + except json.JSONDecodeError: + pass + return results + +def export_to_json(results, output_path): + json_results = { + "tests": [ + {"name": r.name, "status": r.status.name} for r in results + ] + } + with open(output_path, "w") as f: + json.dump(json_results, f) + +def main(stdout_path, stderr_path, output_path): + with open(stdout_path) as f: + stdout_content = f.read() + with open(stderr_path) as f: + stderr_content = f.read() + results = parse_test_output(stdout_content, stderr_content) + export_to_json(results, output_path) + +if __name__ == "__main__": + main(Path(sys.argv[1]), Path(sys.argv[2]), Path(sys.argv[3])) +""" + + result = _parse_test_output_locally(parser_script, mocha_output, "", "test-instance") + assert result.total == 3 + assert result.passed == 2 + + def test_handles_parser_error(self) -> None: + """Test that parser errors are handled gracefully.""" + bad_parser = "raise ValueError('broken')" + result = _parse_test_output_locally(bad_parser, "output", "err", "test") + assert result.total == 0 + assert result.passed == 0 + + def test_handles_empty_output(self) -> None: + """Test parsing with no test output.""" + parser_script = """ +import json +import sys +from pathlib import Path + +def main(stdout_path, stderr_path, output_path): + with open(output_path, "w") as f: + json.dump({"tests": []}, f) + +if __name__ == "__main__": + main(Path(sys.argv[1]), Path(sys.argv[2]), Path(sys.argv[3])) +""" + result = _parse_test_output_locally(parser_script, "", "", "test") + assert result.total == 0 + assert result.passed == 0 + + +class TestRunOfficialTests: + """Tests for _run_official_tests orchestration.""" + + @pytest.mark.asyncio + async def test_runs_script_in_container(self) -> None: + """Test that run_script.sh is written to container and executed.""" + env = MagicMock() + env.uses_prebuilt = True + env.write_file = AsyncMock() + env.exec_command = AsyncMock(return_value=(0, '{"tests":[]}', "")) + + task = { + "instance_id": "test-instance", + "selected_test_files_to_run": '["test/foo.js", "test/bar.js"]', + } + + # Use a simple parser that outputs empty test list + parser = """ +import json +import sys +from pathlib import Path + +def main(stdout_path, stderr_path, output_path): + with open(output_path, "w") as f: + json.dump({"tests": []}, f) + +if __name__ == "__main__": + main(Path(sys.argv[1]), Path(sys.argv[2]), Path(sys.argv[3])) +""" + from mcpbr.benchmarks.swebench_pro import _run_official_tests + + await _run_official_tests(env, task, "#!/bin/bash\necho test", parser) + + # Verify run_script.sh was written to container + env.write_file.assert_called_once_with( + "run_script.sh", "#!/bin/bash\necho test", workdir="/app" + ) + + # Verify exec_command was called (chmod + the actual script run) + assert env.exec_command.call_count == 2 + + @pytest.mark.asyncio + async def test_handles_no_selected_files(self) -> None: + """Test graceful handling when no test files are specified.""" + env = MagicMock() + env.uses_prebuilt = True + + task = { + "instance_id": "test-instance", + "selected_test_files_to_run": "[]", + } + + from mcpbr.benchmarks.swebench_pro import _run_official_tests + + result = await _run_official_tests(env, task, "#!/bin/bash", "parser") + assert result.total == 0 + assert result.passed == 0 + + @pytest.mark.asyncio + async def test_handles_timeout(self) -> None: + """Test graceful handling when test execution times out.""" + env = MagicMock() + env.uses_prebuilt = True + env.write_file = AsyncMock() + env.exec_command = AsyncMock( + side_effect=[ + (0, "", ""), # chmod succeeds + TimeoutError("timed out"), # script times out + ] + ) + + task = { + "instance_id": "test-instance", + "selected_test_files_to_run": '["test/foo.js"]', + } + + from mcpbr.benchmarks.swebench_pro import _run_official_tests + + result = await _run_official_tests(env, task, "#!/bin/bash", "parser") + assert result.total == 0 + assert result.passed == 0 class TestSWEBenchProDockerImage: @@ -422,7 +629,7 @@ class TestSWEBenchProEvalResultToDict: """Tests for _eval_result_to_dict helper.""" def test_basic_conversion(self) -> None: - from mcpbr.evaluation import EvaluationResult, TestResults + from mcpbr.evaluation import EvaluationResult benchmark = SWEBenchProBenchmark() result = EvaluationResult( From 8e6beec4aa68291bc53c4c4ef2792659deae2cbf Mon Sep 17 00:00:00 2001 From: Grey Newell Date: Thu, 26 Feb 2026 11:44:04 -0500 Subject: [PATCH 10/14] fix: run before_repo_set_cmd to restore test files after patch application The official SWE-bench Pro evaluation harness runs before_repo_set_cmd (last line) between patch application and test execution. This typically restores specific test files from the fix commit, e.g.: git checkout -- test/tests/SomeTest.ts Without this, 2 tutanota instances fail because their test files end up in a broken state after patch + test_patch application. Co-Authored-By: Claude Opus 4.6 --- src/mcpbr/benchmark_preflight.py | 4 +++ src/mcpbr/benchmarks/swebench_pro.py | 52 ++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/src/mcpbr/benchmark_preflight.py b/src/mcpbr/benchmark_preflight.py index a1a9af9..2675fcc 100644 --- a/src/mcpbr/benchmark_preflight.py +++ b/src/mcpbr/benchmark_preflight.py @@ -15,6 +15,7 @@ _ensure_run_scripts_repo, _get_instance_scripts, _match_test_results, + _run_before_repo_set_cmd, _run_official_tests, ) from .docker_env import DockerEnvironmentManager, TaskEnvironment @@ -148,6 +149,9 @@ async def _check_single_instance( if test_patch: await _apply_test_patch(env, test_patch, workdir=eval_workdir) + # Run before_repo_set_cmd (restores specific test files from fix commit) + await _run_before_repo_set_cmd(env, task, workdir=eval_workdir) + # Reinstall package in editable mode so patched code is used. # SWE-bench Pro images install the package into site-packages; # without this step, tests would import the old (unpatched) code. diff --git a/src/mcpbr/benchmarks/swebench_pro.py b/src/mcpbr/benchmarks/swebench_pro.py index 406ed66..348b881 100644 --- a/src/mcpbr/benchmarks/swebench_pro.py +++ b/src/mcpbr/benchmarks/swebench_pro.py @@ -133,6 +133,53 @@ def _get_instance_scripts(repo_path: Path, instance_id: str) -> tuple[str, str]: return run_script_path.read_text(), parser_path.read_text() +async def _run_before_repo_set_cmd( + env: TaskEnvironment, + task: dict[str, Any], + workdir: str | None = None, +) -> None: + """Run the before_repo_set_cmd from the dataset after patch application. + + The official SWE-bench Pro evaluation harness runs the last line of + before_repo_set_cmd between applying the patch and running tests. + This typically restores specific test files from the fix commit, e.g.: + git checkout -- test/tests/SomeTest.ts + + The earlier lines (git reset, git clean, git checkout ) are + redundant because our apply_patch() already handles that. + + Args: + env: Task environment. + task: SWE-bench Pro task dictionary. + workdir: Working directory inside container. + """ + before_cmd = task.get("before_repo_set_cmd", "") + if not before_cmd or not before_cmd.strip(): + return + + # The official harness only uses the last line + last_line = before_cmd.strip().split("\n")[-1].strip() + if not last_line: + return + + # Skip if it's just a git reset/clean/checkout (already done by apply_patch) + # We only care about "git checkout -- " which restores specific files + if last_line.startswith("git checkout") and " -- " not in last_line: + return + if last_line.startswith(("git reset", "git clean")): + return + + logger.debug("Running before_repo_set_cmd for %s: %s", task.get("instance_id"), last_line) + try: + await env.exec_command(last_line, timeout=60, workdir=workdir) + except Exception: + logger.warning( + "before_repo_set_cmd failed for %s: %s", + task.get("instance_id"), + last_line, + ) + + async def _run_official_tests( env: TaskEnvironment, task: dict[str, Any], @@ -565,6 +612,11 @@ async def _evaluate_with_official_scripts( if test_patch: await _apply_test_patch(env, test_patch, workdir=eval_workdir) + # Run before_repo_set_cmd (last line only, matching official harness). + # This typically restores specific test files from the fix commit, + # e.g., "git checkout -- test/file.ts" + await _run_before_repo_set_cmd(env, task, workdir=eval_workdir) + # Reinstall package so patched code is active (SWE-bench Pro images # install into site-packages, not editable mode) if eval_workdir and language == "python": From e05964c99b377a51a2542d4bc6da7a196f727f62 Mon Sep 17 00:00:00 2001 From: Grey Newell Date: Thu, 26 Feb 2026 15:06:48 -0500 Subject: [PATCH 11/14] feat: add --shard-index/--shard-total to preflight CLI --- src/mcpbr/cli.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/mcpbr/cli.py b/src/mcpbr/cli.py index a41cdc9..17b33f0 100644 --- a/src/mcpbr/cli.py +++ b/src/mcpbr/cli.py @@ -1588,6 +1588,18 @@ def benchmarks() -> None: default=300, help="Timeout per test in seconds (default: 300).", ) +@click.option( + "--shard-index", + type=int, + default=None, + help="Shard index for parallel runs (0-based).", +) +@click.option( + "--shard-total", + type=int, + default=None, + help="Total number of shards for parallel runs.", +) def preflight( config_path: str | None, benchmark_name: str, @@ -1597,6 +1609,8 @@ def preflight( fail_fast: bool, filter_category: tuple[str, ...], timeout: int, + shard_index: int | None, + shard_total: int | None, ) -> None: """Validate golden patches pass all tests before evaluation. @@ -1635,6 +1649,20 @@ def preflight( console.print("[yellow]No tasks found matching the criteria.[/yellow]") return + # Apply sharding if requested + if shard_index is not None and shard_total is not None: + if shard_index < 0 or shard_index >= shard_total: + console.print( + f"[red]Invalid shard-index {shard_index} for shard-total {shard_total}[/red]" + ) + sys.exit(1) + tasks = tasks[shard_index::shard_total] + console.print(f"Shard {shard_index + 1}/{shard_total}: {len(tasks)} instance(s)\n") + + if not tasks: + console.print("[yellow]No tasks in this shard.[/yellow]") + return + console.print(f"Validating {len(tasks)} instance(s)...\n") # Create Docker manager From 9bd511359839eea5a00d4f8aff0572b487e46545 Mon Sep 17 00:00:00 2001 From: Grey Newell Date: Thu, 26 Feb 2026 16:48:56 -0500 Subject: [PATCH 12/14] fix: use docker system prune for aggressive disk cleanup Protonmail/webclients images are ~4.8GB compressed. docker image prune alone doesn't clear build cache and volumes. docker system prune -af --volumes reclaims all reclaimable space between instances. --- src/mcpbr/benchmark_preflight.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/mcpbr/benchmark_preflight.py b/src/mcpbr/benchmark_preflight.py index 2675fcc..df65e22 100644 --- a/src/mcpbr/benchmark_preflight.py +++ b/src/mcpbr/benchmark_preflight.py @@ -62,25 +62,27 @@ def success_rate(self) -> float: return (self.passed / self.total) * 100.0 -async def _prune_docker_images() -> None: - """Remove unused Docker images to free disk space. +async def _prune_docker() -> None: + """Remove unused Docker images, build cache, and volumes to free disk space. Called after each preflight instance to prevent disk exhaustion. - Each SWE-bench Pro image is ~1.5GB and each instance uses a unique image, - so pruning after cleanup is critical for processing many instances. + Each SWE-bench Pro image is ~1-5GB (protonmail is ~4.8GB compressed) + and each instance uses a unique image, so aggressive pruning after + cleanup is critical for processing many instances. """ try: proc = await asyncio.create_subprocess_exec( "docker", - "image", + "system", "prune", "-af", + "--volumes", stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, ) await proc.wait() except Exception: - logger.debug("Failed to prune Docker images") + logger.debug("Failed to prune Docker system") async def _check_single_instance( @@ -187,7 +189,7 @@ async def _check_single_instance( except Exception: logger.warning(f"Failed to clean up container for {instance_id}") # Prune unused images to free disk space (each image is ~1.5GB) - await _prune_docker_images() + await _prune_docker() async def _check_with_official_scripts( From ef15f255c69e041e686c5e3a4d0ac76ea88ef101 Mon Sep 17 00:00:00 2001 From: Grey Newell Date: Sun, 15 Mar 2026 11:27:28 -0400 Subject: [PATCH 13/14] docs: remove dead links to greynewell.com blog post Co-Authored-By: Claude Sonnet 4.6 --- README.md | 3 +-- site/pages/about.njk | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 3a71ecd..99353ee 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,6 @@ mcpbr runs controlled experiments: same model, same tasks, same environment - th - **Real GitHub issues** from SWE-bench (not toy examples) - **Reproducible results** via Docker containers with pinned dependencies -> Read the full origin story: **[Why I Built mcpbr](https://greynewell.com/blog/why-i-built-mcpbr/)** — the problem, the approach, and where the project is headed. ## Research Paper @@ -1536,4 +1535,4 @@ MIT - see [LICENSE](LICENSE) for details. --- -Built by [Grey Newell](https://greynewell.com) | [Why I Built mcpbr](https://greynewell.com/blog/why-i-built-mcpbr/) | [About](https://mcpbr.org/about/) +Built by [Grey Newell](https://greynewell.com) | [About](https://mcpbr.org/about/) diff --git a/site/pages/about.njk b/site/pages/about.njk index eb7df5c..5f039a0 100644 --- a/site/pages/about.njk +++ b/site/pages/about.njk @@ -49,7 +49,7 @@ headExtra: |

mcpbr was created by Grey Newell after identifying a critical gap in the MCP ecosystem: no tool existed to measure whether an MCP server actually made an AI agent better at its job.

Existing coding benchmarks like SWE-bench measured raw language model capabilities. MCP server developers relied on anecdotal evidence and demo videos. There was no way to answer the fundamental question: does adding this MCP server to an agent improve its performance on real tasks?

mcpbr was built to answer that question with hard data.

-

"No available tool allowed users to easily measure the performance improvement of introducing their MCP server to an agent."

Grey Newell, "Why I Built mcpbr"

+

"No available tool allowed users to easily measure the performance improvement of introducing their MCP server to an agent."

— Grey Newell

The Problem mcpbr Solves

Before mcpbr, MCP server evaluation looked like this:

@@ -84,7 +84,7 @@ headExtra: | GitHubgithub.com/greynewell/mcpbr PyPIpypi.org/project/mcpbr npmnpmjs.com/package/mcpbr-cli - Blog PostWhy I Built mcpbr + Creatorgreynewell.com SchemaFluxschemaflux.dev LicenseMIT From 8b8be831ed19d2fedad5b8a3ff32e181d0ba3510 Mon Sep 17 00:00:00 2001 From: Grey Newell Date: Sun, 15 Mar 2026 11:33:44 -0400 Subject: [PATCH 14/14] docs: add blog section with SWE-bench posts Co-Authored-By: Claude Sonnet 4.6 --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 99353ee..0bde867 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,10 @@ mcpbr runs controlled experiments: same model, same tasks, same environment - th - **Real GitHub issues** from SWE-bench (not toy examples) - **Reproducible results** via Docker containers with pinned dependencies +## Blog + +- [SWE-bench Verified Is Broken: 5 Things I Found in the Source Code](https://greynewell.com/blog/swe-bench-verified-broken-5-things-source-code/) +- [SWE-bench Tests Run 6x Faster on ARM64 with Native Containers](https://greynewell.com/blog/swe-bench-arm64-native-containers-6x-faster/) ## Research Paper