From 5108867c35ddfbf9224eb20f4344a1096d2d4f2e Mon Sep 17 00:00:00 2001
From: Grey Newell <greyshipscode@gmail.com>
Date: Wed, 25 Feb 2026 16:30:59 -0500
Subject: [PATCH 01/14] feat: add SWE-bench Pro benchmark and preflight check
 system

Add multi-language benchmark support (Python, Go, TypeScript, JavaScript)
with 731 instances from ScaleAI/SWE-bench_Pro, plus a preflight validation
system that verifies golden patches pass all tests before agent evaluation.

Key changes:
- SWEBenchProBenchmark class with DockerHub image support and
  language-specific test runners (Go: go test, TS/JS: npx jest)
- Preflight check system (mcpbr preflight CLI command) that validates
  golden patches in Docker environments
- Docker image override support (_image_override, _workdir_override)
  for non-GHCR registries
- Entrypoint override for images with /bin/bash entrypoint
- Editable reinstall after patching for SWE-bench Pro Python images
- Case-insensitive test list field access (fail_to_pass/FAIL_TO_PASS)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 CHANGELOG.md                         |  13 +
 src/mcpbr/benchmark_preflight.py     | 271 +++++++++++++++++
 src/mcpbr/benchmarks/__init__.py     |   6 +
 src/mcpbr/benchmarks/swebench_pro.py | 428 +++++++++++++++++++++++++++
 src/mcpbr/cli.py                     | 160 ++++++++++
 src/mcpbr/config.py                  |   1 +
 src/mcpbr/docker_env.py              |  82 ++++-
 src/mcpbr/evaluation.py              |  38 ++-
 src/mcpbr/swebench_test_specs.py     |  21 ++
 tests/test_benchmark_preflight.py    | 340 +++++++++++++++++++++
 tests/test_swebench_pro.py           | 383 ++++++++++++++++++++++++
 uv.lock                              | 378 +----------------------
 12 files changed, 1732 insertions(+), 389 deletions(-)
 create mode 100644 src/mcpbr/benchmark_preflight.py
 create mode 100644 src/mcpbr/benchmarks/swebench_pro.py
 create mode 100644 tests/test_benchmark_preflight.py
 create mode 100644 tests/test_swebench_pro.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e642c0b..54bf643 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+
+- **SWE-bench Pro benchmark**: Multi-language benchmark support (Python, Go, TypeScript, JavaScript) with 731 instances across 11 repositories
+  - DockerHub-hosted pre-built images via `dockerhub_tag` field
+  - Language-aware test runners (Go `go test`, TS/JS `npx jest`, Python delegates to existing)
+  - Filter by language or repository substring with `--filter-category`
+- **Preflight check command**: `mcpbr preflight` validates golden patches pass all tests before evaluation
+  - Concurrent validation with configurable parallelism (`--max-concurrent`)
+  - Fail-fast mode (`--fail-fast`) for quick CI checks
+  - Per-instance and aggregate reporting with language breakdown
+- **Case-insensitive test list field access**: `get_test_list_field()` helper supports both SWE-bench (`FAIL_TO_PASS`) and SWE-bench Pro (`fail_to_pass`) conventions
+- **Docker image override support**: `_image_override` task field allows benchmarks to specify custom Docker images
+
 ## [0.14.0] - 2026-02-13
 
 ### Added
diff --git a/src/mcpbr/benchmark_preflight.py b/src/mcpbr/benchmark_preflight.py
new file mode 100644
index 0000000..e95efe1
--- /dev/null
+++ b/src/mcpbr/benchmark_preflight.py
@@ -0,0 +1,271 @@
+"""Preflight validation for benchmarks.
+
+Validates that golden patches pass all tests in Docker environments before
+running agent evaluations. This catches environment/configuration issues
+early, ensuring evaluation infrastructure works correctly.
+"""
+
+import asyncio
+import logging
+from dataclasses import dataclass, field
+from typing import Any
+
+from .docker_env import DockerEnvironmentManager, TaskEnvironment
+from .evaluation import (
+    _apply_test_patch,
+    apply_patch,
+    get_test_list_field,
+    parse_test_list,
+    run_tests,
+)
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class PreflightResult:
+    """Result of a single preflight instance check."""
+
+    instance_id: str
+    status: str  # "passed", "failed", "error"
+    fail_to_pass_passed: int = 0
+    fail_to_pass_total: int = 0
+    pass_to_pass_passed: int = 0
+    pass_to_pass_total: int = 0
+    error: str | None = None
+    language: str = "unknown"
+
+
+@dataclass
+class PreflightReport:
+    """Aggregate preflight validation report."""
+
+    total: int = 0
+    passed: int = 0
+    failed: int = 0
+    errors: int = 0
+    results: list[PreflightResult] = field(default_factory=list)
+
+    @property
+    def success_rate(self) -> float:
+        """Calculate success rate as a percentage."""
+        if self.total == 0:
+            return 0.0
+        return (self.passed / self.total) * 100.0
+
+
+async def _check_single_instance(
+    benchmark: Any,
+    task: dict[str, Any],
+    docker_manager: DockerEnvironmentManager,
+    timeout: int = 300,
+) -> PreflightResult:
+    """Validate a single benchmark instance by applying the golden patch.
+
+    Args:
+        benchmark: Benchmark instance with create_environment method.
+        task: Task dictionary with patch, test_patch, fail_to_pass, pass_to_pass.
+        docker_manager: Docker environment manager.
+        timeout: Timeout per test in seconds.
+
+    Returns:
+        PreflightResult for this instance.
+    """
+    instance_id = task.get("instance_id", "unknown")
+    language = task.get("repo_language", "python").lower()
+    env: TaskEnvironment | None = None
+
+    try:
+        # Create Docker environment (skip Claude CLI install — not needed for preflight)
+        preflight_task = dict(task)
+        preflight_task["_skip_cli_install"] = True
+        env = await benchmark.create_environment(preflight_task, docker_manager)
+
+        # Determine eval workdir: SWE-bench Pro images use /app (indicated by
+        # dockerhub_tag), standard SWE-bench uses /testbed.
+        eval_workdir: str | None
+        if env.uses_prebuilt:
+            if task.get("dockerhub_tag"):
+                eval_workdir = "/app"
+            else:
+                eval_workdir = "/testbed"
+        else:
+            eval_workdir = None
+
+        # Apply golden patch
+        golden_patch = task.get("patch", "")
+        if not golden_patch:
+            return PreflightResult(
+                instance_id=instance_id,
+                status="error",
+                error="No golden patch found in task",
+                language=language,
+            )
+
+        applied, error = await apply_patch(env, golden_patch, workdir=eval_workdir)
+        if not applied:
+            return PreflightResult(
+                instance_id=instance_id,
+                status="failed",
+                error=f"Golden patch failed to apply: {error}",
+                language=language,
+            )
+
+        # Apply test patch
+        test_patch = task.get("test_patch", "")
+        if test_patch:
+            await _apply_test_patch(env, test_patch, workdir=eval_workdir)
+
+        # Reinstall package in editable mode so patched code is used.
+        # SWE-bench Pro images install the package into site-packages;
+        # without this step, tests would import the old (unpatched) code.
+        if eval_workdir and language == "python":
+            await env.exec_command(
+                "pip install -e . -q 2>/dev/null || true",
+                timeout=120,
+                workdir=eval_workdir,
+            )
+
+        # Parse test lists (handle both uppercase and lowercase field names)
+        fail_to_pass_str = get_test_list_field(task, "fail_to_pass")
+        pass_to_pass_str = get_test_list_field(task, "pass_to_pass")
+        fail_to_pass_tests = parse_test_list(fail_to_pass_str)
+        pass_to_pass_tests = parse_test_list(pass_to_pass_str)
+
+        # SWE-bench Pro images don't use conda, so skip conda activation
+        # even though uses_prebuilt is True (it only means "image was pulled")
+        uses_conda = env.uses_prebuilt and not task.get("dockerhub_tag")
+
+        # Run fail_to_pass tests (all must PASS with golden patch)
+        ftp_results = await run_tests(
+            env,
+            fail_to_pass_tests,
+            timeout=timeout,
+            uses_prebuilt=uses_conda,
+            workdir=eval_workdir,
+            repo=task.get("repo"),
+        )
+
+        # Run pass_to_pass tests (all must still PASS)
+        ptp_results = await run_tests(
+            env,
+            pass_to_pass_tests[:10],
+            timeout=timeout,
+            uses_prebuilt=uses_conda,
+            workdir=eval_workdir,
+            repo=task.get("repo"),
+        )
+
+        # Determine status
+        all_ftp_pass = ftp_results.passed == ftp_results.total and ftp_results.total > 0
+        all_ptp_pass = ptp_results.passed == ptp_results.total
+
+        if all_ftp_pass and all_ptp_pass:
+            status = "passed"
+            error_msg = None
+        else:
+            status = "failed"
+            parts = []
+            if not all_ftp_pass:
+                parts.append(f"fail_to_pass: {ftp_results.passed}/{ftp_results.total} passed")
+            if not all_ptp_pass:
+                parts.append(f"pass_to_pass: {ptp_results.passed}/{ptp_results.total} passed")
+            error_msg = "; ".join(parts)
+
+        return PreflightResult(
+            instance_id=instance_id,
+            status=status,
+            fail_to_pass_passed=ftp_results.passed,
+            fail_to_pass_total=ftp_results.total,
+            pass_to_pass_passed=ptp_results.passed,
+            pass_to_pass_total=ptp_results.total,
+            error=error_msg,
+            language=language,
+        )
+
+    except Exception as e:
+        logger.exception(f"Preflight error for {instance_id}")
+        return PreflightResult(
+            instance_id=instance_id,
+            status="error",
+            error=str(e),
+            language=language,
+        )
+
+    finally:
+        if env is not None:
+            try:
+                await env.cleanup()
+            except Exception:
+                logger.warning(f"Failed to clean up container for {instance_id}")
+
+
+async def run_benchmark_preflight(
+    benchmark: Any,
+    tasks: list[dict[str, Any]],
+    docker_manager: DockerEnvironmentManager,
+    max_concurrent: int = 4,
+    timeout: int = 300,
+    fail_fast: bool = False,
+) -> PreflightReport:
+    """Run preflight validation on benchmark tasks.
+
+    Applies golden patches and verifies all tests pass for each instance.
+
+    Args:
+        benchmark: Benchmark instance.
+        tasks: List of task dictionaries to validate.
+        docker_manager: Docker environment manager.
+        max_concurrent: Maximum concurrent validations.
+        timeout: Timeout per test in seconds.
+        fail_fast: Stop on first failure.
+
+    Returns:
+        PreflightReport with aggregate results.
+    """
+    report = PreflightReport(total=len(tasks))
+    semaphore = asyncio.Semaphore(max_concurrent)
+
+    async def _check_with_semaphore(task: dict[str, Any]) -> PreflightResult:
+        async with semaphore:
+            return await _check_single_instance(benchmark, task, docker_manager, timeout)
+
+    if fail_fast:
+        # Sequential execution with early exit
+        for task in tasks:
+            result = await _check_with_semaphore(task)
+            report.results.append(result)
+            if result.status == "passed":
+                report.passed += 1
+            elif result.status == "failed":
+                report.failed += 1
+                break
+            else:
+                report.errors += 1
+                break
+    else:
+        # Concurrent execution
+        coros = [_check_with_semaphore(task) for task in tasks]
+        results = await asyncio.gather(*coros, return_exceptions=True)
+
+        for r in results:
+            if isinstance(r, BaseException):
+                report.errors += 1
+                report.results.append(
+                    PreflightResult(
+                        instance_id="unknown",
+                        status="error",
+                        error=str(r),
+                    )
+                )
+            else:
+                preflight_result: PreflightResult = r
+                report.results.append(preflight_result)
+                if preflight_result.status == "passed":
+                    report.passed += 1
+                elif preflight_result.status == "failed":
+                    report.failed += 1
+                else:
+                    report.errors += 1
+
+    return report
diff --git a/src/mcpbr/benchmarks/__init__.py b/src/mcpbr/benchmarks/__init__.py
index 419ceb9..99f48f6 100644
--- a/src/mcpbr/benchmarks/__init__.py
+++ b/src/mcpbr/benchmarks/__init__.py
@@ -29,6 +29,7 @@
 from .mmmu import MMMUBenchmark
 from .repoqa import RepoQABenchmark
 from .swebench import SWEBenchmark
+from .swebench_pro import SWEBenchProBenchmark
 from .terminalbench import TerminalBenchBenchmark
 from .toolbench import ToolBenchBenchmark
 from .truthfulqa import TruthfulQABenchmark
@@ -63,6 +64,7 @@
     "MLAgentBenchBenchmark",
     "MMMUBenchmark",
     "RepoQABenchmark",
+    "SWEBenchProBenchmark",
     "SWEBenchmark",
     "TerminalBenchBenchmark",
     "ToolBenchBenchmark",
@@ -106,6 +108,7 @@
     "mmmu": MMMUBenchmark,
     "longbench": LongBenchBenchmark,
     "adversarial": AdversarialBenchmark,
+    "swe-bench-pro": SWEBenchProBenchmark,
 }
 
 
@@ -137,6 +140,9 @@ def create_benchmark(name: str, **kwargs: Any) -> Benchmark:
     if name in swebench_datasets:
         kwargs["dataset"] = swebench_datasets[name]
 
+    if name == "swe-bench-pro":
+        kwargs["dataset"] = "ScaleAI/SWE-bench_Pro"
+
     return benchmark_class(**kwargs)
 
 
diff --git a/src/mcpbr/benchmarks/swebench_pro.py b/src/mcpbr/benchmarks/swebench_pro.py
new file mode 100644
index 0000000..96e692b
--- /dev/null
+++ b/src/mcpbr/benchmarks/swebench_pro.py
@@ -0,0 +1,428 @@
+"""SWE-bench Pro benchmark implementation.
+
+SWE-bench Pro is a multi-language benchmark with 731 instances across 11 repos
+in Python, Go, TypeScript, and JavaScript. Average solutions span 107.4 lines
+across 4.1 files. Top models achieve ~23% resolution (vs 70%+ on Verified).
+
+Key differences from SWE-bench:
+- Docker images from DockerHub (dockerhub_tag field) instead of GHCR
+- Multi-language test runners (Python, Go, TypeScript, JavaScript)
+- Lowercase field names (fail_to_pass instead of FAIL_TO_PASS)
+- Language metadata per task (repo_language field)
+"""
+
+from typing import Any
+
+from datasets import load_dataset
+
+from ..docker_env import DockerEnvironmentManager, TaskEnvironment
+from ..evaluation import (
+    EvaluationResult,
+    evaluate_patch,
+    get_test_list_field,
+    parse_test_list,
+    run_tests,
+)
+from .base import BenchmarkTask
+
+# Supported languages in SWE-bench Pro
+PRO_LANGUAGES = {"python", "go", "typescript", "javascript"}
+
+# DockerHub registry prefix for SWE-bench Pro pre-built images
+SWEBENCH_PRO_IMAGE_PREFIX = "jefzda/sweap-images"
+
+
+class SWEBenchProBenchmark:
+    """SWE-bench Pro benchmark implementation.
+
+    Multi-language benchmark for evaluating coding agents on real-world
+    software engineering tasks across Python, Go, TypeScript, and JavaScript.
+    """
+
+    name = "swe-bench-pro"
+
+    def __init__(self, dataset: str = "ScaleAI/SWE-bench_Pro"):
+        """Initialize SWE-bench Pro benchmark.
+
+        Args:
+            dataset: HuggingFace dataset identifier.
+        """
+        self.dataset = dataset
+
+    def load_tasks(
+        self,
+        sample_size: int | None = None,
+        task_ids: list[str] | None = None,
+        level: int | None = None,
+        filter_difficulty: list[str] | None = None,
+        filter_category: list[str] | None = None,
+        filter_tags: list[str] | None = None,
+    ) -> list[dict[str, Any]]:
+        """Load tasks from SWE-bench Pro dataset.
+
+        Args:
+            sample_size: Maximum number of tasks to load (None for all).
+            task_ids: Specific task IDs to load (None for all).
+            level: Unused for SWE-bench Pro.
+            filter_difficulty: Unused for SWE-bench Pro.
+            filter_category: Filter by language name (e.g., "python", "go")
+                or repository substring (e.g., "django", "gin-gonic").
+            filter_tags: Unused for SWE-bench Pro.
+
+        Returns:
+            List of SWE-bench Pro task dictionaries.
+        """
+        dataset = load_dataset(self.dataset, split="test")
+
+        # Optimization: early truncation when no filtering is needed
+        needs_full_scan = bool(task_ids) or bool(filter_category)
+        if not needs_full_scan and sample_size is not None and len(dataset) > sample_size:
+            dataset = dataset.select(range(sample_size))
+
+        if task_ids:
+            task_id_set = set(task_ids)
+            tasks = [item for item in dataset if item["instance_id"] in task_id_set]
+        else:
+            tasks = list(dataset)
+
+        if filter_category:
+            filtered = []
+            for task in tasks:
+                repo = task.get("repo", "")
+                language = task.get("repo_language", "").lower()
+                for category in filter_category:
+                    cat_lower = category.lower()
+                    # If the category is a known language, match by language only
+                    if cat_lower in PRO_LANGUAGES:
+                        if cat_lower == language:
+                            filtered.append(task)
+                            break
+                    elif cat_lower in repo.lower():
+                        # Otherwise, match by repo substring
+                        filtered.append(task)
+                        break
+            tasks = filtered
+
+        if sample_size is not None and len(tasks) > sample_size:
+            tasks = tasks[:sample_size]
+
+        return tasks
+
+    def normalize_task(self, task: dict[str, Any]) -> BenchmarkTask:
+        """Convert SWE-bench Pro task to normalized format.
+
+        Handles both lowercase (SWE-bench Pro) and uppercase (SWE-bench)
+        field names for test lists.
+
+        Args:
+            task: SWE-bench Pro task dictionary.
+
+        Returns:
+            Normalized BenchmarkTask.
+        """
+        return BenchmarkTask(
+            task_id=task["instance_id"],
+            problem_statement=task["problem_statement"],
+            repo=task["repo"],
+            commit=task["base_commit"],
+            metadata={
+                "fail_to_pass": get_test_list_field(task, "fail_to_pass"),
+                "pass_to_pass": get_test_list_field(task, "pass_to_pass"),
+                "test_patch": task.get("test_patch", ""),
+                "repo_language": task.get("repo_language", "unknown"),
+            },
+        )
+
+    async def create_environment(
+        self,
+        task: dict[str, Any],
+        docker_manager: DockerEnvironmentManager,
+    ) -> TaskEnvironment:
+        """Create environment for SWE-bench Pro task.
+
+        Injects the DockerHub image override so DockerEnvironmentManager
+        pulls from DockerHub instead of GHCR.
+
+        Args:
+            task: SWE-bench Pro task dictionary.
+            docker_manager: Docker environment manager.
+
+        Returns:
+            TaskEnvironment for the task.
+        """
+        # Inject image override for DockerHub-hosted images
+        # The dockerhub_tag field is the tag portion; prepend the registry prefix
+        # SWE-bench Pro images use /app as workdir (not /testbed)
+        task_copy = dict(task)
+        dockerhub_tag = task.get("dockerhub_tag")
+        if dockerhub_tag:
+            task_copy["_image_override"] = f"{SWEBENCH_PRO_IMAGE_PREFIX}:{dockerhub_tag}"
+            task_copy["_workdir_override"] = "/app"
+
+        return await docker_manager.create_environment(task_copy)
+
+    async def evaluate(
+        self,
+        env: TaskEnvironment,
+        task: dict[str, Any],
+        solution: str,
+    ) -> dict[str, Any]:
+        """Evaluate a patch for SWE-bench Pro task.
+
+        For Python tasks, delegates to the existing evaluate_patch().
+        For Go/TypeScript/JavaScript, uses language-specific test runners.
+
+        Args:
+            env: Task environment.
+            task: SWE-bench Pro task dictionary.
+            solution: Unified diff patch to evaluate.
+
+        Returns:
+            Dictionary with evaluation results including 'resolved' boolean.
+        """
+        language = task.get("repo_language", "python").lower()
+
+        if language == "python":
+            # Delegate Python evaluation to existing logic
+            eval_result: EvaluationResult = await evaluate_patch(env, task, solution)
+            return self._eval_result_to_dict(eval_result)
+
+        # For non-Python languages, use language-specific evaluation
+        return await self._evaluate_multilang(env, task, solution, language)
+
+    async def _evaluate_multilang(
+        self,
+        env: TaskEnvironment,
+        task: dict[str, Any],
+        patch: str,
+        language: str,
+    ) -> dict[str, Any]:
+        """Evaluate a patch using language-specific test runners.
+
+        Args:
+            env: Task environment.
+            task: SWE-bench Pro task dictionary.
+            patch: Unified diff patch to evaluate.
+            language: Programming language (go, typescript, javascript).
+
+        Returns:
+            Dictionary with evaluation results.
+        """
+        from ..evaluation import _apply_test_patch, apply_patch
+
+        # SWE-bench Pro images use /app as their working directory
+        eval_workdir = "/app" if env.uses_prebuilt else None
+
+        applied, error = await apply_patch(env, patch, workdir=eval_workdir)
+        if not applied:
+            return {"resolved": False, "patch_applied": False, "eval_error": error}
+
+        test_patch = task.get("test_patch", "")
+        if test_patch:
+            await _apply_test_patch(env, test_patch, workdir=eval_workdir)
+
+        # Reinstall package so patched code is active (SWE-bench Pro images
+        # install into site-packages, not editable mode)
+        if eval_workdir and language == "python":
+            await env.exec_command(
+                "pip install -e . -q 2>/dev/null || true",
+                timeout=120,
+                workdir=eval_workdir,
+            )
+
+        fail_to_pass_str = get_test_list_field(task, "fail_to_pass")
+        pass_to_pass_str = get_test_list_field(task, "pass_to_pass")
+        fail_to_pass_tests = parse_test_list(fail_to_pass_str)
+        pass_to_pass_tests = parse_test_list(pass_to_pass_str)
+
+        fail_to_pass_results = await self._run_lang_tests(
+            env, fail_to_pass_tests, language, workdir=eval_workdir
+        )
+        pass_to_pass_results = await self._run_lang_tests(
+            env, pass_to_pass_tests[:10], language, workdir=eval_workdir
+        )
+
+        resolved = (
+            fail_to_pass_results.passed == fail_to_pass_results.total
+            and fail_to_pass_results.total > 0
+            and pass_to_pass_results.passed == pass_to_pass_results.total
+        )
+
+        result: dict[str, Any] = {"resolved": resolved, "patch_applied": True}
+        if fail_to_pass_results:
+            result["fail_to_pass"] = {
+                "passed": fail_to_pass_results.passed,
+                "total": fail_to_pass_results.total,
+            }
+        if pass_to_pass_results:
+            result["pass_to_pass"] = {
+                "passed": pass_to_pass_results.passed,
+                "total": pass_to_pass_results.total,
+            }
+        return result
+
+    async def _run_lang_tests(
+        self,
+        env: TaskEnvironment,
+        tests: list[str],
+        language: str,
+        workdir: str | None = None,
+        timeout: int = 120,
+    ) -> Any:
+        """Run tests using language-specific commands.
+
+        Args:
+            env: Task environment.
+            tests: List of test identifiers.
+            language: Programming language.
+            workdir: Working directory.
+            timeout: Timeout per test in seconds.
+
+        Returns:
+            TestResults instance.
+        """
+        if language == "python":
+            return await run_tests(
+                env, tests, timeout=timeout, uses_prebuilt=env.uses_prebuilt, workdir=workdir
+            )
+
+        # For non-Python, build language-specific commands and run
+        from ..evaluation import TestResults
+
+        if not tests:
+            return TestResults(passed=0, total=0, details=[])
+
+        results = []
+        passed = 0
+
+        for test in tests:
+            test_cmd = _build_pro_test_command(test, language, env.uses_prebuilt)
+            try:
+                exit_code, stdout, stderr = await env.exec_command(
+                    test_cmd, timeout=timeout, workdir=workdir
+                )
+                test_passed = exit_code == 0
+                if test_passed:
+                    passed += 1
+                results.append(
+                    {
+                        "test": test,
+                        "passed": test_passed,
+                        "exit_code": exit_code,
+                        "output": stdout[:1000] if stdout else "",
+                        "error": stderr[:1000] if stderr else "",
+                    }
+                )
+            except TimeoutError:
+                results.append(
+                    {
+                        "test": test,
+                        "passed": False,
+                        "exit_code": -1,
+                        "output": "",
+                        "error": "Test timed out",
+                    }
+                )
+
+        return TestResults(passed=passed, total=len(tests), details=results)
+
+    def _eval_result_to_dict(self, eval_result: EvaluationResult) -> dict[str, Any]:
+        """Convert EvaluationResult to dictionary format."""
+        result: dict[str, Any] = {
+            "resolved": eval_result.resolved,
+            "patch_applied": eval_result.patch_applied,
+        }
+        if eval_result.fail_to_pass:
+            result["fail_to_pass"] = {
+                "passed": eval_result.fail_to_pass.passed,
+                "total": eval_result.fail_to_pass.total,
+            }
+        if eval_result.pass_to_pass:
+            result["pass_to_pass"] = {
+                "passed": eval_result.pass_to_pass.passed,
+                "total": eval_result.pass_to_pass.total,
+            }
+        if eval_result.error:
+            result["eval_error"] = eval_result.error
+        return result
+
+    def get_prebuilt_image(self, task: dict[str, Any]) -> str | None:
+        """Get pre-built Docker image name for the task.
+
+        SWE-bench Pro uses DockerHub images specified in the dockerhub_tag field.
+
+        Args:
+            task: SWE-bench Pro task dictionary.
+
+        Returns:
+            Full DockerHub image name, or None if not available.
+        """
+        tag = task.get("dockerhub_tag")
+        if tag:
+            return f"{SWEBENCH_PRO_IMAGE_PREFIX}:{tag}"
+        return None
+
+    def get_prompt_template(self) -> str:
+        """Get SWE-bench Pro prompt template.
+
+        Returns:
+            Prompt template for fixing bugs across multiple languages.
+        """
+        return (
+            "Fix the following bug in this repository:\n\n"
+            "{problem_statement}\n\n"
+            "IMPORTANT CONSTRAINTS:\n"
+            "- Only modify the minimum files necessary to fix the bug\n"
+            "- Do NOT create new test files\n"
+            "- Do NOT create documentation files\n"
+            "- Do NOT create reproduction scripts\n"
+            "- Focus solely on the fix in existing source files\n"
+            "- This may be a Python, Go, TypeScript, or JavaScript project"
+        )
+
+    def get_default_sandbox_level(self) -> str | None:
+        """Get default sandbox level for SWE-bench Pro."""
+        return None
+
+
+def _build_pro_test_command(test: str, language: str, uses_prebuilt: bool = False) -> str:
+    """Build a language-specific test command for SWE-bench Pro.
+
+    Args:
+        test: Test identifier.
+        language: Programming language (python, go, typescript, javascript).
+        uses_prebuilt: Whether a pre-built image is being used.
+
+    Returns:
+        Shell command string to run the test.
+    """
+    if language == "python":
+        from ..evaluation import _build_test_command
+
+        return _build_test_command(test, uses_prebuilt)
+
+    if uses_prebuilt:
+        activate = "source /opt/miniconda3/etc/profile.d/conda.sh && conda activate testbed && "
+    else:
+        activate = ""
+
+    if language == "go":
+        # Go test identifiers can be package paths or test function names
+        if "/" in test or test.startswith("."):
+            # Package path: go test -v ./path/to/package
+            return f"{activate}go test -v -count=1 {test} 2>&1"
+        else:
+            # Test function name: go test -v -run TestName ./...
+            return f"{activate}go test -v -count=1 -run '{test}' ./... 2>&1"
+
+    if language in ("typescript", "javascript"):
+        # Jest-style test identifiers
+        if "/" in test or test.endswith((".ts", ".js", ".tsx", ".jsx")):
+            # File path
+            return f"{activate}npx jest {test} --verbose --no-cache 2>&1"
+        else:
+            # Test name pattern
+            return f"{activate}npx jest -t '{test}' --verbose --no-cache 2>&1"
+
+    # Fallback: try running as-is
+    return f"{activate}{test} 2>&1"
diff --git a/src/mcpbr/cli.py b/src/mcpbr/cli.py
index df18461..a41cdc9 100644
--- a/src/mcpbr/cli.py
+++ b/src/mcpbr/cli.py
@@ -1510,6 +1510,11 @@ def benchmarks() -> None:
         "2,294",
         "Bug fixing (complete benchmark, research)",
     )
+    table.add_row(
+        "swe-bench-pro",
+        "731",
+        "Multi-language bug fixing (Python, Go, TS, JS — harder)",
+    )
     # Other benchmarks
     table.add_row(
         "cybergym",
@@ -1531,6 +1536,161 @@ def benchmarks() -> None:
     console.print("[dim]  mcpbr run -c config.yaml -b mcptoolbench[/dim]")
 
 
+@main.command(context_settings={"help_option_names": ["-h", "--help"]})
+@click.option(
+    "-c",
+    "--config",
+    "config_path",
+    type=click.Path(exists=True),
+    help="Path to configuration YAML file (used for Docker settings).",
+)
+@click.option(
+    "-b",
+    "--benchmark",
+    "benchmark_name",
+    type=click.Choice(list(VALID_BENCHMARKS)),
+    default="swe-bench-pro",
+    help="Benchmark to validate (default: swe-bench-pro).",
+)
+@click.option(
+    "-n",
+    "--sample",
+    "sample_size",
+    type=int,
+    default=None,
+    help="Number of instances to validate (default: all).",
+)
+@click.option(
+    "--task",
+    "task_ids",
+    multiple=True,
+    help="Specific task ID(s) to validate.",
+)
+@click.option(
+    "--max-concurrent",
+    type=int,
+    default=4,
+    help="Maximum concurrent validations (default: 4).",
+)
+@click.option(
+    "--fail-fast",
+    is_flag=True,
+    help="Stop on first failure.",
+)
+@click.option(
+    "--filter-category",
+    multiple=True,
+    help="Filter by language or repo substring.",
+)
+@click.option(
+    "--timeout",
+    type=int,
+    default=300,
+    help="Timeout per test in seconds (default: 300).",
+)
+def preflight(
+    config_path: str | None,
+    benchmark_name: str,
+    sample_size: int | None,
+    task_ids: tuple[str, ...],
+    max_concurrent: int,
+    fail_fast: bool,
+    filter_category: tuple[str, ...],
+    timeout: int,
+) -> None:
+    """Validate golden patches pass all tests before evaluation.
+
+    Runs the benchmark's golden (reference) patches against Docker
+    environments and verifies all tests pass. Use this to catch
+    environment or configuration issues before running agent evaluations.
+
+    \b
+    Examples:
+      mcpbr preflight -b swe-bench-pro -n 5          # Check 5 instances
+      mcpbr preflight --fail-fast                      # Stop on first failure
+      mcpbr preflight --filter-category python -n 10   # Check 10 Python instances
+      mcpbr preflight --task django__django-16046      # Check specific instance
+    """
+    from .benchmark_preflight import run_benchmark_preflight
+    from .benchmarks import create_benchmark
+    from .docker_env import DockerEnvironmentManager
+
+    benchmark = create_benchmark(benchmark_name)
+
+    # Load tasks
+    task_id_list = list(task_ids) if task_ids else None
+    category_list = list(filter_category) if filter_category else None
+
+    console.print(f"[bold]Preflight Check: {benchmark_name}[/bold]\n")
+    dataset_name = getattr(benchmark, "dataset", benchmark_name)
+    console.print(f"Loading tasks from {dataset_name}...")
+
+    tasks = benchmark.load_tasks(
+        sample_size=sample_size,
+        task_ids=task_id_list,
+        filter_category=category_list,
+    )
+
+    if not tasks:
+        console.print("[yellow]No tasks found matching the criteria.[/yellow]")
+        return
+
+    console.print(f"Validating {len(tasks)} instance(s)...\n")
+
+    # Create Docker manager
+    docker_manager = DockerEnvironmentManager(use_prebuilt=True)
+
+    try:
+        report = asyncio.run(
+            run_benchmark_preflight(
+                benchmark=benchmark,
+                tasks=tasks,
+                docker_manager=docker_manager,
+                max_concurrent=max_concurrent,
+                timeout=timeout,
+                fail_fast=fail_fast,
+            )
+        )
+    finally:
+        with contextlib.suppress(Exception):
+            docker_manager.cleanup_all_sync()
+
+    # Display results
+    result_table = Table()
+    result_table.add_column("Instance", style="cyan")
+    result_table.add_column("Language")
+    result_table.add_column("Status")
+    result_table.add_column("FTP (pass/total)")
+    result_table.add_column("PTP (pass/total)")
+    result_table.add_column("Error")
+
+    for r in report.results:
+        status_style = {
+            "passed": "[green]PASS[/green]",
+            "failed": "[red]FAIL[/red]",
+            "error": "[yellow]ERROR[/yellow]",
+        }.get(r.status, r.status)
+
+        result_table.add_row(
+            r.instance_id,
+            r.language,
+            status_style,
+            f"{r.fail_to_pass_passed}/{r.fail_to_pass_total}",
+            f"{r.pass_to_pass_passed}/{r.pass_to_pass_total}",
+            r.error or "",
+        )
+
+    console.print(result_table)
+    console.print(
+        f"\n[bold]Summary:[/bold] {report.passed}/{report.total} passed "
+        f"({report.success_rate:.1f}%), "
+        f"{report.failed} failed, {report.errors} errors"
+    )
+
+    if report.failed > 0 or report.errors > 0:
+        sys.exit(1)
+
+
 @main.group(context_settings={"help_option_names": ["-h", "--help"]})
 def config() -> None:
     """Configuration file management commands.
diff --git a/src/mcpbr/config.py b/src/mcpbr/config.py
index 132d3e9..aa03647 100644
--- a/src/mcpbr/config.py
+++ b/src/mcpbr/config.py
@@ -47,6 +47,7 @@
     "longbench",
     "adversarial",
     "codegraph",
+    "swe-bench-pro",
 )
 VALID_INFRASTRUCTURE_MODES = ("local", "azure", "aws", "gcp", "kubernetes", "cloudflare")
 
diff --git a/src/mcpbr/docker_env.py b/src/mcpbr/docker_env.py
index 2f0863a..a7b2b4a 100644
--- a/src/mcpbr/docker_env.py
+++ b/src/mcpbr/docker_env.py
@@ -445,6 +445,30 @@ def _pull() -> str | None:
         loop = asyncio.get_event_loop()
         return await loop.run_in_executor(None, _pull)
 
+    async def _try_pull_image(self, image_name: str) -> str | None:
+        """Try to pull a Docker image by its full name.
+
+        Used for explicit image overrides (e.g., SWE-bench Pro DockerHub images).
+
+        Args:
+            image_name: Full Docker image name (e.g., "dockerhub_user/image:tag").
+
+        Returns:
+            Image name if successful, None if not available.
+        """
+
+        def _pull() -> str | None:
+            try:
+                self.client.images.pull(image_name, platform="linux/amd64")
+                return image_name
+            except docker.errors.ImageNotFound:
+                return None
+            except docker.errors.APIError:
+                return None
+
+        loop = asyncio.get_event_loop()
+        return await loop.run_in_executor(None, _pull)
+
     async def _ensure_fallback_image(self) -> None:
         """Ensure the fallback Docker image is built."""
         if self._fallback_image_built:
@@ -557,9 +581,16 @@ async def create_environment(
         uses_prebuilt = False
 
         if self.use_prebuilt:
-            image_name = await self._try_pull_prebuilt(instance_id)
-            if image_name:
-                uses_prebuilt = True
+            # Check for explicit image override (e.g., SWE-bench Pro DockerHub images)
+            image_override = task.get("_image_override")
+            if image_override:
+                image_name = await self._try_pull_image(image_override)
+                if image_name:
+                    uses_prebuilt = True
+            if not image_name:
+                image_name = await self._try_pull_prebuilt(instance_id)
+                if image_name:
+                    uses_prebuilt = True
 
         if not image_name:
             await self._ensure_fallback_image()
@@ -574,7 +605,18 @@ async def create_environment(
         unique_suffix = uuid.uuid4().hex[:6]
         container_name = f"mcpbr-{self._session_id}-{instance_id}-{unique_suffix}"
 
-        container_workdir = "/testbed" if uses_prebuilt else "/workspace"
+        # SWE-bench Pro images use /app, standard SWE-bench uses /testbed
+        workdir_override = task.get("_workdir_override")
+        if workdir_override:
+            container_workdir = workdir_override
+        elif uses_prebuilt:
+            container_workdir = "/testbed"
+        else:
+            container_workdir = "/workspace"
+
+        # Some pre-built images set an entrypoint (e.g., /bin/bash) that
+        # conflicts with our "tail -f /dev/null" keep-alive command.
+        has_entrypoint_override = bool(task.get("_image_override"))
 
         def _create_container() -> Container:
             max_retries = 3
@@ -599,9 +641,20 @@ def _create_container() -> Container:
                     # Default network mode; sandbox may override
                     network_mode = sandbox_kwargs.pop("network_mode", "bridge")
 
+                    # Override entrypoint for images that set one (e.g.,
+                    # SWE-bench Pro's /bin/bash entrypoint conflicts with
+                    # our "tail -f /dev/null" keep-alive command).
+                    entrypoint_kwargs: dict = {}
+                    if has_entrypoint_override:
+                        entrypoint_kwargs["entrypoint"] = [
+                            "/bin/sh",
+                            "-c",
+                            "tail -f /dev/null",
+                        ]
+
                     container = self.client.containers.run(
                         image_name,
-                        command="tail -f /dev/null",
+                        command="tail -f /dev/null" if not has_entrypoint_override else None,
                         name=container_name,
                         detach=True,
                         platform="linux/amd64" if uses_prebuilt else None,
@@ -617,6 +670,7 @@ def _create_container() -> Container:
                             MCPBR_SESSION_LABEL: self._session_id,
                             MCPBR_TIMESTAMP_LABEL: self._session_timestamp,
                         },
+                        **entrypoint_kwargs,
                         **sandbox_kwargs,
                     )
                     return container
@@ -716,8 +770,10 @@ def _create_container() -> Container:
         if uses_prebuilt:
             await self._copy_repo_to_workspace(env)
             # Install Claude CLI for running agent inside container
-            await self._install_claude_cli(env)
-            env.claude_cli_installed = True
+            # (skip when running preflight checks or evaluation-only workflows)
+            if not task.get("_skip_cli_install"):
+                await self._install_claude_cli(env)
+                env.claude_cli_installed = True
         else:
             await self._setup_repo(env, repo, base_commit)
 
@@ -742,7 +798,7 @@ async def _check_workspace_file_count(self, env: TaskEnvironment) -> int:
             return 0
 
     async def _copy_repo_to_workspace(self, env: TaskEnvironment) -> None:
-        """Copy repo from pre-built image /testbed to /workspace for agent access.
+        """Copy repo from pre-built image source dir to /workspace for agent access.
 
         Under high concurrency the Docker filesystem copy can silently produce
         an empty workspace.  This method retries with a sync and, if necessary,
@@ -751,9 +807,13 @@ async def _copy_repo_to_workspace(self, env: TaskEnvironment) -> None:
         Args:
             env: Task environment with pre-built image.
         """
+        # The source directory is the container's working directory (e.g.,
+        # /testbed for standard SWE-bench, /app for SWE-bench Pro).
+        source_dir = env.workdir if env.workdir != "/workspace" else "/testbed"
+
         # --- Phase 1: initial copy + verify ---
         exit_code, _stdout, stderr = await env.exec_command(
-            "cp -r /testbed/. /workspace/",
+            f"cp -r {source_dir}/. /workspace/",
             timeout=120,
         )
         if exit_code != 0:
@@ -799,11 +859,11 @@ async def _copy_repo_to_workspace(self, env: TaskEnvironment) -> None:
 
         # --- Phase 3: full copy retry ---
         logger.warning(
-            "Workspace still empty after sync retry — re-copying from /testbed "
+            f"Workspace still empty after sync retry — re-copying from {source_dir} "
             f"(instance={env.instance_id})"
         )
         exit_code, _, stderr = await env.exec_command(
-            "cp -r /testbed/. /workspace/",
+            f"cp -r {source_dir}/. /workspace/",
             timeout=120,
         )
         if exit_code != 0:
diff --git a/src/mcpbr/evaluation.py b/src/mcpbr/evaluation.py
index bfb2614..d7861fb 100644
--- a/src/mcpbr/evaluation.py
+++ b/src/mcpbr/evaluation.py
@@ -1,6 +1,7 @@
 """Evaluation logic for applying patches and running tests."""
 
 import ast
+import contextlib
 import json
 from dataclasses import dataclass
 from typing import Any
@@ -28,6 +29,31 @@ class EvaluationResult:
     error: str | None = None
 
 
+def get_test_list_field(task: dict[str, Any], field_name: str) -> str:
+    """Get a test list field from a task, checking both lowercase and uppercase names.
+
+    SWE-bench uses FAIL_TO_PASS/PASS_TO_PASS while SWE-bench Pro uses
+    fail_to_pass/pass_to_pass. This helper provides backward-compatible access.
+
+    Args:
+        task: Task dictionary.
+        field_name: Field name in lowercase (e.g., "fail_to_pass").
+
+    Returns:
+        Field value as string, or "[]" if not found.
+    """
+    # Try lowercase first (SWE-bench Pro convention)
+    value = task.get(field_name)
+    if value is not None:
+        return str(value)
+    # Fall back to uppercase (SWE-bench convention)
+    upper_name = field_name.upper()
+    value = task.get(upper_name)
+    if value is not None:
+        return str(value)
+    return "[]"
+
+
 def parse_test_list(test_str: str) -> list[str]:
     """Parse test list from SWE-bench format (JSON string or Python literal).
 
@@ -263,7 +289,7 @@ def _build_test_command(test: str, uses_prebuilt: bool = False, repo: str | None
         test_module = ".".join(test.split(".")[:2])  # Extract test_utils.tests
         return f"{activate}cd /testbed/tests && ./runtests.py {test_module}"
     elif "::" in test or test.endswith(".py"):
-        return f"{activate}python -m pytest {test} -xvs 2>&1"
+        return f"{activate}python -m pytest '{test}' -xvs 2>&1"
     else:
         return f"{activate}python -m pytest -k '{test}' -xvs 2>&1"
 
@@ -375,6 +401,16 @@ async def evaluate_patch(
                 patch_applied=True,
                 error="Docker exec timed out during dependency installation",
             )
+    elif task.get("dockerhub_tag") and task.get("repo_language", "python").lower() == "python":
+        # SWE-bench Pro images install packages into site-packages (not
+        # editable).  After patching we must reinstall so the new code is
+        # importable.
+        with contextlib.suppress(TimeoutError):
+            await env.exec_command(
+                "pip install -e . -q 2>/dev/null || true",
+                timeout=120,
+                workdir=eval_workdir,
+            )
 
     repo = task.get("repo")
 
diff --git a/src/mcpbr/swebench_test_specs.py b/src/mcpbr/swebench_test_specs.py
index 9c1bdde..0afeb08 100644
--- a/src/mcpbr/swebench_test_specs.py
+++ b/src/mcpbr/swebench_test_specs.py
@@ -31,3 +31,24 @@ def get_repo_test_command(repo: str) -> str | None:
     Returns None if repo uses standard pytest (handled by existing logic).
     """
     return REPO_TO_TEST_CMD.get(repo)
+
+
+# Language → default test command for SWE-bench Pro multi-language support
+LANGUAGE_TO_TEST_CMD: dict[str, str] = {
+    "python": TEST_PYTEST,
+    "go": "go test -v -count=1",
+    "typescript": "npx jest --verbose --no-cache",
+    "javascript": "npx jest --verbose --no-cache",
+}
+
+
+def get_language_test_command(language: str) -> str | None:
+    """Look up the default test command for a programming language.
+
+    Args:
+        language: Programming language name (lowercase).
+
+    Returns:
+        Default test command string, or None if language is not recognized.
+    """
+    return LANGUAGE_TO_TEST_CMD.get(language.lower())
diff --git a/tests/test_benchmark_preflight.py b/tests/test_benchmark_preflight.py
new file mode 100644
index 0000000..d95e586
--- /dev/null
+++ b/tests/test_benchmark_preflight.py
@@ -0,0 +1,340 @@
+"""Tests for benchmark preflight validation system."""
+
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from mcpbr.benchmark_preflight import (
+    PreflightReport,
+    PreflightResult,
+    _check_single_instance,
+    run_benchmark_preflight,
+)
+
+
+class TestPreflightResult:
+    """Tests for PreflightResult data structure."""
+
+    def test_basic_construction(self) -> None:
+        result = PreflightResult(
+            instance_id="test-123",
+            status="passed",
+            fail_to_pass_passed=3,
+            fail_to_pass_total=3,
+            pass_to_pass_passed=5,
+            pass_to_pass_total=5,
+            language="python",
+        )
+        assert result.instance_id == "test-123"
+        assert result.status == "passed"
+        assert result.error is None
+
+    def test_failed_result(self) -> None:
+        result = PreflightResult(
+            instance_id="test-456",
+            status="failed",
+            fail_to_pass_passed=1,
+            fail_to_pass_total=3,
+            error="fail_to_pass: 1/3 passed",
+            language="go",
+        )
+        assert result.status == "failed"
+        assert result.error is not None
+
+    def test_error_result(self) -> None:
+        result = PreflightResult(
+            instance_id="test-789",
+            status="error",
+            error="Docker connection failed",
+        )
+        assert result.status == "error"
+        assert result.language == "unknown"
+
+
+class TestPreflightReport:
+    """Tests for PreflightReport aggregate results."""
+
+    def test_all_passed(self) -> None:
+        report = PreflightReport(total=3, passed=3, failed=0, errors=0)
+        assert report.success_rate == 100.0
+
+    def test_empty_report(self) -> None:
+        report = PreflightReport(total=0, passed=0, failed=0, errors=0)
+        assert report.success_rate == 0.0
+
+    def test_partial_success(self) -> None:
+        report = PreflightReport(total=10, passed=7, failed=2, errors=1)
+        assert report.success_rate == 70.0
+
+    def test_all_failed(self) -> None:
+        report = PreflightReport(total=5, passed=0, failed=5, errors=0)
+        assert report.success_rate == 0.0
+
+    def test_default_results_list(self) -> None:
+        report = PreflightReport()
+        assert report.results == []
+
+
+class TestCheckSingleInstance:
+    """Tests for single instance preflight check."""
+
+    @pytest.mark.asyncio
+    async def test_successful_check(self) -> None:
+        mock_env = MagicMock()
+        mock_env.uses_prebuilt = True
+        mock_env.cleanup = AsyncMock()
+        mock_env.exec_command = AsyncMock(return_value=(0, "", ""))
+
+        mock_benchmark = MagicMock()
+        mock_benchmark.create_environment = AsyncMock(return_value=mock_env)
+
+        task = {
+            "instance_id": "django__django-16046",
+            "repo": "django/django",
+            "repo_language": "python",
+            "patch": "diff --git a/fix.py",
+            "test_patch": "",
+            "fail_to_pass": '["test_one"]',
+            "pass_to_pass": '["test_two"]',
+        }
+
+        mock_docker = MagicMock()
+
+        with (
+            patch("mcpbr.benchmark_preflight.apply_patch", new_callable=AsyncMock) as mock_apply,
+            patch("mcpbr.benchmark_preflight.run_tests", new_callable=AsyncMock) as mock_tests,
+        ):
+            mock_apply.return_value = (True, "")
+            # fail_to_pass: 1/1 passed, pass_to_pass: 1/1 passed
+            mock_tests.side_effect = [
+                MagicMock(passed=1, total=1),
+                MagicMock(passed=1, total=1),
+            ]
+
+            result = await _check_single_instance(mock_benchmark, task, mock_docker)
+
+        assert result.status == "passed"
+        assert result.instance_id == "django__django-16046"
+        assert result.language == "python"
+        mock_env.cleanup.assert_called_once()
+
+    @pytest.mark.asyncio
+    async def test_patch_apply_failure(self) -> None:
+        mock_env = MagicMock()
+        mock_env.uses_prebuilt = True
+        mock_env.cleanup = AsyncMock()
+
+        mock_benchmark = MagicMock()
+        mock_benchmark.create_environment = AsyncMock(return_value=mock_env)
+
+        task = {
+            "instance_id": "test-fail",
+            "repo": "org/repo",
+            "repo_language": "go",
+            "patch": "bad patch",
+            "fail_to_pass": '["test"]',
+            "pass_to_pass": "[]",
+        }
+
+        mock_docker = MagicMock()
+
+        with patch("mcpbr.benchmark_preflight.apply_patch", new_callable=AsyncMock) as mock_apply:
+            mock_apply.return_value = (False, "Patch does not apply")
+
+            result = await _check_single_instance(mock_benchmark, task, mock_docker)
+
+        assert result.status == "failed"
+        assert "Golden patch failed to apply" in (result.error or "")
+
+    @pytest.mark.asyncio
+    async def test_no_golden_patch(self) -> None:
+        mock_env = MagicMock()
+        mock_env.uses_prebuilt = True
+        mock_env.cleanup = AsyncMock()
+
+        mock_benchmark = MagicMock()
+        mock_benchmark.create_environment = AsyncMock(return_value=mock_env)
+
+        task = {
+            "instance_id": "no-patch",
+            "repo": "org/repo",
+            "repo_language": "python",
+            "patch": "",
+            "fail_to_pass": '["test"]',
+            "pass_to_pass": "[]",
+        }
+
+        mock_docker = MagicMock()
+        result = await _check_single_instance(mock_benchmark, task, mock_docker)
+
+        assert result.status == "error"
+        assert "No golden patch" in (result.error or "")
+
+    @pytest.mark.asyncio
+    async def test_exception_handling(self) -> None:
+        mock_benchmark = MagicMock()
+        mock_benchmark.create_environment = AsyncMock(
+            side_effect=RuntimeError("Docker not available")
+        )
+
+        task = {
+            "instance_id": "error-task",
+            "repo": "org/repo",
+            "patch": "diff",
+        }
+
+        mock_docker = MagicMock()
+        result = await _check_single_instance(mock_benchmark, task, mock_docker)
+
+        assert result.status == "error"
+        assert "Docker not available" in (result.error or "")
+
+
+class TestRunBenchmarkPreflight:
+    """Tests for the main preflight runner."""
+
+    @pytest.mark.asyncio
+    async def test_concurrent_execution(self) -> None:
+        mock_benchmark = MagicMock()
+        mock_env = MagicMock()
+        mock_env.uses_prebuilt = True
+        mock_env.cleanup = AsyncMock()
+        mock_env.exec_command = AsyncMock(return_value=(0, "", ""))
+        mock_benchmark.create_environment = AsyncMock(return_value=mock_env)
+
+        tasks = [
+            {
+                "instance_id": f"task-{i}",
+                "repo": "org/repo",
+                "repo_language": "python",
+                "patch": "diff --git",
+                "test_patch": "",
+                "fail_to_pass": '["test"]',
+                "pass_to_pass": "[]",
+            }
+            for i in range(3)
+        ]
+
+        mock_docker = MagicMock()
+
+        with (
+            patch("mcpbr.benchmark_preflight.apply_patch", new_callable=AsyncMock) as mock_apply,
+            patch("mcpbr.benchmark_preflight.run_tests", new_callable=AsyncMock) as mock_tests,
+        ):
+            mock_apply.return_value = (True, "")
+            mock_tests.return_value = MagicMock(passed=1, total=1)
+
+            report = await run_benchmark_preflight(
+                benchmark=mock_benchmark,
+                tasks=tasks,
+                docker_manager=mock_docker,
+                max_concurrent=2,
+            )
+
+        assert report.total == 3
+        assert report.passed == 3
+        assert report.failed == 0
+        assert report.success_rate == 100.0
+
+    @pytest.mark.asyncio
+    async def test_fail_fast(self) -> None:
+        call_count = 0
+
+        async def mock_check(
+            benchmark: object, task: dict, docker: object, timeout: int = 300
+        ) -> PreflightResult:
+            nonlocal call_count
+            call_count += 1
+            if call_count == 2:
+                return PreflightResult(
+                    instance_id=task["instance_id"],
+                    status="failed",
+                    error="Test failure",
+                    language="python",
+                )
+            return PreflightResult(
+                instance_id=task["instance_id"],
+                status="passed",
+                language="python",
+            )
+
+        tasks = [{"instance_id": f"task-{i}", "repo": "r", "patch": "d"} for i in range(5)]
+
+        mock_docker = MagicMock()
+        mock_benchmark = MagicMock()
+
+        with patch(
+            "mcpbr.benchmark_preflight._check_single_instance",
+            side_effect=mock_check,
+        ):
+            report = await run_benchmark_preflight(
+                benchmark=mock_benchmark,
+                tasks=tasks,
+                docker_manager=mock_docker,
+                fail_fast=True,
+            )
+
+        # Should stop after the failure (task 2)
+        assert report.total == 5
+        assert report.passed == 1
+        assert report.failed == 1
+        assert len(report.results) == 2
+
+    @pytest.mark.asyncio
+    async def test_error_handling_in_gather(self) -> None:
+        mock_benchmark = MagicMock()
+
+        async def failing_create(*args: object, **kwargs: object) -> None:
+            raise RuntimeError("Docker error")
+
+        mock_benchmark.create_environment = AsyncMock(side_effect=failing_create)
+
+        tasks = [
+            {
+                "instance_id": "err-task",
+                "repo": "org/repo",
+                "patch": "diff",
+                "fail_to_pass": '["test"]',
+                "pass_to_pass": "[]",
+            }
+        ]
+
+        mock_docker = MagicMock()
+
+        report = await run_benchmark_preflight(
+            benchmark=mock_benchmark,
+            tasks=tasks,
+            docker_manager=mock_docker,
+        )
+
+        assert report.total == 1
+        assert report.errors == 1
+        assert report.success_rate == 0.0
+
+
+class TestGetTestListField:
+    """Tests for the get_test_list_field helper."""
+
+    def test_lowercase_field(self) -> None:
+        from mcpbr.evaluation import get_test_list_field
+
+        task = {"fail_to_pass": '["test_a"]'}
+        assert get_test_list_field(task, "fail_to_pass") == '["test_a"]'
+
+    def test_uppercase_field(self) -> None:
+        from mcpbr.evaluation import get_test_list_field
+
+        task = {"FAIL_TO_PASS": '["test_b"]'}
+        assert get_test_list_field(task, "fail_to_pass") == '["test_b"]'
+
+    def test_lowercase_preferred(self) -> None:
+        from mcpbr.evaluation import get_test_list_field
+
+        task = {"fail_to_pass": '["lower"]', "FAIL_TO_PASS": '["upper"]'}
+        assert get_test_list_field(task, "fail_to_pass") == '["lower"]'
+
+    def test_missing_field(self) -> None:
+        from mcpbr.evaluation import get_test_list_field
+
+        task = {"something_else": "value"}
+        assert get_test_list_field(task, "fail_to_pass") == "[]"
diff --git a/tests/test_swebench_pro.py b/tests/test_swebench_pro.py
new file mode 100644
index 0000000..fde8b8e
--- /dev/null
+++ b/tests/test_swebench_pro.py
@@ -0,0 +1,383 @@
+"""Tests for SWE-bench Pro benchmark implementation."""
+
+from unittest.mock import MagicMock, patch
+
+from mcpbr.benchmarks.swebench_pro import (
+    PRO_LANGUAGES,
+    SWEBENCH_PRO_IMAGE_PREFIX,
+    SWEBenchProBenchmark,
+    _build_pro_test_command,
+)
+
+
+class TestSWEBenchProInit:
+    """Tests for SWEBenchProBenchmark initialization."""
+
+    def test_default_dataset(self) -> None:
+        benchmark = SWEBenchProBenchmark()
+        assert benchmark.dataset == "ScaleAI/SWE-bench_Pro"
+
+    def test_custom_dataset(self) -> None:
+        benchmark = SWEBenchProBenchmark(dataset="custom/dataset")
+        assert benchmark.dataset == "custom/dataset"
+
+    def test_name(self) -> None:
+        benchmark = SWEBenchProBenchmark()
+        assert benchmark.name == "swe-bench-pro"
+
+
+class TestSWEBenchProNormalizeTask:
+    """Tests for task normalization."""
+
+    def test_normalize_basic_task(self) -> None:
+        task = {
+            "instance_id": "django__django-16046",
+            "problem_statement": "Fix the bug",
+            "repo": "django/django",
+            "base_commit": "abc123",
+            "fail_to_pass": '["test_one"]',
+            "pass_to_pass": '["test_two"]',
+            "test_patch": "diff --git a/test.py",
+            "repo_language": "python",
+        }
+        benchmark = SWEBenchProBenchmark()
+        bt = benchmark.normalize_task(task)
+        assert bt.task_id == "django__django-16046"
+        assert bt.problem_statement == "Fix the bug"
+        assert bt.repo == "django/django"
+        assert bt.commit == "abc123"
+        assert bt.metadata["repo_language"] == "python"
+
+    def test_normalize_with_uppercase_fields(self) -> None:
+        """Test that uppercase FAIL_TO_PASS/PASS_TO_PASS are handled."""
+        task = {
+            "instance_id": "test-123",
+            "problem_statement": "desc",
+            "repo": "org/repo",
+            "base_commit": "def456",
+            "FAIL_TO_PASS": '["test_a"]',
+            "PASS_TO_PASS": '["test_b"]',
+        }
+        benchmark = SWEBenchProBenchmark()
+        bt = benchmark.normalize_task(task)
+        assert bt.task_id == "test-123"
+        assert bt.metadata["fail_to_pass"] == '["test_a"]'
+        assert bt.metadata["pass_to_pass"] == '["test_b"]'
+
+    def test_normalize_missing_language(self) -> None:
+        task = {
+            "instance_id": "test-456",
+            "problem_statement": "desc",
+            "repo": "org/repo",
+            "base_commit": "ghi789",
+        }
+        benchmark = SWEBenchProBenchmark()
+        bt = benchmark.normalize_task(task)
+        assert bt.metadata["repo_language"] == "unknown"
+
+    def test_normalize_go_task(self) -> None:
+        task = {
+            "instance_id": "gin-gonic__gin-3890",
+            "problem_statement": "Fix routing",
+            "repo": "gin-gonic/gin",
+            "base_commit": "jkl012",
+            "fail_to_pass": '["TestRoute"]',
+            "pass_to_pass": "[]",
+            "repo_language": "go",
+        }
+        benchmark = SWEBenchProBenchmark()
+        bt = benchmark.normalize_task(task)
+        assert bt.metadata["repo_language"] == "go"
+
+
+class TestBuildProTestCommand:
+    """Tests for language-specific test command building."""
+
+    def test_python_delegates(self) -> None:
+        """Python should delegate to existing _build_test_command."""
+        cmd = _build_pro_test_command("tests/test_foo.py::test_bar", "python")
+        assert "pytest" in cmd or "test_foo" in cmd
+
+    def test_go_package_path(self) -> None:
+        cmd = _build_pro_test_command("./pkg/router", "go")
+        assert "go test" in cmd
+        assert "./pkg/router" in cmd
+        assert "-v" in cmd
+
+    def test_go_function_name(self) -> None:
+        cmd = _build_pro_test_command("TestRouteMatching", "go")
+        assert "go test" in cmd
+        assert "-run" in cmd
+        assert "TestRouteMatching" in cmd
+
+    def test_typescript_file(self) -> None:
+        cmd = _build_pro_test_command("src/__tests__/parser.test.ts", "typescript")
+        assert "npx jest" in cmd
+        assert "parser.test.ts" in cmd
+
+    def test_typescript_pattern(self) -> None:
+        cmd = _build_pro_test_command("should parse tokens", "typescript")
+        assert "npx jest" in cmd
+        assert "-t" in cmd
+
+    def test_javascript_file(self) -> None:
+        cmd = _build_pro_test_command("test/index.test.js", "javascript")
+        assert "npx jest" in cmd
+        assert "index.test.js" in cmd
+
+    def test_javascript_pattern(self) -> None:
+        cmd = _build_pro_test_command("handles edge case", "javascript")
+        assert "npx jest" in cmd
+        assert "-t" in cmd
+
+    def test_prebuilt_conda_activation(self) -> None:
+        cmd = _build_pro_test_command("TestFoo", "go", uses_prebuilt=True)
+        assert "conda activate testbed" in cmd
+
+    def test_unknown_language_fallback(self) -> None:
+        cmd = _build_pro_test_command("test_something", "rust")
+        assert "test_something" in cmd
+
+
+class TestSWEBenchProDockerImage:
+    """Tests for pre-built image lookup."""
+
+    def test_get_prebuilt_image_with_tag(self) -> None:
+        task = {"dockerhub_tag": "django.django-django__django-abc123"}
+        benchmark = SWEBenchProBenchmark()
+        expected = f"{SWEBENCH_PRO_IMAGE_PREFIX}:django.django-django__django-abc123"
+        assert benchmark.get_prebuilt_image(task) == expected
+
+    def test_get_prebuilt_image_missing(self) -> None:
+        task = {"instance_id": "test-123"}
+        benchmark = SWEBenchProBenchmark()
+        assert benchmark.get_prebuilt_image(task) is None
+
+
+class TestSWEBenchProPromptTemplate:
+    """Tests for prompt template."""
+
+    def test_has_placeholder(self) -> None:
+        benchmark = SWEBenchProBenchmark()
+        template = benchmark.get_prompt_template()
+        assert "{problem_statement}" in template
+
+    def test_mentions_multiple_languages(self) -> None:
+        benchmark = SWEBenchProBenchmark()
+        template = benchmark.get_prompt_template()
+        assert "Go" in template
+        assert "TypeScript" in template
+        assert "JavaScript" in template
+
+
+class TestSWEBenchProFilterCategory:
+    """Tests for category filtering in load_tasks."""
+
+    @patch("mcpbr.benchmarks.swebench_pro.load_dataset")
+    def test_filter_by_language(self, mock_load: MagicMock) -> None:
+        mock_dataset = [
+            {
+                "instance_id": "t1",
+                "repo": "django/django",
+                "repo_language": "python",
+                "problem_statement": "p",
+                "base_commit": "c",
+            },
+            {
+                "instance_id": "t2",
+                "repo": "gin-gonic/gin",
+                "repo_language": "go",
+                "problem_statement": "p",
+                "base_commit": "c",
+            },
+            {
+                "instance_id": "t3",
+                "repo": "vercel/next.js",
+                "repo_language": "typescript",
+                "problem_statement": "p",
+                "base_commit": "c",
+            },
+        ]
+        mock_load.return_value = MagicMock(
+            __iter__=lambda self: iter(mock_dataset),
+            __len__=lambda self: len(mock_dataset),
+        )
+
+        benchmark = SWEBenchProBenchmark()
+        tasks = benchmark.load_tasks(filter_category=["go"])
+        assert len(tasks) == 1
+        assert tasks[0]["instance_id"] == "t2"
+
+    @patch("mcpbr.benchmarks.swebench_pro.load_dataset")
+    def test_filter_by_repo_substring(self, mock_load: MagicMock) -> None:
+        mock_dataset = [
+            {
+                "instance_id": "t1",
+                "repo": "django/django",
+                "repo_language": "python",
+                "problem_statement": "p",
+                "base_commit": "c",
+            },
+            {
+                "instance_id": "t2",
+                "repo": "gin-gonic/gin",
+                "repo_language": "go",
+                "problem_statement": "p",
+                "base_commit": "c",
+            },
+        ]
+        mock_load.return_value = MagicMock(
+            __iter__=lambda self: iter(mock_dataset),
+            __len__=lambda self: len(mock_dataset),
+        )
+
+        benchmark = SWEBenchProBenchmark()
+        tasks = benchmark.load_tasks(filter_category=["django"])
+        assert len(tasks) == 1
+        assert tasks[0]["instance_id"] == "t1"
+
+    def test_pro_languages_set(self) -> None:
+        assert {"python", "go", "typescript", "javascript"} == PRO_LANGUAGES
+
+
+class TestSWEBenchProLoadTasks:
+    """Tests for task loading."""
+
+    @patch("mcpbr.benchmarks.swebench_pro.load_dataset")
+    def test_sample_size(self, mock_load: MagicMock) -> None:
+        mock_dataset = [
+            {
+                "instance_id": f"t{i}",
+                "repo": "r",
+                "problem_statement": "p",
+                "base_commit": "c",
+            }
+            for i in range(10)
+        ]
+        mock_ds = MagicMock()
+        mock_ds.__iter__ = lambda self: iter(mock_dataset)
+        mock_ds.__len__ = lambda self: len(mock_dataset)
+        mock_ds.select = MagicMock(return_value=mock_dataset[:3])
+        mock_load.return_value = mock_ds
+
+        benchmark = SWEBenchProBenchmark()
+        tasks = benchmark.load_tasks(sample_size=3)
+        assert len(tasks) == 3
+
+    @patch("mcpbr.benchmarks.swebench_pro.load_dataset")
+    def test_task_ids(self, mock_load: MagicMock) -> None:
+        mock_dataset = [
+            {
+                "instance_id": f"t{i}",
+                "repo": "r",
+                "problem_statement": "p",
+                "base_commit": "c",
+            }
+            for i in range(5)
+        ]
+        mock_load.return_value = MagicMock(
+            __iter__=lambda self: iter(mock_dataset),
+            __len__=lambda self: len(mock_dataset),
+        )
+
+        benchmark = SWEBenchProBenchmark()
+        tasks = benchmark.load_tasks(task_ids=["t1", "t3"])
+        assert len(tasks) == 2
+        ids = {t["instance_id"] for t in tasks}
+        assert ids == {"t1", "t3"}
+
+    @patch("mcpbr.benchmarks.swebench_pro.load_dataset")
+    def test_combined_filters(self, mock_load: MagicMock) -> None:
+        mock_dataset = [
+            {
+                "instance_id": "t1",
+                "repo": "django/django",
+                "repo_language": "python",
+                "problem_statement": "p",
+                "base_commit": "c",
+            },
+            {
+                "instance_id": "t2",
+                "repo": "gin-gonic/gin",
+                "repo_language": "go",
+                "problem_statement": "p",
+                "base_commit": "c",
+            },
+            {
+                "instance_id": "t3",
+                "repo": "vercel/next.js",
+                "repo_language": "typescript",
+                "problem_statement": "p",
+                "base_commit": "c",
+            },
+        ]
+        mock_load.return_value = MagicMock(
+            __iter__=lambda self: iter(mock_dataset),
+            __len__=lambda self: len(mock_dataset),
+        )
+
+        benchmark = SWEBenchProBenchmark()
+        tasks = benchmark.load_tasks(
+            task_ids=["t1", "t2"],
+            filter_category=["python"],
+        )
+        assert len(tasks) == 1
+        assert tasks[0]["instance_id"] == "t1"
+
+
+class TestSWEBenchProSandboxLevel:
+    """Tests for sandbox level."""
+
+    def test_default_sandbox_level(self) -> None:
+        benchmark = SWEBenchProBenchmark()
+        assert benchmark.get_default_sandbox_level() is None
+
+
+class TestSWEBenchProRegistry:
+    """Tests for benchmark registry integration."""
+
+    def test_create_swebench_pro(self) -> None:
+        from mcpbr.benchmarks import create_benchmark
+
+        benchmark = create_benchmark("swe-bench-pro")
+        assert isinstance(benchmark, SWEBenchProBenchmark)
+        assert benchmark.dataset == "ScaleAI/SWE-bench_Pro"
+
+    def test_listed_in_registry(self) -> None:
+        from mcpbr.benchmarks import list_benchmarks
+
+        assert "swe-bench-pro" in list_benchmarks()
+
+
+class TestSWEBenchProEvalResultToDict:
+    """Tests for _eval_result_to_dict helper."""
+
+    def test_basic_conversion(self) -> None:
+        from mcpbr.evaluation import EvaluationResult, TestResults
+
+        benchmark = SWEBenchProBenchmark()
+        result = EvaluationResult(
+            resolved=True,
+            patch_applied=True,
+            fail_to_pass=TestResults(passed=2, total=2, details=[]),
+            pass_to_pass=TestResults(passed=5, total=5, details=[]),
+        )
+        d = benchmark._eval_result_to_dict(result)
+        assert d["resolved"] is True
+        assert d["patch_applied"] is True
+        assert d["fail_to_pass"]["passed"] == 2
+        assert d["pass_to_pass"]["passed"] == 5
+
+    def test_with_error(self) -> None:
+        from mcpbr.evaluation import EvaluationResult
+
+        benchmark = SWEBenchProBenchmark()
+        result = EvaluationResult(
+            resolved=False,
+            patch_applied=False,
+            error="Patch failed",
+        )
+        d = benchmark._eval_result_to_dict(result)
+        assert d["resolved"] is False
+        assert d["eval_error"] == "Patch failed"
+        assert "fail_to_pass" not in d
diff --git a/uv.lock b/uv.lock
index ef2aaf8..bfeb7e8 100644
--- a/uv.lock
+++ b/uv.lock
@@ -190,29 +190,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/3a/2a/7cc015f5b9f5db42b7d48157e23356022889fc354a2813c15934b7cb5c0e/attrs-25.4.0-py3-none-any.whl", hash = "sha256:adcf7e2a1fb3b36ac48d97835bb6d8ade15b8dcce26aba8bf1d14847b57a3373", size = 67615, upload-time = "2025-10-06T13:54:43.17Z" },
 ]
 
-[[package]]
-name = "babel"
-version = "2.17.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/7d/6b/d52e42361e1aa00709585ecc30b3f9684b3ab62530771402248b1b1d6240/babel-2.17.0.tar.gz", hash = "sha256:0c54cffb19f690cdcc52a3b50bcbf71e07a808d1c80d549f2459b9d2cf0afb9d", size = 9951852, upload-time = "2025-02-01T15:17:41.026Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/b7/b8/3fe70c75fe32afc4bb507f75563d39bc5642255d1d94f1f23604725780bf/babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2", size = 10182537, upload-time = "2025-02-01T15:17:37.39Z" },
-]
-
-[[package]]
-name = "backrefs"
-version = "6.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/86/e3/bb3a439d5cb255c4774724810ad8073830fac9c9dee123555820c1bcc806/backrefs-6.1.tar.gz", hash = "sha256:3bba1749aafe1db9b915f00e0dd166cba613b6f788ffd63060ac3485dc9be231", size = 7011962, upload-time = "2025-11-15T14:52:08.323Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/3b/ee/c216d52f58ea75b5e1841022bbae24438b19834a29b163cb32aa3a2a7c6e/backrefs-6.1-py310-none-any.whl", hash = "sha256:2a2ccb96302337ce61ee4717ceacfbf26ba4efb1d55af86564b8bbaeda39cac1", size = 381059, upload-time = "2025-11-15T14:51:59.758Z" },
-    { url = "https://files.pythonhosted.org/packages/e6/9a/8da246d988ded941da96c7ed945d63e94a445637eaad985a0ed88787cb89/backrefs-6.1-py311-none-any.whl", hash = "sha256:e82bba3875ee4430f4de4b6db19429a27275d95a5f3773c57e9e18abc23fd2b7", size = 392854, upload-time = "2025-11-15T14:52:01.194Z" },
-    { url = "https://files.pythonhosted.org/packages/37/c9/fd117a6f9300c62bbc33bc337fd2b3c6bfe28b6e9701de336b52d7a797ad/backrefs-6.1-py312-none-any.whl", hash = "sha256:c64698c8d2269343d88947c0735cb4b78745bd3ba590e10313fbf3f78c34da5a", size = 398770, upload-time = "2025-11-15T14:52:02.584Z" },
-    { url = "https://files.pythonhosted.org/packages/eb/95/7118e935b0b0bd3f94dfec2d852fd4e4f4f9757bdb49850519acd245cd3a/backrefs-6.1-py313-none-any.whl", hash = "sha256:4c9d3dc1e2e558965202c012304f33d4e0e477e1c103663fd2c3cc9bb18b0d05", size = 400726, upload-time = "2025-11-15T14:52:04.093Z" },
-    { url = "https://files.pythonhosted.org/packages/1d/72/6296bad135bfafd3254ae3648cd152980a424bd6fed64a101af00cc7ba31/backrefs-6.1-py314-none-any.whl", hash = "sha256:13eafbc9ccd5222e9c1f0bec563e6d2a6d21514962f11e7fc79872fd56cbc853", size = 412584, upload-time = "2025-11-15T14:52:05.233Z" },
-    { url = "https://files.pythonhosted.org/packages/02/e3/a4fa1946722c4c7b063cc25043a12d9ce9b4323777f89643be74cef2993c/backrefs-6.1-py39-none-any.whl", hash = "sha256:a9e99b8a4867852cad177a6430e31b0f6e495d65f8c6c134b68c14c3c95bf4b0", size = 381058, upload-time = "2025-11-15T14:52:06.698Z" },
-]
-
 [[package]]
 name = "bcrypt"
 version = "5.0.0"
@@ -524,12 +501,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/bc/58/6b3d24e6b9bc474a2dcdee65dfd1f008867015408a271562e4b690561a4d/cryptography-46.0.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:8456928655f856c6e1533ff59d5be76578a7157224dbd9ce6872f25055ab9ab7", size = 3407605, upload-time = "2026-02-10T19:18:29.233Z" },
 ]
 
-[[package]]
-name = "csscompressor"
-version = "0.9.5"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/f1/2a/8c3ac3d8bc94e6de8d7ae270bb5bc437b210bb9d6d9e46630c98f4abd20c/csscompressor-0.9.5.tar.gz", hash = "sha256:afa22badbcf3120a4f392e4d22f9fff485c044a1feda4a950ecc5eba9dd31a05", size = 237808, upload-time = "2017-11-26T21:13:08.238Z" }
-
 [[package]]
 name = "datasets"
 version = "4.5.0"
@@ -733,18 +704,6 @@ http = [
     { name = "aiohttp" },
 ]
 
-[[package]]
-name = "ghp-import"
-version = "2.1.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "python-dateutil" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/d9/29/d40217cbe2f6b1359e00c6c307bb3fc876ba74068cbab3dde77f03ca0dc4/ghp-import-2.1.0.tar.gz", hash = "sha256:9c535c4c61193c2df8871222567d7fd7e5014d835f97dc7b7439069e2413d343", size = 10943, upload-time = "2022-05-02T15:47:16.11Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/f7/ec/67fbef5d497f86283db54c22eec6f6140243aae73265799baaaa19cd17fb/ghp_import-2.1.0-py3-none-any.whl", hash = "sha256:8337dd7b50877f163d4c0289bc1f1c7f127550241988d568c1db512c4324a619", size = 11034, upload-time = "2022-05-02T15:47:14.552Z" },
-]
-
 [[package]]
 name = "gitdb"
 version = "4.0.12"
@@ -879,18 +838,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c4/ab/09169d5a4612a5f92490806649ac8d41e3ec9129c636754575b3553f4ea4/googleapis_common_protos-1.72.0-py3-none-any.whl", hash = "sha256:4299c5a82d5ae1a9702ada957347726b167f9f8d1fc352477702a1e851ff4038", size = 297515, upload-time = "2025-11-06T18:29:13.14Z" },
 ]
 
-[[package]]
-name = "griffe"
-version = "1.15.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "colorama" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/0d/0c/3a471b6e31951dce2360477420d0a8d1e00dea6cf33b70f3e8c3ab6e28e1/griffe-1.15.0.tar.gz", hash = "sha256:7726e3afd6f298fbc3696e67958803e7ac843c1cfe59734b6251a40cdbfb5eea", size = 424112, upload-time = "2025-11-10T15:03:15.52Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/9c/83/3b1d03d36f224edded98e9affd0467630fc09d766c0e56fb1498cbb04a9b/griffe-1.15.0-py3-none-any.whl", hash = "sha256:6f6762661949411031f5fcda9593f586e6ce8340f0ba88921a0f2ef7a81eb9a3", size = 150705, upload-time = "2025-11-10T15:03:13.549Z" },
-]
-
 [[package]]
 name = "grpcio"
 version = "1.78.0"
@@ -994,14 +941,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/cb/44/870d44b30e1dcfb6a65932e3e1506c103a8a5aea9103c337e7a53180322c/hf_xet-1.2.0-cp37-abi3-win_amd64.whl", hash = "sha256:e6584a52253f72c9f52f9e549d5895ca7a471608495c4ecaa6cc73dba2b24d69", size = 2905735, upload-time = "2025-10-24T19:04:35.928Z" },
 ]
 
-[[package]]
-name = "htmlmin2"
-version = "0.1.13"
-source = { registry = "https://pypi.org/simple" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/be/31/a76f4bfa885f93b8167cb4c85cf32b54d1f64384d0b897d45bc6d19b7b45/htmlmin2-0.1.13-py3-none-any.whl", hash = "sha256:75609f2a42e64f7ce57dbff28a39890363bde9e7e5885db633317efbdf8c79a2", size = 34486, upload-time = "2023-03-14T21:28:30.388Z" },
-]
-
 [[package]]
 name = "httpcore"
 version = "1.0.9"
@@ -1108,18 +1047,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/32/4b/b99e37f88336009971405cbb7630610322ed6fbfa31e1d7ab3fbf3049a2d/invoke-2.2.1-py3-none-any.whl", hash = "sha256:2413bc441b376e5cd3f55bb5d364f973ad8bdd7bf87e53c79de3c11bf3feecc8", size = 160287, upload-time = "2025-10-11T00:36:33.703Z" },
 ]
 
-[[package]]
-name = "jinja2"
-version = "3.1.6"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "markupsafe" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" },
-]
-
 [[package]]
 name = "jiter"
 version = "0.12.0"
@@ -1205,12 +1132,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/2f/9c/6753e6522b8d0ef07d3a3d239426669e984fb0eba15a315cdbc1253904e4/jiter-0.12.0-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c24e864cb30ab82311c6425655b0cdab0a98c5d973b065c66a3f020740c2324c", size = 346110, upload-time = "2025-11-09T20:49:21.817Z" },
 ]
 
-[[package]]
-name = "jsmin"
-version = "3.0.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/5e/73/e01e4c5e11ad0494f4407a3f623ad4d87714909f50b17a06ed121034ff6e/jsmin-3.0.1.tar.gz", hash = "sha256:c0959a121ef94542e807a674142606f7e90214a2b3d1eb17300244bbb5cc2bfc", size = 13925, upload-time = "2022-01-16T20:35:59.13Z" }
-
 [[package]]
 name = "jsonschema"
 version = "4.26.0"
@@ -1311,15 +1232,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/94/d1/433b3c06e78f23486fe4fdd19bc134657eb30997d2054b0dbf52bbf3382e/librt-0.8.0-cp314-cp314t-win_arm64.whl", hash = "sha256:92249938ab744a5890580d3cb2b22042f0dce71cdaa7c1369823df62bedf7cbc", size = 48753, upload-time = "2026-02-12T14:53:38.539Z" },
 ]
 
-[[package]]
-name = "markdown"
-version = "3.10.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/b7/b1/af95bcae8549f1f3fd70faacb29075826a0d689a27f232e8cee315efa053/markdown-3.10.1.tar.gz", hash = "sha256:1c19c10bd5c14ac948c53d0d762a04e2fa35a6d58a6b7b1e6bfcbe6fefc0001a", size = 365402, upload-time = "2026-01-21T18:09:28.206Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/59/1b/6ef961f543593969d25b2afe57a3564200280528caa9bd1082eecdd7b3bc/markdown-3.10.1-py3-none-any.whl", hash = "sha256:867d788939fe33e4b736426f5b9f651ad0c0ae0ecf89df0ca5d1176c70812fe3", size = 107684, upload-time = "2026-01-21T18:09:27.203Z" },
-]
-
 [[package]]
 name = "markdown-it-py"
 version = "4.0.0"
@@ -1332,80 +1244,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" },
 ]
 
-[[package]]
-name = "markupsafe"
-version = "3.0.3"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313, upload-time = "2025-09-27T18:37:40.426Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/08/db/fefacb2136439fc8dd20e797950e749aa1f4997ed584c62cfb8ef7c2be0e/markupsafe-3.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1cc7ea17a6824959616c525620e387f6dd30fec8cb44f649e31712db02123dad", size = 11631, upload-time = "2025-09-27T18:36:18.185Z" },
-    { url = "https://files.pythonhosted.org/packages/e1/2e/5898933336b61975ce9dc04decbc0a7f2fee78c30353c5efba7f2d6ff27a/markupsafe-3.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4bd4cd07944443f5a265608cc6aab442e4f74dff8088b0dfc8238647b8f6ae9a", size = 12058, upload-time = "2025-09-27T18:36:19.444Z" },
-    { url = "https://files.pythonhosted.org/packages/1d/09/adf2df3699d87d1d8184038df46a9c80d78c0148492323f4693df54e17bb/markupsafe-3.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b5420a1d9450023228968e7e6a9ce57f65d148ab56d2313fcd589eee96a7a50", size = 24287, upload-time = "2025-09-27T18:36:20.768Z" },
-    { url = "https://files.pythonhosted.org/packages/30/ac/0273f6fcb5f42e314c6d8cd99effae6a5354604d461b8d392b5ec9530a54/markupsafe-3.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0bf2a864d67e76e5c9a34dc26ec616a66b9888e25e7b9460e1c76d3293bd9dbf", size = 22940, upload-time = "2025-09-27T18:36:22.249Z" },
-    { url = "https://files.pythonhosted.org/packages/19/ae/31c1be199ef767124c042c6c3e904da327a2f7f0cd63a0337e1eca2967a8/markupsafe-3.0.3-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:bc51efed119bc9cfdf792cdeaa4d67e8f6fcccab66ed4bfdd6bde3e59bfcbb2f", size = 21887, upload-time = "2025-09-27T18:36:23.535Z" },
-    { url = "https://files.pythonhosted.org/packages/b2/76/7edcab99d5349a4532a459e1fe64f0b0467a3365056ae550d3bcf3f79e1e/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:068f375c472b3e7acbe2d5318dea141359e6900156b5b2ba06a30b169086b91a", size = 23692, upload-time = "2025-09-27T18:36:24.823Z" },
-    { url = "https://files.pythonhosted.org/packages/a4/28/6e74cdd26d7514849143d69f0bf2399f929c37dc2b31e6829fd2045b2765/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:7be7b61bb172e1ed687f1754f8e7484f1c8019780f6f6b0786e76bb01c2ae115", size = 21471, upload-time = "2025-09-27T18:36:25.95Z" },
-    { url = "https://files.pythonhosted.org/packages/62/7e/a145f36a5c2945673e590850a6f8014318d5577ed7e5920a4b3448e0865d/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f9e130248f4462aaa8e2552d547f36ddadbeaa573879158d721bbd33dfe4743a", size = 22923, upload-time = "2025-09-27T18:36:27.109Z" },
-    { url = "https://files.pythonhosted.org/packages/0f/62/d9c46a7f5c9adbeeeda52f5b8d802e1094e9717705a645efc71b0913a0a8/markupsafe-3.0.3-cp311-cp311-win32.whl", hash = "sha256:0db14f5dafddbb6d9208827849fad01f1a2609380add406671a26386cdf15a19", size = 14572, upload-time = "2025-09-27T18:36:28.045Z" },
-    { url = "https://files.pythonhosted.org/packages/83/8a/4414c03d3f891739326e1783338e48fb49781cc915b2e0ee052aa490d586/markupsafe-3.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:de8a88e63464af587c950061a5e6a67d3632e36df62b986892331d4620a35c01", size = 15077, upload-time = "2025-09-27T18:36:29.025Z" },
-    { url = "https://files.pythonhosted.org/packages/35/73/893072b42e6862f319b5207adc9ae06070f095b358655f077f69a35601f0/markupsafe-3.0.3-cp311-cp311-win_arm64.whl", hash = "sha256:3b562dd9e9ea93f13d53989d23a7e775fdfd1066c33494ff43f5418bc8c58a5c", size = 13876, upload-time = "2025-09-27T18:36:29.954Z" },
-    { url = "https://files.pythonhosted.org/packages/5a/72/147da192e38635ada20e0a2e1a51cf8823d2119ce8883f7053879c2199b5/markupsafe-3.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d53197da72cc091b024dd97249dfc7794d6a56530370992a5e1a08983ad9230e", size = 11615, upload-time = "2025-09-27T18:36:30.854Z" },
-    { url = "https://files.pythonhosted.org/packages/9a/81/7e4e08678a1f98521201c3079f77db69fb552acd56067661f8c2f534a718/markupsafe-3.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1872df69a4de6aead3491198eaf13810b565bdbeec3ae2dc8780f14458ec73ce", size = 12020, upload-time = "2025-09-27T18:36:31.971Z" },
-    { url = "https://files.pythonhosted.org/packages/1e/2c/799f4742efc39633a1b54a92eec4082e4f815314869865d876824c257c1e/markupsafe-3.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a7e8ae81ae39e62a41ec302f972ba6ae23a5c5396c8e60113e9066ef893da0d", size = 24332, upload-time = "2025-09-27T18:36:32.813Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/2e/8d0c2ab90a8c1d9a24f0399058ab8519a3279d1bd4289511d74e909f060e/markupsafe-3.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d6dd0be5b5b189d31db7cda48b91d7e0a9795f31430b7f271219ab30f1d3ac9d", size = 22947, upload-time = "2025-09-27T18:36:33.86Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/54/887f3092a85238093a0b2154bd629c89444f395618842e8b0c41783898ea/markupsafe-3.0.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:94c6f0bb423f739146aec64595853541634bde58b2135f27f61c1ffd1cd4d16a", size = 21962, upload-time = "2025-09-27T18:36:35.099Z" },
-    { url = "https://files.pythonhosted.org/packages/c9/2f/336b8c7b6f4a4d95e91119dc8521402461b74a485558d8f238a68312f11c/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:be8813b57049a7dc738189df53d69395eba14fb99345e0a5994914a3864c8a4b", size = 23760, upload-time = "2025-09-27T18:36:36.001Z" },
-    { url = "https://files.pythonhosted.org/packages/32/43/67935f2b7e4982ffb50a4d169b724d74b62a3964bc1a9a527f5ac4f1ee2b/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:83891d0e9fb81a825d9a6d61e3f07550ca70a076484292a70fde82c4b807286f", size = 21529, upload-time = "2025-09-27T18:36:36.906Z" },
-    { url = "https://files.pythonhosted.org/packages/89/e0/4486f11e51bbba8b0c041098859e869e304d1c261e59244baa3d295d47b7/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:77f0643abe7495da77fb436f50f8dab76dbc6e5fd25d39589a0f1fe6548bfa2b", size = 23015, upload-time = "2025-09-27T18:36:37.868Z" },
-    { url = "https://files.pythonhosted.org/packages/2f/e1/78ee7a023dac597a5825441ebd17170785a9dab23de95d2c7508ade94e0e/markupsafe-3.0.3-cp312-cp312-win32.whl", hash = "sha256:d88b440e37a16e651bda4c7c2b930eb586fd15ca7406cb39e211fcff3bf3017d", size = 14540, upload-time = "2025-09-27T18:36:38.761Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/5b/bec5aa9bbbb2c946ca2733ef9c4ca91c91b6a24580193e891b5f7dbe8e1e/markupsafe-3.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:26a5784ded40c9e318cfc2bdb30fe164bdb8665ded9cd64d500a34fb42067b1c", size = 15105, upload-time = "2025-09-27T18:36:39.701Z" },
-    { url = "https://files.pythonhosted.org/packages/e5/f1/216fc1bbfd74011693a4fd837e7026152e89c4bcf3e77b6692fba9923123/markupsafe-3.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:35add3b638a5d900e807944a078b51922212fb3dedb01633a8defc4b01a3c85f", size = 13906, upload-time = "2025-09-27T18:36:40.689Z" },
-    { url = "https://files.pythonhosted.org/packages/38/2f/907b9c7bbba283e68f20259574b13d005c121a0fa4c175f9bed27c4597ff/markupsafe-3.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e1cf1972137e83c5d4c136c43ced9ac51d0e124706ee1c8aa8532c1287fa8795", size = 11622, upload-time = "2025-09-27T18:36:41.777Z" },
-    { url = "https://files.pythonhosted.org/packages/9c/d9/5f7756922cdd676869eca1c4e3c0cd0df60ed30199ffd775e319089cb3ed/markupsafe-3.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:116bb52f642a37c115f517494ea5feb03889e04df47eeff5b130b1808ce7c219", size = 12029, upload-time = "2025-09-27T18:36:43.257Z" },
-    { url = "https://files.pythonhosted.org/packages/00/07/575a68c754943058c78f30db02ee03a64b3c638586fba6a6dd56830b30a3/markupsafe-3.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:133a43e73a802c5562be9bbcd03d090aa5a1fe899db609c29e8c8d815c5f6de6", size = 24374, upload-time = "2025-09-27T18:36:44.508Z" },
-    { url = "https://files.pythonhosted.org/packages/a9/21/9b05698b46f218fc0e118e1f8168395c65c8a2c750ae2bab54fc4bd4e0e8/markupsafe-3.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ccfcd093f13f0f0b7fdd0f198b90053bf7b2f02a3927a30e63f3ccc9df56b676", size = 22980, upload-time = "2025-09-27T18:36:45.385Z" },
-    { url = "https://files.pythonhosted.org/packages/7f/71/544260864f893f18b6827315b988c146b559391e6e7e8f7252839b1b846a/markupsafe-3.0.3-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:509fa21c6deb7a7a273d629cf5ec029bc209d1a51178615ddf718f5918992ab9", size = 21990, upload-time = "2025-09-27T18:36:46.916Z" },
-    { url = "https://files.pythonhosted.org/packages/c2/28/b50fc2f74d1ad761af2f5dcce7492648b983d00a65b8c0e0cb457c82ebbe/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4afe79fb3de0b7097d81da19090f4df4f8d3a2b3adaa8764138aac2e44f3af1", size = 23784, upload-time = "2025-09-27T18:36:47.884Z" },
-    { url = "https://files.pythonhosted.org/packages/ed/76/104b2aa106a208da8b17a2fb72e033a5a9d7073c68f7e508b94916ed47a9/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:795e7751525cae078558e679d646ae45574b47ed6e7771863fcc079a6171a0fc", size = 21588, upload-time = "2025-09-27T18:36:48.82Z" },
-    { url = "https://files.pythonhosted.org/packages/b5/99/16a5eb2d140087ebd97180d95249b00a03aa87e29cc224056274f2e45fd6/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8485f406a96febb5140bfeca44a73e3ce5116b2501ac54fe953e488fb1d03b12", size = 23041, upload-time = "2025-09-27T18:36:49.797Z" },
-    { url = "https://files.pythonhosted.org/packages/19/bc/e7140ed90c5d61d77cea142eed9f9c303f4c4806f60a1044c13e3f1471d0/markupsafe-3.0.3-cp313-cp313-win32.whl", hash = "sha256:bdd37121970bfd8be76c5fb069c7751683bdf373db1ed6c010162b2a130248ed", size = 14543, upload-time = "2025-09-27T18:36:51.584Z" },
-    { url = "https://files.pythonhosted.org/packages/05/73/c4abe620b841b6b791f2edc248f556900667a5a1cf023a6646967ae98335/markupsafe-3.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:9a1abfdc021a164803f4d485104931fb8f8c1efd55bc6b748d2f5774e78b62c5", size = 15113, upload-time = "2025-09-27T18:36:52.537Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/3a/fa34a0f7cfef23cf9500d68cb7c32dd64ffd58a12b09225fb03dd37d5b80/markupsafe-3.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:7e68f88e5b8799aa49c85cd116c932a1ac15caaa3f5db09087854d218359e485", size = 13911, upload-time = "2025-09-27T18:36:53.513Z" },
-    { url = "https://files.pythonhosted.org/packages/e4/d7/e05cd7efe43a88a17a37b3ae96e79a19e846f3f456fe79c57ca61356ef01/markupsafe-3.0.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:218551f6df4868a8d527e3062d0fb968682fe92054e89978594c28e642c43a73", size = 11658, upload-time = "2025-09-27T18:36:54.819Z" },
-    { url = "https://files.pythonhosted.org/packages/99/9e/e412117548182ce2148bdeacdda3bb494260c0b0184360fe0d56389b523b/markupsafe-3.0.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:3524b778fe5cfb3452a09d31e7b5adefeea8c5be1d43c4f810ba09f2ceb29d37", size = 12066, upload-time = "2025-09-27T18:36:55.714Z" },
-    { url = "https://files.pythonhosted.org/packages/bc/e6/fa0ffcda717ef64a5108eaa7b4f5ed28d56122c9a6d70ab8b72f9f715c80/markupsafe-3.0.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4e885a3d1efa2eadc93c894a21770e4bc67899e3543680313b09f139e149ab19", size = 25639, upload-time = "2025-09-27T18:36:56.908Z" },
-    { url = "https://files.pythonhosted.org/packages/96/ec/2102e881fe9d25fc16cb4b25d5f5cde50970967ffa5dddafdb771237062d/markupsafe-3.0.3-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8709b08f4a89aa7586de0aadc8da56180242ee0ada3999749b183aa23df95025", size = 23569, upload-time = "2025-09-27T18:36:57.913Z" },
-    { url = "https://files.pythonhosted.org/packages/4b/30/6f2fce1f1f205fc9323255b216ca8a235b15860c34b6798f810f05828e32/markupsafe-3.0.3-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b8512a91625c9b3da6f127803b166b629725e68af71f8184ae7e7d54686a56d6", size = 23284, upload-time = "2025-09-27T18:36:58.833Z" },
-    { url = "https://files.pythonhosted.org/packages/58/47/4a0ccea4ab9f5dcb6f79c0236d954acb382202721e704223a8aafa38b5c8/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9b79b7a16f7fedff2495d684f2b59b0457c3b493778c9eed31111be64d58279f", size = 24801, upload-time = "2025-09-27T18:36:59.739Z" },
-    { url = "https://files.pythonhosted.org/packages/6a/70/3780e9b72180b6fecb83a4814d84c3bf4b4ae4bf0b19c27196104149734c/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:12c63dfb4a98206f045aa9563db46507995f7ef6d83b2f68eda65c307c6829eb", size = 22769, upload-time = "2025-09-27T18:37:00.719Z" },
-    { url = "https://files.pythonhosted.org/packages/98/c5/c03c7f4125180fc215220c035beac6b9cb684bc7a067c84fc69414d315f5/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8f71bc33915be5186016f675cd83a1e08523649b0e33efdb898db577ef5bb009", size = 23642, upload-time = "2025-09-27T18:37:01.673Z" },
-    { url = "https://files.pythonhosted.org/packages/80/d6/2d1b89f6ca4bff1036499b1e29a1d02d282259f3681540e16563f27ebc23/markupsafe-3.0.3-cp313-cp313t-win32.whl", hash = "sha256:69c0b73548bc525c8cb9a251cddf1931d1db4d2258e9599c28c07ef3580ef354", size = 14612, upload-time = "2025-09-27T18:37:02.639Z" },
-    { url = "https://files.pythonhosted.org/packages/2b/98/e48a4bfba0a0ffcf9925fe2d69240bfaa19c6f7507b8cd09c70684a53c1e/markupsafe-3.0.3-cp313-cp313t-win_amd64.whl", hash = "sha256:1b4b79e8ebf6b55351f0d91fe80f893b4743f104bff22e90697db1590e47a218", size = 15200, upload-time = "2025-09-27T18:37:03.582Z" },
-    { url = "https://files.pythonhosted.org/packages/0e/72/e3cc540f351f316e9ed0f092757459afbc595824ca724cbc5a5d4263713f/markupsafe-3.0.3-cp313-cp313t-win_arm64.whl", hash = "sha256:ad2cf8aa28b8c020ab2fc8287b0f823d0a7d8630784c31e9ee5edea20f406287", size = 13973, upload-time = "2025-09-27T18:37:04.929Z" },
-    { url = "https://files.pythonhosted.org/packages/33/8a/8e42d4838cd89b7dde187011e97fe6c3af66d8c044997d2183fbd6d31352/markupsafe-3.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:eaa9599de571d72e2daf60164784109f19978b327a3910d3e9de8c97b5b70cfe", size = 11619, upload-time = "2025-09-27T18:37:06.342Z" },
-    { url = "https://files.pythonhosted.org/packages/b5/64/7660f8a4a8e53c924d0fa05dc3a55c9cee10bbd82b11c5afb27d44b096ce/markupsafe-3.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c47a551199eb8eb2121d4f0f15ae0f923d31350ab9280078d1e5f12b249e0026", size = 12029, upload-time = "2025-09-27T18:37:07.213Z" },
-    { url = "https://files.pythonhosted.org/packages/da/ef/e648bfd021127bef5fa12e1720ffed0c6cbb8310c8d9bea7266337ff06de/markupsafe-3.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f34c41761022dd093b4b6896d4810782ffbabe30f2d443ff5f083e0cbbb8c737", size = 24408, upload-time = "2025-09-27T18:37:09.572Z" },
-    { url = "https://files.pythonhosted.org/packages/41/3c/a36c2450754618e62008bf7435ccb0f88053e07592e6028a34776213d877/markupsafe-3.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:457a69a9577064c05a97c41f4e65148652db078a3a509039e64d3467b9e7ef97", size = 23005, upload-time = "2025-09-27T18:37:10.58Z" },
-    { url = "https://files.pythonhosted.org/packages/bc/20/b7fdf89a8456b099837cd1dc21974632a02a999ec9bf7ca3e490aacd98e7/markupsafe-3.0.3-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e8afc3f2ccfa24215f8cb28dcf43f0113ac3c37c2f0f0806d8c70e4228c5cf4d", size = 22048, upload-time = "2025-09-27T18:37:11.547Z" },
-    { url = "https://files.pythonhosted.org/packages/9a/a7/591f592afdc734f47db08a75793a55d7fbcc6902a723ae4cfbab61010cc5/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ec15a59cf5af7be74194f7ab02d0f59a62bdcf1a537677ce67a2537c9b87fcda", size = 23821, upload-time = "2025-09-27T18:37:12.48Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/33/45b24e4f44195b26521bc6f1a82197118f74df348556594bd2262bda1038/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:0eb9ff8191e8498cca014656ae6b8d61f39da5f95b488805da4bb029cccbfbaf", size = 21606, upload-time = "2025-09-27T18:37:13.485Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/0e/53dfaca23a69fbfbbf17a4b64072090e70717344c52eaaaa9c5ddff1e5f0/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2713baf880df847f2bece4230d4d094280f4e67b1e813eec43b4c0e144a34ffe", size = 23043, upload-time = "2025-09-27T18:37:14.408Z" },
-    { url = "https://files.pythonhosted.org/packages/46/11/f333a06fc16236d5238bfe74daccbca41459dcd8d1fa952e8fbd5dccfb70/markupsafe-3.0.3-cp314-cp314-win32.whl", hash = "sha256:729586769a26dbceff69f7a7dbbf59ab6572b99d94576a5592625d5b411576b9", size = 14747, upload-time = "2025-09-27T18:37:15.36Z" },
-    { url = "https://files.pythonhosted.org/packages/28/52/182836104b33b444e400b14f797212f720cbc9ed6ba34c800639d154e821/markupsafe-3.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:bdc919ead48f234740ad807933cdf545180bfbe9342c2bb451556db2ed958581", size = 15341, upload-time = "2025-09-27T18:37:16.496Z" },
-    { url = "https://files.pythonhosted.org/packages/6f/18/acf23e91bd94fd7b3031558b1f013adfa21a8e407a3fdb32745538730382/markupsafe-3.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:5a7d5dc5140555cf21a6fefbdbf8723f06fcd2f63ef108f2854de715e4422cb4", size = 14073, upload-time = "2025-09-27T18:37:17.476Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/f0/57689aa4076e1b43b15fdfa646b04653969d50cf30c32a102762be2485da/markupsafe-3.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:1353ef0c1b138e1907ae78e2f6c63ff67501122006b0f9abad68fda5f4ffc6ab", size = 11661, upload-time = "2025-09-27T18:37:18.453Z" },
-    { url = "https://files.pythonhosted.org/packages/89/c3/2e67a7ca217c6912985ec766c6393b636fb0c2344443ff9d91404dc4c79f/markupsafe-3.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1085e7fbddd3be5f89cc898938f42c0b3c711fdcb37d75221de2666af647c175", size = 12069, upload-time = "2025-09-27T18:37:19.332Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/00/be561dce4e6ca66b15276e184ce4b8aec61fe83662cce2f7d72bd3249d28/markupsafe-3.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1b52b4fb9df4eb9ae465f8d0c228a00624de2334f216f178a995ccdcf82c4634", size = 25670, upload-time = "2025-09-27T18:37:20.245Z" },
-    { url = "https://files.pythonhosted.org/packages/50/09/c419f6f5a92e5fadde27efd190eca90f05e1261b10dbd8cbcb39cd8ea1dc/markupsafe-3.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fed51ac40f757d41b7c48425901843666a6677e3e8eb0abcff09e4ba6e664f50", size = 23598, upload-time = "2025-09-27T18:37:21.177Z" },
-    { url = "https://files.pythonhosted.org/packages/22/44/a0681611106e0b2921b3033fc19bc53323e0b50bc70cffdd19f7d679bb66/markupsafe-3.0.3-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f190daf01f13c72eac4efd5c430a8de82489d9cff23c364c3ea822545032993e", size = 23261, upload-time = "2025-09-27T18:37:22.167Z" },
-    { url = "https://files.pythonhosted.org/packages/5f/57/1b0b3f100259dc9fffe780cfb60d4be71375510e435efec3d116b6436d43/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e56b7d45a839a697b5eb268c82a71bd8c7f6c94d6fd50c3d577fa39a9f1409f5", size = 24835, upload-time = "2025-09-27T18:37:23.296Z" },
-    { url = "https://files.pythonhosted.org/packages/26/6a/4bf6d0c97c4920f1597cc14dd720705eca0bf7c787aebc6bb4d1bead5388/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:f3e98bb3798ead92273dc0e5fd0f31ade220f59a266ffd8a4f6065e0a3ce0523", size = 22733, upload-time = "2025-09-27T18:37:24.237Z" },
-    { url = "https://files.pythonhosted.org/packages/14/c7/ca723101509b518797fedc2fdf79ba57f886b4aca8a7d31857ba3ee8281f/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5678211cb9333a6468fb8d8be0305520aa073f50d17f089b5b4b477ea6e67fdc", size = 23672, upload-time = "2025-09-27T18:37:25.271Z" },
-    { url = "https://files.pythonhosted.org/packages/fb/df/5bd7a48c256faecd1d36edc13133e51397e41b73bb77e1a69deab746ebac/markupsafe-3.0.3-cp314-cp314t-win32.whl", hash = "sha256:915c04ba3851909ce68ccc2b8e2cd691618c4dc4c4232fb7982bca3f41fd8c3d", size = 14819, upload-time = "2025-09-27T18:37:26.285Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/8a/0402ba61a2f16038b48b39bccca271134be00c5c9f0f623208399333c448/markupsafe-3.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4faffd047e07c38848ce017e8725090413cd80cbc23d86e55c587bf979e579c9", size = 15426, upload-time = "2025-09-27T18:37:27.316Z" },
-    { url = "https://files.pythonhosted.org/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa", size = 14146, upload-time = "2025-09-27T18:37:28.327Z" },
-]
-
 [[package]]
 name = "mcp"
 version = "1.26.0"
@@ -1433,7 +1271,7 @@ wheels = [
 
 [[package]]
 name = "mcpbr"
-version = "0.14.0"
+version = "0.14.1"
 source = { editable = "." }
 dependencies = [
     { name = "anthropic" },
@@ -1468,12 +1306,6 @@ dev = [
     { name = "types-pyyaml" },
     { name = "types-requests" },
 ]
-docs = [
-    { name = "mkdocs" },
-    { name = "mkdocs-material" },
-    { name = "mkdocs-minify-plugin" },
-    { name = "mkdocstrings", extra = ["python"] },
-]
 gemini = [
     { name = "google-generativeai" },
 ]
@@ -1496,10 +1328,6 @@ requires-dist = [
     { name = "google-generativeai", marker = "extra == 'all-providers'", specifier = ">=0.3.0" },
     { name = "google-generativeai", marker = "extra == 'gemini'", specifier = ">=0.3.0" },
     { name = "mcp", specifier = ">=1.0.0" },
-    { name = "mkdocs", marker = "extra == 'docs'", specifier = ">=1.5.0" },
-    { name = "mkdocs-material", marker = "extra == 'docs'", specifier = ">=9.5.0" },
-    { name = "mkdocs-minify-plugin", marker = "extra == 'docs'", specifier = ">=0.7.0" },
-    { name = "mkdocstrings", extras = ["python"], marker = "extra == 'docs'", specifier = ">=0.24.0" },
     { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.11.0" },
     { name = "openai", marker = "extra == 'all-providers'", specifier = ">=1.0.0" },
     { name = "openai", marker = "extra == 'openai'", specifier = ">=1.0.0" },
@@ -1534,149 +1362,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" },
 ]
 
-[[package]]
-name = "mergedeep"
-version = "1.3.4"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/3a/41/580bb4006e3ed0361b8151a01d324fb03f420815446c7def45d02f74c270/mergedeep-1.3.4.tar.gz", hash = "sha256:0096d52e9dad9939c3d975a774666af186eda617e6ca84df4c94dec30004f2a8", size = 4661, upload-time = "2021-02-05T18:55:30.623Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/2c/19/04f9b178c2d8a15b076c8b5140708fa6ffc5601fb6f1e975537072df5b2a/mergedeep-1.3.4-py3-none-any.whl", hash = "sha256:70775750742b25c0d8f36c55aed03d24c3384d17c951b3175d898bd778ef0307", size = 6354, upload-time = "2021-02-05T18:55:29.583Z" },
-]
-
-[[package]]
-name = "mkdocs"
-version = "1.6.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "click" },
-    { name = "colorama", marker = "sys_platform == 'win32'" },
-    { name = "ghp-import" },
-    { name = "jinja2" },
-    { name = "markdown" },
-    { name = "markupsafe" },
-    { name = "mergedeep" },
-    { name = "mkdocs-get-deps" },
-    { name = "packaging" },
-    { name = "pathspec" },
-    { name = "pyyaml" },
-    { name = "pyyaml-env-tag" },
-    { name = "watchdog" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/bc/c6/bbd4f061bd16b378247f12953ffcb04786a618ce5e904b8c5a01a0309061/mkdocs-1.6.1.tar.gz", hash = "sha256:7b432f01d928c084353ab39c57282f29f92136665bdd6abf7c1ec8d822ef86f2", size = 3889159, upload-time = "2024-08-30T12:24:06.899Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/22/5b/dbc6a8cddc9cfa9c4971d59fb12bb8d42e161b7e7f8cc89e49137c5b279c/mkdocs-1.6.1-py3-none-any.whl", hash = "sha256:db91759624d1647f3f34aa0c3f327dd2601beae39a366d6e064c03468d35c20e", size = 3864451, upload-time = "2024-08-30T12:24:05.054Z" },
-]
-
-[[package]]
-name = "mkdocs-autorefs"
-version = "1.4.3"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "markdown" },
-    { name = "markupsafe" },
-    { name = "mkdocs" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/51/fa/9124cd63d822e2bcbea1450ae68cdc3faf3655c69b455f3a7ed36ce6c628/mkdocs_autorefs-1.4.3.tar.gz", hash = "sha256:beee715b254455c4aa93b6ef3c67579c399ca092259cc41b7d9342573ff1fc75", size = 55425, upload-time = "2025-08-26T14:23:17.223Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/9f/4d/7123b6fa2278000688ebd338e2a06d16870aaf9eceae6ba047ea05f92df1/mkdocs_autorefs-1.4.3-py3-none-any.whl", hash = "sha256:469d85eb3114801d08e9cc55d102b3ba65917a869b893403b8987b601cf55dc9", size = 25034, upload-time = "2025-08-26T14:23:15.906Z" },
-]
-
-[[package]]
-name = "mkdocs-get-deps"
-version = "0.2.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "mergedeep" },
-    { name = "platformdirs" },
-    { name = "pyyaml" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/98/f5/ed29cd50067784976f25ed0ed6fcd3c2ce9eb90650aa3b2796ddf7b6870b/mkdocs_get_deps-0.2.0.tar.gz", hash = "sha256:162b3d129c7fad9b19abfdcb9c1458a651628e4b1dea628ac68790fb3061c60c", size = 10239, upload-time = "2023-11-20T17:51:09.981Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/9f/d4/029f984e8d3f3b6b726bd33cafc473b75e9e44c0f7e80a5b29abc466bdea/mkdocs_get_deps-0.2.0-py3-none-any.whl", hash = "sha256:2bf11d0b133e77a0dd036abeeb06dec8775e46efa526dc70667d8863eefc6134", size = 9521, upload-time = "2023-11-20T17:51:08.587Z" },
-]
-
-[[package]]
-name = "mkdocs-material"
-version = "9.7.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "babel" },
-    { name = "backrefs" },
-    { name = "colorama" },
-    { name = "jinja2" },
-    { name = "markdown" },
-    { name = "mkdocs" },
-    { name = "mkdocs-material-extensions" },
-    { name = "paginate" },
-    { name = "pygments" },
-    { name = "pymdown-extensions" },
-    { name = "requests" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/27/e2/2ffc356cd72f1473d07c7719d82a8f2cbd261666828614ecb95b12169f41/mkdocs_material-9.7.1.tar.gz", hash = "sha256:89601b8f2c3e6c6ee0a918cc3566cb201d40bf37c3cd3c2067e26fadb8cce2b8", size = 4094392, upload-time = "2025-12-18T09:49:00.308Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/3e/32/ed071cb721aca8c227718cffcf7bd539620e9799bbf2619e90c757bfd030/mkdocs_material-9.7.1-py3-none-any.whl", hash = "sha256:3f6100937d7d731f87f1e3e3b021c97f7239666b9ba1151ab476cabb96c60d5c", size = 9297166, upload-time = "2025-12-18T09:48:56.664Z" },
-]
-
-[[package]]
-name = "mkdocs-material-extensions"
-version = "1.3.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/79/9b/9b4c96d6593b2a541e1cb8b34899a6d021d208bb357042823d4d2cabdbe7/mkdocs_material_extensions-1.3.1.tar.gz", hash = "sha256:10c9511cea88f568257f960358a467d12b970e1f7b2c0e5fb2bb48cab1928443", size = 11847, upload-time = "2023-11-22T19:09:45.208Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/5b/54/662a4743aa81d9582ee9339d4ffa3c8fd40a4965e033d77b9da9774d3960/mkdocs_material_extensions-1.3.1-py3-none-any.whl", hash = "sha256:adff8b62700b25cb77b53358dad940f3ef973dd6db797907c49e3c2ef3ab4e31", size = 8728, upload-time = "2023-11-22T19:09:43.465Z" },
-]
-
-[[package]]
-name = "mkdocs-minify-plugin"
-version = "0.8.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "csscompressor" },
-    { name = "htmlmin2" },
-    { name = "jsmin" },
-    { name = "mkdocs" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/52/67/fe4b77e7a8ae7628392e28b14122588beaf6078b53eb91c7ed000fd158ac/mkdocs-minify-plugin-0.8.0.tar.gz", hash = "sha256:bc11b78b8120d79e817308e2b11539d790d21445eb63df831e393f76e52e753d", size = 8366, upload-time = "2024-01-29T16:11:32.982Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/1b/cd/2e8d0d92421916e2ea4ff97f10a544a9bd5588eb747556701c983581df13/mkdocs_minify_plugin-0.8.0-py3-none-any.whl", hash = "sha256:5fba1a3f7bd9a2142c9954a6559a57e946587b21f133165ece30ea145c66aee6", size = 6723, upload-time = "2024-01-29T16:11:31.851Z" },
-]
-
-[[package]]
-name = "mkdocstrings"
-version = "1.0.2"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "jinja2" },
-    { name = "markdown" },
-    { name = "markupsafe" },
-    { name = "mkdocs" },
-    { name = "mkdocs-autorefs" },
-    { name = "pymdown-extensions" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/63/4d/1ca8a9432579184599714aaeb36591414cc3d3bfd9d494f6db540c995ae4/mkdocstrings-1.0.2.tar.gz", hash = "sha256:48edd0ccbcb9e30a3121684e165261a9d6af4d63385fc4f39a54a49ac3b32ea8", size = 101048, upload-time = "2026-01-24T15:57:25.735Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/57/32/407a9a5fdd7d8ecb4af8d830b9bcdf47ea68f916869b3f44bac31f081250/mkdocstrings-1.0.2-py3-none-any.whl", hash = "sha256:41897815a8026c3634fe5d51472c3a569f92ded0ad8c7a640550873eea3b6817", size = 35443, upload-time = "2026-01-24T15:57:23.933Z" },
-]
-
-[package.optional-dependencies]
-python = [
-    { name = "mkdocstrings-python" },
-]
-
-[[package]]
-name = "mkdocstrings-python"
-version = "2.0.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "griffe" },
-    { name = "mkdocs-autorefs" },
-    { name = "mkdocstrings" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/24/75/d30af27a2906f00eb90143470272376d728521997800f5dce5b340ba35bc/mkdocstrings_python-2.0.1.tar.gz", hash = "sha256:843a562221e6a471fefdd4b45cc6c22d2607ccbad632879234fa9692e9cf7732", size = 199345, upload-time = "2025-12-03T14:26:11.755Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/81/06/c5f8deba7d2cbdfa7967a716ae801aa9ca5f734b8f54fd473ef77a088dbe/mkdocstrings_python-2.0.1-py3-none-any.whl", hash = "sha256:66ecff45c5f8b71bf174e11d49afc845c2dfc7fc0ab17a86b6b337e0f24d8d90", size = 105055, upload-time = "2025-12-03T14:26:10.184Z" },
-]
-
 [[package]]
 name = "multidict"
 version = "6.7.1"
@@ -1978,15 +1663,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" },
 ]
 
-[[package]]
-name = "paginate"
-version = "0.5.7"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/ec/46/68dde5b6bc00c1296ec6466ab27dddede6aec9af1b99090e1107091b3b84/paginate-0.5.7.tar.gz", hash = "sha256:22bd083ab41e1a8b4f3690544afb2c60c25e5c9a63a30fa2f483f6c60c8e5945", size = 19252, upload-time = "2024-08-25T14:17:24.139Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/90/96/04b8e52da071d28f5e21a805b19cb9390aa17a47462ac87f5e2696b9566d/paginate-0.5.7-py2.py3-none-any.whl", hash = "sha256:b885e2af73abcf01d9559fd5216b57ef722f8c42affbb63942377668e35c7591", size = 13746, upload-time = "2024-08-25T14:17:22.55Z" },
-]
-
 [[package]]
 name = "pandas"
 version = "3.0.0"
@@ -2487,19 +2163,6 @@ crypto = [
     { name = "cryptography" },
 ]
 
-[[package]]
-name = "pymdown-extensions"
-version = "10.20.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "markdown" },
-    { name = "pyyaml" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/1e/6c/9e370934bfa30e889d12e61d0dae009991294f40055c238980066a7fbd83/pymdown_extensions-10.20.1.tar.gz", hash = "sha256:e7e39c865727338d434b55f1dd8da51febcffcaebd6e1a0b9c836243f660740a", size = 852860, upload-time = "2026-01-24T05:56:56.758Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/40/6d/b6ee155462a0156b94312bdd82d2b92ea56e909740045a87ccb98bf52405/pymdown_extensions-10.20.1-py3-none-any.whl", hash = "sha256:24af7feacbca56504b313b7b418c4f5e1317bb5fea60f03d57be7fcc40912aa0", size = 268768, upload-time = "2026-01-24T05:56:54.537Z" },
-]
-
 [[package]]
 name = "pynacl"
 version = "1.6.2"
@@ -2677,18 +2340,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" },
 ]
 
-[[package]]
-name = "pyyaml-env-tag"
-version = "1.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "pyyaml" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/eb/2e/79c822141bfd05a853236b504869ebc6b70159afc570e1d5a20641782eaa/pyyaml_env_tag-1.1.tar.gz", hash = "sha256:2eb38b75a2d21ee0475d6d97ec19c63287a7e140231e4214969d0eac923cd7ff", size = 5737, upload-time = "2025-05-13T15:24:01.64Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/04/11/432f32f8097b03e3cd5fe57e88efb685d964e2e5178a48ed61e841f7fdce/pyyaml_env_tag-1.1-py3-none-any.whl", hash = "sha256:17109e1a528561e32f026364712fee1264bc2ea6715120891174ed1b980d2e04", size = 4722, upload-time = "2025-05-13T15:23:59.629Z" },
-]
-
 [[package]]
 name = "referencing"
 version = "0.37.0"
@@ -3146,33 +2797,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/3a/9a/f3919d7ee7ba99dabf0aac7e299c6c328f5eae94f9f6b28c76005f882d5d/wandb-0.24.2-py3-none-win_arm64.whl", hash = "sha256:b42614b99f8b9af69f88c15a84283a973c8cd5750e9c4752aa3ce21f13dbac9a", size = 20268261, upload-time = "2026-02-05T00:12:14.353Z" },
 ]
 
-[[package]]
-name = "watchdog"
-version = "6.0.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/db/7d/7f3d619e951c88ed75c6037b246ddcf2d322812ee8ea189be89511721d54/watchdog-6.0.0.tar.gz", hash = "sha256:9ddf7c82fda3ae8e24decda1338ede66e1c99883db93711d8fb941eaa2d8c282", size = 131220, upload-time = "2024-11-01T14:07:13.037Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/e0/24/d9be5cd6642a6aa68352ded4b4b10fb0d7889cb7f45814fb92cecd35f101/watchdog-6.0.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6eb11feb5a0d452ee41f824e271ca311a09e250441c262ca2fd7ebcf2461a06c", size = 96393, upload-time = "2024-11-01T14:06:31.756Z" },
-    { url = "https://files.pythonhosted.org/packages/63/7a/6013b0d8dbc56adca7fdd4f0beed381c59f6752341b12fa0886fa7afc78b/watchdog-6.0.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ef810fbf7b781a5a593894e4f439773830bdecb885e6880d957d5b9382a960d2", size = 88392, upload-time = "2024-11-01T14:06:32.99Z" },
-    { url = "https://files.pythonhosted.org/packages/d1/40/b75381494851556de56281e053700e46bff5b37bf4c7267e858640af5a7f/watchdog-6.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:afd0fe1b2270917c5e23c2a65ce50c2a4abb63daafb0d419fde368e272a76b7c", size = 89019, upload-time = "2024-11-01T14:06:34.963Z" },
-    { url = "https://files.pythonhosted.org/packages/39/ea/3930d07dafc9e286ed356a679aa02d777c06e9bfd1164fa7c19c288a5483/watchdog-6.0.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:bdd4e6f14b8b18c334febb9c4425a878a2ac20efd1e0b231978e7b150f92a948", size = 96471, upload-time = "2024-11-01T14:06:37.745Z" },
-    { url = "https://files.pythonhosted.org/packages/12/87/48361531f70b1f87928b045df868a9fd4e253d9ae087fa4cf3f7113be363/watchdog-6.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c7c15dda13c4eb00d6fb6fc508b3c0ed88b9d5d374056b239c4ad1611125c860", size = 88449, upload-time = "2024-11-01T14:06:39.748Z" },
-    { url = "https://files.pythonhosted.org/packages/5b/7e/8f322f5e600812e6f9a31b75d242631068ca8f4ef0582dd3ae6e72daecc8/watchdog-6.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6f10cb2d5902447c7d0da897e2c6768bca89174d0c6e1e30abec5421af97a5b0", size = 89054, upload-time = "2024-11-01T14:06:41.009Z" },
-    { url = "https://files.pythonhosted.org/packages/68/98/b0345cabdce2041a01293ba483333582891a3bd5769b08eceb0d406056ef/watchdog-6.0.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:490ab2ef84f11129844c23fb14ecf30ef3d8a6abafd3754a6f75ca1e6654136c", size = 96480, upload-time = "2024-11-01T14:06:42.952Z" },
-    { url = "https://files.pythonhosted.org/packages/85/83/cdf13902c626b28eedef7ec4f10745c52aad8a8fe7eb04ed7b1f111ca20e/watchdog-6.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:76aae96b00ae814b181bb25b1b98076d5fc84e8a53cd8885a318b42b6d3a5134", size = 88451, upload-time = "2024-11-01T14:06:45.084Z" },
-    { url = "https://files.pythonhosted.org/packages/fe/c4/225c87bae08c8b9ec99030cd48ae9c4eca050a59bf5c2255853e18c87b50/watchdog-6.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a175f755fc2279e0b7312c0035d52e27211a5bc39719dd529625b1930917345b", size = 89057, upload-time = "2024-11-01T14:06:47.324Z" },
-    { url = "https://files.pythonhosted.org/packages/a9/c7/ca4bf3e518cb57a686b2feb4f55a1892fd9a3dd13f470fca14e00f80ea36/watchdog-6.0.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7607498efa04a3542ae3e05e64da8202e58159aa1fa4acddf7678d34a35d4f13", size = 79079, upload-time = "2024-11-01T14:06:59.472Z" },
-    { url = "https://files.pythonhosted.org/packages/5c/51/d46dc9332f9a647593c947b4b88e2381c8dfc0942d15b8edc0310fa4abb1/watchdog-6.0.0-py3-none-manylinux2014_armv7l.whl", hash = "sha256:9041567ee8953024c83343288ccc458fd0a2d811d6a0fd68c4c22609e3490379", size = 79078, upload-time = "2024-11-01T14:07:01.431Z" },
-    { url = "https://files.pythonhosted.org/packages/d4/57/04edbf5e169cd318d5f07b4766fee38e825d64b6913ca157ca32d1a42267/watchdog-6.0.0-py3-none-manylinux2014_i686.whl", hash = "sha256:82dc3e3143c7e38ec49d61af98d6558288c415eac98486a5c581726e0737c00e", size = 79076, upload-time = "2024-11-01T14:07:02.568Z" },
-    { url = "https://files.pythonhosted.org/packages/ab/cc/da8422b300e13cb187d2203f20b9253e91058aaf7db65b74142013478e66/watchdog-6.0.0-py3-none-manylinux2014_ppc64.whl", hash = "sha256:212ac9b8bf1161dc91bd09c048048a95ca3a4c4f5e5d4a7d1b1a7d5752a7f96f", size = 79077, upload-time = "2024-11-01T14:07:03.893Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/3b/b8964e04ae1a025c44ba8e4291f86e97fac443bca31de8bd98d3263d2fcf/watchdog-6.0.0-py3-none-manylinux2014_ppc64le.whl", hash = "sha256:e3df4cbb9a450c6d49318f6d14f4bbc80d763fa587ba46ec86f99f9e6876bb26", size = 79078, upload-time = "2024-11-01T14:07:05.189Z" },
-    { url = "https://files.pythonhosted.org/packages/62/ae/a696eb424bedff7407801c257d4b1afda455fe40821a2be430e173660e81/watchdog-6.0.0-py3-none-manylinux2014_s390x.whl", hash = "sha256:2cce7cfc2008eb51feb6aab51251fd79b85d9894e98ba847408f662b3395ca3c", size = 79077, upload-time = "2024-11-01T14:07:06.376Z" },
-    { url = "https://files.pythonhosted.org/packages/b5/e8/dbf020b4d98251a9860752a094d09a65e1b436ad181faf929983f697048f/watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl", hash = "sha256:20ffe5b202af80ab4266dcd3e91aae72bf2da48c0d33bdb15c66658e685e94e2", size = 79078, upload-time = "2024-11-01T14:07:07.547Z" },
-    { url = "https://files.pythonhosted.org/packages/07/f6/d0e5b343768e8bcb4cda79f0f2f55051bf26177ecd5651f84c07567461cf/watchdog-6.0.0-py3-none-win32.whl", hash = "sha256:07df1fdd701c5d4c8e55ef6cf55b8f0120fe1aef7ef39a1c6fc6bc2e606d517a", size = 79065, upload-time = "2024-11-01T14:07:09.525Z" },
-    { url = "https://files.pythonhosted.org/packages/db/d9/c495884c6e548fce18a8f40568ff120bc3a4b7b99813081c8ac0c936fa64/watchdog-6.0.0-py3-none-win_amd64.whl", hash = "sha256:cbafb470cf848d93b5d013e2ecb245d4aa1c8fd0504e863ccefa32445359d680", size = 79070, upload-time = "2024-11-01T14:07:10.686Z" },
-    { url = "https://files.pythonhosted.org/packages/33/e8/e40370e6d74ddba47f002a32919d91310d6074130fe4e17dabcafc15cbf1/watchdog-6.0.0-py3-none-win_ia64.whl", hash = "sha256:a1914259fa9e1454315171103c6a30961236f508b9b623eae470268bbcc6a22f", size = 79067, upload-time = "2024-11-01T14:07:11.845Z" },
-]
-
 [[package]]
 name = "xxhash"
 version = "3.6.0"

From df54b1902cb5f5412a7c9d7bc4ed2b9faa5387a2 Mon Sep 17 00:00:00 2001
From: Grey Newell <greyshipscode@gmail.com>
Date: Wed, 25 Feb 2026 17:00:31 -0500
Subject: [PATCH 02/14] fix: use shlex.quote for test IDs with special
 characters

Test IDs from SWE-bench Pro datasets can contain single quotes and
literal \uXXXX escape sequences that break shell command construction.

- Replace bare single-quote wrapping with shlex.quote() in
  _build_test_command and _build_pro_test_command
- Add _normalize_test_id() to decode \uXXXX to actual unicode chars
- Fixes qutebrowser PTP test failures where 8/10 tests had IDs
  containing embedded single quotes

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/mcpbr/benchmarks/codegraph.py    |  8 +++++---
 src/mcpbr/benchmarks/swebench_pro.py | 16 ++++++++++------
 src/mcpbr/evaluation.py              | 20 ++++++++++++++++----
 3 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/src/mcpbr/benchmarks/codegraph.py b/src/mcpbr/benchmarks/codegraph.py
index 459a712..5396914 100644
--- a/src/mcpbr/benchmarks/codegraph.py
+++ b/src/mcpbr/benchmarks/codegraph.py
@@ -15,7 +15,7 @@
 import json
 import logging
 import re
-from datetime import datetime, timezone
+from datetime import UTC, datetime
 from typing import Any
 
 from datasets import load_dataset
@@ -266,7 +266,7 @@ async def _setup_environment(self, env: TaskEnvironment, task: dict[str, Any]) -
             "version": 1,
             "repoName": cache_name,
             "commitHash": None,
-            "savedAt": datetime.now(timezone.utc).isoformat(),
+            "savedAt": datetime.now(UTC).isoformat(),
             "raw": result,
         }
 
@@ -382,7 +382,9 @@ def _count_steps(self, text: str) -> int:
             return 1
 
         # Count tool call patterns in the output
-        tool_calls = len(re.findall(r"(?:tool_use|tool_call|<tool>|Tool:|Calling)", text, re.IGNORECASE))
+        tool_calls = len(
+            re.findall(r"(?:tool_use|tool_call|<tool>|Tool:|Calling)", text, re.IGNORECASE)
+        )
         return max(tool_calls, 1)
 
     def get_prebuilt_image(self, task: dict[str, Any]) -> str | None:
diff --git a/src/mcpbr/benchmarks/swebench_pro.py b/src/mcpbr/benchmarks/swebench_pro.py
index 96e692b..3339f03 100644
--- a/src/mcpbr/benchmarks/swebench_pro.py
+++ b/src/mcpbr/benchmarks/swebench_pro.py
@@ -396,11 +396,15 @@ def _build_pro_test_command(test: str, language: str, uses_prebuilt: bool = Fals
     Returns:
         Shell command string to run the test.
     """
-    if language == "python":
-        from ..evaluation import _build_test_command
+    import shlex
+
+    from ..evaluation import _build_test_command, _normalize_test_id
 
+    if language == "python":
         return _build_test_command(test, uses_prebuilt)
 
+    test = _normalize_test_id(test)
+
     if uses_prebuilt:
         activate = "source /opt/miniconda3/etc/profile.d/conda.sh && conda activate testbed && "
     else:
@@ -410,19 +414,19 @@ def _build_pro_test_command(test: str, language: str, uses_prebuilt: bool = Fals
         # Go test identifiers can be package paths or test function names
         if "/" in test or test.startswith("."):
             # Package path: go test -v ./path/to/package
-            return f"{activate}go test -v -count=1 {test} 2>&1"
+            return f"{activate}go test -v -count=1 {shlex.quote(test)} 2>&1"
         else:
             # Test function name: go test -v -run TestName ./...
-            return f"{activate}go test -v -count=1 -run '{test}' ./... 2>&1"
+            return f"{activate}go test -v -count=1 -run {shlex.quote(test)} ./... 2>&1"
 
     if language in ("typescript", "javascript"):
         # Jest-style test identifiers
         if "/" in test or test.endswith((".ts", ".js", ".tsx", ".jsx")):
             # File path
-            return f"{activate}npx jest {test} --verbose --no-cache 2>&1"
+            return f"{activate}npx jest {shlex.quote(test)} --verbose --no-cache 2>&1"
         else:
             # Test name pattern
-            return f"{activate}npx jest -t '{test}' --verbose --no-cache 2>&1"
+            return f"{activate}npx jest -t {shlex.quote(test)} --verbose --no-cache 2>&1"
 
     # Fallback: try running as-is
     return f"{activate}{test} 2>&1"
diff --git a/src/mcpbr/evaluation.py b/src/mcpbr/evaluation.py
index d7861fb..523a918 100644
--- a/src/mcpbr/evaluation.py
+++ b/src/mcpbr/evaluation.py
@@ -3,6 +3,8 @@
 import ast
 import contextlib
 import json
+import re
+import shlex
 from dataclasses import dataclass
 from typing import Any
 
@@ -235,6 +237,16 @@ async def run_tests(
     )
 
 
+def _normalize_test_id(test: str) -> str:
+    """Normalize a test identifier for shell-safe command construction.
+
+    Decodes literal ``\\uXXXX`` escape sequences to actual unicode characters.
+    Some datasets (e.g. SWE-bench Pro) store pytest parametrize IDs with
+    escaped unicode (``\\u2026``) instead of the real character (``…``).
+    """
+    return re.sub(r"\\u([0-9a-fA-F]{4})", lambda m: chr(int(m.group(1), 16)), test)
+
+
 def _build_test_command(test: str, uses_prebuilt: bool = False, repo: str | None = None) -> str:
     """Build a test command for the given test identifier.
 
@@ -249,10 +261,10 @@ def _build_test_command(test: str, uses_prebuilt: bool = False, repo: str | None
     Returns:
         Shell command string to run the test.
     """
-    import re
-
     from .swebench_test_specs import get_repo_test_command
 
+    test = _normalize_test_id(test)
+
     # Pre-built SWE-bench images use a conda environment called 'testbed'
     if uses_prebuilt:
         activate = "source /opt/miniconda3/etc/profile.d/conda.sh && conda activate testbed && "
@@ -289,9 +301,9 @@ def _build_test_command(test: str, uses_prebuilt: bool = False, repo: str | None
         test_module = ".".join(test.split(".")[:2])  # Extract test_utils.tests
         return f"{activate}cd /testbed/tests && ./runtests.py {test_module}"
     elif "::" in test or test.endswith(".py"):
-        return f"{activate}python -m pytest '{test}' -xvs 2>&1"
+        return f"{activate}python -m pytest {shlex.quote(test)} -xvs 2>&1"
     else:
-        return f"{activate}python -m pytest -k '{test}' -xvs 2>&1"
+        return f"{activate}python -m pytest -k {shlex.quote(test)} -xvs 2>&1"
 
 
 async def _apply_test_patch(

From 78487e8575c3e8691b38d9b94bdeb4431a75e3c9 Mon Sep 17 00:00:00 2001
From: Grey Newell <greyshipscode@gmail.com>
Date: Wed, 25 Feb 2026 17:05:32 -0500
Subject: [PATCH 03/14] fix: remove broken unicode normalization from test ID
 handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

_normalize_test_id was incorrectly converting literal \uXXXX sequences
to actual unicode characters. SWE-bench Pro test IDs already contain
the correct literal sequences that match pytest node IDs — converting
them breaks test matching.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/mcpbr/evaluation.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/mcpbr/evaluation.py b/src/mcpbr/evaluation.py
index 523a918..0a45ea5 100644
--- a/src/mcpbr/evaluation.py
+++ b/src/mcpbr/evaluation.py
@@ -240,11 +240,12 @@ async def run_tests(
 def _normalize_test_id(test: str) -> str:
     """Normalize a test identifier for shell-safe command construction.
 
-    Decodes literal ``\\uXXXX`` escape sequences to actual unicode characters.
-    Some datasets (e.g. SWE-bench Pro) store pytest parametrize IDs with
-    escaped unicode (``\\u2026``) instead of the real character (``…``).
+    Currently a no-op pass-through. SWE-bench Pro test IDs already contain
+    the correct literal sequences (e.g. ``\\u2026`` as 7 ASCII characters)
+    that match what pytest uses in its node IDs. Converting them to actual
+    unicode characters would break matching.
     """
-    return re.sub(r"\\u([0-9a-fA-F]{4})", lambda m: chr(int(m.group(1), 16)), test)
+    return test
 
 
 def _build_test_command(test: str, uses_prebuilt: bool = False, repo: str | None = None) -> str:

From cd5378f4a98ad4c3eaf98f1c35c8b9768cc458c2 Mon Sep 17 00:00:00 2001
From: Grey Newell <greyshipscode@gmail.com>
Date: Wed, 25 Feb 2026 17:57:49 -0500
Subject: [PATCH 04/14] fix: resolve language alias mismatch in SWE-bench Pro
 filter

The dataset uses 'js' and 'ts' as language values, but
filter_category accepted 'javascript' and 'typescript' without
mapping to the dataset values. Added _LANGUAGE_ALIASES dict
to resolve user-friendly names to dataset values.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/mcpbr/benchmarks/swebench_pro.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/mcpbr/benchmarks/swebench_pro.py b/src/mcpbr/benchmarks/swebench_pro.py
index 3339f03..d28da57 100644
--- a/src/mcpbr/benchmarks/swebench_pro.py
+++ b/src/mcpbr/benchmarks/swebench_pro.py
@@ -26,7 +26,13 @@
 from .base import BenchmarkTask
 
 # Supported languages in SWE-bench Pro
-PRO_LANGUAGES = {"python", "go", "typescript", "javascript"}
+PRO_LANGUAGES = {"python", "go", "typescript", "javascript", "ts", "js"}
+
+# Aliases: user-friendly names → dataset values
+_LANGUAGE_ALIASES: dict[str, str] = {
+    "javascript": "js",
+    "typescript": "ts",
+}
 
 # DockerHub registry prefix for SWE-bench Pro pre-built images
 SWEBENCH_PRO_IMAGE_PREFIX = "jefzda/sweap-images"
@@ -94,7 +100,9 @@ def load_tasks(
                     cat_lower = category.lower()
                     # If the category is a known language, match by language only
                     if cat_lower in PRO_LANGUAGES:
-                        if cat_lower == language:
+                        # Resolve aliases (e.g., "javascript" -> "js")
+                        resolved = _LANGUAGE_ALIASES.get(cat_lower, cat_lower)
+                        if resolved == language:
                             filtered.append(task)
                             break
                     elif cat_lower in repo.lower():

From 6472bbb93f7b40f873c5fc6cff1f6c7dd0bef2e5 Mon Sep 17 00:00:00 2001
From: Grey Newell <greyshipscode@gmail.com>
Date: Wed, 25 Feb 2026 18:24:24 -0500
Subject: [PATCH 05/14] fix: use language-specific test runners in preflight
 validation

Preflight was using the Python-only run_tests() (pytest) for all
languages, causing 100% failure for Go, JavaScript, and TypeScript
instances. Now routes non-Python languages through
_build_pro_test_command() which generates the correct commands:
- Go: go test -v -count=1 ...
- JS/TS: npx jest ...

Also fixes conda activation bug in _run_lang_tests: SWE-bench Pro
images don't have conda, so uses_prebuilt=False is passed for
non-Python test commands to avoid prepending conda activation.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/mcpbr/benchmark_preflight.py     | 88 ++++++++++++++++++++++++++--
 src/mcpbr/benchmarks/swebench_pro.py |  4 +-
 tests/test_swebench_pro.py           |  2 +-
 3 files changed, 88 insertions(+), 6 deletions(-)

diff --git a/src/mcpbr/benchmark_preflight.py b/src/mcpbr/benchmark_preflight.py
index e95efe1..971d648 100644
--- a/src/mcpbr/benchmark_preflight.py
+++ b/src/mcpbr/benchmark_preflight.py
@@ -12,6 +12,7 @@
 
 from .docker_env import DockerEnvironmentManager, TaskEnvironment
 from .evaluation import (
+    TestResults,
     _apply_test_patch,
     apply_patch,
     get_test_list_field,
@@ -54,6 +55,83 @@ def success_rate(self) -> float:
         return (self.passed / self.total) * 100.0
 
 
+async def _run_preflight_tests(
+    env: TaskEnvironment,
+    tests: list[str],
+    language: str,
+    timeout: int = 300,
+    uses_conda: bool = False,
+    workdir: str | None = None,
+    repo: str | None = None,
+) -> TestResults:
+    """Run tests using the appropriate language-specific runner.
+
+    For Python, delegates to the standard run_tests(). For Go, JavaScript,
+    and TypeScript, builds language-specific commands (go test, npx jest).
+
+    Args:
+        env: Task environment.
+        tests: List of test identifiers.
+        language: Programming language.
+        timeout: Timeout per test in seconds.
+        uses_conda: Whether to activate conda environment.
+        workdir: Working directory inside container.
+        repo: Repository name (used for Python test specs).
+
+    Returns:
+        TestResults with pass/fail counts.
+    """
+    if language == "python":
+        return await run_tests(
+            env,
+            tests,
+            timeout=timeout,
+            uses_prebuilt=uses_conda,
+            workdir=workdir,
+            repo=repo,
+        )
+
+    # Non-Python: use language-specific test commands
+    from .benchmarks.swebench_pro import _build_pro_test_command
+
+    if not tests:
+        return TestResults(passed=0, total=0, details=[])
+
+    results = []
+    passed = 0
+
+    for test in tests:
+        test_cmd = _build_pro_test_command(test, language, uses_conda)
+        try:
+            exit_code, stdout, stderr = await env.exec_command(
+                test_cmd, timeout=timeout, workdir=workdir
+            )
+            test_passed = exit_code == 0
+            if test_passed:
+                passed += 1
+            results.append(
+                {
+                    "test": test,
+                    "passed": test_passed,
+                    "exit_code": exit_code,
+                    "output": stdout[:1000] if stdout else "",
+                    "error": stderr[:1000] if stderr else "",
+                }
+            )
+        except TimeoutError:
+            results.append(
+                {
+                    "test": test,
+                    "passed": False,
+                    "exit_code": -1,
+                    "output": "",
+                    "error": "Test timed out",
+                }
+            )
+
+    return TestResults(passed=passed, total=len(tests), details=results)
+
+
 async def _check_single_instance(
     benchmark: Any,
     task: dict[str, Any],
@@ -137,21 +215,23 @@ async def _check_single_instance(
         uses_conda = env.uses_prebuilt and not task.get("dockerhub_tag")
 
         # Run fail_to_pass tests (all must PASS with golden patch)
-        ftp_results = await run_tests(
+        ftp_results = await _run_preflight_tests(
             env,
             fail_to_pass_tests,
+            language=language,
             timeout=timeout,
-            uses_prebuilt=uses_conda,
+            uses_conda=uses_conda,
             workdir=eval_workdir,
             repo=task.get("repo"),
         )
 
         # Run pass_to_pass tests (all must still PASS)
-        ptp_results = await run_tests(
+        ptp_results = await _run_preflight_tests(
             env,
             pass_to_pass_tests[:10],
+            language=language,
             timeout=timeout,
-            uses_prebuilt=uses_conda,
+            uses_conda=uses_conda,
             workdir=eval_workdir,
             repo=task.get("repo"),
         )
diff --git a/src/mcpbr/benchmarks/swebench_pro.py b/src/mcpbr/benchmarks/swebench_pro.py
index d28da57..7795338 100644
--- a/src/mcpbr/benchmarks/swebench_pro.py
+++ b/src/mcpbr/benchmarks/swebench_pro.py
@@ -304,7 +304,9 @@ async def _run_lang_tests(
         passed = 0
 
         for test in tests:
-            test_cmd = _build_pro_test_command(test, language, env.uses_prebuilt)
+            # SWE-bench Pro images don't use conda — never prepend conda activation
+            # for non-Python languages (uses_prebuilt=False disables it)
+            test_cmd = _build_pro_test_command(test, language, uses_prebuilt=False)
             try:
                 exit_code, stdout, stderr = await env.exec_command(
                     test_cmd, timeout=timeout, workdir=workdir
diff --git a/tests/test_swebench_pro.py b/tests/test_swebench_pro.py
index fde8b8e..c9a1c97 100644
--- a/tests/test_swebench_pro.py
+++ b/tests/test_swebench_pro.py
@@ -237,7 +237,7 @@ def test_filter_by_repo_substring(self, mock_load: MagicMock) -> None:
         assert tasks[0]["instance_id"] == "t1"
 
     def test_pro_languages_set(self) -> None:
-        assert {"python", "go", "typescript", "javascript"} == PRO_LANGUAGES
+        assert {"python", "go", "typescript", "javascript", "ts", "js"} == PRO_LANGUAGES
 
 
 class TestSWEBenchProLoadTasks:

From 351abd4787bb53a417eb4abda7f463f1928a0a1f Mon Sep 17 00:00:00 2001
From: Grey Newell <greyshipscode@gmail.com>
Date: Wed, 25 Feb 2026 18:39:14 -0500
Subject: [PATCH 06/14] fix: correct test command parsing for Go subtests and
 JS/TS pipe format

Go: Test IDs like "TestFoo/#00" and "TestFoo//api/v1" are subtests,
not package paths. Always extract the top-level function name and
use -run with ./... to search all packages.

JS/TS: SWE-bench Pro uses "file.js | test description" format.
Split on " | " to get the file path and test name separately.
"test suite" as description runs the whole file without -t filter.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/mcpbr/benchmarks/swebench_pro.py | 42 ++++++++++++++++++++--------
 tests/test_swebench_pro.py           | 29 +++++++++++++++----
 2 files changed, 54 insertions(+), 17 deletions(-)

diff --git a/src/mcpbr/benchmarks/swebench_pro.py b/src/mcpbr/benchmarks/swebench_pro.py
index 7795338..f92ccbc 100644
--- a/src/mcpbr/benchmarks/swebench_pro.py
+++ b/src/mcpbr/benchmarks/swebench_pro.py
@@ -398,10 +398,15 @@ def get_default_sandbox_level(self) -> str | None:
 def _build_pro_test_command(test: str, language: str, uses_prebuilt: bool = False) -> str:
     """Build a language-specific test command for SWE-bench Pro.
 
+    Test ID formats by language:
+        Go: "TestFoo", "TestFoo/subtest", "TestFoo/#00"
+        JS/TS: "file.js | test description", "file.ts | suite name"
+        Python: "tests/test_foo.py::TestClass::test_method"
+
     Args:
         test: Test identifier.
-        language: Programming language (python, go, typescript, javascript).
-        uses_prebuilt: Whether a pre-built image is being used.
+        language: Programming language (python, go, typescript, javascript, js, ts).
+        uses_prebuilt: Whether a pre-built image is being used (adds conda activation).
 
     Returns:
         Shell command string to run the test.
@@ -421,18 +426,33 @@ def _build_pro_test_command(test: str, language: str, uses_prebuilt: bool = Fals
         activate = ""
 
     if language == "go":
-        # Go test identifiers can be package paths or test function names
-        if "/" in test or test.startswith("."):
-            # Package path: go test -v ./path/to/package
-            return f"{activate}go test -v -count=1 {shlex.quote(test)} 2>&1"
+        # Go test IDs are always function names, optionally with subtests via /
+        # e.g., "TestFoo", "TestFoo/subtest", "TestFoo/#00", "TestFoo//api/v1"
+        # Always use -run with the top-level test name and ./... to search all packages
+        if "/" in test:
+            # Extract top-level test name (before first /)
+            top_level = test.split("/", 1)[0]
+            return f"{activate}go test -v -count=1 -run {shlex.quote(top_level)} ./... 2>&1"
         else:
-            # Test function name: go test -v -run TestName ./...
             return f"{activate}go test -v -count=1 -run {shlex.quote(test)} ./... 2>&1"
 
-    if language in ("typescript", "javascript"):
-        # Jest-style test identifiers
-        if "/" in test or test.endswith((".ts", ".js", ".tsx", ".jsx")):
-            # File path
+    if language in ("typescript", "javascript", "ts", "js"):
+        # SWE-bench Pro format: "file_path | test description"
+        if " | " in test:
+            parts = test.split(" | ", 1)
+            file_path = parts[0].strip()
+            test_name = parts[1].strip()
+            if test_name and test_name != "test suite":
+                # Run specific test file with test name filter
+                return (
+                    f"{activate}npx jest {shlex.quote(file_path)}"
+                    f" -t {shlex.quote(test_name)} --verbose --no-cache 2>&1"
+                )
+            else:
+                # "test suite" means run the whole file
+                return f"{activate}npx jest {shlex.quote(file_path)} --verbose --no-cache 2>&1"
+        elif "/" in test or test.endswith((".ts", ".js", ".tsx", ".jsx")):
+            # Plain file path
             return f"{activate}npx jest {shlex.quote(test)} --verbose --no-cache 2>&1"
         else:
             # Test name pattern
diff --git a/tests/test_swebench_pro.py b/tests/test_swebench_pro.py
index c9a1c97..d7e12ed 100644
--- a/tests/test_swebench_pro.py
+++ b/tests/test_swebench_pro.py
@@ -98,17 +98,20 @@ def test_python_delegates(self) -> None:
         cmd = _build_pro_test_command("tests/test_foo.py::test_bar", "python")
         assert "pytest" in cmd or "test_foo" in cmd
 
-    def test_go_package_path(self) -> None:
-        cmd = _build_pro_test_command("./pkg/router", "go")
-        assert "go test" in cmd
-        assert "./pkg/router" in cmd
-        assert "-v" in cmd
-
     def test_go_function_name(self) -> None:
         cmd = _build_pro_test_command("TestRouteMatching", "go")
         assert "go test" in cmd
         assert "-run" in cmd
         assert "TestRouteMatching" in cmd
+        assert "./..." in cmd
+
+    def test_go_subtest(self) -> None:
+        """Go subtests (TestFoo/#00, TestFoo/subtest) use top-level name with -run."""
+        cmd = _build_pro_test_command("TestParseResourcePath/#00", "go")
+        assert "go test" in cmd
+        assert "-run" in cmd
+        assert "TestParseResourcePath" in cmd
+        assert "./..." in cmd
 
     def test_typescript_file(self) -> None:
         cmd = _build_pro_test_command("src/__tests__/parser.test.ts", "typescript")
@@ -128,7 +131,21 @@ def test_javascript_file(self) -> None:
     def test_javascript_pattern(self) -> None:
         cmd = _build_pro_test_command("handles edge case", "javascript")
         assert "npx jest" in cmd
+
+    def test_js_pipe_format(self) -> None:
+        """SWE-bench Pro JS format: 'file.js | test description'."""
+        cmd = _build_pro_test_command("test/database.js | Test database key methods", "js")
+        assert "npx jest" in cmd
+        assert "test/database.js" in cmd
         assert "-t" in cmd
+        assert "Test database key methods" in cmd
+
+    def test_ts_test_suite_format(self) -> None:
+        """TS 'test suite' format runs the whole file without -t filter."""
+        cmd = _build_pro_test_command("test/tests/LoginFacadeTest.js | test suite", "ts")
+        assert "npx jest" in cmd
+        assert "test/tests/LoginFacadeTest.js" in cmd
+        assert "-t" not in cmd
 
     def test_prebuilt_conda_activation(self) -> None:
         cmd = _build_pro_test_command("TestFoo", "go", uses_prebuilt=True)

From e5ccd29ca4674a1c67c91199606ae6608e7c80de Mon Sep 17 00:00:00 2001
From: Grey Newell <greyshipscode@gmail.com>
Date: Wed, 25 Feb 2026 18:54:04 -0500
Subject: [PATCH 07/14] fix: auto-detect JS/TS test runner (jest/mocha/vitest)
 in containers

Different SWE-bench Pro JS/TS repos use different test frameworks:
- NodeBB: mocha
- element-web: jest
- protonmail/webclients: jest
- tutanota: custom (testdouble)

Added _detect_js_runner() that checks for runner binaries in
node_modules/.bin/ inside the container. Mocha uses --grep for
filtering, jest uses -t, vitest uses -t.

Also refactored JS/TS command building into _build_js_test_command()
for clearer separation of runner-specific logic.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/mcpbr/benchmark_preflight.py     |   9 +-
 src/mcpbr/benchmarks/swebench_pro.py | 127 ++++++++++++++++++++++-----
 tests/test_swebench_pro.py           |  31 ++++++-
 3 files changed, 139 insertions(+), 28 deletions(-)

diff --git a/src/mcpbr/benchmark_preflight.py b/src/mcpbr/benchmark_preflight.py
index 971d648..fd11807 100644
--- a/src/mcpbr/benchmark_preflight.py
+++ b/src/mcpbr/benchmark_preflight.py
@@ -92,16 +92,21 @@ async def _run_preflight_tests(
         )
 
     # Non-Python: use language-specific test commands
-    from .benchmarks.swebench_pro import _build_pro_test_command
+    from .benchmarks.swebench_pro import _build_pro_test_command, _detect_js_runner
 
     if not tests:
         return TestResults(passed=0, total=0, details=[])
 
+    # Detect JS/TS test runner once per instance
+    js_runner = "jest"
+    if language in ("typescript", "javascript", "ts", "js"):
+        js_runner = await _detect_js_runner(env, workdir=workdir)
+
     results = []
     passed = 0
 
     for test in tests:
-        test_cmd = _build_pro_test_command(test, language, uses_conda)
+        test_cmd = _build_pro_test_command(test, language, uses_conda, js_runner=js_runner)
         try:
             exit_code, stdout, stderr = await env.exec_command(
                 test_cmd, timeout=timeout, workdir=workdir
diff --git a/src/mcpbr/benchmarks/swebench_pro.py b/src/mcpbr/benchmarks/swebench_pro.py
index f92ccbc..7e93f27 100644
--- a/src/mcpbr/benchmarks/swebench_pro.py
+++ b/src/mcpbr/benchmarks/swebench_pro.py
@@ -11,6 +11,7 @@
 - Language metadata per task (repo_language field)
 """
 
+import logging
 from typing import Any
 
 from datasets import load_dataset
@@ -25,6 +26,8 @@
 )
 from .base import BenchmarkTask
 
+logger = logging.getLogger(__name__)
+
 # Supported languages in SWE-bench Pro
 PRO_LANGUAGES = {"python", "go", "typescript", "javascript", "ts", "js"}
 
@@ -300,13 +303,20 @@ async def _run_lang_tests(
         if not tests:
             return TestResults(passed=0, total=0, details=[])
 
+        # Detect JS/TS test runner once (avoids repeated detection per test)
+        js_runner = "jest"
+        if language in ("typescript", "javascript", "ts", "js"):
+            js_runner = await _detect_js_runner(env, workdir=workdir)
+
         results = []
         passed = 0
 
         for test in tests:
             # SWE-bench Pro images don't use conda — never prepend conda activation
             # for non-Python languages (uses_prebuilt=False disables it)
-            test_cmd = _build_pro_test_command(test, language, uses_prebuilt=False)
+            test_cmd = _build_pro_test_command(
+                test, language, uses_prebuilt=False, js_runner=js_runner
+            )
             try:
                 exit_code, stdout, stderr = await env.exec_command(
                     test_cmd, timeout=timeout, workdir=workdir
@@ -395,7 +405,43 @@ def get_default_sandbox_level(self) -> str | None:
         return None
 
 
-def _build_pro_test_command(test: str, language: str, uses_prebuilt: bool = False) -> str:
+async def _detect_js_runner(env: "TaskEnvironment", workdir: str | None = None) -> str:
+    """Detect the JavaScript/TypeScript test runner installed in a container.
+
+    Checks for common test runners in order of preference:
+    jest, mocha, vitest. Falls back to "jest" if none detected.
+
+    Args:
+        env: Task environment with exec_command.
+        workdir: Working directory inside container.
+
+    Returns:
+        Runner name: "jest", "mocha", or "vitest".
+    """
+    # Check for runner binaries in node_modules
+    detect_cmd = (
+        "if [ -f node_modules/.bin/jest ]; then echo jest; "
+        "elif [ -f node_modules/.bin/mocha ]; then echo mocha; "
+        "elif [ -f node_modules/.bin/vitest ]; then echo vitest; "
+        "else echo jest; fi"
+    )
+    try:
+        exit_code, stdout, _ = await env.exec_command(detect_cmd, timeout=10, workdir=workdir)
+        if exit_code == 0 and stdout:
+            runner = stdout.strip().split("\n")[-1].strip()
+            if runner in ("jest", "mocha", "vitest"):
+                return runner
+    except Exception:
+        logger.debug("Failed to detect JS test runner, defaulting to jest")
+    return "jest"
+
+
+def _build_pro_test_command(
+    test: str,
+    language: str,
+    uses_prebuilt: bool = False,
+    js_runner: str = "jest",
+) -> str:
     """Build a language-specific test command for SWE-bench Pro.
 
     Test ID formats by language:
@@ -407,6 +453,7 @@ def _build_pro_test_command(test: str, language: str, uses_prebuilt: bool = Fals
         test: Test identifier.
         language: Programming language (python, go, typescript, javascript, js, ts).
         uses_prebuilt: Whether a pre-built image is being used (adds conda activation).
+        js_runner: JavaScript test runner ("jest", "mocha", or "vitest").
 
     Returns:
         Shell command string to run the test.
@@ -437,26 +484,62 @@ def _build_pro_test_command(test: str, language: str, uses_prebuilt: bool = Fals
             return f"{activate}go test -v -count=1 -run {shlex.quote(test)} ./... 2>&1"
 
     if language in ("typescript", "javascript", "ts", "js"):
-        # SWE-bench Pro format: "file_path | test description"
-        if " | " in test:
-            parts = test.split(" | ", 1)
-            file_path = parts[0].strip()
-            test_name = parts[1].strip()
-            if test_name and test_name != "test suite":
-                # Run specific test file with test name filter
-                return (
-                    f"{activate}npx jest {shlex.quote(file_path)}"
-                    f" -t {shlex.quote(test_name)} --verbose --no-cache 2>&1"
-                )
-            else:
-                # "test suite" means run the whole file
-                return f"{activate}npx jest {shlex.quote(file_path)} --verbose --no-cache 2>&1"
-        elif "/" in test or test.endswith((".ts", ".js", ".tsx", ".jsx")):
-            # Plain file path
-            return f"{activate}npx jest {shlex.quote(test)} --verbose --no-cache 2>&1"
-        else:
-            # Test name pattern
-            return f"{activate}npx jest -t {shlex.quote(test)} --verbose --no-cache 2>&1"
+        return _build_js_test_command(test, js_runner, activate)
 
     # Fallback: try running as-is
     return f"{activate}{test} 2>&1"
+
+
+def _build_js_test_command(test: str, runner: str, activate: str = "") -> str:
+    """Build a JS/TS test command for the detected runner.
+
+    Args:
+        test: Test identifier in "file | description" format.
+        runner: Test runner name ("jest", "mocha", or "vitest").
+        activate: Optional conda activation prefix.
+
+    Returns:
+        Shell command string.
+    """
+    import shlex
+
+    # Parse "file | description" format
+    file_path = ""
+    test_name = ""
+    if " | " in test:
+        parts = test.split(" | ", 1)
+        file_path = parts[0].strip()
+        test_name = parts[1].strip()
+    elif "/" in test or test.endswith((".ts", ".js", ".tsx", ".jsx")):
+        file_path = test
+    else:
+        test_name = test
+
+    if runner == "mocha":
+        # mocha: npx mocha <file> --grep "pattern"
+        cmd = f"{activate}npx mocha"
+        if file_path:
+            cmd += f" {shlex.quote(file_path)}"
+        if test_name and test_name != "test suite":
+            cmd += f" --grep {shlex.quote(test_name)}"
+        cmd += " --timeout 30000 2>&1"
+        return cmd
+
+    if runner == "vitest":
+        # vitest: npx vitest run <file> -t "pattern"
+        cmd = f"{activate}npx vitest run"
+        if file_path:
+            cmd += f" {shlex.quote(file_path)}"
+        if test_name and test_name != "test suite":
+            cmd += f" -t {shlex.quote(test_name)}"
+        cmd += " 2>&1"
+        return cmd
+
+    # Default: jest
+    cmd = f"{activate}npx jest"
+    if file_path:
+        cmd += f" {shlex.quote(file_path)}"
+    if test_name and test_name != "test suite":
+        cmd += f" -t {shlex.quote(test_name)}"
+    cmd += " --verbose --no-cache 2>&1"
+    return cmd
diff --git a/tests/test_swebench_pro.py b/tests/test_swebench_pro.py
index d7e12ed..afa44c5 100644
--- a/tests/test_swebench_pro.py
+++ b/tests/test_swebench_pro.py
@@ -132,21 +132,44 @@ def test_javascript_pattern(self) -> None:
         cmd = _build_pro_test_command("handles edge case", "javascript")
         assert "npx jest" in cmd
 
-    def test_js_pipe_format(self) -> None:
-        """SWE-bench Pro JS format: 'file.js | test description'."""
-        cmd = _build_pro_test_command("test/database.js | Test database key methods", "js")
+    def test_js_pipe_format_jest(self) -> None:
+        """SWE-bench Pro JS format with jest runner."""
+        cmd = _build_pro_test_command(
+            "test/database.js | Test database key methods", "js", js_runner="jest"
+        )
         assert "npx jest" in cmd
         assert "test/database.js" in cmd
         assert "-t" in cmd
         assert "Test database key methods" in cmd
 
+    def test_js_pipe_format_mocha(self) -> None:
+        """SWE-bench Pro JS format with mocha runner."""
+        cmd = _build_pro_test_command(
+            "test/database.js | Test database key methods", "js", js_runner="mocha"
+        )
+        assert "npx mocha" in cmd
+        assert "test/database.js" in cmd
+        assert "--grep" in cmd
+        assert "Test database key methods" in cmd
+
     def test_ts_test_suite_format(self) -> None:
         """TS 'test suite' format runs the whole file without -t filter."""
-        cmd = _build_pro_test_command("test/tests/LoginFacadeTest.js | test suite", "ts")
+        cmd = _build_pro_test_command(
+            "test/tests/LoginFacadeTest.js | test suite", "ts", js_runner="jest"
+        )
         assert "npx jest" in cmd
         assert "test/tests/LoginFacadeTest.js" in cmd
         assert "-t" not in cmd
 
+    def test_mocha_test_suite_format(self) -> None:
+        """Mocha 'test suite' runs whole file without --grep."""
+        cmd = _build_pro_test_command(
+            "test/tests/LoginFacadeTest.js | test suite", "js", js_runner="mocha"
+        )
+        assert "npx mocha" in cmd
+        assert "test/tests/LoginFacadeTest.js" in cmd
+        assert "--grep" not in cmd
+
     def test_prebuilt_conda_activation(self) -> None:
         cmd = _build_pro_test_command("TestFoo", "go", uses_prebuilt=True)
         assert "conda activate testbed" in cmd

From 135c73f6b89507a9910d1019661e5865789d575d Mon Sep 17 00:00:00 2001
From: Grey Newell <greyshipscode@gmail.com>
Date: Thu, 26 Feb 2026 10:22:58 -0500
Subject: [PATCH 08/14] fix: prune Docker images after each preflight instance
 and improve JS/TS runner detection

Add docker image prune after each preflight instance to prevent disk
exhaustion on CI runners (each SWE-bench Pro image is ~1.5GB).

Enhance JS/TS test runner detection to check package.json scripts.test
and support ospec, ava, and npm test fallback. Previously, unrecognized
projects (like tutanota) defaulted to jest, causing 100% test failures.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/mcpbr/benchmark_preflight.py     | 24 ++++++++++
 src/mcpbr/benchmarks/swebench_pro.py | 69 ++++++++++++++++++++++++----
 tests/test_swebench_pro.py           | 29 ++++++++++++
 3 files changed, 112 insertions(+), 10 deletions(-)

diff --git a/src/mcpbr/benchmark_preflight.py b/src/mcpbr/benchmark_preflight.py
index fd11807..1996ce0 100644
--- a/src/mcpbr/benchmark_preflight.py
+++ b/src/mcpbr/benchmark_preflight.py
@@ -7,6 +7,7 @@
 
 import asyncio
 import logging
+import subprocess
 from dataclasses import dataclass, field
 from typing import Any
 
@@ -137,6 +138,27 @@ async def _run_preflight_tests(
     return TestResults(passed=passed, total=len(tests), details=results)
 
 
+async def _prune_docker_images() -> None:
+    """Remove unused Docker images to free disk space.
+
+    Called after each preflight instance to prevent disk exhaustion.
+    Each SWE-bench Pro image is ~1.5GB and each instance uses a unique image,
+    so pruning after cleanup is critical for processing many instances.
+    """
+    try:
+        proc = await asyncio.create_subprocess_exec(
+            "docker",
+            "image",
+            "prune",
+            "-af",
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+        await proc.wait()
+    except Exception:
+        logger.debug("Failed to prune Docker images")
+
+
 async def _check_single_instance(
     benchmark: Any,
     task: dict[str, Any],
@@ -283,6 +305,8 @@ async def _check_single_instance(
                 await env.cleanup()
             except Exception:
                 logger.warning(f"Failed to clean up container for {instance_id}")
+        # Prune unused images to free disk space (each image is ~1.5GB)
+        await _prune_docker_images()
 
 
 async def run_benchmark_preflight(
diff --git a/src/mcpbr/benchmarks/swebench_pro.py b/src/mcpbr/benchmarks/swebench_pro.py
index 7e93f27..ac5a447 100644
--- a/src/mcpbr/benchmarks/swebench_pro.py
+++ b/src/mcpbr/benchmarks/swebench_pro.py
@@ -405,35 +405,59 @@ def get_default_sandbox_level(self) -> str | None:
         return None
 
 
+_KNOWN_RUNNERS = ("jest", "mocha", "vitest", "ospec", "ava")
+
+
 async def _detect_js_runner(env: "TaskEnvironment", workdir: str | None = None) -> str:
     """Detect the JavaScript/TypeScript test runner installed in a container.
 
-    Checks for common test runners in order of preference:
-    jest, mocha, vitest. Falls back to "jest" if none detected.
+    Detection strategy:
+    1. Check node_modules/.bin/ for known runner binaries
+    2. Parse package.json scripts.test for runner hints
+    3. Fall back to "npm" (runs npm test) if nothing is detected
 
     Args:
         env: Task environment with exec_command.
         workdir: Working directory inside container.
 
     Returns:
-        Runner name: "jest", "mocha", or "vitest".
+        Runner name: "jest", "mocha", "vitest", "ospec", "ava", or "npm".
     """
     # Check for runner binaries in node_modules
     detect_cmd = (
         "if [ -f node_modules/.bin/jest ]; then echo jest; "
         "elif [ -f node_modules/.bin/mocha ]; then echo mocha; "
         "elif [ -f node_modules/.bin/vitest ]; then echo vitest; "
-        "else echo jest; fi"
+        "elif [ -f node_modules/.bin/ospec ]; then echo ospec; "
+        "elif [ -f node_modules/.bin/ava ]; then echo ava; "
+        "else echo none; fi"
     )
     try:
         exit_code, stdout, _ = await env.exec_command(detect_cmd, timeout=10, workdir=workdir)
         if exit_code == 0 and stdout:
             runner = stdout.strip().split("\n")[-1].strip()
-            if runner in ("jest", "mocha", "vitest"):
+            if runner in _KNOWN_RUNNERS:
                 return runner
     except Exception:
-        logger.debug("Failed to detect JS test runner, defaulting to jest")
-    return "jest"
+        logger.debug("Failed to detect JS test runner from node_modules")
+
+    # Fallback: parse package.json scripts.test for runner hints
+    pkg_cmd = (
+        "node -e \"try{const p=require('./package.json');"
+        "console.log(p.scripts&&p.scripts.test||'')}catch(e){console.log('')}\" 2>/dev/null"
+    )
+    try:
+        exit_code, stdout, _ = await env.exec_command(pkg_cmd, timeout=10, workdir=workdir)
+        if exit_code == 0 and stdout:
+            test_script = stdout.strip().split("\n")[-1].strip().lower()
+            for runner in _KNOWN_RUNNERS:
+                if runner in test_script:
+                    return runner
+    except Exception:
+        logger.debug("Failed to detect JS test runner from package.json")
+
+    # Ultimate fallback: use npm test
+    return "npm"
 
 
 def _build_pro_test_command(
@@ -495,7 +519,7 @@ def _build_js_test_command(test: str, runner: str, activate: str = "") -> str:
 
     Args:
         test: Test identifier in "file | description" format.
-        runner: Test runner name ("jest", "mocha", or "vitest").
+        runner: Test runner name ("jest", "mocha", "vitest", "ospec", "ava", "npm").
         activate: Optional conda activation prefix.
 
     Returns:
@@ -516,7 +540,6 @@ def _build_js_test_command(test: str, runner: str, activate: str = "") -> str:
         test_name = test
 
     if runner == "mocha":
-        # mocha: npx mocha <file> --grep "pattern"
         cmd = f"{activate}npx mocha"
         if file_path:
             cmd += f" {shlex.quote(file_path)}"
@@ -526,7 +549,6 @@ def _build_js_test_command(test: str, runner: str, activate: str = "") -> str:
         return cmd
 
     if runner == "vitest":
-        # vitest: npx vitest run <file> -t "pattern"
         cmd = f"{activate}npx vitest run"
         if file_path:
             cmd += f" {shlex.quote(file_path)}"
@@ -535,6 +557,33 @@ def _build_js_test_command(test: str, runner: str, activate: str = "") -> str:
         cmd += " 2>&1"
         return cmd
 
+    if runner == "ospec":
+        # ospec: run file directly with node (ospec tests are self-executing)
+        if file_path:
+            cmd = f"{activate}node {shlex.quote(file_path)} 2>&1"
+        else:
+            cmd = f"{activate}npx ospec 2>&1"
+        return cmd
+
+    if runner == "ava":
+        cmd = f"{activate}npx ava"
+        if file_path:
+            cmd += f" {shlex.quote(file_path)}"
+        if test_name and test_name != "test suite":
+            cmd += f" -m {shlex.quote(test_name)}"
+        cmd += " 2>&1"
+        return cmd
+
+    if runner == "npm":
+        # Fallback: use npm test, passing file as argument if possible
+        if file_path:
+            cmd = f"{activate}npm test -- {shlex.quote(file_path)} 2>&1"
+        elif test_name:
+            cmd = f"{activate}npm test 2>&1"
+        else:
+            cmd = f"{activate}npm test 2>&1"
+        return cmd
+
     # Default: jest
     cmd = f"{activate}npx jest"
     if file_path:
diff --git a/tests/test_swebench_pro.py b/tests/test_swebench_pro.py
index afa44c5..551d6a4 100644
--- a/tests/test_swebench_pro.py
+++ b/tests/test_swebench_pro.py
@@ -170,6 +170,35 @@ def test_mocha_test_suite_format(self) -> None:
         assert "test/tests/LoginFacadeTest.js" in cmd
         assert "--grep" not in cmd
 
+    def test_ospec_runner_file(self) -> None:
+        """ospec runs test files directly with node."""
+        cmd = _build_pro_test_command(
+            "test/tests/LoginFacadeTest.js | test suite", "ts", js_runner="ospec"
+        )
+        assert "node" in cmd
+        assert "test/tests/LoginFacadeTest.js" in cmd
+
+    def test_ava_runner(self) -> None:
+        """ava runner uses -m for test name matching."""
+        cmd = _build_pro_test_command("test/database.js | Test db methods", "js", js_runner="ava")
+        assert "npx ava" in cmd
+        assert "test/database.js" in cmd
+        assert "-m" in cmd
+        assert "Test db methods" in cmd
+
+    def test_npm_fallback_with_file(self) -> None:
+        """npm fallback passes file via -- to npm test."""
+        cmd = _build_pro_test_command(
+            "test/tests/LoginFacadeTest.js | test suite", "ts", js_runner="npm"
+        )
+        assert "npm test" in cmd
+        assert "test/tests/LoginFacadeTest.js" in cmd
+
+    def test_npm_fallback_no_file(self) -> None:
+        """npm fallback with no file runs plain npm test."""
+        cmd = _build_pro_test_command("should work", "js", js_runner="npm")
+        assert "npm test" in cmd
+
     def test_prebuilt_conda_activation(self) -> None:
         cmd = _build_pro_test_command("TestFoo", "go", uses_prebuilt=True)
         assert "conda activate testbed" in cmd

From 136a331232d8a3cb77d26452d1a3cb589d8468d3 Mon Sep 17 00:00:00 2001
From: Grey Newell <greyshipscode@gmail.com>
Date: Thu, 26 Feb 2026 11:07:10 -0500
Subject: [PATCH 09/14] feat: use official SWE-bench Pro run scripts for test
 execution

Replace custom language-specific test command building (jest/mocha/go test)
with official run_script.sh + parser.py from scaleapi/SWE-bench_Pro-os.
Each of the 11 repos has unique test infrastructure that the official
scripts handle correctly (e.g., Redis for NodeBB, ansible-test for ansible,
custom runners for tutanota). Parser runs locally on the host, avoiding
Python dependency in Go/JS/TS container images.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 CHANGELOG.md                         |  10 +-
 src/mcpbr/benchmark_preflight.py     | 305 +++++++------
 src/mcpbr/benchmarks/swebench_pro.py | 657 +++++++++++++++------------
 tests/test_swebench_pro.py           | 427 ++++++++++++-----
 4 files changed, 857 insertions(+), 542 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 54bf643..7af89fe 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,7 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - **SWE-bench Pro benchmark**: Multi-language benchmark support (Python, Go, TypeScript, JavaScript) with 731 instances across 11 repositories
   - DockerHub-hosted pre-built images via `dockerhub_tag` field
-  - Language-aware test runners (Go `go test`, TS/JS `npx jest`, Python delegates to existing)
+  - Official run scripts from `scaleapi/SWE-bench_Pro-os` for per-repo test infrastructure
   - Filter by language or repository substring with `--filter-category`
 - **Preflight check command**: `mcpbr preflight` validates golden patches pass all tests before evaluation
   - Concurrent validation with configurable parallelism (`--max-concurrent`)
@@ -20,6 +20,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - **Case-insensitive test list field access**: `get_test_list_field()` helper supports both SWE-bench (`FAIL_TO_PASS`) and SWE-bench Pro (`fail_to_pass`) conventions
 - **Docker image override support**: `_image_override` task field allows benchmarks to specify custom Docker images
 
+### Changed
+
+- **SWE-bench Pro test execution**: Replaced custom language-specific test command building (jest/mocha/go test) with official `run_script.sh` + `parser.py` from `scaleapi/SWE-bench_Pro-os`
+  - Each of the 11 repos has unique test infrastructure (e.g., Redis for NodeBB, `ansible-test` for ansible, custom runners for tutanota) that the official scripts handle correctly
+  - Parser runs locally on the host, avoiding Python dependency in Go/JS/TS container images
+  - Scripts repo is shallow-cloned and cached in `~/.cache/mcpbr/swebench-pro-scripts/`
+  - Falls back to standard `evaluate_patch()` for Python tasks without official scripts
+
 ## [0.14.0] - 2026-02-13
 
 ### Added
diff --git a/src/mcpbr/benchmark_preflight.py b/src/mcpbr/benchmark_preflight.py
index 1996ce0..a1a9af9 100644
--- a/src/mcpbr/benchmark_preflight.py
+++ b/src/mcpbr/benchmark_preflight.py
@@ -11,9 +11,14 @@
 from dataclasses import dataclass, field
 from typing import Any
 
+from .benchmarks.swebench_pro import (
+    _ensure_run_scripts_repo,
+    _get_instance_scripts,
+    _match_test_results,
+    _run_official_tests,
+)
 from .docker_env import DockerEnvironmentManager, TaskEnvironment
 from .evaluation import (
-    TestResults,
     _apply_test_patch,
     apply_patch,
     get_test_list_field,
@@ -56,88 +61,6 @@ def success_rate(self) -> float:
         return (self.passed / self.total) * 100.0
 
 
-async def _run_preflight_tests(
-    env: TaskEnvironment,
-    tests: list[str],
-    language: str,
-    timeout: int = 300,
-    uses_conda: bool = False,
-    workdir: str | None = None,
-    repo: str | None = None,
-) -> TestResults:
-    """Run tests using the appropriate language-specific runner.
-
-    For Python, delegates to the standard run_tests(). For Go, JavaScript,
-    and TypeScript, builds language-specific commands (go test, npx jest).
-
-    Args:
-        env: Task environment.
-        tests: List of test identifiers.
-        language: Programming language.
-        timeout: Timeout per test in seconds.
-        uses_conda: Whether to activate conda environment.
-        workdir: Working directory inside container.
-        repo: Repository name (used for Python test specs).
-
-    Returns:
-        TestResults with pass/fail counts.
-    """
-    if language == "python":
-        return await run_tests(
-            env,
-            tests,
-            timeout=timeout,
-            uses_prebuilt=uses_conda,
-            workdir=workdir,
-            repo=repo,
-        )
-
-    # Non-Python: use language-specific test commands
-    from .benchmarks.swebench_pro import _build_pro_test_command, _detect_js_runner
-
-    if not tests:
-        return TestResults(passed=0, total=0, details=[])
-
-    # Detect JS/TS test runner once per instance
-    js_runner = "jest"
-    if language in ("typescript", "javascript", "ts", "js"):
-        js_runner = await _detect_js_runner(env, workdir=workdir)
-
-    results = []
-    passed = 0
-
-    for test in tests:
-        test_cmd = _build_pro_test_command(test, language, uses_conda, js_runner=js_runner)
-        try:
-            exit_code, stdout, stderr = await env.exec_command(
-                test_cmd, timeout=timeout, workdir=workdir
-            )
-            test_passed = exit_code == 0
-            if test_passed:
-                passed += 1
-            results.append(
-                {
-                    "test": test,
-                    "passed": test_passed,
-                    "exit_code": exit_code,
-                    "output": stdout[:1000] if stdout else "",
-                    "error": stderr[:1000] if stderr else "",
-                }
-            )
-        except TimeoutError:
-            results.append(
-                {
-                    "test": test,
-                    "passed": False,
-                    "exit_code": -1,
-                    "output": "",
-                    "error": "Test timed out",
-                }
-            )
-
-    return TestResults(passed=passed, total=len(tests), details=results)
-
-
 async def _prune_docker_images() -> None:
     """Remove unused Docker images to free disk space.
 
@@ -167,6 +90,10 @@ async def _check_single_instance(
 ) -> PreflightResult:
     """Validate a single benchmark instance by applying the golden patch.
 
+    For SWE-bench Pro tasks (identified by having a dockerhub_tag), uses
+    official run scripts from scaleapi/SWE-bench_Pro-os. For standard
+    SWE-bench tasks, falls back to the existing test runner logic.
+
     Args:
         benchmark: Benchmark instance with create_environment method.
         task: Task dictionary with patch, test_patch, fail_to_pass, pass_to_pass.
@@ -231,63 +158,13 @@ async def _check_single_instance(
                 workdir=eval_workdir,
             )
 
-        # Parse test lists (handle both uppercase and lowercase field names)
-        fail_to_pass_str = get_test_list_field(task, "fail_to_pass")
-        pass_to_pass_str = get_test_list_field(task, "pass_to_pass")
-        fail_to_pass_tests = parse_test_list(fail_to_pass_str)
-        pass_to_pass_tests = parse_test_list(pass_to_pass_str)
-
-        # SWE-bench Pro images don't use conda, so skip conda activation
-        # even though uses_prebuilt is True (it only means "image was pulled")
-        uses_conda = env.uses_prebuilt and not task.get("dockerhub_tag")
-
-        # Run fail_to_pass tests (all must PASS with golden patch)
-        ftp_results = await _run_preflight_tests(
-            env,
-            fail_to_pass_tests,
-            language=language,
-            timeout=timeout,
-            uses_conda=uses_conda,
-            workdir=eval_workdir,
-            repo=task.get("repo"),
-        )
+        # Try official SWE-bench Pro run scripts first (for all languages)
+        if task.get("dockerhub_tag"):
+            return await _check_with_official_scripts(env, task, instance_id, language, timeout)
 
-        # Run pass_to_pass tests (all must still PASS)
-        ptp_results = await _run_preflight_tests(
-            env,
-            pass_to_pass_tests[:10],
-            language=language,
-            timeout=timeout,
-            uses_conda=uses_conda,
-            workdir=eval_workdir,
-            repo=task.get("repo"),
-        )
-
-        # Determine status
-        all_ftp_pass = ftp_results.passed == ftp_results.total and ftp_results.total > 0
-        all_ptp_pass = ptp_results.passed == ptp_results.total
-
-        if all_ftp_pass and all_ptp_pass:
-            status = "passed"
-            error_msg = None
-        else:
-            status = "failed"
-            parts = []
-            if not all_ftp_pass:
-                parts.append(f"fail_to_pass: {ftp_results.passed}/{ftp_results.total} passed")
-            if not all_ptp_pass:
-                parts.append(f"pass_to_pass: {ptp_results.passed}/{ptp_results.total} passed")
-            error_msg = "; ".join(parts)
-
-        return PreflightResult(
-            instance_id=instance_id,
-            status=status,
-            fail_to_pass_passed=ftp_results.passed,
-            fail_to_pass_total=ftp_results.total,
-            pass_to_pass_passed=ptp_results.passed,
-            pass_to_pass_total=ptp_results.total,
-            error=error_msg,
-            language=language,
+        # Fallback: standard SWE-bench test runner (Python only)
+        return await _check_with_standard_runner(
+            env, task, instance_id, language, eval_workdir, timeout
         )
 
     except Exception as e:
@@ -309,6 +186,156 @@ async def _check_single_instance(
         await _prune_docker_images()
 
 
+async def _check_with_official_scripts(
+    env: TaskEnvironment,
+    task: dict[str, Any],
+    instance_id: str,
+    language: str,
+    timeout: int,
+) -> PreflightResult:
+    """Run preflight using official SWE-bench Pro run scripts.
+
+    Args:
+        env: Task environment.
+        task: Task dictionary.
+        instance_id: Instance ID.
+        language: Programming language.
+        timeout: Test timeout in seconds.
+
+    Returns:
+        PreflightResult.
+    """
+    try:
+        scripts_repo = _ensure_run_scripts_repo()
+        run_script, parser_script = _get_instance_scripts(scripts_repo, instance_id)
+    except (FileNotFoundError, subprocess.CalledProcessError) as e:
+        return PreflightResult(
+            instance_id=instance_id,
+            status="error",
+            error=f"Failed to get run scripts: {e}",
+            language=language,
+        )
+
+    # Run tests using official scripts
+    parsed_results = await _run_official_tests(
+        env, task, run_script, parser_script, timeout=timeout
+    )
+
+    # Parse expected test lists
+    fail_to_pass_str = get_test_list_field(task, "fail_to_pass")
+    pass_to_pass_str = get_test_list_field(task, "pass_to_pass")
+    fail_to_pass_tests = parse_test_list(fail_to_pass_str)
+    pass_to_pass_tests = parse_test_list(pass_to_pass_str)
+
+    # Match parsed results against expectations
+    ftp_results, ptp_results = _match_test_results(
+        parsed_results, fail_to_pass_tests, pass_to_pass_tests
+    )
+
+    # Determine status
+    all_ftp_pass = ftp_results.passed == ftp_results.total and ftp_results.total > 0
+    all_ptp_pass = ptp_results.passed == ptp_results.total
+
+    if all_ftp_pass and all_ptp_pass:
+        status = "passed"
+        error_msg = None
+    else:
+        status = "failed"
+        parts = []
+        if not all_ftp_pass:
+            parts.append(f"fail_to_pass: {ftp_results.passed}/{ftp_results.total} passed")
+        if not all_ptp_pass:
+            parts.append(f"pass_to_pass: {ptp_results.passed}/{ptp_results.total} passed")
+        error_msg = "; ".join(parts)
+
+    return PreflightResult(
+        instance_id=instance_id,
+        status=status,
+        fail_to_pass_passed=ftp_results.passed,
+        fail_to_pass_total=ftp_results.total,
+        pass_to_pass_passed=ptp_results.passed,
+        pass_to_pass_total=ptp_results.total,
+        error=error_msg,
+        language=language,
+    )
+
+
+async def _check_with_standard_runner(
+    env: TaskEnvironment,
+    task: dict[str, Any],
+    instance_id: str,
+    language: str,
+    eval_workdir: str | None,
+    timeout: int,
+) -> PreflightResult:
+    """Run preflight using standard SWE-bench test runner (Python).
+
+    Falls back to this for standard SWE-bench tasks that don't have
+    official run scripts (non-Pro tasks).
+
+    Args:
+        env: Task environment.
+        task: Task dictionary.
+        instance_id: Instance ID.
+        language: Programming language.
+        eval_workdir: Working directory inside container.
+        timeout: Test timeout in seconds.
+
+    Returns:
+        PreflightResult.
+    """
+    fail_to_pass_str = get_test_list_field(task, "fail_to_pass")
+    pass_to_pass_str = get_test_list_field(task, "pass_to_pass")
+    fail_to_pass_tests = parse_test_list(fail_to_pass_str)
+    pass_to_pass_tests = parse_test_list(pass_to_pass_str)
+
+    uses_conda = env.uses_prebuilt and not task.get("dockerhub_tag")
+
+    ftp_results = await run_tests(
+        env,
+        fail_to_pass_tests,
+        timeout=timeout,
+        uses_prebuilt=uses_conda,
+        workdir=eval_workdir,
+        repo=task.get("repo"),
+    )
+
+    ptp_results = await run_tests(
+        env,
+        pass_to_pass_tests[:10],
+        timeout=timeout,
+        uses_prebuilt=uses_conda,
+        workdir=eval_workdir,
+        repo=task.get("repo"),
+    )
+
+    all_ftp_pass = ftp_results.passed == ftp_results.total and ftp_results.total > 0
+    all_ptp_pass = ptp_results.passed == ptp_results.total
+
+    if all_ftp_pass and all_ptp_pass:
+        status = "passed"
+        error_msg = None
+    else:
+        status = "failed"
+        parts = []
+        if not all_ftp_pass:
+            parts.append(f"fail_to_pass: {ftp_results.passed}/{ftp_results.total} passed")
+        if not all_ptp_pass:
+            parts.append(f"pass_to_pass: {ptp_results.passed}/{ptp_results.total} passed")
+        error_msg = "; ".join(parts)
+
+    return PreflightResult(
+        instance_id=instance_id,
+        status=status,
+        fail_to_pass_passed=ftp_results.passed,
+        fail_to_pass_total=ftp_results.total,
+        pass_to_pass_passed=ptp_results.passed,
+        pass_to_pass_total=ptp_results.total,
+        error=error_msg,
+        language=language,
+    )
+
+
 async def run_benchmark_preflight(
     benchmark: Any,
     tasks: list[dict[str, Any]],
diff --git a/src/mcpbr/benchmarks/swebench_pro.py b/src/mcpbr/benchmarks/swebench_pro.py
index ac5a447..406ed66 100644
--- a/src/mcpbr/benchmarks/swebench_pro.py
+++ b/src/mcpbr/benchmarks/swebench_pro.py
@@ -9,9 +9,17 @@
 - Multi-language test runners (Python, Go, TypeScript, JavaScript)
 - Lowercase field names (fail_to_pass instead of FAIL_TO_PASS)
 - Language metadata per task (repo_language field)
+
+Test execution uses official run scripts from scaleapi/SWE-bench_Pro-os,
+which handle per-repo test infrastructure (e.g., Redis for NodeBB,
+ansible-test for ansible, custom runners for tutanota).
 """
 
+import json
 import logging
+import subprocess
+import tempfile
+from pathlib import Path
 from typing import Any
 
 from datasets import load_dataset
@@ -19,10 +27,12 @@
 from ..docker_env import DockerEnvironmentManager, TaskEnvironment
 from ..evaluation import (
     EvaluationResult,
+    TestResults,
+    _apply_test_patch,
+    apply_patch,
     evaluate_patch,
     get_test_list_field,
     parse_test_list,
-    run_tests,
 )
 from .base import BenchmarkTask
 
@@ -40,6 +50,298 @@
 # DockerHub registry prefix for SWE-bench Pro pre-built images
 SWEBENCH_PRO_IMAGE_PREFIX = "jefzda/sweap-images"
 
+# Git URL for the official SWE-bench Pro run scripts repository
+_RUN_SCRIPTS_REPO = "https://github.com/scaleapi/SWE-bench_Pro-os.git"
+
+# Default cache directory for cloned run scripts
+_DEFAULT_CACHE_DIR = Path.home() / ".cache" / "mcpbr" / "swebench-pro-scripts"
+
+
+def _ensure_run_scripts_repo(cache_dir: Path | None = None) -> Path:
+    """Clone or update the official SWE-bench Pro run scripts repository.
+
+    Performs a shallow clone of scaleapi/SWE-bench_Pro-os into the cache
+    directory. If the repo already exists, reuses it.
+
+    Args:
+        cache_dir: Directory to clone into. Defaults to ~/.cache/mcpbr/swebench-pro-scripts/.
+
+    Returns:
+        Path to the cloned repository root.
+    """
+    repo_dir = cache_dir or _DEFAULT_CACHE_DIR
+
+    if (repo_dir / "run_scripts").is_dir():
+        logger.debug("Run scripts repo already cached at %s", repo_dir)
+        return repo_dir
+
+    repo_dir.mkdir(parents=True, exist_ok=True)
+
+    logger.info("Cloning SWE-bench Pro run scripts to %s", repo_dir)
+    subprocess.run(
+        [
+            "git",
+            "clone",
+            "--depth",
+            "1",
+            "--filter=blob:none",
+            "--sparse",
+            _RUN_SCRIPTS_REPO,
+            str(repo_dir),
+        ],
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+    # Sparse checkout only the run_scripts directory
+    subprocess.run(
+        ["git", "sparse-checkout", "set", "run_scripts"],
+        cwd=str(repo_dir),
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+
+    return repo_dir
+
+
+def _get_instance_scripts(repo_path: Path, instance_id: str) -> tuple[str, str]:
+    """Read the run_script.sh and parser.py for a specific instance.
+
+    Args:
+        repo_path: Path to the cloned SWE-bench_Pro-os repository.
+        instance_id: Instance ID matching the directory name in run_scripts/.
+
+    Returns:
+        Tuple of (run_script_content, parser_content).
+
+    Raises:
+        FileNotFoundError: If instance scripts don't exist.
+    """
+    instance_dir = repo_path / "run_scripts" / instance_id
+
+    run_script_path = instance_dir / "run_script.sh"
+    parser_path = instance_dir / "parser.py"
+
+    if not run_script_path.exists():
+        raise FileNotFoundError(
+            f"No run_script.sh found for instance {instance_id} at {run_script_path}"
+        )
+    if not parser_path.exists():
+        raise FileNotFoundError(f"No parser.py found for instance {instance_id} at {parser_path}")
+
+    return run_script_path.read_text(), parser_path.read_text()
+
+
+async def _run_official_tests(
+    env: TaskEnvironment,
+    task: dict[str, Any],
+    run_script: str,
+    parser_script: str,
+    timeout: int = 300,
+) -> TestResults:
+    """Run tests using the official SWE-bench Pro run scripts.
+
+    Copies run_script.sh into the container, executes it with the selected
+    test files, captures stdout/stderr, then runs parser.py locally on
+    the host to parse results.
+
+    Args:
+        env: Task environment with a running container.
+        task: SWE-bench Pro task dictionary (needs selected_test_files_to_run).
+        run_script: Content of run_script.sh.
+        parser_script: Content of parser.py.
+        timeout: Timeout for test execution in seconds.
+
+    Returns:
+        TestResults with parsed pass/fail counts.
+    """
+    eval_workdir = "/app" if env.uses_prebuilt else None
+
+    # Build test files argument from selected_test_files_to_run
+    selected_files_raw = task.get("selected_test_files_to_run", "[]")
+    try:
+        selected_files = (
+            json.loads(selected_files_raw)
+            if isinstance(selected_files_raw, str)
+            else selected_files_raw
+        )
+    except (json.JSONDecodeError, TypeError):
+        selected_files = []
+
+    if not selected_files:
+        logger.warning("No selected_test_files_to_run for %s", task.get("instance_id"))
+        return TestResults(passed=0, total=0, details=[])
+
+    # Write run_script.sh to container
+    await env.write_file("run_script.sh", run_script, workdir=eval_workdir)
+    await env.exec_command("chmod +x /app/run_script.sh", timeout=10, workdir=eval_workdir)
+
+    # Join test files as comma-separated argument
+    test_files_arg = ",".join(str(f) for f in selected_files)
+
+    # Run the official test script
+    try:
+        _exit_code, stdout, stderr = await env.exec_command(
+            f"bash /app/run_script.sh '{test_files_arg}'",
+            timeout=timeout,
+            workdir=eval_workdir,
+        )
+    except TimeoutError:
+        logger.warning("Test execution timed out for %s", task.get("instance_id"))
+        return TestResults(passed=0, total=0, details=[{"error": "Test timed out"}])
+
+    # Run parser.py locally on host to parse the test output
+    return _parse_test_output_locally(
+        parser_script, stdout, stderr, task.get("instance_id", "unknown")
+    )
+
+
+def _parse_test_output_locally(
+    parser_script: str,
+    stdout: str,
+    stderr: str,
+    instance_id: str,
+) -> TestResults:
+    """Run parser.py as a local subprocess to parse test output.
+
+    The parser runs on the host (not in the container) because Go/JS/TS
+    container images may not have Python installed.
+
+    Args:
+        parser_script: Content of parser.py.
+        stdout: Captured stdout from test execution.
+        stderr: Captured stderr from test execution.
+        instance_id: Instance ID for logging.
+
+    Returns:
+        TestResults parsed from the output.
+    """
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmp = Path(tmpdir)
+        parser_path = tmp / "parser.py"
+        stdout_path = tmp / "stdout.log"
+        stderr_path = tmp / "stderr.log"
+        output_path = tmp / "output.json"
+
+        parser_path.write_text(parser_script)
+        stdout_path.write_text(stdout or "")
+        stderr_path.write_text(stderr or "")
+
+        try:
+            result = subprocess.run(
+                [
+                    "python3",
+                    str(parser_path),
+                    str(stdout_path),
+                    str(stderr_path),
+                    str(output_path),
+                ],
+                capture_output=True,
+                text=True,
+                timeout=30,
+            )
+        except subprocess.TimeoutExpired:
+            logger.warning("Parser timed out for %s", instance_id)
+            return TestResults(passed=0, total=0, details=[{"error": "Parser timed out"}])
+
+        if result.returncode != 0:
+            logger.warning("Parser failed for %s: %s", instance_id, result.stderr[:500])
+            return TestResults(
+                passed=0,
+                total=0,
+                details=[{"error": f"Parser failed: {result.stderr[:500]}"}],
+            )
+
+        if not output_path.exists():
+            logger.warning("Parser produced no output.json for %s", instance_id)
+            return TestResults(passed=0, total=0, details=[])
+
+        try:
+            output_data = json.loads(output_path.read_text())
+        except json.JSONDecodeError:
+            logger.warning("Parser output is not valid JSON for %s", instance_id)
+            return TestResults(passed=0, total=0, details=[])
+
+        tests = output_data.get("tests", [])
+        passed = sum(1 for t in tests if t.get("status") == "PASSED")
+        total = len(tests)
+
+        details = [
+            {
+                "test": t.get("name", "unknown"),
+                "passed": t.get("status") == "PASSED",
+                "status": t.get("status", "UNKNOWN"),
+            }
+            for t in tests
+        ]
+
+        return TestResults(passed=passed, total=total, details=details)
+
+
+def _match_test_results(
+    parsed_results: TestResults,
+    fail_to_pass: list[str],
+    pass_to_pass: list[str],
+) -> tuple[TestResults, TestResults]:
+    """Match parsed test results against expected fail_to_pass and pass_to_pass lists.
+
+    The parser produces test names like "file.js | test description" or
+    "TestFoo" etc. We match these against the expected test lists from
+    the dataset.
+
+    Args:
+        parsed_results: TestResults from _run_official_tests.
+        fail_to_pass: Expected tests that should pass (were failing before fix).
+        pass_to_pass: Expected tests that should still pass (regression check).
+
+    Returns:
+        Tuple of (fail_to_pass_results, pass_to_pass_results).
+    """
+    # Build a lookup of parsed test name → status
+    parsed_status: dict[str, str] = {}
+    for detail in parsed_results.details:
+        name = detail.get("test", "")
+        status = detail.get("status", "UNKNOWN")
+        if name:
+            parsed_status[name] = status
+
+    def _check_tests(expected: list[str]) -> TestResults:
+        if not expected:
+            return TestResults(passed=0, total=0, details=[])
+
+        passed = 0
+        details = []
+        for test_name in expected:
+            # Try exact match first
+            status = parsed_status.get(test_name)
+
+            # If no exact match, try substring matching (parser may add
+            # file prefixes or slightly different formatting)
+            if status is None:
+                for parsed_name, parsed_stat in parsed_status.items():
+                    if test_name in parsed_name or parsed_name in test_name:
+                        status = parsed_stat
+                        break
+
+            test_passed = status == "PASSED"
+            if test_passed:
+                passed += 1
+            details.append(
+                {
+                    "test": test_name,
+                    "passed": test_passed,
+                    "status": status or "NOT_FOUND",
+                }
+            )
+
+        return TestResults(passed=passed, total=len(expected), details=details)
+
+    ftp_results = _check_tests(fail_to_pass)
+    ptp_results = _check_tests(pass_to_pass)
+
+    return ftp_results, ptp_results
+
 
 class SWEBenchProBenchmark:
     """SWE-bench Pro benchmark implementation.
@@ -50,13 +352,26 @@ class SWEBenchProBenchmark:
 
     name = "swe-bench-pro"
 
-    def __init__(self, dataset: str = "ScaleAI/SWE-bench_Pro"):
+    def __init__(
+        self,
+        dataset: str = "ScaleAI/SWE-bench_Pro",
+        scripts_cache_dir: Path | None = None,
+    ):
         """Initialize SWE-bench Pro benchmark.
 
         Args:
             dataset: HuggingFace dataset identifier.
+            scripts_cache_dir: Override cache dir for run scripts repo.
         """
         self.dataset = dataset
+        self._scripts_cache_dir = scripts_cache_dir
+        self._scripts_repo_path: Path | None = None
+
+    def _get_scripts_repo(self) -> Path:
+        """Lazily clone and return the run scripts repo path."""
+        if self._scripts_repo_path is None:
+            self._scripts_repo_path = _ensure_run_scripts_repo(self._scripts_cache_dir)
+        return self._scripts_repo_path
 
     def load_tasks(
         self,
@@ -180,8 +495,9 @@ async def evaluate(
     ) -> dict[str, Any]:
         """Evaluate a patch for SWE-bench Pro task.
 
-        For Python tasks, delegates to the existing evaluate_patch().
-        For Go/TypeScript/JavaScript, uses language-specific test runners.
+        Uses official run scripts from scaleapi/SWE-bench_Pro-os for all
+        languages. Falls back to the standard evaluate_patch() for Python
+        tasks when official scripts are not available.
 
         Args:
             env: Task environment.
@@ -191,37 +507,54 @@ async def evaluate(
         Returns:
             Dictionary with evaluation results including 'resolved' boolean.
         """
-        language = task.get("repo_language", "python").lower()
+        instance_id = task.get("instance_id", "")
+
+        # Try to use official run scripts
+        try:
+            scripts_repo = self._get_scripts_repo()
+            run_script, parser_script = _get_instance_scripts(scripts_repo, instance_id)
+            return await self._evaluate_with_official_scripts(
+                env, task, solution, run_script, parser_script
+            )
+        except FileNotFoundError:
+            logger.info(
+                "No official scripts for %s, falling back to standard evaluation",
+                instance_id,
+            )
 
+        # Fallback for Python tasks without official scripts
+        language = task.get("repo_language", "python").lower()
         if language == "python":
-            # Delegate Python evaluation to existing logic
             eval_result: EvaluationResult = await evaluate_patch(env, task, solution)
             return self._eval_result_to_dict(eval_result)
 
-        # For non-Python languages, use language-specific evaluation
-        return await self._evaluate_multilang(env, task, solution, language)
+        return {
+            "resolved": False,
+            "patch_applied": False,
+            "eval_error": f"No official run scripts found for {instance_id}",
+        }
 
-    async def _evaluate_multilang(
+    async def _evaluate_with_official_scripts(
         self,
         env: TaskEnvironment,
         task: dict[str, Any],
         patch: str,
-        language: str,
+        run_script: str,
+        parser_script: str,
     ) -> dict[str, Any]:
-        """Evaluate a patch using language-specific test runners.
+        """Evaluate using official SWE-bench Pro run scripts.
 
         Args:
             env: Task environment.
             task: SWE-bench Pro task dictionary.
             patch: Unified diff patch to evaluate.
-            language: Programming language (go, typescript, javascript).
+            run_script: Content of run_script.sh.
+            parser_script: Content of parser.py.
 
         Returns:
             Dictionary with evaluation results.
         """
-        from ..evaluation import _apply_test_patch, apply_patch
-
-        # SWE-bench Pro images use /app as their working directory
+        language = task.get("repo_language", "python").lower()
         eval_workdir = "/app" if env.uses_prebuilt else None
 
         applied, error = await apply_patch(env, patch, workdir=eval_workdir)
@@ -241,111 +574,40 @@ async def _evaluate_multilang(
                 workdir=eval_workdir,
             )
 
+        # Run tests using official scripts
+        parsed_results = await _run_official_tests(
+            env, task, run_script, parser_script, timeout=300
+        )
+
+        # Match against expected test lists
         fail_to_pass_str = get_test_list_field(task, "fail_to_pass")
         pass_to_pass_str = get_test_list_field(task, "pass_to_pass")
         fail_to_pass_tests = parse_test_list(fail_to_pass_str)
         pass_to_pass_tests = parse_test_list(pass_to_pass_str)
 
-        fail_to_pass_results = await self._run_lang_tests(
-            env, fail_to_pass_tests, language, workdir=eval_workdir
-        )
-        pass_to_pass_results = await self._run_lang_tests(
-            env, pass_to_pass_tests[:10], language, workdir=eval_workdir
+        ftp_results, ptp_results = _match_test_results(
+            parsed_results, fail_to_pass_tests, pass_to_pass_tests
         )
 
         resolved = (
-            fail_to_pass_results.passed == fail_to_pass_results.total
-            and fail_to_pass_results.total > 0
-            and pass_to_pass_results.passed == pass_to_pass_results.total
+            ftp_results.passed == ftp_results.total
+            and ftp_results.total > 0
+            and ptp_results.passed == ptp_results.total
         )
 
         result: dict[str, Any] = {"resolved": resolved, "patch_applied": True}
-        if fail_to_pass_results:
+        if ftp_results:
             result["fail_to_pass"] = {
-                "passed": fail_to_pass_results.passed,
-                "total": fail_to_pass_results.total,
+                "passed": ftp_results.passed,
+                "total": ftp_results.total,
             }
-        if pass_to_pass_results:
+        if ptp_results:
             result["pass_to_pass"] = {
-                "passed": pass_to_pass_results.passed,
-                "total": pass_to_pass_results.total,
+                "passed": ptp_results.passed,
+                "total": ptp_results.total,
             }
         return result
 
-    async def _run_lang_tests(
-        self,
-        env: TaskEnvironment,
-        tests: list[str],
-        language: str,
-        workdir: str | None = None,
-        timeout: int = 120,
-    ) -> Any:
-        """Run tests using language-specific commands.
-
-        Args:
-            env: Task environment.
-            tests: List of test identifiers.
-            language: Programming language.
-            workdir: Working directory.
-            timeout: Timeout per test in seconds.
-
-        Returns:
-            TestResults instance.
-        """
-        if language == "python":
-            return await run_tests(
-                env, tests, timeout=timeout, uses_prebuilt=env.uses_prebuilt, workdir=workdir
-            )
-
-        # For non-Python, build language-specific commands and run
-        from ..evaluation import TestResults
-
-        if not tests:
-            return TestResults(passed=0, total=0, details=[])
-
-        # Detect JS/TS test runner once (avoids repeated detection per test)
-        js_runner = "jest"
-        if language in ("typescript", "javascript", "ts", "js"):
-            js_runner = await _detect_js_runner(env, workdir=workdir)
-
-        results = []
-        passed = 0
-
-        for test in tests:
-            # SWE-bench Pro images don't use conda — never prepend conda activation
-            # for non-Python languages (uses_prebuilt=False disables it)
-            test_cmd = _build_pro_test_command(
-                test, language, uses_prebuilt=False, js_runner=js_runner
-            )
-            try:
-                exit_code, stdout, stderr = await env.exec_command(
-                    test_cmd, timeout=timeout, workdir=workdir
-                )
-                test_passed = exit_code == 0
-                if test_passed:
-                    passed += 1
-                results.append(
-                    {
-                        "test": test,
-                        "passed": test_passed,
-                        "exit_code": exit_code,
-                        "output": stdout[:1000] if stdout else "",
-                        "error": stderr[:1000] if stderr else "",
-                    }
-                )
-            except TimeoutError:
-                results.append(
-                    {
-                        "test": test,
-                        "passed": False,
-                        "exit_code": -1,
-                        "output": "",
-                        "error": "Test timed out",
-                    }
-                )
-
-        return TestResults(passed=passed, total=len(tests), details=results)
-
     def _eval_result_to_dict(self, eval_result: EvaluationResult) -> dict[str, Any]:
         """Convert EvaluationResult to dictionary format."""
         result: dict[str, Any] = {
@@ -403,192 +665,3 @@ def get_prompt_template(self) -> str:
     def get_default_sandbox_level(self) -> str | None:
         """Get default sandbox level for SWE-bench Pro."""
         return None
-
-
-_KNOWN_RUNNERS = ("jest", "mocha", "vitest", "ospec", "ava")
-
-
-async def _detect_js_runner(env: "TaskEnvironment", workdir: str | None = None) -> str:
-    """Detect the JavaScript/TypeScript test runner installed in a container.
-
-    Detection strategy:
-    1. Check node_modules/.bin/ for known runner binaries
-    2. Parse package.json scripts.test for runner hints
-    3. Fall back to "npm" (runs npm test) if nothing is detected
-
-    Args:
-        env: Task environment with exec_command.
-        workdir: Working directory inside container.
-
-    Returns:
-        Runner name: "jest", "mocha", "vitest", "ospec", "ava", or "npm".
-    """
-    # Check for runner binaries in node_modules
-    detect_cmd = (
-        "if [ -f node_modules/.bin/jest ]; then echo jest; "
-        "elif [ -f node_modules/.bin/mocha ]; then echo mocha; "
-        "elif [ -f node_modules/.bin/vitest ]; then echo vitest; "
-        "elif [ -f node_modules/.bin/ospec ]; then echo ospec; "
-        "elif [ -f node_modules/.bin/ava ]; then echo ava; "
-        "else echo none; fi"
-    )
-    try:
-        exit_code, stdout, _ = await env.exec_command(detect_cmd, timeout=10, workdir=workdir)
-        if exit_code == 0 and stdout:
-            runner = stdout.strip().split("\n")[-1].strip()
-            if runner in _KNOWN_RUNNERS:
-                return runner
-    except Exception:
-        logger.debug("Failed to detect JS test runner from node_modules")
-
-    # Fallback: parse package.json scripts.test for runner hints
-    pkg_cmd = (
-        "node -e \"try{const p=require('./package.json');"
-        "console.log(p.scripts&&p.scripts.test||'')}catch(e){console.log('')}\" 2>/dev/null"
-    )
-    try:
-        exit_code, stdout, _ = await env.exec_command(pkg_cmd, timeout=10, workdir=workdir)
-        if exit_code == 0 and stdout:
-            test_script = stdout.strip().split("\n")[-1].strip().lower()
-            for runner in _KNOWN_RUNNERS:
-                if runner in test_script:
-                    return runner
-    except Exception:
-        logger.debug("Failed to detect JS test runner from package.json")
-
-    # Ultimate fallback: use npm test
-    return "npm"
-
-
-def _build_pro_test_command(
-    test: str,
-    language: str,
-    uses_prebuilt: bool = False,
-    js_runner: str = "jest",
-) -> str:
-    """Build a language-specific test command for SWE-bench Pro.
-
-    Test ID formats by language:
-        Go: "TestFoo", "TestFoo/subtest", "TestFoo/#00"
-        JS/TS: "file.js | test description", "file.ts | suite name"
-        Python: "tests/test_foo.py::TestClass::test_method"
-
-    Args:
-        test: Test identifier.
-        language: Programming language (python, go, typescript, javascript, js, ts).
-        uses_prebuilt: Whether a pre-built image is being used (adds conda activation).
-        js_runner: JavaScript test runner ("jest", "mocha", or "vitest").
-
-    Returns:
-        Shell command string to run the test.
-    """
-    import shlex
-
-    from ..evaluation import _build_test_command, _normalize_test_id
-
-    if language == "python":
-        return _build_test_command(test, uses_prebuilt)
-
-    test = _normalize_test_id(test)
-
-    if uses_prebuilt:
-        activate = "source /opt/miniconda3/etc/profile.d/conda.sh && conda activate testbed && "
-    else:
-        activate = ""
-
-    if language == "go":
-        # Go test IDs are always function names, optionally with subtests via /
-        # e.g., "TestFoo", "TestFoo/subtest", "TestFoo/#00", "TestFoo//api/v1"
-        # Always use -run with the top-level test name and ./... to search all packages
-        if "/" in test:
-            # Extract top-level test name (before first /)
-            top_level = test.split("/", 1)[0]
-            return f"{activate}go test -v -count=1 -run {shlex.quote(top_level)} ./... 2>&1"
-        else:
-            return f"{activate}go test -v -count=1 -run {shlex.quote(test)} ./... 2>&1"
-
-    if language in ("typescript", "javascript", "ts", "js"):
-        return _build_js_test_command(test, js_runner, activate)
-
-    # Fallback: try running as-is
-    return f"{activate}{test} 2>&1"
-
-
-def _build_js_test_command(test: str, runner: str, activate: str = "") -> str:
-    """Build a JS/TS test command for the detected runner.
-
-    Args:
-        test: Test identifier in "file | description" format.
-        runner: Test runner name ("jest", "mocha", "vitest", "ospec", "ava", "npm").
-        activate: Optional conda activation prefix.
-
-    Returns:
-        Shell command string.
-    """
-    import shlex
-
-    # Parse "file | description" format
-    file_path = ""
-    test_name = ""
-    if " | " in test:
-        parts = test.split(" | ", 1)
-        file_path = parts[0].strip()
-        test_name = parts[1].strip()
-    elif "/" in test or test.endswith((".ts", ".js", ".tsx", ".jsx")):
-        file_path = test
-    else:
-        test_name = test
-
-    if runner == "mocha":
-        cmd = f"{activate}npx mocha"
-        if file_path:
-            cmd += f" {shlex.quote(file_path)}"
-        if test_name and test_name != "test suite":
-            cmd += f" --grep {shlex.quote(test_name)}"
-        cmd += " --timeout 30000 2>&1"
-        return cmd
-
-    if runner == "vitest":
-        cmd = f"{activate}npx vitest run"
-        if file_path:
-            cmd += f" {shlex.quote(file_path)}"
-        if test_name and test_name != "test suite":
-            cmd += f" -t {shlex.quote(test_name)}"
-        cmd += " 2>&1"
-        return cmd
-
-    if runner == "ospec":
-        # ospec: run file directly with node (ospec tests are self-executing)
-        if file_path:
-            cmd = f"{activate}node {shlex.quote(file_path)} 2>&1"
-        else:
-            cmd = f"{activate}npx ospec 2>&1"
-        return cmd
-
-    if runner == "ava":
-        cmd = f"{activate}npx ava"
-        if file_path:
-            cmd += f" {shlex.quote(file_path)}"
-        if test_name and test_name != "test suite":
-            cmd += f" -m {shlex.quote(test_name)}"
-        cmd += " 2>&1"
-        return cmd
-
-    if runner == "npm":
-        # Fallback: use npm test, passing file as argument if possible
-        if file_path:
-            cmd = f"{activate}npm test -- {shlex.quote(file_path)} 2>&1"
-        elif test_name:
-            cmd = f"{activate}npm test 2>&1"
-        else:
-            cmd = f"{activate}npm test 2>&1"
-        return cmd
-
-    # Default: jest
-    cmd = f"{activate}npx jest"
-    if file_path:
-        cmd += f" {shlex.quote(file_path)}"
-    if test_name and test_name != "test suite":
-        cmd += f" -t {shlex.quote(test_name)}"
-    cmd += " --verbose --no-cache 2>&1"
-    return cmd
diff --git a/tests/test_swebench_pro.py b/tests/test_swebench_pro.py
index 551d6a4..5c265e1 100644
--- a/tests/test_swebench_pro.py
+++ b/tests/test_swebench_pro.py
@@ -1,13 +1,20 @@
 """Tests for SWE-bench Pro benchmark implementation."""
 
-from unittest.mock import MagicMock, patch
+import json
+from pathlib import Path
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
 
 from mcpbr.benchmarks.swebench_pro import (
     PRO_LANGUAGES,
     SWEBENCH_PRO_IMAGE_PREFIX,
     SWEBenchProBenchmark,
-    _build_pro_test_command,
+    _get_instance_scripts,
+    _match_test_results,
+    _parse_test_output_locally,
 )
+from mcpbr.evaluation import TestResults
 
 
 class TestSWEBenchProInit:
@@ -25,6 +32,10 @@ def test_name(self) -> None:
         benchmark = SWEBenchProBenchmark()
         assert benchmark.name == "swe-bench-pro"
 
+    def test_custom_scripts_cache_dir(self) -> None:
+        benchmark = SWEBenchProBenchmark(scripts_cache_dir=Path("/tmp/test-scripts"))
+        assert benchmark._scripts_cache_dir == Path("/tmp/test-scripts")
+
 
 class TestSWEBenchProNormalizeTask:
     """Tests for task normalization."""
@@ -90,122 +101,318 @@ def test_normalize_go_task(self) -> None:
         assert bt.metadata["repo_language"] == "go"
 
 
-class TestBuildProTestCommand:
-    """Tests for language-specific test command building."""
-
-    def test_python_delegates(self) -> None:
-        """Python should delegate to existing _build_test_command."""
-        cmd = _build_pro_test_command("tests/test_foo.py::test_bar", "python")
-        assert "pytest" in cmd or "test_foo" in cmd
-
-    def test_go_function_name(self) -> None:
-        cmd = _build_pro_test_command("TestRouteMatching", "go")
-        assert "go test" in cmd
-        assert "-run" in cmd
-        assert "TestRouteMatching" in cmd
-        assert "./..." in cmd
-
-    def test_go_subtest(self) -> None:
-        """Go subtests (TestFoo/#00, TestFoo/subtest) use top-level name with -run."""
-        cmd = _build_pro_test_command("TestParseResourcePath/#00", "go")
-        assert "go test" in cmd
-        assert "-run" in cmd
-        assert "TestParseResourcePath" in cmd
-        assert "./..." in cmd
-
-    def test_typescript_file(self) -> None:
-        cmd = _build_pro_test_command("src/__tests__/parser.test.ts", "typescript")
-        assert "npx jest" in cmd
-        assert "parser.test.ts" in cmd
-
-    def test_typescript_pattern(self) -> None:
-        cmd = _build_pro_test_command("should parse tokens", "typescript")
-        assert "npx jest" in cmd
-        assert "-t" in cmd
-
-    def test_javascript_file(self) -> None:
-        cmd = _build_pro_test_command("test/index.test.js", "javascript")
-        assert "npx jest" in cmd
-        assert "index.test.js" in cmd
-
-    def test_javascript_pattern(self) -> None:
-        cmd = _build_pro_test_command("handles edge case", "javascript")
-        assert "npx jest" in cmd
-
-    def test_js_pipe_format_jest(self) -> None:
-        """SWE-bench Pro JS format with jest runner."""
-        cmd = _build_pro_test_command(
-            "test/database.js | Test database key methods", "js", js_runner="jest"
+class TestGetInstanceScripts:
+    """Tests for _get_instance_scripts."""
+
+    def test_reads_scripts_from_directory(self, tmp_path: Path) -> None:
+        instance_id = "instance_test__repo-abc123"
+        instance_dir = tmp_path / "run_scripts" / instance_id
+        instance_dir.mkdir(parents=True)
+
+        run_script_content = "#!/bin/bash\necho 'test'"
+        parser_content = "import sys\nprint('parser')"
+
+        (instance_dir / "run_script.sh").write_text(run_script_content)
+        (instance_dir / "parser.py").write_text(parser_content)
+
+        run_script, parser = _get_instance_scripts(tmp_path, instance_id)
+        assert run_script == run_script_content
+        assert parser == parser_content
+
+    def test_raises_on_missing_run_script(self, tmp_path: Path) -> None:
+        instance_id = "instance_missing__repo-abc123"
+        instance_dir = tmp_path / "run_scripts" / instance_id
+        instance_dir.mkdir(parents=True)
+        (instance_dir / "parser.py").write_text("parser")
+
+        with pytest.raises(FileNotFoundError, match=r"run_script\.sh"):
+            _get_instance_scripts(tmp_path, instance_id)
+
+    def test_raises_on_missing_parser(self, tmp_path: Path) -> None:
+        instance_id = "instance_missing__repo-abc123"
+        instance_dir = tmp_path / "run_scripts" / instance_id
+        instance_dir.mkdir(parents=True)
+        (instance_dir / "run_script.sh").write_text("script")
+
+        with pytest.raises(FileNotFoundError, match=r"parser\.py"):
+            _get_instance_scripts(tmp_path, instance_id)
+
+    def test_raises_on_missing_directory(self, tmp_path: Path) -> None:
+        (tmp_path / "run_scripts").mkdir(parents=True)
+        with pytest.raises(FileNotFoundError):
+            _get_instance_scripts(tmp_path, "nonexistent_instance")
+
+
+class TestMatchTestResults:
+    """Tests for _match_test_results."""
+
+    def test_all_tests_pass(self) -> None:
+        parsed = TestResults(
+            passed=2,
+            total=2,
+            details=[
+                {"test": "TestFoo", "passed": True, "status": "PASSED"},
+                {"test": "TestBar", "passed": True, "status": "PASSED"},
+            ],
         )
-        assert "npx jest" in cmd
-        assert "test/database.js" in cmd
-        assert "-t" in cmd
-        assert "Test database key methods" in cmd
-
-    def test_js_pipe_format_mocha(self) -> None:
-        """SWE-bench Pro JS format with mocha runner."""
-        cmd = _build_pro_test_command(
-            "test/database.js | Test database key methods", "js", js_runner="mocha"
+        ftp, ptp = _match_test_results(parsed, ["TestFoo"], ["TestBar"])
+        assert ftp.passed == 1
+        assert ftp.total == 1
+        assert ptp.passed == 1
+        assert ptp.total == 1
+
+    def test_fail_to_pass_fails(self) -> None:
+        parsed = TestResults(
+            passed=1,
+            total=2,
+            details=[
+                {"test": "TestFoo", "passed": False, "status": "FAILED"},
+                {"test": "TestBar", "passed": True, "status": "PASSED"},
+            ],
         )
-        assert "npx mocha" in cmd
-        assert "test/database.js" in cmd
-        assert "--grep" in cmd
-        assert "Test database key methods" in cmd
-
-    def test_ts_test_suite_format(self) -> None:
-        """TS 'test suite' format runs the whole file without -t filter."""
-        cmd = _build_pro_test_command(
-            "test/tests/LoginFacadeTest.js | test suite", "ts", js_runner="jest"
+        ftp, ptp = _match_test_results(parsed, ["TestFoo"], ["TestBar"])
+        assert ftp.passed == 0
+        assert ftp.total == 1
+        assert ptp.passed == 1
+        assert ptp.total == 1
+
+    def test_substring_matching(self) -> None:
+        """Tests that weren't found by exact match fall back to substring."""
+        parsed = TestResults(
+            passed=1,
+            total=1,
+            details=[
+                {
+                    "test": "test/database.js | Test database key methods",
+                    "passed": True,
+                    "status": "PASSED",
+                },
+            ],
         )
-        assert "npx jest" in cmd
-        assert "test/tests/LoginFacadeTest.js" in cmd
-        assert "-t" not in cmd
-
-    def test_mocha_test_suite_format(self) -> None:
-        """Mocha 'test suite' runs whole file without --grep."""
-        cmd = _build_pro_test_command(
-            "test/tests/LoginFacadeTest.js | test suite", "js", js_runner="mocha"
+        ftp, _ptp = _match_test_results(
+            parsed,
+            ["test/database.js | Test database key methods"],
+            [],
         )
-        assert "npx mocha" in cmd
-        assert "test/tests/LoginFacadeTest.js" in cmd
-        assert "--grep" not in cmd
-
-    def test_ospec_runner_file(self) -> None:
-        """ospec runs test files directly with node."""
-        cmd = _build_pro_test_command(
-            "test/tests/LoginFacadeTest.js | test suite", "ts", js_runner="ospec"
+        assert ftp.passed == 1
+        assert ftp.total == 1
+
+    def test_empty_lists(self) -> None:
+        parsed = TestResults(passed=0, total=0, details=[])
+        ftp, ptp = _match_test_results(parsed, [], [])
+        assert ftp.total == 0
+        assert ptp.total == 0
+
+    def test_test_not_found(self) -> None:
+        parsed = TestResults(
+            passed=1,
+            total=1,
+            details=[
+                {"test": "TestFoo", "passed": True, "status": "PASSED"},
+            ],
         )
-        assert "node" in cmd
-        assert "test/tests/LoginFacadeTest.js" in cmd
-
-    def test_ava_runner(self) -> None:
-        """ava runner uses -m for test name matching."""
-        cmd = _build_pro_test_command("test/database.js | Test db methods", "js", js_runner="ava")
-        assert "npx ava" in cmd
-        assert "test/database.js" in cmd
-        assert "-m" in cmd
-        assert "Test db methods" in cmd
-
-    def test_npm_fallback_with_file(self) -> None:
-        """npm fallback passes file via -- to npm test."""
-        cmd = _build_pro_test_command(
-            "test/tests/LoginFacadeTest.js | test suite", "ts", js_runner="npm"
+        ftp, _ptp = _match_test_results(parsed, ["TestMissing"], [])
+        assert ftp.passed == 0
+        assert ftp.total == 1
+        assert ftp.details[0]["status"] == "NOT_FOUND"
+
+    def test_multiple_fail_to_pass(self) -> None:
+        parsed = TestResults(
+            passed=2,
+            total=3,
+            details=[
+                {"test": "TestA", "passed": True, "status": "PASSED"},
+                {"test": "TestB", "passed": True, "status": "PASSED"},
+                {"test": "TestC", "passed": False, "status": "FAILED"},
+            ],
         )
-        assert "npm test" in cmd
-        assert "test/tests/LoginFacadeTest.js" in cmd
+        ftp, _ptp = _match_test_results(parsed, ["TestA", "TestB", "TestC"], [])
+        assert ftp.passed == 2
+        assert ftp.total == 3
 
-    def test_npm_fallback_no_file(self) -> None:
-        """npm fallback with no file runs plain npm test."""
-        cmd = _build_pro_test_command("should work", "js", js_runner="npm")
-        assert "npm test" in cmd
 
-    def test_prebuilt_conda_activation(self) -> None:
-        cmd = _build_pro_test_command("TestFoo", "go", uses_prebuilt=True)
-        assert "conda activate testbed" in cmd
+class TestParseTestOutputLocally:
+    """Tests for _parse_test_output_locally."""
+
+    def test_parses_mocha_json(self) -> None:
+        """Test parsing mocha JSON output (NodeBB style)."""
+        mocha_output = json.dumps(
+            {
+                "passes": [
+                    {"file": "test/database.js", "fullTitle": "Test db key methods"},
+                    {"file": "test/meta.js", "fullTitle": "Meta functions"},
+                ],
+                "failures": [
+                    {"file": "test/translator.js", "fullTitle": "Translator shim"},
+                ],
+                "pending": [],
+            }
+        )
 
-    def test_unknown_language_fallback(self) -> None:
-        cmd = _build_pro_test_command("test_something", "rust")
-        assert "test_something" in cmd
+        # Create a minimal parser.py that handles mocha JSON
+        parser_script = """
+import json
+import sys
+import dataclasses
+from enum import Enum
+from pathlib import Path
+from typing import List
+
+class TestStatus(Enum):
+    PASSED = 1
+    FAILED = 2
+    SKIPPED = 3
+    ERROR = 4
+
+@dataclasses.dataclass
+class TestResult:
+    name: str
+    status: TestStatus
+
+def parse_test_output(stdout_content, stderr_content):
+    results = []
+    try:
+        data = json.loads(stdout_content)
+        for t in data.get("passes", []):
+            results.append(TestResult(name=t.get("fullTitle", ""), status=TestStatus.PASSED))
+        for t in data.get("failures", []):
+            results.append(TestResult(name=t.get("fullTitle", ""), status=TestStatus.FAILED))
+    except json.JSONDecodeError:
+        pass
+    return results
+
+def export_to_json(results, output_path):
+    json_results = {
+        "tests": [
+            {"name": r.name, "status": r.status.name} for r in results
+        ]
+    }
+    with open(output_path, "w") as f:
+        json.dump(json_results, f)
+
+def main(stdout_path, stderr_path, output_path):
+    with open(stdout_path) as f:
+        stdout_content = f.read()
+    with open(stderr_path) as f:
+        stderr_content = f.read()
+    results = parse_test_output(stdout_content, stderr_content)
+    export_to_json(results, output_path)
+
+if __name__ == "__main__":
+    main(Path(sys.argv[1]), Path(sys.argv[2]), Path(sys.argv[3]))
+"""
+
+        result = _parse_test_output_locally(parser_script, mocha_output, "", "test-instance")
+        assert result.total == 3
+        assert result.passed == 2
+
+    def test_handles_parser_error(self) -> None:
+        """Test that parser errors are handled gracefully."""
+        bad_parser = "raise ValueError('broken')"
+        result = _parse_test_output_locally(bad_parser, "output", "err", "test")
+        assert result.total == 0
+        assert result.passed == 0
+
+    def test_handles_empty_output(self) -> None:
+        """Test parsing with no test output."""
+        parser_script = """
+import json
+import sys
+from pathlib import Path
+
+def main(stdout_path, stderr_path, output_path):
+    with open(output_path, "w") as f:
+        json.dump({"tests": []}, f)
+
+if __name__ == "__main__":
+    main(Path(sys.argv[1]), Path(sys.argv[2]), Path(sys.argv[3]))
+"""
+        result = _parse_test_output_locally(parser_script, "", "", "test")
+        assert result.total == 0
+        assert result.passed == 0
+
+
+class TestRunOfficialTests:
+    """Tests for _run_official_tests orchestration."""
+
+    @pytest.mark.asyncio
+    async def test_runs_script_in_container(self) -> None:
+        """Test that run_script.sh is written to container and executed."""
+        env = MagicMock()
+        env.uses_prebuilt = True
+        env.write_file = AsyncMock()
+        env.exec_command = AsyncMock(return_value=(0, '{"tests":[]}', ""))
+
+        task = {
+            "instance_id": "test-instance",
+            "selected_test_files_to_run": '["test/foo.js", "test/bar.js"]',
+        }
+
+        # Use a simple parser that outputs empty test list
+        parser = """
+import json
+import sys
+from pathlib import Path
+
+def main(stdout_path, stderr_path, output_path):
+    with open(output_path, "w") as f:
+        json.dump({"tests": []}, f)
+
+if __name__ == "__main__":
+    main(Path(sys.argv[1]), Path(sys.argv[2]), Path(sys.argv[3]))
+"""
+        from mcpbr.benchmarks.swebench_pro import _run_official_tests
+
+        await _run_official_tests(env, task, "#!/bin/bash\necho test", parser)
+
+        # Verify run_script.sh was written to container
+        env.write_file.assert_called_once_with(
+            "run_script.sh", "#!/bin/bash\necho test", workdir="/app"
+        )
+
+        # Verify exec_command was called (chmod + the actual script run)
+        assert env.exec_command.call_count == 2
+
+    @pytest.mark.asyncio
+    async def test_handles_no_selected_files(self) -> None:
+        """Test graceful handling when no test files are specified."""
+        env = MagicMock()
+        env.uses_prebuilt = True
+
+        task = {
+            "instance_id": "test-instance",
+            "selected_test_files_to_run": "[]",
+        }
+
+        from mcpbr.benchmarks.swebench_pro import _run_official_tests
+
+        result = await _run_official_tests(env, task, "#!/bin/bash", "parser")
+        assert result.total == 0
+        assert result.passed == 0
+
+    @pytest.mark.asyncio
+    async def test_handles_timeout(self) -> None:
+        """Test graceful handling when test execution times out."""
+        env = MagicMock()
+        env.uses_prebuilt = True
+        env.write_file = AsyncMock()
+        env.exec_command = AsyncMock(
+            side_effect=[
+                (0, "", ""),  # chmod succeeds
+                TimeoutError("timed out"),  # script times out
+            ]
+        )
+
+        task = {
+            "instance_id": "test-instance",
+            "selected_test_files_to_run": '["test/foo.js"]',
+        }
+
+        from mcpbr.benchmarks.swebench_pro import _run_official_tests
+
+        result = await _run_official_tests(env, task, "#!/bin/bash", "parser")
+        assert result.total == 0
+        assert result.passed == 0
 
 
 class TestSWEBenchProDockerImage:
@@ -422,7 +629,7 @@ class TestSWEBenchProEvalResultToDict:
     """Tests for _eval_result_to_dict helper."""
 
     def test_basic_conversion(self) -> None:
-        from mcpbr.evaluation import EvaluationResult, TestResults
+        from mcpbr.evaluation import EvaluationResult
 
         benchmark = SWEBenchProBenchmark()
         result = EvaluationResult(

From 8e6beec4aa68291bc53c4c4ef2792659deae2cbf Mon Sep 17 00:00:00 2001
From: Grey Newell <greyshipscode@gmail.com>
Date: Thu, 26 Feb 2026 11:44:04 -0500
Subject: [PATCH 10/14] fix: run before_repo_set_cmd to restore test files
 after patch application

The official SWE-bench Pro evaluation harness runs before_repo_set_cmd
(last line) between patch application and test execution. This typically
restores specific test files from the fix commit, e.g.:
  git checkout <commit> -- test/tests/SomeTest.ts

Without this, 2 tutanota instances fail because their test files end up
in a broken state after patch + test_patch application.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/mcpbr/benchmark_preflight.py     |  4 +++
 src/mcpbr/benchmarks/swebench_pro.py | 52 ++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+)

diff --git a/src/mcpbr/benchmark_preflight.py b/src/mcpbr/benchmark_preflight.py
index a1a9af9..2675fcc 100644
--- a/src/mcpbr/benchmark_preflight.py
+++ b/src/mcpbr/benchmark_preflight.py
@@ -15,6 +15,7 @@
     _ensure_run_scripts_repo,
     _get_instance_scripts,
     _match_test_results,
+    _run_before_repo_set_cmd,
     _run_official_tests,
 )
 from .docker_env import DockerEnvironmentManager, TaskEnvironment
@@ -148,6 +149,9 @@ async def _check_single_instance(
         if test_patch:
             await _apply_test_patch(env, test_patch, workdir=eval_workdir)
 
+        # Run before_repo_set_cmd (restores specific test files from fix commit)
+        await _run_before_repo_set_cmd(env, task, workdir=eval_workdir)
+
         # Reinstall package in editable mode so patched code is used.
         # SWE-bench Pro images install the package into site-packages;
         # without this step, tests would import the old (unpatched) code.
diff --git a/src/mcpbr/benchmarks/swebench_pro.py b/src/mcpbr/benchmarks/swebench_pro.py
index 406ed66..348b881 100644
--- a/src/mcpbr/benchmarks/swebench_pro.py
+++ b/src/mcpbr/benchmarks/swebench_pro.py
@@ -133,6 +133,53 @@ def _get_instance_scripts(repo_path: Path, instance_id: str) -> tuple[str, str]:
     return run_script_path.read_text(), parser_path.read_text()
 
 
+async def _run_before_repo_set_cmd(
+    env: TaskEnvironment,
+    task: dict[str, Any],
+    workdir: str | None = None,
+) -> None:
+    """Run the before_repo_set_cmd from the dataset after patch application.
+
+    The official SWE-bench Pro evaluation harness runs the last line of
+    before_repo_set_cmd between applying the patch and running tests.
+    This typically restores specific test files from the fix commit, e.g.:
+        git checkout <commit> -- test/tests/SomeTest.ts
+
+    The earlier lines (git reset, git clean, git checkout <base>) are
+    redundant because our apply_patch() already handles that.
+
+    Args:
+        env: Task environment.
+        task: SWE-bench Pro task dictionary.
+        workdir: Working directory inside container.
+    """
+    before_cmd = task.get("before_repo_set_cmd", "")
+    if not before_cmd or not before_cmd.strip():
+        return
+
+    # The official harness only uses the last line
+    last_line = before_cmd.strip().split("\n")[-1].strip()
+    if not last_line:
+        return
+
+    # Skip if it's just a git reset/clean/checkout <hash> (already done by apply_patch)
+    # We only care about "git checkout <hash> -- <file>" which restores specific files
+    if last_line.startswith("git checkout") and " -- " not in last_line:
+        return
+    if last_line.startswith(("git reset", "git clean")):
+        return
+
+    logger.debug("Running before_repo_set_cmd for %s: %s", task.get("instance_id"), last_line)
+    try:
+        await env.exec_command(last_line, timeout=60, workdir=workdir)
+    except Exception:
+        logger.warning(
+            "before_repo_set_cmd failed for %s: %s",
+            task.get("instance_id"),
+            last_line,
+        )
+
+
 async def _run_official_tests(
     env: TaskEnvironment,
     task: dict[str, Any],
@@ -565,6 +612,11 @@ async def _evaluate_with_official_scripts(
         if test_patch:
             await _apply_test_patch(env, test_patch, workdir=eval_workdir)
 
+        # Run before_repo_set_cmd (last line only, matching official harness).
+        # This typically restores specific test files from the fix commit,
+        # e.g., "git checkout <commit> -- test/file.ts"
+        await _run_before_repo_set_cmd(env, task, workdir=eval_workdir)
+
         # Reinstall package so patched code is active (SWE-bench Pro images
         # install into site-packages, not editable mode)
         if eval_workdir and language == "python":

From e05964c99b377a51a2542d4bc6da7a196f727f62 Mon Sep 17 00:00:00 2001
From: Grey Newell <greyshipscode@gmail.com>
Date: Thu, 26 Feb 2026 15:06:48 -0500
Subject: [PATCH 11/14] feat: add --shard-index/--shard-total to preflight CLI

---
 src/mcpbr/cli.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/src/mcpbr/cli.py b/src/mcpbr/cli.py
index a41cdc9..17b33f0 100644
--- a/src/mcpbr/cli.py
+++ b/src/mcpbr/cli.py
@@ -1588,6 +1588,18 @@ def benchmarks() -> None:
     default=300,
     help="Timeout per test in seconds (default: 300).",
 )
+@click.option(
+    "--shard-index",
+    type=int,
+    default=None,
+    help="Shard index for parallel runs (0-based).",
+)
+@click.option(
+    "--shard-total",
+    type=int,
+    default=None,
+    help="Total number of shards for parallel runs.",
+)
 def preflight(
     config_path: str | None,
     benchmark_name: str,
@@ -1597,6 +1609,8 @@ def preflight(
     fail_fast: bool,
     filter_category: tuple[str, ...],
     timeout: int,
+    shard_index: int | None,
+    shard_total: int | None,
 ) -> None:
     """Validate golden patches pass all tests before evaluation.
 
@@ -1635,6 +1649,20 @@ def preflight(
         console.print("[yellow]No tasks found matching the criteria.[/yellow]")
         return
 
+    # Apply sharding if requested
+    if shard_index is not None and shard_total is not None:
+        if shard_index < 0 or shard_index >= shard_total:
+            console.print(
+                f"[red]Invalid shard-index {shard_index} for shard-total {shard_total}[/red]"
+            )
+            sys.exit(1)
+        tasks = tasks[shard_index::shard_total]
+        console.print(f"Shard {shard_index + 1}/{shard_total}: {len(tasks)} instance(s)\n")
+
+    if not tasks:
+        console.print("[yellow]No tasks in this shard.[/yellow]")
+        return
+
     console.print(f"Validating {len(tasks)} instance(s)...\n")
 
     # Create Docker manager

From 9bd511359839eea5a00d4f8aff0572b487e46545 Mon Sep 17 00:00:00 2001
From: Grey Newell <greyshipscode@gmail.com>
Date: Thu, 26 Feb 2026 16:48:56 -0500
Subject: [PATCH 12/14] fix: use docker system prune for aggressive disk
 cleanup

Protonmail/webclients images are ~4.8GB compressed. docker image prune
alone doesn't clear build cache and volumes. docker system prune -af
--volumes reclaims all reclaimable space between instances.
---
 src/mcpbr/benchmark_preflight.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/mcpbr/benchmark_preflight.py b/src/mcpbr/benchmark_preflight.py
index 2675fcc..df65e22 100644
--- a/src/mcpbr/benchmark_preflight.py
+++ b/src/mcpbr/benchmark_preflight.py
@@ -62,25 +62,27 @@ def success_rate(self) -> float:
         return (self.passed / self.total) * 100.0
 
 
-async def _prune_docker_images() -> None:
-    """Remove unused Docker images to free disk space.
+async def _prune_docker() -> None:
+    """Remove unused Docker images, build cache, and volumes to free disk space.
 
     Called after each preflight instance to prevent disk exhaustion.
-    Each SWE-bench Pro image is ~1.5GB and each instance uses a unique image,
-    so pruning after cleanup is critical for processing many instances.
+    Each SWE-bench Pro image is ~1-5GB (protonmail is ~4.8GB compressed)
+    and each instance uses a unique image, so aggressive pruning after
+    cleanup is critical for processing many instances.
     """
     try:
         proc = await asyncio.create_subprocess_exec(
             "docker",
-            "image",
+            "system",
             "prune",
             "-af",
+            "--volumes",
             stdout=subprocess.DEVNULL,
             stderr=subprocess.DEVNULL,
         )
         await proc.wait()
     except Exception:
-        logger.debug("Failed to prune Docker images")
+        logger.debug("Failed to prune Docker system")
 
 
 async def _check_single_instance(
@@ -187,7 +189,7 @@ async def _check_single_instance(
             except Exception:
                 logger.warning(f"Failed to clean up container for {instance_id}")
         # Prune unused images to free disk space (each image is ~1.5GB)
-        await _prune_docker_images()
+        await _prune_docker()
 
 
 async def _check_with_official_scripts(

From ef15f255c69e041e686c5e3a4d0ac76ea88ef101 Mon Sep 17 00:00:00 2001
From: Grey Newell <greyshipscode@gmail.com>
Date: Sun, 15 Mar 2026 11:27:28 -0400
Subject: [PATCH 13/14] docs: remove dead links to greynewell.com blog post

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 README.md            | 3 +--
 site/pages/about.njk | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 3a71ecd..99353ee 100644
--- a/README.md
+++ b/README.md
@@ -55,7 +55,6 @@ mcpbr runs controlled experiments: same model, same tasks, same environment - th
 - **Real GitHub issues** from SWE-bench (not toy examples)
 - **Reproducible results** via Docker containers with pinned dependencies
 
-> Read the full origin story: **[Why I Built mcpbr](https://greynewell.com/blog/why-i-built-mcpbr/)** — the problem, the approach, and where the project is headed.
 
 ## Research Paper
 
@@ -1536,4 +1535,4 @@ MIT - see [LICENSE](LICENSE) for details.
 
 ---
 
-Built by [Grey Newell](https://greynewell.com) | [Why I Built mcpbr](https://greynewell.com/blog/why-i-built-mcpbr/) | [About](https://mcpbr.org/about/)
+Built by [Grey Newell](https://greynewell.com) | [About](https://mcpbr.org/about/)
diff --git a/site/pages/about.njk b/site/pages/about.njk
index eb7df5c..5f039a0 100644
--- a/site/pages/about.njk
+++ b/site/pages/about.njk
@@ -49,7 +49,7 @@ headExtra: |
 <p>mcpbr was created by <a href="https://greynewell.com">Grey Newell</a> after identifying a critical gap in the MCP ecosystem: <strong>no tool existed to measure whether an MCP server actually made an AI agent better at its job.</strong></p>
 <p>Existing coding benchmarks like SWE-bench measured raw language model capabilities. MCP server developers relied on anecdotal evidence and demo videos. There was no way to answer the fundamental question: <em>does adding this MCP server to an agent improve its performance on real tasks?</em></p>
 <p>mcpbr was built to answer that question with hard data.</p>
-<blockquote><p>"No available tool allowed users to easily measure the performance improvement of introducing their MCP server to an agent."</p><p>&mdash; <a href="https://greynewell.com/blog/why-i-built-mcpbr/">Grey Newell, "Why I Built mcpbr"</a></p></blockquote>
+<blockquote><p>"No available tool allowed users to easily measure the performance improvement of introducing their MCP server to an agent."</p><p>&mdash; Grey Newell</p></blockquote>
 
 <h2>The Problem mcpbr Solves</h2>
 <p>Before mcpbr, MCP server evaluation looked like this:</p>
@@ -84,7 +84,7 @@ headExtra: |
   <tr><td>GitHub</td><td><a href="https://github.com/supermodeltools/mcpbr">github.com/greynewell/mcpbr</a></td></tr>
   <tr><td>PyPI</td><td><a href="https://pypi.org/project/mcpbr/">pypi.org/project/mcpbr</a></td></tr>
   <tr><td>npm</td><td><a href="https://www.npmjs.com/package/mcpbr-cli">npmjs.com/package/mcpbr-cli</a></td></tr>
-  <tr><td>Blog Post</td><td><a href="https://greynewell.com/blog/why-i-built-mcpbr/">Why I Built mcpbr</a></td></tr>
+
   <tr><td>Creator</td><td><a href="https://greynewell.com">greynewell.com</a></td></tr>
   <tr><td>SchemaFlux</td><td><a href="https://schemaflux.dev">schemaflux.dev</a></td></tr>
   <tr><td>License</td><td><a href="https://github.com/supermodeltools/mcpbr/blob/main/LICENSE">MIT</a></td></tr>

From 8b8be831ed19d2fedad5b8a3ff32e181d0ba3510 Mon Sep 17 00:00:00 2001
From: Grey Newell <greyshipscode@gmail.com>
Date: Sun, 15 Mar 2026 11:33:44 -0400
Subject: [PATCH 14/14] docs: add blog section with SWE-bench posts

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 99353ee..0bde867 100644
--- a/README.md
+++ b/README.md
@@ -55,6 +55,10 @@ mcpbr runs controlled experiments: same model, same tasks, same environment - th
 - **Real GitHub issues** from SWE-bench (not toy examples)
 - **Reproducible results** via Docker containers with pinned dependencies
 
+## Blog
+
+- [SWE-bench Verified Is Broken: 5 Things I Found in the Source Code](https://greynewell.com/blog/swe-bench-verified-broken-5-things-source-code/)
+- [SWE-bench Tests Run 6x Faster on ARM64 with Native Containers](https://greynewell.com/blog/swe-bench-arm64-native-containers-6x-faster/)
 
 ## Research Paper