supermodeltools · greynewell · Mar 24, 2026 · Mar 24, 2026 · Mar 24, 2026 · Mar 24, 2026
diff --git a/Dockerfile b/Dockerfile
@@ -1,5 +1,5 @@
 # Base image for SWE-bench task environments
-FROM python:3.11-slim
+FROM mcr.microsoft.com/mirror/docker/library/python:3.11-slim
 
 # Install comprehensive system dependencies for SWE-bench tasks
 RUN apt-get update && apt-get install -y --no-install-recommends \

diff --git a/pyproject.toml b/pyproject.toml
@@ -135,6 +135,8 @@ ignore = [
 "tests/**/*.py" = ["S", "T20"]
 "infrastructure/**/*.py" = ["S603", "S607"]
 "src/mcpbr/infrastructure/**/*.py" = ["S603", "S607", "S108"]
+"src/mcpbr/benchmarks/deadcode.py" = ["S608"]
+"src/mcpbr/benchmarks/supermodel/benchmark.py" = ["S608"]
 "scripts/**/*.py" = ["T20", "S"]
 
 [tool.pytest.ini_options]

diff --git a/src/mcpbr/__init__.py b/src/mcpbr/__init__.py
@@ -3,7 +3,7 @@
 A benchmark runner for evaluating MCP servers against SWE-bench tasks.
 """
 
-__version__ = "0.14.0"
+__version__ = "0.14.1"
 
 from .sdk import (
     BenchmarkResult,

diff --git a/src/mcpbr/benchmarks/__init__.py b/src/mcpbr/benchmarks/__init__.py
@@ -15,6 +15,7 @@
 from .codereval import CoderEvalBenchmark
 from .custom import CustomBenchmark
 from .cybergym import CyberGymBenchmark
+from .deadcode import DeadCodeBenchmark
 from .gaia import GAIABenchmark
 from .gsm8k import GSM8KBenchmark
 from .hellaswag import HellaSwagBenchmark
@@ -28,6 +29,7 @@
 from .mlagentbench import MLAgentBenchBenchmark
 from .mmmu import MMMUBenchmark
 from .repoqa import RepoQABenchmark
+from .supermodel.benchmark import SupermodelBenchmark
 from .swebench import SWEBenchmark
 from .terminalbench import TerminalBenchBenchmark
 from .toolbench import ToolBenchBenchmark
@@ -50,6 +52,7 @@
     "CoderEvalBenchmark",
     "CustomBenchmark",
     "CyberGymBenchmark",
+    "DeadCodeBenchmark",
     "GAIABenchmark",
     "GSM8KBenchmark",
     "HellaSwagBenchmark",
@@ -64,6 +67,7 @@
     "MMMUBenchmark",
     "RepoQABenchmark",
     "SWEBenchmark",
+    "SupermodelBenchmark",
     "TerminalBenchBenchmark",
     "ToolBenchBenchmark",
     "TruthfulQABenchmark",
@@ -106,6 +110,8 @@
     "mmmu": MMMUBenchmark,
     "longbench": LongBenchBenchmark,
     "adversarial": AdversarialBenchmark,
+    "dead-code": DeadCodeBenchmark,  # type: ignore[dict-item]
+    "supermodel": SupermodelBenchmark,  # type: ignore[dict-item]
 }
 
 

diff --git a/src/mcpbr/benchmarks/_bench_utils.py b/src/mcpbr/benchmarks/_bench_utils.py
@@ -0,0 +1,122 @@
+"""Shared utilities for benchmark implementations."""
+
+import json
+import logging
+import subprocess
+from pathlib import Path
+from typing import Any
+
+logger = logging.getLogger("mcpbr.benchmarks")
+
+
+def extract_findings_from_text(text: str, findings_key: str = "dead_code") -> list[dict[str, Any]]:
+    """Extract findings array from text/patch content by locating a JSON key.
+
+    Searches for a JSON key (e.g. "dead_code") and extracts the associated array
+    using bracket-depth matching. Handles brackets inside JSON strings correctly.
+
+    Args:
+        text: Raw text that may contain a JSON object with the findings key.
+        findings_key: The JSON key whose array value to extract.
+
+    Returns:
+        List of finding dicts, or empty list if not found/parseable.
+    """
+    findings: list[dict[str, Any]] = []
+    try:
+        marker = f'"{findings_key}"'
+        start = text.find(marker)
+        if start == -1:
+            return findings
+        arr_start = text.find("[", start)
+        if arr_start == -1:
+            return findings
+        # Bracket-depth matching that respects JSON strings
+        depth = 0
+        in_string = False
+        escape_next = False
+        for i, c in enumerate(text[arr_start:], arr_start):
+            if escape_next:
+                escape_next = False
+                continue
+            if c == "\\":
+                if in_string:
+                    escape_next = True
+                continue
+            if c == '"':
+                in_string = not in_string
+                continue
+            if in_string:
+                continue
+            if c == "[":
+                depth += 1
+            elif c == "]":
+                depth -= 1
+                if depth == 0:
+                    arr_text = text[arr_start : i + 1]
+                    parsed = json.loads(arr_text)
+                    if isinstance(parsed, list):
+                        findings = parsed
+                    break
+    except (json.JSONDecodeError, ValueError):
+        pass
+    return findings
+
+
+def init_git_workdir(host_workdir: str, timeout: int = 30) -> None:
+    """Initialize a git repo in a workdir so the harness can track modifications.
+
+    Args:
+        host_workdir: Path to the working directory.
+        timeout: Timeout in seconds for each git command.
+    """
+    subprocess.run(
+        ["git", "init"], cwd=host_workdir, capture_output=True, check=False, timeout=timeout
+    )
+    subprocess.run(
+        ["git", "config", "user.email", "mcpbr@test.com"],
+        cwd=host_workdir,
+        capture_output=True,
+        check=False,
+        timeout=timeout,
+    )
+    subprocess.run(
+        ["git", "config", "user.name", "MCPBR"],
+        cwd=host_workdir,
+        capture_output=True,
+        check=False,
+        timeout=timeout,
+    )
+    subprocess.run(
+        ["git", "add", "-A"],
+        cwd=host_workdir,
+        capture_output=True,
+        check=False,
+        timeout=timeout,
+    )
+    subprocess.run(
+        ["git", "commit", "-m", "Initial"],
+        cwd=host_workdir,
+        capture_output=True,
+        check=False,
+        timeout=timeout,
+    )
+
+
+def safe_write_file(host_workdir: str, file_path: str, content: str) -> None:
+    """Write a file within host_workdir, raising if the path escapes containment.
+
+    Args:
+        host_workdir: Root directory that all writes must stay within.
+        file_path: Relative path of the file to write.
+        content: File content.
+
+    Raises:
+        ValueError: If the resolved path is outside host_workdir.
+    """
+    root = Path(host_workdir).resolve()
+    full_path = (root / file_path).resolve()
+    if not full_path.is_relative_to(root):
+        raise ValueError(f"Path traversal detected: {file_path!r} escapes {host_workdir!r}")
+    full_path.parent.mkdir(parents=True, exist_ok=True)
+    full_path.write_text(content)
diff --git a/src/mcpbr/benchmarks/codegraph.py b/src/mcpbr/benchmarks/codegraph.py
@@ -15,7 +15,7 @@
 import json
 import logging
 import re
-from datetime import datetime, timezone
+from datetime import UTC, datetime
 from typing import Any
 
 from datasets import load_dataset
@@ -266,7 +266,7 @@ async def _setup_environment(self, env: TaskEnvironment, task: dict[str, Any]) -
             "version": 1,
             "repoName": cache_name,
             "commitHash": None,
-            "savedAt": datetime.now(timezone.utc).isoformat(),
+            "savedAt": datetime.now(UTC).isoformat(),
             "raw": result,
         }
 
@@ -382,7 +382,9 @@ def _count_steps(self, text: str) -> int:
             return 1
 
         # Count tool call patterns in the output
-        tool_calls = len(re.findall(r"(?:tool_use|tool_call|<tool>|Tool:|Calling)", text, re.IGNORECASE))
+        tool_calls = len(
+            re.findall(r"(?:tool_use|tool_call|<tool>|Tool:|Calling)", text, re.IGNORECASE)
+        )
         return max(tool_calls, 1)
 
     def get_prebuilt_image(self, task: dict[str, Any]) -> str | None: