supermodeltools · greynewell · Mar 25, 2026 · Mar 24, 2026 · Mar 24, 2026 · Mar 25, 2026
diff --git a/src/mcpbr/benchmarks/supermodel/benchmark.py b/src/mcpbr/benchmarks/supermodel/benchmark.py
@@ -40,6 +40,9 @@ class SupermodelBenchmark:
 
     name = "supermodel"
     evaluate_without_patch = True  # Uses REPORT.json, not git diff
+    suppress_mcp_suffix = (
+        True  # enhanced_prompt_v2 provides its own guidance; generic MCP suffix conflicts
+    )
 
     def __init__(
         self,
@@ -475,50 +478,18 @@ async def create_environment(
                 {k: v for k, v in ep.items() if k in ep_keep} for ep in entry_points[:200]
             ]
 
-            # Chunk candidates into files of max 200 each (~150 chars/entry
-            # with reason+confidence = ~30K chars = ~7.5K tokens per chunk).
-            # Must stay under 10K token read limit.
-            max_per_file = 200
-            total = len(slimmed)
-
-            base_name = self._endpoint.analysis_filename.replace(".json", "")
-            chunk_refs = []
-            for i in range(0, max(total, 1), max_per_file):
-                chunk_num = i // max_per_file + 1
-                chunk = slimmed[i : i + max_per_file]
-                if not chunk:
-                    break
-                chunk_name = f"{base_name}_chunk_{chunk_num:03d}.json"
-                chunk_path = Path(host_workdir) / chunk_name
-
-                # Per-chunk metadata
-                chunk_reasons = Counter(c.get("reason", "") for c in chunk)
-                chunk_data = {
-                    "chunk": chunk_num,
-                    "candidateCount": len(chunk),
-                    "reasonBreakdown": dict(chunk_reasons.most_common()),
-                    "deadCodeCandidates": chunk,
-                }
-                chunk_path.write_text(json.dumps(chunk_data, separators=(",", ":")))
-                chunk_refs.append(
-                    {
-                        "file": chunk_name,
-                        "candidateCount": len(chunk),
-                    }
-                )
-
-            # Write the index file (what the agent reads first)
-            index_data = {
+            # Write all candidates directly into a single analysis file.
+            analysis_data = {
                 "metadataSummary": metadata_summary,
-                "chunkFiles": chunk_refs,
+                "deadCodeCandidates": slimmed,
                 "entryPoints": slim_entry_points,
             }
             index_path = Path(host_workdir) / self._endpoint.analysis_filename
-            index_path.write_text(json.dumps(index_data, indent=2))
+            index_path.write_text(json.dumps(analysis_data, indent=2))
 
             logger.info(
-                f"Placed analysis for {instance_id}: {total} candidates "
-                f"in {len(chunk_refs)} chunks, {len(slim_entry_points)} entry points "
+                f"Placed analysis for {instance_id}: {len(slimmed)} candidates, "
+                f"{len(slim_entry_points)} entry points "
                 f"(filtered: {type_filtered} types, {ep_filtered} entry points)"
             )
         except Exception as e:
@@ -528,6 +499,16 @@ async def create_environment(
                 file=sys.stderr,
                 flush=True,
             )
+            # Write an empty analysis file so the agent can still complete (with 0 results).
+            # Without this, enhanced_prompt_v2 fails immediately and the task shows as ERROR
+            # rather than FAIL with 0% recall, which obscures whether the code path is working.
+            empty_analysis = {
+                "metadataSummary": {"totalCandidates": 0, "includedCandidates": 0},
+                "deadCodeCandidates": [],
+                "entryPoints": [],
+            }
+            index_path = Path(host_workdir) / self._endpoint.analysis_filename
+            index_path.write_text(json.dumps(empty_analysis, indent=2))
 
         # Start Docker container
         container_name = f"mcpbr-{docker_manager._session_id}-{instance_id}"

diff --git a/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py b/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py
@@ -121,79 +121,57 @@ def enhanced_prompt(self) -> str:
     @property
     def enhanced_prompt_v2(self) -> str:
         return """You are an expert software architect. A static analyzer has pre-computed dead code
-candidates for this codebase. Your job is to FILTER them using the metadata provided.
+candidates for this codebase. Your job is to run a filter script and produce REPORT.json.
 
-STEP 1: Read `supermodel_dead_code_analysis.json`. It contains:
+The file `supermodel_dead_code_analysis.json` in your working directory contains:
 - `metadataSummary`: totalCandidates, rootFilesCount, reasonBreakdown, confidenceBreakdown
-- `chunkFiles`: list of chunk files with candidate details
+- `deadCodeCandidates`: all candidates (may be large — do NOT read the whole file manually)
 - `entryPoints`: symbols confirmed alive — any candidate matching an entry point is a false positive
 
-If there are chunk files, read ALL of them.
-
-STEP 2: Understand the analysis quality.
-- Check `rootFilesCount` — if it's much higher than expected (>20), the import
-  resolver likely failed on many files, meaning "file never imported" candidates
-  have a high false positive rate for framework-wired code.
-- Check `reasonBreakdown` to understand where candidates come from.
-
-STEP 3: Write a script to filter candidates and produce REPORT.json:
+STEP 1: Run this Python script with Bash:
 
 ```python
-import json, glob
+import json
 
 with open("supermodel_dead_code_analysis.json") as f:
-    index = json.load(f)
+    analysis = json.load(f)
 
-# Load entry points as a whitelist
-entry_set = set()
-for ep in index.get("entryPoints", []):
-    entry_set.add((ep.get("file", ""), ep.get("name", "")))
+summary = analysis.get("metadataSummary", {})
+print(f"Total candidates: {summary.get('totalCandidates', '?')}, included: {summary.get('includedCandidates', '?')}")
 
-# Load all candidates from chunk files
-candidates = []
-for chunk_ref in index.get("chunkFiles", []):
-    with open(chunk_ref["file"]) as f:
-        chunk = json.load(f)
-    candidates.extend(chunk.get("deadCodeCandidates", []))
+# Build entry point whitelist
+entry_set = {(ep.get("file", ""), ep.get("name", "")) for ep in analysis.get("entryPoints", [])}
 
-# Filter
+# Filter candidates
 dead_code = []
-for c in candidates:
+for c in analysis.get("deadCodeCandidates", []):
     key = (c.get("file", ""), c.get("name", ""))
     reason = c.get("reason", "")
-    confidence = c.get("confidence", "")
 
-    # Drop entry points
     if key in entry_set:
         continue
-
-    # Drop pure type/interface candidates (high FP rate from structural typing)
     if "Type/interface" in reason:
         continue
 
-    # Keep everything else — the graph already did import/call analysis
     dead_code.append({
         "file": c.get("file", ""),
         "name": c.get("name", ""),
         "type": c.get("type", "function"),
-        "reason": reason
+        "reason": reason,
     })
 
 with open("REPORT.json", "w") as f:
     json.dump({"dead_code": dead_code, "analysis_complete": True}, f, indent=2)
 print(f"Wrote {len(dead_code)} candidates to REPORT.json")
 ```
 
-STEP 4: Run the script, then read REPORT.json to confirm it was written correctly.
+STEP 2: Verify REPORT.json was written by running: `python3 -c "import json; d=json.load(open('REPORT.json')); print(len(d['dead_code']), 'items written')"`
 
 RULES:
-- Do NOT grep the codebase to verify candidates. The static analyzer already
-  performed call graph and dependency analysis — grep produces false negatives
-  when symbol names appear in comments, strings, or type-only imports.
-- Trust the graph. Filter only using the metadata (reason, confidence, entryPoints).
+- Do NOT read supermodel_dead_code_analysis.json manually — it may be very large.
+- Do NOT grep or explore the codebase. Trust the pre-computed analysis.
+- Run the script exactly as shown. Do not modify it.
 - Type should be one of: function, class, method, const, interface, variable.
-- When in doubt about a candidate, INCLUDE it — missing real dead code is worse
-  than a false positive.
 """
 
     def parse_api_response(self, response: dict) -> dict:

diff --git a/src/mcpbr/benchmarks/supermodel/git_utils.py b/src/mcpbr/benchmarks/supermodel/git_utils.py
@@ -1,10 +1,41 @@
 """Git utilities for cloning repos and creating zip archives."""
 
 import asyncio
+import fnmatch
 import logging
+import os
+import zipfile
 
 logger = logging.getLogger("mcpbr.supermodel")
 
+# Binary / media assets that are irrelevant for code analysis.
+# Applied to every zip regardless of per-task zip_exclude config.
+BINARY_EXCLUDE_PATTERNS = [
+    "*.mp4",
+    "*.mov",
+    "*.avi",
+    "*.webm",
+    "*.png",
+    "*.jpg",
+    "*.jpeg",
+    "*.gif",
+    "*.svg",
+    "*.ico",
+    "*.webp",
+    "*.woff",
+    "*.woff2",
+    "*.ttf",
+    "*.eot",
+    "*.otf",
+    "*.pdf",
+    "*.zip",
+    "*.tar",
+    "*.gz",
+    "*.mp3",
+    "*.wav",
+    "*.ogg",
+]
+
 
 async def clone_repo_at_commit(repo: str, commit: str, dest: str) -> None:
     """Clone a repo and checkout a specific commit.
@@ -117,18 +148,25 @@ async def zip_repo(
 
     is_git = os.path.isdir(os.path.join(repo_dir, ".git"))
 
+    all_excludes = BINARY_EXCLUDE_PATTERNS + (exclude_patterns or [])
+
     if is_git:
-        return await _zip_repo_git_archive(repo_dir, output_zip, scope_prefix)
+        return await _zip_repo_git_archive(repo_dir, output_zip, scope_prefix, all_excludes)
     else:
-        return await _zip_repo_fallback(repo_dir, output_zip, scope_prefix, exclude_patterns)
+        return await _zip_repo_fallback(repo_dir, output_zip, scope_prefix, all_excludes)
 
 
 async def _zip_repo_git_archive(
     repo_dir: str,
     output_zip: str,
     scope_prefix: str | None = None,
+    exclude_patterns: list[str] | None = None,
 ) -> str:
-    """Create zip using ``git archive`` — only includes tracked files."""
+    """Create zip using ``git archive`` — only includes tracked files.
+
+    If exclude_patterns are provided, rewrites the zip to strip matching entries
+    (git archive has no native exclude support).
+    """
     cmd = ["git", "archive", "--format=zip", "-o", output_zip, "HEAD"]
     if scope_prefix:
         cmd.append(scope_prefix)
@@ -142,9 +180,37 @@ async def _zip_repo_git_archive(
     _, stderr = await asyncio.wait_for(proc.communicate(), timeout=120)
     if proc.returncode != 0:
         raise RuntimeError(f"git archive failed: {stderr.decode()}")
+
+    if exclude_patterns:
+        _filter_zip_entries(output_zip, exclude_patterns)
+
     return output_zip
 
 
+def _filter_zip_entries(zip_path: str, patterns: list[str]) -> None:
+    """Rewrite zip in-place, removing entries whose basename matches any glob pattern."""
+    tmp_path = zip_path + ".tmp"
+    removed = 0
+    try:
+        with (
+            zipfile.ZipFile(zip_path, "r") as zin,
+            zipfile.ZipFile(tmp_path, "w", compression=zipfile.ZIP_DEFLATED) as zout,
+        ):
+            for item in zin.infolist():
+                basename = os.path.basename(item.filename)
+                if any(fnmatch.fnmatch(basename, pat) for pat in patterns):
+                    removed += 1
+                    continue
+                zout.writestr(item, zin.read(item.filename))
+        os.replace(tmp_path, zip_path)
+    except Exception:
+        if os.path.exists(tmp_path):
+            os.unlink(tmp_path)
+        raise
+    if removed:
+        logger.info(f"Filtered {removed} binary entries from {os.path.basename(zip_path)}")
+
+
 async def _zip_repo_fallback(
     repo_dir: str,
     output_zip: str,

diff --git a/src/mcpbr/harness.py b/src/mcpbr/harness.py
@@ -306,6 +306,10 @@ def _create_mcp_agent(
     # Use explicit server config if provided, otherwise fall back to config.mcp_server
     server_config = mcp_server_config if mcp_server_config is not None else config.mcp_server
 
+    # Allow benchmarks to opt out of MCP_PROMPT_SUFFIX (e.g. when the problem_statement
+    # already contains its own MCP guidance that conflicts with the generic suffix).
+    suppress_mcp_suffix = getattr(benchmark, "suppress_mcp_suffix", False)
+
     return create_harness(
         config.agent_harness,
         model=config.model,
@@ -317,6 +321,7 @@ def _create_mcp_agent(
         mcp_logs_dir=mcp_logs_dir,
         thinking_budget=config.thinking_budget,
         claude_code_version=config.claude_code_version,
+        suppress_mcp_suffix=suppress_mcp_suffix,
     )
 
 
@@ -628,9 +633,14 @@ async def _run_mcp_evaluation(
         instance_id = task.get(
             "instance_id", f"{task.get('project', 'unknown')}_{task.get('bug_id', 'unknown')}"
         )
+        # For MCP/enhanced condition, swap in the enhanced problem statement if available.
+        # create_environment makes a local copy with this swap, but agent.solve needs it too.
+        task_for_agent = {**task}
+        if "problem_statement_enhanced" in task_for_agent:
+            task_for_agent["problem_statement"] = task_for_agent["problem_statement_enhanced"]
         agent_result = await asyncio.wait_for(
             agent.solve(
-                task,
+                task_for_agent,
                 env.host_workdir,
                 timeout=config.timeout_seconds,
                 verbose=verbose,

diff --git a/src/mcpbr/harnesses.py b/src/mcpbr/harnesses.py
@@ -534,6 +534,7 @@ def __init__(
         mcp_logs_dir: Path | None = None,
         thinking_budget: int | None = None,
         claude_code_version: str | None = None,
+        suppress_mcp_suffix: bool = False,
     ) -> None:
         """Initialize Claude Code harness.
 
@@ -547,11 +548,14 @@ def __init__(
             mcp_logs_dir: Directory for MCP server logs. Default: ~/.mcpbr_state/logs
             thinking_budget: Extended thinking token budget. Set to enable thinking mode.
             claude_code_version: Pinned Claude Code version (e.g., '2.1.37').
+            suppress_mcp_suffix: If True, do not append MCP_PROMPT_SUFFIX when an MCP
+                server is active. Use for benchmarks that provide their own MCP guidance
+                in the problem statement (e.g. supermodel dead-code benchmark).
         """
         self.model = model
         self.mcp_server = mcp_server
         self.prompt_template = prompt or DEFAULT_PROMPT
-        if mcp_server and not mcp_server.setup_only:
+        if mcp_server and not mcp_server.setup_only and not suppress_mcp_suffix:
             self.prompt_template += MCP_PROMPT_SUFFIX
         self.max_iterations = max_iterations
         self.verbosity = verbosity
@@ -1368,6 +1372,7 @@ def create_harness(
     mcp_logs_dir: Path | None = None,
     thinking_budget: int | None = None,
     claude_code_version: str | None = None,
+    suppress_mcp_suffix: bool = False,
 ) -> AgentHarness:
     """Factory function to create an agent harness.
 
@@ -1405,6 +1410,7 @@ def create_harness(
         mcp_logs_dir=mcp_logs_dir,
         thinking_budget=thinking_budget,
         claude_code_version=claude_code_version,
+        suppress_mcp_suffix=suppress_mcp_suffix,
     )
     return harness