Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 19 additions & 38 deletions src/mcpbr/benchmarks/supermodel/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ class SupermodelBenchmark:

name = "supermodel"
evaluate_without_patch = True # Uses REPORT.json, not git diff
suppress_mcp_suffix = (
True # enhanced_prompt_v2 provides its own guidance; generic MCP suffix conflicts
)

def __init__(
self,
Expand Down Expand Up @@ -475,50 +478,18 @@ async def create_environment(
{k: v for k, v in ep.items() if k in ep_keep} for ep in entry_points[:200]
]

# Chunk candidates into files of max 200 each (~150 chars/entry
# with reason+confidence = ~30K chars = ~7.5K tokens per chunk).
# Must stay under 10K token read limit.
max_per_file = 200
total = len(slimmed)

base_name = self._endpoint.analysis_filename.replace(".json", "")
chunk_refs = []
for i in range(0, max(total, 1), max_per_file):
chunk_num = i // max_per_file + 1
chunk = slimmed[i : i + max_per_file]
if not chunk:
break
chunk_name = f"{base_name}_chunk_{chunk_num:03d}.json"
chunk_path = Path(host_workdir) / chunk_name

# Per-chunk metadata
chunk_reasons = Counter(c.get("reason", "") for c in chunk)
chunk_data = {
"chunk": chunk_num,
"candidateCount": len(chunk),
"reasonBreakdown": dict(chunk_reasons.most_common()),
"deadCodeCandidates": chunk,
}
chunk_path.write_text(json.dumps(chunk_data, separators=(",", ":")))
chunk_refs.append(
{
"file": chunk_name,
"candidateCount": len(chunk),
}
)

# Write the index file (what the agent reads first)
index_data = {
# Write all candidates directly into a single analysis file.
analysis_data = {
"metadataSummary": metadata_summary,
"chunkFiles": chunk_refs,
"deadCodeCandidates": slimmed,
"entryPoints": slim_entry_points,
}
index_path = Path(host_workdir) / self._endpoint.analysis_filename
index_path.write_text(json.dumps(index_data, indent=2))
index_path.write_text(json.dumps(analysis_data, indent=2))

logger.info(
f"Placed analysis for {instance_id}: {total} candidates "
f"in {len(chunk_refs)} chunks, {len(slim_entry_points)} entry points "
f"Placed analysis for {instance_id}: {len(slimmed)} candidates, "
f"{len(slim_entry_points)} entry points "
f"(filtered: {type_filtered} types, {ep_filtered} entry points)"
)
except Exception as e:
Expand All @@ -528,6 +499,16 @@ async def create_environment(
file=sys.stderr,
flush=True,
)
# Write an empty analysis file so the agent can still complete (with 0 results).
# Without this, enhanced_prompt_v2 fails immediately and the task shows as ERROR
# rather than FAIL with 0% recall, which obscures whether the code path is working.
empty_analysis = {
"metadataSummary": {"totalCandidates": 0, "includedCandidates": 0},
"deadCodeCandidates": [],
"entryPoints": [],
}
index_path = Path(host_workdir) / self._endpoint.analysis_filename
index_path.write_text(json.dumps(empty_analysis, indent=2))

# Start Docker container
container_name = f"mcpbr-{docker_manager._session_id}-{instance_id}"
Expand Down
56 changes: 17 additions & 39 deletions src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,79 +121,57 @@ def enhanced_prompt(self) -> str:
@property
def enhanced_prompt_v2(self) -> str:
return """You are an expert software architect. A static analyzer has pre-computed dead code
candidates for this codebase. Your job is to FILTER them using the metadata provided.
candidates for this codebase. Your job is to run a filter script and produce REPORT.json.

STEP 1: Read `supermodel_dead_code_analysis.json`. It contains:
The file `supermodel_dead_code_analysis.json` in your working directory contains:
- `metadataSummary`: totalCandidates, rootFilesCount, reasonBreakdown, confidenceBreakdown
- `chunkFiles`: list of chunk files with candidate details
- `deadCodeCandidates`: all candidates (may be large — do NOT read the whole file manually)
- `entryPoints`: symbols confirmed alive — any candidate matching an entry point is a false positive
Comment on lines +126 to 129
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Prompt/schema mismatch will zero out results on normal runs.

This script reads only analysis["deadCodeCandidates"], but the generated supermodel_dead_code_analysis.json index is chunked (chunkFiles) in src/mcpbr/benchmarks/supermodel/benchmark.py (Line 514-518). Result: script writes empty dead_code despite real candidates existing in chunk files.

🔧 Suggested prompt-script update (support both schemas)
-# Filter candidates
-dead_code = []
-for c in analysis.get("deadCodeCandidates", []):
+# Collect candidates (direct list or chunked index format)
+candidates = analysis.get("deadCodeCandidates")
+if candidates is None:
+    candidates = []
+    for ref in analysis.get("chunkFiles", []):
+        chunk_file = ref.get("file")
+        if not chunk_file:
+            continue
+        with open(chunk_file) as cf:
+            chunk_json = json.load(cf)
+        candidates.extend(chunk_json.get("deadCodeCandidates", []))
+
+# Filter candidates
+dead_code = []
+for c in candidates:

Also applies to: 147-148, 171-174

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py` around lines 126 -
129, The script only reads analysis["deadCodeCandidates"] and misses chunked
schemas; update the load logic in dead_code.py to support both formats by
checking if analysis contains "chunkFiles" and, if so, iterating over and
loading each referenced chunk file to aggregate candidates, otherwise fall back
to analysis["deadCodeCandidates"]; preserve existing filtering against
analysis["entryPoints"] (so entryPoints still remove false positives) and ensure
the final dead_code output is written from the aggregated candidate list.


If there are chunk files, read ALL of them.

STEP 2: Understand the analysis quality.
- Check `rootFilesCount` — if it's much higher than expected (>20), the import
resolver likely failed on many files, meaning "file never imported" candidates
have a high false positive rate for framework-wired code.
- Check `reasonBreakdown` to understand where candidates come from.

STEP 3: Write a script to filter candidates and produce REPORT.json:
STEP 1: Run this Python script with Bash:

```python
import json, glob
import json

with open("supermodel_dead_code_analysis.json") as f:
index = json.load(f)
analysis = json.load(f)

# Load entry points as a whitelist
entry_set = set()
for ep in index.get("entryPoints", []):
entry_set.add((ep.get("file", ""), ep.get("name", "")))
summary = analysis.get("metadataSummary", {})
print(f"Total candidates: {summary.get('totalCandidates', '?')}, included: {summary.get('includedCandidates', '?')}")

# Load all candidates from chunk files
candidates = []
for chunk_ref in index.get("chunkFiles", []):
with open(chunk_ref["file"]) as f:
chunk = json.load(f)
candidates.extend(chunk.get("deadCodeCandidates", []))
# Build entry point whitelist
entry_set = {(ep.get("file", ""), ep.get("name", "")) for ep in analysis.get("entryPoints", [])}

# Filter
# Filter candidates
dead_code = []
for c in candidates:
for c in analysis.get("deadCodeCandidates", []):
key = (c.get("file", ""), c.get("name", ""))
reason = c.get("reason", "")
confidence = c.get("confidence", "")

# Drop entry points
if key in entry_set:
continue

# Drop pure type/interface candidates (high FP rate from structural typing)
if "Type/interface" in reason:
continue

# Keep everything else — the graph already did import/call analysis
dead_code.append({
"file": c.get("file", ""),
"name": c.get("name", ""),
"type": c.get("type", "function"),
"reason": reason
"reason": reason,
})

with open("REPORT.json", "w") as f:
json.dump({"dead_code": dead_code, "analysis_complete": True}, f, indent=2)
print(f"Wrote {len(dead_code)} candidates to REPORT.json")
```

STEP 4: Run the script, then read REPORT.json to confirm it was written correctly.
STEP 2: Verify REPORT.json was written by running: `python3 -c "import json; d=json.load(open('REPORT.json')); print(len(d['dead_code']), 'items written')"`

RULES:
- Do NOT grep the codebase to verify candidates. The static analyzer already
performed call graph and dependency analysis — grep produces false negatives
when symbol names appear in comments, strings, or type-only imports.
- Trust the graph. Filter only using the metadata (reason, confidence, entryPoints).
- Do NOT read supermodel_dead_code_analysis.json manually — it may be very large.
- Do NOT grep or explore the codebase. Trust the pre-computed analysis.
- Run the script exactly as shown. Do not modify it.
- Type should be one of: function, class, method, const, interface, variable.
- When in doubt about a candidate, INCLUDE it — missing real dead code is worse
than a false positive.
"""

def parse_api_response(self, response: dict) -> dict:
Expand Down
72 changes: 69 additions & 3 deletions src/mcpbr/benchmarks/supermodel/git_utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,41 @@
"""Git utilities for cloning repos and creating zip archives."""

import asyncio
import fnmatch
import logging
import os
import zipfile

logger = logging.getLogger("mcpbr.supermodel")

# Binary / media assets that are irrelevant for code analysis.
# Applied to every zip regardless of per-task zip_exclude config.
BINARY_EXCLUDE_PATTERNS = [
"*.mp4",
"*.mov",
"*.avi",
"*.webm",
"*.png",
"*.jpg",
"*.jpeg",
"*.gif",
"*.svg",
"*.ico",
"*.webp",
"*.woff",
"*.woff2",
"*.ttf",
"*.eot",
"*.otf",
"*.pdf",
"*.zip",
"*.tar",
"*.gz",
"*.mp3",
"*.wav",
"*.ogg",
]


async def clone_repo_at_commit(repo: str, commit: str, dest: str) -> None:
"""Clone a repo and checkout a specific commit.
Expand Down Expand Up @@ -117,18 +148,25 @@ async def zip_repo(

is_git = os.path.isdir(os.path.join(repo_dir, ".git"))

all_excludes = BINARY_EXCLUDE_PATTERNS + (exclude_patterns or [])

if is_git:
return await _zip_repo_git_archive(repo_dir, output_zip, scope_prefix)
return await _zip_repo_git_archive(repo_dir, output_zip, scope_prefix, all_excludes)
else:
return await _zip_repo_fallback(repo_dir, output_zip, scope_prefix, exclude_patterns)
return await _zip_repo_fallback(repo_dir, output_zip, scope_prefix, all_excludes)


async def _zip_repo_git_archive(
repo_dir: str,
output_zip: str,
scope_prefix: str | None = None,
exclude_patterns: list[str] | None = None,
) -> str:
"""Create zip using ``git archive`` — only includes tracked files."""
"""Create zip using ``git archive`` — only includes tracked files.

If exclude_patterns are provided, rewrites the zip to strip matching entries
(git archive has no native exclude support).
"""
cmd = ["git", "archive", "--format=zip", "-o", output_zip, "HEAD"]
if scope_prefix:
cmd.append(scope_prefix)
Expand All @@ -142,9 +180,37 @@ async def _zip_repo_git_archive(
_, stderr = await asyncio.wait_for(proc.communicate(), timeout=120)
if proc.returncode != 0:
raise RuntimeError(f"git archive failed: {stderr.decode()}")

if exclude_patterns:
_filter_zip_entries(output_zip, exclude_patterns)

return output_zip


def _filter_zip_entries(zip_path: str, patterns: list[str]) -> None:
"""Rewrite zip in-place, removing entries whose basename matches any glob pattern."""
tmp_path = zip_path + ".tmp"
removed = 0
try:
with (
zipfile.ZipFile(zip_path, "r") as zin,
zipfile.ZipFile(tmp_path, "w", compression=zipfile.ZIP_DEFLATED) as zout,
):
for item in zin.infolist():
basename = os.path.basename(item.filename)
if any(fnmatch.fnmatch(basename, pat) for pat in patterns):
removed += 1
continue
Comment on lines +199 to +203
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Path-based excludes are silently ignored in git-archive filtering.

_filter_zip_entries only matches os.path.basename(item.filename), so excludes like loc/* / lib/* (documented in zip_repo) never match. This makes caller zip_exclude ineffective on the git-archive path.

💡 Proposed fix
 def _filter_zip_entries(zip_path: str, patterns: list[str]) -> None:
     """Rewrite zip in-place, removing entries whose basename matches any glob pattern."""
@@
             for item in zin.infolist():
-                basename = os.path.basename(item.filename)
-                if any(fnmatch.fnmatch(basename, pat) for pat in patterns):
+                rel_path = item.filename.lstrip("./")
+                basename = os.path.basename(rel_path)
+                if any(
+                    fnmatch.fnmatch(rel_path, pat) or fnmatch.fnmatch(basename, pat)
+                    for pat in patterns
+                ):
                     removed += 1
                     continue
                 zout.writestr(item, zin.read(item.filename))
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/mcpbr/benchmarks/supermodel/git_utils.py` around lines 199 - 203,
_filter_zip_entries currently matches only os.path.basename(item.filename), so
path-based patterns like "loc/*" or "lib/*" never match; update the filtering to
match against the full (normalized) entry path instead of basename (e.g., use
the archive entry path from item.filename after normalizing separators) and
apply fnmatch.fnmatch to that full path so zip_exclude/zip_repo path patterns
work; ensure pattern normalization/leading "./" differences are handled
consistently with how zip_exclude patterns are specified.

zout.writestr(item, zin.read(item.filename))
os.replace(tmp_path, zip_path)
except Exception:
if os.path.exists(tmp_path):
os.unlink(tmp_path)
raise
if removed:
logger.info(f"Filtered {removed} binary entries from {os.path.basename(zip_path)}")


async def _zip_repo_fallback(
repo_dir: str,
output_zip: str,
Expand Down
12 changes: 11 additions & 1 deletion src/mcpbr/harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,10 @@ def _create_mcp_agent(
# Use explicit server config if provided, otherwise fall back to config.mcp_server
server_config = mcp_server_config if mcp_server_config is not None else config.mcp_server

# Allow benchmarks to opt out of MCP_PROMPT_SUFFIX (e.g. when the problem_statement
# already contains its own MCP guidance that conflicts with the generic suffix).
suppress_mcp_suffix = getattr(benchmark, "suppress_mcp_suffix", False)

return create_harness(
config.agent_harness,
model=config.model,
Expand All @@ -317,6 +321,7 @@ def _create_mcp_agent(
mcp_logs_dir=mcp_logs_dir,
thinking_budget=config.thinking_budget,
claude_code_version=config.claude_code_version,
suppress_mcp_suffix=suppress_mcp_suffix,
)


Expand Down Expand Up @@ -628,9 +633,14 @@ async def _run_mcp_evaluation(
instance_id = task.get(
"instance_id", f"{task.get('project', 'unknown')}_{task.get('bug_id', 'unknown')}"
)
# For MCP/enhanced condition, swap in the enhanced problem statement if available.
# create_environment makes a local copy with this swap, but agent.solve needs it too.
task_for_agent = {**task}
if "problem_statement_enhanced" in task_for_agent:
task_for_agent["problem_statement"] = task_for_agent["problem_statement_enhanced"]
agent_result = await asyncio.wait_for(
agent.solve(
task,
task_for_agent,
env.host_workdir,
timeout=config.timeout_seconds,
verbose=verbose,
Expand Down
8 changes: 7 additions & 1 deletion src/mcpbr/harnesses.py
Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,7 @@ def __init__(
mcp_logs_dir: Path | None = None,
thinking_budget: int | None = None,
claude_code_version: str | None = None,
suppress_mcp_suffix: bool = False,
) -> None:
"""Initialize Claude Code harness.

Expand All @@ -547,11 +548,14 @@ def __init__(
mcp_logs_dir: Directory for MCP server logs. Default: ~/.mcpbr_state/logs
thinking_budget: Extended thinking token budget. Set to enable thinking mode.
claude_code_version: Pinned Claude Code version (e.g., '2.1.37').
suppress_mcp_suffix: If True, do not append MCP_PROMPT_SUFFIX when an MCP
server is active. Use for benchmarks that provide their own MCP guidance
in the problem statement (e.g. supermodel dead-code benchmark).
"""
self.model = model
self.mcp_server = mcp_server
self.prompt_template = prompt or DEFAULT_PROMPT
if mcp_server and not mcp_server.setup_only:
if mcp_server and not mcp_server.setup_only and not suppress_mcp_suffix:
self.prompt_template += MCP_PROMPT_SUFFIX
self.max_iterations = max_iterations
self.verbosity = verbosity
Expand Down Expand Up @@ -1368,6 +1372,7 @@ def create_harness(
mcp_logs_dir: Path | None = None,
thinking_budget: int | None = None,
claude_code_version: str | None = None,
suppress_mcp_suffix: bool = False,
) -> AgentHarness:
"""Factory function to create an agent harness.

Expand Down Expand Up @@ -1405,6 +1410,7 @@ def create_harness(
mcp_logs_dir=mcp_logs_dir,
thinking_budget=thinking_budget,
claude_code_version=claude_code_version,
suppress_mcp_suffix=suppress_mcp_suffix,
)
return harness

Expand Down
Loading