Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Base image for SWE-bench task environments
FROM python:3.11-slim
FROM mcr.microsoft.com/mirror/docker/library/python:3.11-slim

# Install comprehensive system dependencies for SWE-bench tasks
RUN apt-get update && apt-get install -y --no-install-recommends \
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,8 @@ ignore = [
"tests/**/*.py" = ["S", "T20"]
"infrastructure/**/*.py" = ["S603", "S607"]
"src/mcpbr/infrastructure/**/*.py" = ["S603", "S607", "S108"]
"src/mcpbr/benchmarks/deadcode.py" = ["S608"]
"src/mcpbr/benchmarks/supermodel/benchmark.py" = ["S608"]
"scripts/**/*.py" = ["T20", "S"]

[tool.pytest.ini_options]
Expand Down
2 changes: 1 addition & 1 deletion src/mcpbr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
A benchmark runner for evaluating MCP servers against SWE-bench tasks.
"""

__version__ = "0.14.0"
__version__ = "0.14.1"

from .sdk import (
BenchmarkResult,
Expand Down
6 changes: 6 additions & 0 deletions src/mcpbr/benchmarks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from .codereval import CoderEvalBenchmark
from .custom import CustomBenchmark
from .cybergym import CyberGymBenchmark
from .deadcode import DeadCodeBenchmark
from .gaia import GAIABenchmark
from .gsm8k import GSM8KBenchmark
from .hellaswag import HellaSwagBenchmark
Expand All @@ -28,6 +29,7 @@
from .mlagentbench import MLAgentBenchBenchmark
from .mmmu import MMMUBenchmark
from .repoqa import RepoQABenchmark
from .supermodel.benchmark import SupermodelBenchmark
from .swebench import SWEBenchmark
from .terminalbench import TerminalBenchBenchmark
from .toolbench import ToolBenchBenchmark
Expand All @@ -50,6 +52,7 @@
"CoderEvalBenchmark",
"CustomBenchmark",
"CyberGymBenchmark",
"DeadCodeBenchmark",
"GAIABenchmark",
"GSM8KBenchmark",
"HellaSwagBenchmark",
Expand All @@ -64,6 +67,7 @@
"MMMUBenchmark",
"RepoQABenchmark",
"SWEBenchmark",
"SupermodelBenchmark",
"TerminalBenchBenchmark",
"ToolBenchBenchmark",
"TruthfulQABenchmark",
Expand Down Expand Up @@ -106,6 +110,8 @@
"mmmu": MMMUBenchmark,
"longbench": LongBenchBenchmark,
"adversarial": AdversarialBenchmark,
"dead-code": DeadCodeBenchmark, # type: ignore[dict-item]
"supermodel": SupermodelBenchmark, # type: ignore[dict-item]
}


Expand Down
122 changes: 122 additions & 0 deletions src/mcpbr/benchmarks/_bench_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
"""Shared utilities for benchmark implementations."""

import json
import logging
import subprocess
from pathlib import Path
from typing import Any

logger = logging.getLogger("mcpbr.benchmarks")


def extract_findings_from_text(text: str, findings_key: str = "dead_code") -> list[dict[str, Any]]:
"""Extract findings array from text/patch content by locating a JSON key.

Searches for a JSON key (e.g. "dead_code") and extracts the associated array
using bracket-depth matching. Handles brackets inside JSON strings correctly.

Args:
text: Raw text that may contain a JSON object with the findings key.
findings_key: The JSON key whose array value to extract.

Returns:
List of finding dicts, or empty list if not found/parseable.
"""
findings: list[dict[str, Any]] = []
try:
marker = f'"{findings_key}"'
start = text.find(marker)
if start == -1:
return findings
arr_start = text.find("[", start)
if arr_start == -1:
return findings
Comment on lines +27 to +33
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Diff fallback will usually read the deleted REPORT.json array.

This parser grabs the first "{findings_key}" array it sees. In a normal unified diff, that is often the removed placeholder like - "dead_code": [], so fallback scoring returns an empty list even when the patch later adds real findings. Please prefer the last valid array after the key, or explicitly handle added diff lines.

Also applies to: 38-60

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/mcpbr/benchmarks/_bench_utils.py` around lines 27 - 33, The parser
currently uses text.find(marker) which picks the first occurrence (often a
removed/`-` array); change the logic in the function that computes
marker/start/arr_start to locate the last valid array after the key (use
text.rfind(marker) or loop to find the last marker occurrence and then search
forward for the next '['), and similarly update the subsequent parsing used in
the block handling lines 38-60 so it selects the array that follows the last
marker (or prefers lines starting with '+' indicating added content) before
extracting the JSON array of findings.

# Bracket-depth matching that respects JSON strings
depth = 0
in_string = False
escape_next = False
for i, c in enumerate(text[arr_start:], arr_start):
if escape_next:
escape_next = False
continue
if c == "\\":
if in_string:
escape_next = True
continue
if c == '"':
in_string = not in_string
continue
if in_string:
continue
if c == "[":
depth += 1
elif c == "]":
depth -= 1
if depth == 0:
arr_text = text[arr_start : i + 1]
parsed = json.loads(arr_text)
if isinstance(parsed, list):
findings = parsed
break
except (json.JSONDecodeError, ValueError):
pass
return findings


def init_git_workdir(host_workdir: str, timeout: int = 30) -> None:
"""Initialize a git repo in a workdir so the harness can track modifications.

Args:
host_workdir: Path to the working directory.
timeout: Timeout in seconds for each git command.
"""
subprocess.run(
["git", "init"], cwd=host_workdir, capture_output=True, check=False, timeout=timeout
)
subprocess.run(
["git", "config", "user.email", "mcpbr@test.com"],
cwd=host_workdir,
capture_output=True,
check=False,
timeout=timeout,
)
subprocess.run(
["git", "config", "user.name", "MCPBR"],
cwd=host_workdir,
capture_output=True,
check=False,
timeout=timeout,
)
subprocess.run(
["git", "add", "-A"],
cwd=host_workdir,
capture_output=True,
check=False,
timeout=timeout,
)
subprocess.run(
["git", "commit", "-m", "Initial"],
cwd=host_workdir,
capture_output=True,
check=False,
timeout=timeout,
)


def safe_write_file(host_workdir: str, file_path: str, content: str) -> None:
"""Write a file within host_workdir, raising if the path escapes containment.

Args:
host_workdir: Root directory that all writes must stay within.
file_path: Relative path of the file to write.
content: File content.

Raises:
ValueError: If the resolved path is outside host_workdir.
"""
root = Path(host_workdir).resolve()
full_path = (root / file_path).resolve()
if not full_path.is_relative_to(root):
raise ValueError(f"Path traversal detected: {file_path!r} escapes {host_workdir!r}")
full_path.parent.mkdir(parents=True, exist_ok=True)
full_path.write_text(content)
8 changes: 5 additions & 3 deletions src/mcpbr/benchmarks/codegraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import json
import logging
import re
from datetime import datetime, timezone
from datetime import UTC, datetime
from typing import Any

from datasets import load_dataset
Expand Down Expand Up @@ -266,7 +266,7 @@ async def _setup_environment(self, env: TaskEnvironment, task: dict[str, Any]) -
"version": 1,
"repoName": cache_name,
"commitHash": None,
"savedAt": datetime.now(timezone.utc).isoformat(),
"savedAt": datetime.now(UTC).isoformat(),
"raw": result,
}

Expand Down Expand Up @@ -382,7 +382,9 @@ def _count_steps(self, text: str) -> int:
return 1

# Count tool call patterns in the output
tool_calls = len(re.findall(r"(?:tool_use|tool_call|<tool>|Tool:|Calling)", text, re.IGNORECASE))
tool_calls = len(
re.findall(r"(?:tool_use|tool_call|<tool>|Tool:|Calling)", text, re.IGNORECASE)
)
return max(tool_calls, 1)

def get_prebuilt_image(self, task: dict[str, Any]) -> str | None:
Expand Down
Loading
Loading