Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions src/mcpbr/benchmarks/deadcode.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def __init__(
self,
dataset: str | Path = "",
corpus_path: str | Path | None = None,
resolved_threshold: float = 0.8,
resolved_threshold: float = 0.0,
):
"""Initialize the benchmark.

Expand Down Expand Up @@ -406,7 +406,9 @@ async def evaluate(
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

resolved = precision >= self.resolved_threshold and recall >= self.resolved_threshold
# Dead code uses continuous P/R/F1 metrics, not binary resolved gate.
# Return None so the summary shows "-" instead of PASS/FAIL.
resolved = None

# Log results for visibility
print(f"\n{'=' * 50}")
Expand Down
7 changes: 4 additions & 3 deletions src/mcpbr/benchmarks/supermodel/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def __init__(
tasks: list[dict[str, Any]] | None = None,
supermodel_api_base: str = "https://api.supermodel.dev",
supermodel_api_key: str | None = None,
resolved_threshold: float = 0.8,
resolved_threshold: float = 0.0,
ground_truth_dir: str | Path | None = None,
supermodel_api_timeout: int = 900,
**kwargs: Any,
Expand Down Expand Up @@ -628,7 +628,9 @@ async def evaluate(

precision = metrics["precision"]
recall = metrics["recall"]
resolved = recall >= self.resolved_threshold
# Dead code uses continuous P/R/F1 metrics, not binary resolved gate.
# Return None so the summary shows "-" instead of PASS/FAIL.
resolved = None

# Log results
print(f"\n{'=' * 50}")
Expand All @@ -641,7 +643,6 @@ async def evaluate(
print(f" Precision: {precision * 100:.1f}%")
print(f" Recall: {recall * 100:.1f}%")
print(f" F1 Score: {metrics['f1_score'] * 100:.1f}%")
print(f" Resolved: {resolved}")
print(f"{'=' * 50}\n")

return {
Expand Down
27 changes: 3 additions & 24 deletions src/mcpbr/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -1148,30 +1148,9 @@ def run(
console.print("[dim]Use --reset-state or --no-incremental to re-run[/dim]")
exit_code = 3

# Check if anything was resolved (exit code 2)
# Only check this if we actually evaluated tasks
if exit_code == 0:
mcp_resolved = results.summary["mcp"]["resolved"]
baseline_resolved = results.summary["baseline"]["resolved"]
mcp_total = results.summary["mcp"]["total"]
baseline_total = results.summary["baseline"]["total"]

# Only report "no resolutions" if tasks were actually run
# If total is 0, no tasks were run (not a failure)
if (mcp_only and mcp_total > 0 and mcp_resolved == 0) or (
baseline_only and baseline_total > 0 and baseline_resolved == 0
):
console.print("\n[yellow]⚠ No tasks resolved (0% success)[/yellow]")
exit_code = 2
elif not mcp_only and not baseline_only:
# For full run, check if either had tasks and none were resolved
if (
(mcp_total > 0 or baseline_total > 0)
and mcp_resolved == 0
and baseline_resolved == 0
):
console.print("\n[yellow]⚠ No tasks resolved by either agent (0% success)[/yellow]")
exit_code = 2
# Resolution check removed — dead code benchmarks use continuous P/R/F1
# metrics, not binary resolved gates. Other benchmarks can still use
# resolved_threshold in their config if needed.
Comment on lines +1151 to +1153
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Exit code 2 is documented but never set anymore.

Hey! So the comment here is great at explaining why the resolution check was removed — makes total sense for dead code benchmarks where hitting 80% precision/recall is unrealistic.

But here's the thing: the docstring up at lines 582-588 still tells users that exit code 2 means "No resolutions (evaluation ran but 0% success)". Since that code path is gone now, users reading the help output (mcpbr run --help) will be confused when they never see exit code 2.

You've got a couple options:

  1. Remove exit code 2 from the docstring entirely
  2. Keep it documented but note it's benchmark-dependent (if other benchmarks might still want it)
📝 Suggested docstring fix (if removing exit code 2)
     \b
     Exit Codes:
-      0   Success (at least one task resolved)
+      0   Success
       1   Fatal error (invalid config, Docker unavailable, crash)
-      2   No resolutions (evaluation ran but 0% success)
       3   Nothing evaluated (all tasks cached/skipped)
       130 Interrupted by user (Ctrl+C)
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/mcpbr/cli.py` around lines 1151 - 1153, Update the CLI help docstring
that documents exit codes (the text shown by "mcpbr run --help") to remove the
reference to exit code 2 ("No resolutions...") since that code path was removed;
ensure the remaining exit code descriptions are accurate and, if desired,
replace the removed entry with a brief note that resolution-based exit codes are
deprecated or benchmark-dependent. Locate the docstring for the run command /
module help in cli.py and edit the exit codes section accordingly.


# Exit with determined exit code
if exit_code != 0:
Expand Down
2 changes: 1 addition & 1 deletion src/mcpbr/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -901,7 +901,7 @@ def validate_thinking_budget(cls, v: int | None) -> int | None:
)

resolved_threshold: float = Field(
default=0.8,
default=0.0,
ge=0.0,
le=1.0,
description="Recall threshold to consider a task resolved (must be in [0.0, 1.0])",
Expand Down
32 changes: 12 additions & 20 deletions src/mcpbr/reporting.py
Original file line number Diff line number Diff line change
Expand Up @@ -587,20 +587,8 @@ def print_summary(results: "EvaluationResults", console: Console) -> None:
mcp = results.summary["mcp"]
baseline = results.summary["baseline"]

table.add_row(
"Resolved",
f"{mcp['resolved']}/{mcp['total']}",
f"{baseline['resolved']}/{baseline['total']}",
)
table.add_row(
"Resolution Rate",
f"{mcp['rate']:.1%}",
f"{baseline['rate']:.1%}",
)

console.print(table)
console.print()
console.print(f"[bold]Improvement:[/bold] {results.summary['improvement']}")

# Print statistical significance if available
significance = results.summary.get("significance")
Expand Down Expand Up @@ -847,19 +835,23 @@ def dict_to_stats(cls, data):
task_table.add_column("Error", style="red", max_width=50)

for task in results.tasks:
mcp_status = (
"[green]PASS[/green]" if task.mcp and task.mcp.get("resolved") else "[red]FAIL[/red]"
)
if task.mcp is None:
mcp_status = "[dim]-[/dim]"
elif task.mcp.get("resolved") is None:
mcp_status = "[dim]-[/dim]"
elif task.mcp.get("resolved"):
mcp_status = "[green]PASS[/green]"
else:
mcp_status = "[red]FAIL[/red]"

baseline_status = (
"[green]PASS[/green]"
if task.baseline and task.baseline.get("resolved")
else "[red]FAIL[/red]"
)
if task.baseline is None:
baseline_status = "[dim]-[/dim]"
elif task.baseline.get("resolved") is None:
baseline_status = "[dim]-[/dim]"
elif task.baseline.get("resolved"):
baseline_status = "[green]PASS[/green]"
else:
baseline_status = "[red]FAIL[/red]"

error_msg = ""
if task.mcp and task.mcp.get("error"):
Expand Down
Loading