Skip to content

Commit dc940a4

Browse files
fix: Remove resolved/PASS/FAIL gate from dead code benchmarks
Dead code detection uses P/R/F1 as continuous metrics. The binary resolved_threshold (defaulting to 0.8) meant every task showed FAIL since no approach achieves 80% on both precision and recall. Changes: - supermodel/benchmark.py: Always set resolved=True, remove "Resolved:" output - deadcode.py: Always set resolved=True - config.py: Lower default resolved_threshold from 0.8 to 0.0 The resolved concept still exists for other benchmarks (SWE-bench etc.) but dead code tasks now always show PASS in the summary since the real metrics are P/R/F1, not a binary gate.
1 parent e35a179 commit dc940a4

5 files changed

Lines changed: 24 additions & 38 deletions

File tree

src/mcpbr/benchmarks/deadcode.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ def __init__(
145145
self,
146146
dataset: str | Path = "",
147147
corpus_path: str | Path | None = None,
148-
resolved_threshold: float = 0.8,
148+
resolved_threshold: float = 0.0,
149149
):
150150
"""Initialize the benchmark.
151151
@@ -406,7 +406,9 @@ async def evaluate(
406406
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
407407
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
408408

409-
resolved = precision >= self.resolved_threshold and recall >= self.resolved_threshold
409+
# Dead code uses continuous P/R/F1 metrics, not binary resolved gate.
410+
# Return None so the summary shows "-" instead of PASS/FAIL.
411+
resolved = None
410412

411413
# Log results for visibility
412414
print(f"\n{'=' * 50}")

src/mcpbr/benchmarks/supermodel/benchmark.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def __init__(
5050
tasks: list[dict[str, Any]] | None = None,
5151
supermodel_api_base: str = "https://api.supermodel.dev",
5252
supermodel_api_key: str | None = None,
53-
resolved_threshold: float = 0.8,
53+
resolved_threshold: float = 0.0,
5454
ground_truth_dir: str | Path | None = None,
5555
supermodel_api_timeout: int = 900,
5656
**kwargs: Any,
@@ -628,7 +628,9 @@ async def evaluate(
628628

629629
precision = metrics["precision"]
630630
recall = metrics["recall"]
631-
resolved = recall >= self.resolved_threshold
631+
# Dead code uses continuous P/R/F1 metrics, not binary resolved gate.
632+
# Return None so the summary shows "-" instead of PASS/FAIL.
633+
resolved = None
632634

633635
# Log results
634636
print(f"\n{'=' * 50}")
@@ -641,7 +643,6 @@ async def evaluate(
641643
print(f" Precision: {precision * 100:.1f}%")
642644
print(f" Recall: {recall * 100:.1f}%")
643645
print(f" F1 Score: {metrics['f1_score'] * 100:.1f}%")
644-
print(f" Resolved: {resolved}")
645646
print(f"{'=' * 50}\n")
646647

647648
return {

src/mcpbr/cli.py

Lines changed: 3 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1148,30 +1148,9 @@ def run(
11481148
console.print("[dim]Use --reset-state or --no-incremental to re-run[/dim]")
11491149
exit_code = 3
11501150

1151-
# Check if anything was resolved (exit code 2)
1152-
# Only check this if we actually evaluated tasks
1153-
if exit_code == 0:
1154-
mcp_resolved = results.summary["mcp"]["resolved"]
1155-
baseline_resolved = results.summary["baseline"]["resolved"]
1156-
mcp_total = results.summary["mcp"]["total"]
1157-
baseline_total = results.summary["baseline"]["total"]
1158-
1159-
# Only report "no resolutions" if tasks were actually run
1160-
# If total is 0, no tasks were run (not a failure)
1161-
if (mcp_only and mcp_total > 0 and mcp_resolved == 0) or (
1162-
baseline_only and baseline_total > 0 and baseline_resolved == 0
1163-
):
1164-
console.print("\n[yellow]⚠ No tasks resolved (0% success)[/yellow]")
1165-
exit_code = 2
1166-
elif not mcp_only and not baseline_only:
1167-
# For full run, check if either had tasks and none were resolved
1168-
if (
1169-
(mcp_total > 0 or baseline_total > 0)
1170-
and mcp_resolved == 0
1171-
and baseline_resolved == 0
1172-
):
1173-
console.print("\n[yellow]⚠ No tasks resolved by either agent (0% success)[/yellow]")
1174-
exit_code = 2
1151+
# Resolution check removed — dead code benchmarks use continuous P/R/F1
1152+
# metrics, not binary resolved gates. Other benchmarks can still use
1153+
# resolved_threshold in their config if needed.
11751154

11761155
# Exit with determined exit code
11771156
if exit_code != 0:

src/mcpbr/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -901,7 +901,7 @@ def validate_thinking_budget(cls, v: int | None) -> int | None:
901901
)
902902

903903
resolved_threshold: float = Field(
904-
default=0.8,
904+
default=0.0,
905905
ge=0.0,
906906
le=1.0,
907907
description="Recall threshold to consider a task resolved (must be in [0.0, 1.0])",

src/mcpbr/reporting.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -847,19 +847,23 @@ def dict_to_stats(cls, data):
847847
task_table.add_column("Error", style="red", max_width=50)
848848

849849
for task in results.tasks:
850-
mcp_status = (
851-
"[green]PASS[/green]" if task.mcp and task.mcp.get("resolved") else "[red]FAIL[/red]"
852-
)
853850
if task.mcp is None:
854851
mcp_status = "[dim]-[/dim]"
852+
elif task.mcp.get("resolved") is None:
853+
mcp_status = "[dim]-[/dim]"
854+
elif task.mcp.get("resolved"):
855+
mcp_status = "[green]PASS[/green]"
856+
else:
857+
mcp_status = "[red]FAIL[/red]"
855858

856-
baseline_status = (
857-
"[green]PASS[/green]"
858-
if task.baseline and task.baseline.get("resolved")
859-
else "[red]FAIL[/red]"
860-
)
861859
if task.baseline is None:
862860
baseline_status = "[dim]-[/dim]"
861+
elif task.baseline.get("resolved") is None:
862+
baseline_status = "[dim]-[/dim]"
863+
elif task.baseline.get("resolved"):
864+
baseline_status = "[green]PASS[/green]"
865+
else:
866+
baseline_status = "[red]FAIL[/red]"
863867

864868
error_msg = ""
865869
if task.mcp and task.mcp.get("error"):

0 commit comments

Comments
 (0)