supermodeltools · jonathanpopham · Mar 25, 2026 · coderabbitai · Mar 25, 2026
diff --git a/src/mcpbr/benchmarks/deadcode.py b/src/mcpbr/benchmarks/deadcode.py
@@ -145,7 +145,7 @@ def __init__(
         self,
         dataset: str | Path = "",
         corpus_path: str | Path | None = None,
-        resolved_threshold: float = 0.8,
+        resolved_threshold: float = 0.0,
     ):
         """Initialize the benchmark.
 
@@ -406,7 +406,9 @@ async def evaluate(
         recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
         f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
 
-        resolved = precision >= self.resolved_threshold and recall >= self.resolved_threshold
+        # Dead code uses continuous P/R/F1 metrics, not binary resolved gate.
+        # Return None so the summary shows "-" instead of PASS/FAIL.
+        resolved = None
 
         # Log results for visibility
         print(f"\n{'=' * 50}")

diff --git a/src/mcpbr/benchmarks/supermodel/benchmark.py b/src/mcpbr/benchmarks/supermodel/benchmark.py
@@ -50,7 +50,7 @@ def __init__(
         tasks: list[dict[str, Any]] | None = None,
         supermodel_api_base: str = "https://api.supermodel.dev",
         supermodel_api_key: str | None = None,
-        resolved_threshold: float = 0.8,
+        resolved_threshold: float = 0.0,
         ground_truth_dir: str | Path | None = None,
         supermodel_api_timeout: int = 900,
         **kwargs: Any,
@@ -628,7 +628,9 @@ async def evaluate(
 
         precision = metrics["precision"]
         recall = metrics["recall"]
-        resolved = recall >= self.resolved_threshold
+        # Dead code uses continuous P/R/F1 metrics, not binary resolved gate.
+        # Return None so the summary shows "-" instead of PASS/FAIL.
+        resolved = None
 
         # Log results
         print(f"\n{'=' * 50}")
@@ -641,7 +643,6 @@ async def evaluate(
         print(f"  Precision: {precision * 100:.1f}%")
         print(f"  Recall: {recall * 100:.1f}%")
         print(f"  F1 Score: {metrics['f1_score'] * 100:.1f}%")
-        print(f"  Resolved: {resolved}")
         print(f"{'=' * 50}\n")
 
         return {

diff --git a/src/mcpbr/cli.py b/src/mcpbr/cli.py
@@ -1148,30 +1148,9 @@ def run(
             console.print("[dim]Use --reset-state or --no-incremental to re-run[/dim]")
             exit_code = 3
 
-    # Check if anything was resolved (exit code 2)
-    # Only check this if we actually evaluated tasks
-    if exit_code == 0:
-        mcp_resolved = results.summary["mcp"]["resolved"]
-        baseline_resolved = results.summary["baseline"]["resolved"]
-        mcp_total = results.summary["mcp"]["total"]
-        baseline_total = results.summary["baseline"]["total"]
-
-        # Only report "no resolutions" if tasks were actually run
-        # If total is 0, no tasks were run (not a failure)
-        if (mcp_only and mcp_total > 0 and mcp_resolved == 0) or (
-            baseline_only and baseline_total > 0 and baseline_resolved == 0
-        ):
-            console.print("\n[yellow]⚠ No tasks resolved (0% success)[/yellow]")
-            exit_code = 2
-        elif not mcp_only and not baseline_only:
-            # For full run, check if either had tasks and none were resolved
-            if (
-                (mcp_total > 0 or baseline_total > 0)
-                and mcp_resolved == 0
-                and baseline_resolved == 0
-            ):
-                console.print("\n[yellow]⚠ No tasks resolved by either agent (0% success)[/yellow]")
-                exit_code = 2
+    # Resolution check removed — dead code benchmarks use continuous P/R/F1
+    # metrics, not binary resolved gates. Other benchmarks can still use
+    # resolved_threshold in their config if needed.
 
     # Exit with determined exit code
     if exit_code != 0:

diff --git a/src/mcpbr/config.py b/src/mcpbr/config.py
@@ -901,7 +901,7 @@ def validate_thinking_budget(cls, v: int | None) -> int | None:
     )
 
     resolved_threshold: float = Field(
-        default=0.8,
+        default=0.0,
         ge=0.0,
         le=1.0,
         description="Recall threshold to consider a task resolved (must be in [0.0, 1.0])",

diff --git a/src/mcpbr/reporting.py b/src/mcpbr/reporting.py
@@ -587,20 +587,8 @@ def print_summary(results: "EvaluationResults", console: Console) -> None:
     mcp = results.summary["mcp"]
     baseline = results.summary["baseline"]
 
-    table.add_row(
-        "Resolved",
-        f"{mcp['resolved']}/{mcp['total']}",
-        f"{baseline['resolved']}/{baseline['total']}",
-    )
-    table.add_row(
-        "Resolution Rate",
-        f"{mcp['rate']:.1%}",
-        f"{baseline['rate']:.1%}",
-    )
-
     console.print(table)
     console.print()
-    console.print(f"[bold]Improvement:[/bold] {results.summary['improvement']}")
 
     # Print statistical significance if available
     significance = results.summary.get("significance")
@@ -847,19 +835,23 @@ def dict_to_stats(cls, data):
     task_table.add_column("Error", style="red", max_width=50)
 
     for task in results.tasks:
-        mcp_status = (
-            "[green]PASS[/green]" if task.mcp and task.mcp.get("resolved") else "[red]FAIL[/red]"
-        )
         if task.mcp is None:
             mcp_status = "[dim]-[/dim]"
+        elif task.mcp.get("resolved") is None:
+            mcp_status = "[dim]-[/dim]"
+        elif task.mcp.get("resolved"):
+            mcp_status = "[green]PASS[/green]"
+        else:
+            mcp_status = "[red]FAIL[/red]"
 
-        baseline_status = (
-            "[green]PASS[/green]"
-            if task.baseline and task.baseline.get("resolved")
-            else "[red]FAIL[/red]"
-        )
         if task.baseline is None:
             baseline_status = "[dim]-[/dim]"
+        elif task.baseline.get("resolved") is None:
+            baseline_status = "[dim]-[/dim]"
+        elif task.baseline.get("resolved"):
+            baseline_status = "[green]PASS[/green]"
+        else:
+            baseline_status = "[red]FAIL[/red]"
 
         error_msg = ""
         if task.mcp and task.mcp.get("error"):