fix: Lower resolved_threshold default from 0.8 to 0.0 for dead code benchmarks

jonathanpopham · jonathanpopham · commit cd8efdb20ea0 · 2026-03-25T15:48:21.000-04:00
The 80% precision AND recall gate meant every task showed "Resolved: False"
for both MCP and baseline agents. No dead code detection approach achieves
80% on both metrics simultaneously. Setting to 0.0 means any task with
non-zero P and R counts as resolved. Still configurable via config YAML.
diff --git a/src/mcpbr/benchmarks/deadcode.py b/src/mcpbr/benchmarks/deadcode.py
@@ -145,7 +145,7 @@ def __init__(
         self,
         dataset: str | Path = "",
         corpus_path: str | Path | None = None,
-        resolved_threshold: float = 0.8,
+        resolved_threshold: float = 0.0,
     ):
         """Initialize the benchmark.
 
diff --git a/src/mcpbr/benchmarks/supermodel/benchmark.py b/src/mcpbr/benchmarks/supermodel/benchmark.py
@@ -50,7 +50,7 @@ def __init__(
         tasks: list[dict[str, Any]] | None = None,
         supermodel_api_base: str = "https://api.supermodel.dev",
         supermodel_api_key: str | None = None,
-        resolved_threshold: float = 0.8,
+        resolved_threshold: float = 0.0,
         ground_truth_dir: str | Path | None = None,
         supermodel_api_timeout: int = 900,
         **kwargs: Any,
diff --git a/src/mcpbr/config.py b/src/mcpbr/config.py
@@ -901,7 +901,7 @@ def validate_thinking_budget(cls, v: int | None) -> int | None:
     )
 
     resolved_threshold: float = Field(
-        default=0.8,
+        default=0.0,
         ge=0.0,
         le=1.0,
         description="Recall threshold to consider a task resolved (must be in [0.0, 1.0])",

Original file line number	Diff line number	Diff line change
`@@ -901,7 +901,7 @@ def validate_thinking_budget(cls, v: int \| None) -> int \| None:`
`901`	`901`	`)`
`902`	`902`
`903`	`903`	`resolved_threshold: float = Field(`
`904`		`- default=0.8,`
	`904`	`+ default=0.0,`
`905`	`905`	`ge=0.0,`
`906`	`906`	`le=1.0,`
`907`	`907`	`description="Recall threshold to consider a task resolved (must be in [0.0, 1.0])",`