File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -145,7 +145,7 @@ def __init__(
145145 self ,
146146 dataset : str | Path = "" ,
147147 corpus_path : str | Path | None = None ,
148- resolved_threshold : float = 0.8 ,
148+ resolved_threshold : float = 0.0 ,
149149 ):
150150 """Initialize the benchmark.
151151
@@ -406,7 +406,9 @@ async def evaluate(
406406 recall = tp / (tp + fn ) if (tp + fn ) > 0 else 0.0
407407 f1 = 2 * precision * recall / (precision + recall ) if (precision + recall ) > 0 else 0.0
408408
409- resolved = precision >= self .resolved_threshold and recall >= self .resolved_threshold
409+ # Dead code uses continuous P/R/F1 metrics, not binary resolved gate.
410+ # Return None so the summary shows "-" instead of PASS/FAIL.
411+ resolved = None
410412
411413 # Log results for visibility
412414 print (f"\n { '=' * 50 } " )
Original file line number Diff line number Diff line change @@ -50,7 +50,7 @@ def __init__(
5050 tasks : list [dict [str , Any ]] | None = None ,
5151 supermodel_api_base : str = "https://api.supermodel.dev" ,
5252 supermodel_api_key : str | None = None ,
53- resolved_threshold : float = 0.8 ,
53+ resolved_threshold : float = 0.0 ,
5454 ground_truth_dir : str | Path | None = None ,
5555 supermodel_api_timeout : int = 900 ,
5656 ** kwargs : Any ,
@@ -628,7 +628,9 @@ async def evaluate(
628628
629629 precision = metrics ["precision" ]
630630 recall = metrics ["recall" ]
631- resolved = recall >= self .resolved_threshold
631+ # Dead code uses continuous P/R/F1 metrics, not binary resolved gate.
632+ # Return None so the summary shows "-" instead of PASS/FAIL.
633+ resolved = None
632634
633635 # Log results
634636 print (f"\n { '=' * 50 } " )
@@ -641,7 +643,6 @@ async def evaluate(
641643 print (f" Precision: { precision * 100 :.1f} %" )
642644 print (f" Recall: { recall * 100 :.1f} %" )
643645 print (f" F1 Score: { metrics ['f1_score' ] * 100 :.1f} %" )
644- print (f" Resolved: { resolved } " )
645646 print (f"{ '=' * 50 } \n " )
646647
647648 return {
Original file line number Diff line number Diff line change @@ -1148,30 +1148,9 @@ def run(
11481148 console .print ("[dim]Use --reset-state or --no-incremental to re-run[/dim]" )
11491149 exit_code = 3
11501150
1151- # Check if anything was resolved (exit code 2)
1152- # Only check this if we actually evaluated tasks
1153- if exit_code == 0 :
1154- mcp_resolved = results .summary ["mcp" ]["resolved" ]
1155- baseline_resolved = results .summary ["baseline" ]["resolved" ]
1156- mcp_total = results .summary ["mcp" ]["total" ]
1157- baseline_total = results .summary ["baseline" ]["total" ]
1158-
1159- # Only report "no resolutions" if tasks were actually run
1160- # If total is 0, no tasks were run (not a failure)
1161- if (mcp_only and mcp_total > 0 and mcp_resolved == 0 ) or (
1162- baseline_only and baseline_total > 0 and baseline_resolved == 0
1163- ):
1164- console .print ("\n [yellow]⚠ No tasks resolved (0% success)[/yellow]" )
1165- exit_code = 2
1166- elif not mcp_only and not baseline_only :
1167- # For full run, check if either had tasks and none were resolved
1168- if (
1169- (mcp_total > 0 or baseline_total > 0 )
1170- and mcp_resolved == 0
1171- and baseline_resolved == 0
1172- ):
1173- console .print ("\n [yellow]⚠ No tasks resolved by either agent (0% success)[/yellow]" )
1174- exit_code = 2
1151+ # Resolution check removed — dead code benchmarks use continuous P/R/F1
1152+ # metrics, not binary resolved gates. Other benchmarks can still use
1153+ # resolved_threshold in their config if needed.
11751154
11761155 # Exit with determined exit code
11771156 if exit_code != 0 :
Original file line number Diff line number Diff line change @@ -901,7 +901,7 @@ def validate_thinking_budget(cls, v: int | None) -> int | None:
901901 )
902902
903903 resolved_threshold : float = Field (
904- default = 0.8 ,
904+ default = 0.0 ,
905905 ge = 0.0 ,
906906 le = 1.0 ,
907907 description = "Recall threshold to consider a task resolved (must be in [0.0, 1.0])" ,
Original file line number Diff line number Diff line change @@ -847,19 +847,23 @@ def dict_to_stats(cls, data):
847847 task_table .add_column ("Error" , style = "red" , max_width = 50 )
848848
849849 for task in results .tasks :
850- mcp_status = (
851- "[green]PASS[/green]" if task .mcp and task .mcp .get ("resolved" ) else "[red]FAIL[/red]"
852- )
853850 if task .mcp is None :
854851 mcp_status = "[dim]-[/dim]"
852+ elif task .mcp .get ("resolved" ) is None :
853+ mcp_status = "[dim]-[/dim]"
854+ elif task .mcp .get ("resolved" ):
855+ mcp_status = "[green]PASS[/green]"
856+ else :
857+ mcp_status = "[red]FAIL[/red]"
855858
856- baseline_status = (
857- "[green]PASS[/green]"
858- if task .baseline and task .baseline .get ("resolved" )
859- else "[red]FAIL[/red]"
860- )
861859 if task .baseline is None :
862860 baseline_status = "[dim]-[/dim]"
861+ elif task .baseline .get ("resolved" ) is None :
862+ baseline_status = "[dim]-[/dim]"
863+ elif task .baseline .get ("resolved" ):
864+ baseline_status = "[green]PASS[/green]"
865+ else :
866+ baseline_status = "[red]FAIL[/red]"
863867
864868 error_msg = ""
865869 if task .mcp and task .mcp .get ("error" ):
You can’t perform that action at this time.
0 commit comments