Skip to content

Commit 4af032e

Browse files
Fix success rate calculation to exclude skipped tests from denominator
Skipped tests should not count as failures. The success rate now only considers tests that were actually run (non-skipped tests). Changes: - Update success_rate calculation to use (successful / non_skipped) instead of (successful / total) - Update console output to show correct denominator: X/Y where Y excludes skipped tests - Update markdown report to display success rate with correct denominator - Show skipped tests as separate count: N/total Co-authored-by: openhands <[email protected]>
1 parent 262bd60 commit 4af032e

File tree

3 files changed

+17
-6
lines changed

3 files changed

+17
-6
lines changed

tests/integration/run_infer.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -290,9 +290,12 @@ def generate_structured_results(
290290
successful = structured_results.successful_tests
291291
skipped = structured_results.skipped_tests
292292
total = structured_results.total_tests
293-
logger.info("Success rate: %.2f%% (%d/%d)", success_rate * 100, successful, total)
293+
non_skipped = total - skipped
294+
logger.info(
295+
"Success rate: %.2f%% (%d/%d)", success_rate * 100, successful, non_skipped
296+
)
294297
if skipped > 0:
295-
logger.info("Skipped tests: %d", skipped)
298+
logger.info("Skipped tests: %d/%d", skipped, total)
296299
logger.info("Evaluation Results:")
297300
for instance in structured_results.test_instances:
298301
if instance.test_result.skipped:

tests/integration/schemas.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,11 @@ def from_eval_outputs(
8888
total_tests = len(test_instances)
8989
successful_tests = sum(1 for t in test_instances if t.test_result.success)
9090
skipped_tests = sum(1 for t in test_instances if t.test_result.skipped)
91-
success_rate = successful_tests / total_tests if total_tests > 0 else 0.0
91+
# Success rate excludes skipped tests from denominator
92+
non_skipped_tests = total_tests - skipped_tests
93+
success_rate = (
94+
successful_tests / non_skipped_tests if non_skipped_tests > 0 else 0.0
95+
)
9296
total_cost = sum(t.cost for t in test_instances)
9397

9498
return cls(

tests/integration/utils/generate_markdown_report.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@ def generate_model_summary_table(model_results: list[ModelTestResults]) -> str:
2424

2525
for result in model_results:
2626
success_rate = f"{result.success_rate:.1%}"
27-
tests_passed = f"{result.successful_tests}/{result.total_tests}"
27+
non_skipped = result.total_tests - result.skipped_tests
28+
tests_passed = f"{result.successful_tests}/{non_skipped}"
2829
skipped = f"{result.skipped_tests}"
2930
cost = format_cost(result.total_cost)
3031

@@ -45,17 +46,20 @@ def generate_detailed_results(model_results: list[ModelTestResults]) -> str:
4546
sections = []
4647

4748
for result in model_results:
49+
non_skipped = result.total_tests - result.skipped_tests
4850
section_lines = [
4951
f"### {result.model_name}",
5052
"",
5153
f"- **Success Rate**: {result.success_rate:.1%} "
52-
f"({result.successful_tests}/{result.total_tests})",
54+
f"({result.successful_tests}/{non_skipped})",
5355
f"- **Total Cost**: {format_cost(result.total_cost)}",
5456
f"- **Run Suffix**: `{result.run_suffix}`",
5557
]
5658

5759
if result.skipped_tests > 0:
58-
section_lines.append(f"- **Skipped Tests**: {result.skipped_tests}")
60+
section_lines.append(
61+
f"- **Skipped Tests**: {result.skipped_tests}/{result.total_tests}"
62+
)
5963

6064
section_lines.append("")
6165

0 commit comments

Comments
 (0)