Fix success rate calculation to exclude skipped tests from denominator

openhands-agent · openhands-agent · commit 4af032e478ab · 2025-11-10T16:39:21.000Z
Skipped tests should not count as failures. The success rate now only
considers tests that were actually run (non-skipped tests).

Changes:
- Update success_rate calculation to use (successful / non_skipped) instead of (successful / total)
- Update console output to show correct denominator: X/Y where Y excludes skipped tests
- Update markdown report to display success rate with correct denominator
- Show skipped tests as separate count: N/total

Co-authored-by: openhands &lt;openhands@all-hands.dev&gt;
diff --git a/tests/integration/run_infer.py b/tests/integration/run_infer.py
@@ -290,9 +290,12 @@ def generate_structured_results(
     successful = structured_results.successful_tests
     skipped = structured_results.skipped_tests
     total = structured_results.total_tests
-    logger.info("Success rate: %.2f%% (%d/%d)", success_rate * 100, successful, total)
+    non_skipped = total - skipped
+    logger.info(
+        "Success rate: %.2f%% (%d/%d)", success_rate * 100, successful, non_skipped
+    )
     if skipped > 0:
-        logger.info("Skipped tests: %d", skipped)
+        logger.info("Skipped tests: %d/%d", skipped, total)
     logger.info("Evaluation Results:")
     for instance in structured_results.test_instances:
         if instance.test_result.skipped:
diff --git a/tests/integration/schemas.py b/tests/integration/schemas.py
@@ -88,7 +88,11 @@ def from_eval_outputs(
         total_tests = len(test_instances)
         successful_tests = sum(1 for t in test_instances if t.test_result.success)
         skipped_tests = sum(1 for t in test_instances if t.test_result.skipped)
-        success_rate = successful_tests / total_tests if total_tests > 0 else 0.0
+        # Success rate excludes skipped tests from denominator
+        non_skipped_tests = total_tests - skipped_tests
+        success_rate = (
+            successful_tests / non_skipped_tests if non_skipped_tests > 0 else 0.0
+        )
         total_cost = sum(t.cost for t in test_instances)
 
         return cls(
diff --git a/tests/integration/utils/generate_markdown_report.py b/tests/integration/utils/generate_markdown_report.py
@@ -24,7 +24,8 @@ def generate_model_summary_table(model_results: list[ModelTestResults]) -> str:
 
     for result in model_results:
         success_rate = f"{result.success_rate:.1%}"
-        tests_passed = f"{result.successful_tests}/{result.total_tests}"
+        non_skipped = result.total_tests - result.skipped_tests
+        tests_passed = f"{result.successful_tests}/{non_skipped}"
         skipped = f"{result.skipped_tests}"
         cost = format_cost(result.total_cost)
 
@@ -45,17 +46,20 @@ def generate_detailed_results(model_results: list[ModelTestResults]) -> str:
     sections = []
 
     for result in model_results:
+        non_skipped = result.total_tests - result.skipped_tests
         section_lines = [
             f"### {result.model_name}",
             "",
             f"- **Success Rate**: {result.success_rate:.1%} "
-            f"({result.successful_tests}/{result.total_tests})",
+            f"({result.successful_tests}/{non_skipped})",
             f"- **Total Cost**: {format_cost(result.total_cost)}",
             f"- **Run Suffix**: `{result.run_suffix}`",
         ]
 
         if result.skipped_tests > 0:
-            section_lines.append(f"- **Skipped Tests**: {result.skipped_tests}")
+            section_lines.append(
+                f"- **Skipped Tests**: {result.skipped_tests}/{result.total_tests}"
+            )
 
         section_lines.append("")