Implement SkipTest exception to gracefully skip integration tests

openhands-agent · openhands-agent · commit 262bd60900a4 · 2025-11-10T15:27:05.000Z
- Added SkipTest exception in base.py to signal when tests should be skipped
- Extended TestResult and TestResultData schemas with 'skipped' field
- Updated ModelTestResults to track skipped_tests count
- Modified run_infer.py to catch SkipTest and handle skipped tests appropriately
- Updated t08_image_file_viewing.py to raise SkipTest instead of ValueError when vision not supported
- Enhanced console output to display skipped tests with ⊘ symbol
- Updated markdown report generator to show skipped tests in summary and detailed results

This allows integration tests to gracefully skip when LLMs lack required capabilities (e.g., vision support) rather than failing.

Co-authored-by: openhands &lt;openhands@all-hands.dev&gt;
diff --git a/tests/integration/base.py b/tests/integration/base.py
@@ -26,11 +26,23 @@
 from openhands.sdk.tool import Tool
 
 
+class SkipTest(Exception):
+    """
+    Exception raised to indicate that a test should be skipped.
+
+    This is useful for tests that require specific capabilities (e.g., vision)
+    that may not be available in all LLMs.
+    """
+
+    pass
+
+
 class TestResult(BaseModel):
     """Result of an integration test."""
 
     success: bool
     reason: str | None = None
+    skipped: bool = False
 
 
 class BaseIntegrationTest(ABC):
diff --git a/tests/integration/run_infer.py b/tests/integration/run_infer.py
@@ -17,7 +17,7 @@
 from pydantic import BaseModel, ConfigDict
 
 from openhands.sdk.logger import get_logger
-from tests.integration.base import BaseIntegrationTest, TestResult
+from tests.integration.base import BaseIntegrationTest, SkipTest, TestResult
 from tests.integration.schemas import ModelTestResults
 from tests.integration.utils.format_costs import format_cost
 
@@ -171,6 +171,20 @@ def process_instance(instance: TestInstance, llm_config: dict[str, Any]) -> Eval
             log_file_path=log_file_path,
         )
 
+    except SkipTest as e:
+        # Test should be skipped (e.g., LLM doesn't support required capabilities)
+        logger.info("Test %s skipped: %s", instance.instance_id, str(e))
+        return EvalOutput(
+            instance_id=instance.instance_id,
+            test_result=TestResult(
+                success=False,
+                reason=str(e),
+                skipped=True,
+            ),
+            llm_model=llm_config.get("model", "unknown"),
+            cost=0.0,
+        )
+
     except Exception as e:
         logger.error("Error running test %s: %s", instance.instance_id, e)
         return EvalOutput(
@@ -274,11 +288,17 @@ def generate_structured_results(
     # Print summary for console output
     success_rate = structured_results.success_rate
     successful = structured_results.successful_tests
+    skipped = structured_results.skipped_tests
     total = structured_results.total_tests
     logger.info("Success rate: %.2f%% (%d/%d)", success_rate * 100, successful, total)
+    if skipped > 0:
+        logger.info("Skipped tests: %d", skipped)
     logger.info("Evaluation Results:")
     for instance in structured_results.test_instances:
-        status = "✓" if instance.test_result.success else "✗"
+        if instance.test_result.skipped:
+            status = "⊘"  # Skipped symbol
+        else:
+            status = "✓" if instance.test_result.success else "✗"
         reason = instance.test_result.reason or "N/A"
         logger.info("%s: %s - %s", instance.instance_id, status, reason)
     logger.info("Total cost: %s", format_cost(structured_results.total_cost))
diff --git a/tests/integration/schemas.py b/tests/integration/schemas.py
@@ -20,6 +20,7 @@ class TestResultData(BaseModel):
 
     success: bool
     reason: str | None = None
+    skipped: bool = False
 
 
 class TestInstanceResult(BaseModel):
@@ -46,6 +47,7 @@ class ModelTestResults(BaseModel):
     # Summary statistics
     total_tests: int
     successful_tests: int
+    skipped_tests: int
     success_rate: float
     total_cost: float
 
@@ -75,6 +77,7 @@ def from_eval_outputs(
                     test_result=TestResultData(
                         success=output.test_result.success,
                         reason=output.test_result.reason,
+                        skipped=output.test_result.skipped,
                     ),
                     cost=output.cost,
                     error_message=output.error_message,
@@ -84,6 +87,7 @@ def from_eval_outputs(
         # Calculate summary statistics
         total_tests = len(test_instances)
         successful_tests = sum(1 for t in test_instances if t.test_result.success)
+        skipped_tests = sum(1 for t in test_instances if t.test_result.skipped)
         success_rate = successful_tests / total_tests if total_tests > 0 else 0.0
         total_cost = sum(t.cost for t in test_instances)
 
@@ -94,6 +98,7 @@ def from_eval_outputs(
             test_instances=test_instances,
             total_tests=total_tests,
             successful_tests=successful_tests,
+            skipped_tests=skipped_tests,
             success_rate=success_rate,
             total_cost=total_cost,
             eval_note=eval_note,
diff --git a/tests/integration/tests/t08_image_file_viewing.py b/tests/integration/tests/t08_image_file_viewing.py
@@ -8,7 +8,7 @@
 from openhands.sdk.tool import Tool, register_tool
 from openhands.tools.file_editor import FileEditorTool
 from openhands.tools.terminal import TerminalTool
-from tests.integration.base import BaseIntegrationTest, TestResult
+from tests.integration.base import BaseIntegrationTest, SkipTest, TestResult
 
 
 INSTRUCTION = (
@@ -33,7 +33,7 @@ def __init__(self, *args, **kwargs):
 
         # Verify that the LLM supports vision
         if not self.llm.vision_is_active():
-            raise ValueError(
+            raise SkipTest(
                 "This test requires a vision-capable LLM model. "
                 "Please use a model that supports image input."
             )
diff --git a/tests/integration/utils/generate_markdown_report.py b/tests/integration/utils/generate_markdown_report.py
@@ -18,20 +18,21 @@ def generate_model_summary_table(model_results: list[ModelTestResults]) -> str:
     """Generate a summary table for all models."""
 
     table_lines = [
-        "| Model | Success Rate | Tests Passed | Total Tests | Cost |",
-        "|-------|--------------|--------------|-------------|------|",
+        "| Model | Success Rate | Tests Passed | Skipped | Total Tests | Cost |",
+        "|-------|--------------|--------------|---------|-------------|------|",
     ]
 
     for result in model_results:
         success_rate = f"{result.success_rate:.1%}"
         tests_passed = f"{result.successful_tests}/{result.total_tests}"
+        skipped = f"{result.skipped_tests}"
         cost = format_cost(result.total_cost)
 
         model_name = result.model_name
         total_tests = result.total_tests
         row = (
             f"| {model_name} | {success_rate} | {tests_passed} | "
-            f"{total_tests} | {cost} |"
+            f"{skipped} | {total_tests} | {cost} |"
         )
         table_lines.append(row)
 
@@ -51,11 +52,35 @@ def generate_detailed_results(model_results: list[ModelTestResults]) -> str:
             f"({result.successful_tests}/{result.total_tests})",
             f"- **Total Cost**: {format_cost(result.total_cost)}",
             f"- **Run Suffix**: `{result.run_suffix}`",
-            "",
         ]
 
+        if result.skipped_tests > 0:
+            section_lines.append(f"- **Skipped Tests**: {result.skipped_tests}")
+
+        section_lines.append("")
+
+        # Add skipped tests if any
+        skipped_tests = [t for t in result.test_instances if t.test_result.skipped]
+        if skipped_tests:
+            section_lines.extend(
+                [
+                    "**Skipped Tests:**",
+                    "",
+                ]
+            )
+
+            for test in skipped_tests:
+                reason = test.test_result.reason or "No reason provided"
+                section_lines.append(f"- `{test.instance_id}`: {reason}")
+
+            section_lines.append("")
+
         # Add failed tests if any
-        failed_tests = [t for t in result.test_instances if not t.test_result.success]
+        failed_tests = [
+            t
+            for t in result.test_instances
+            if not t.test_result.success and not t.test_result.skipped
+        ]
         if failed_tests:
             section_lines.extend(
                 [