Skip to content

Commit 262bd60

Browse files
Implement SkipTest exception to gracefully skip integration tests
- Added SkipTest exception in base.py to signal when tests should be skipped - Extended TestResult and TestResultData schemas with 'skipped' field - Updated ModelTestResults to track skipped_tests count - Modified run_infer.py to catch SkipTest and handle skipped tests appropriately - Updated t08_image_file_viewing.py to raise SkipTest instead of ValueError when vision not supported - Enhanced console output to display skipped tests with ⊘ symbol - Updated markdown report generator to show skipped tests in summary and detailed results This allows integration tests to gracefully skip when LLMs lack required capabilities (e.g., vision support) rather than failing. Co-authored-by: openhands <[email protected]>
1 parent e7666de commit 262bd60

File tree

5 files changed

+71
-9
lines changed

5 files changed

+71
-9
lines changed

tests/integration/base.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,23 @@
2626
from openhands.sdk.tool import Tool
2727

2828

29+
class SkipTest(Exception):
30+
"""
31+
Exception raised to indicate that a test should be skipped.
32+
33+
This is useful for tests that require specific capabilities (e.g., vision)
34+
that may not be available in all LLMs.
35+
"""
36+
37+
pass
38+
39+
2940
class TestResult(BaseModel):
3041
"""Result of an integration test."""
3142

3243
success: bool
3344
reason: str | None = None
45+
skipped: bool = False
3446

3547

3648
class BaseIntegrationTest(ABC):

tests/integration/run_infer.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from pydantic import BaseModel, ConfigDict
1818

1919
from openhands.sdk.logger import get_logger
20-
from tests.integration.base import BaseIntegrationTest, TestResult
20+
from tests.integration.base import BaseIntegrationTest, SkipTest, TestResult
2121
from tests.integration.schemas import ModelTestResults
2222
from tests.integration.utils.format_costs import format_cost
2323

@@ -171,6 +171,20 @@ def process_instance(instance: TestInstance, llm_config: dict[str, Any]) -> Eval
171171
log_file_path=log_file_path,
172172
)
173173

174+
except SkipTest as e:
175+
# Test should be skipped (e.g., LLM doesn't support required capabilities)
176+
logger.info("Test %s skipped: %s", instance.instance_id, str(e))
177+
return EvalOutput(
178+
instance_id=instance.instance_id,
179+
test_result=TestResult(
180+
success=False,
181+
reason=str(e),
182+
skipped=True,
183+
),
184+
llm_model=llm_config.get("model", "unknown"),
185+
cost=0.0,
186+
)
187+
174188
except Exception as e:
175189
logger.error("Error running test %s: %s", instance.instance_id, e)
176190
return EvalOutput(
@@ -274,11 +288,17 @@ def generate_structured_results(
274288
# Print summary for console output
275289
success_rate = structured_results.success_rate
276290
successful = structured_results.successful_tests
291+
skipped = structured_results.skipped_tests
277292
total = structured_results.total_tests
278293
logger.info("Success rate: %.2f%% (%d/%d)", success_rate * 100, successful, total)
294+
if skipped > 0:
295+
logger.info("Skipped tests: %d", skipped)
279296
logger.info("Evaluation Results:")
280297
for instance in structured_results.test_instances:
281-
status = "✓" if instance.test_result.success else "✗"
298+
if instance.test_result.skipped:
299+
status = "⊘" # Skipped symbol
300+
else:
301+
status = "✓" if instance.test_result.success else "✗"
282302
reason = instance.test_result.reason or "N/A"
283303
logger.info("%s: %s - %s", instance.instance_id, status, reason)
284304
logger.info("Total cost: %s", format_cost(structured_results.total_cost))

tests/integration/schemas.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ class TestResultData(BaseModel):
2020

2121
success: bool
2222
reason: str | None = None
23+
skipped: bool = False
2324

2425

2526
class TestInstanceResult(BaseModel):
@@ -46,6 +47,7 @@ class ModelTestResults(BaseModel):
4647
# Summary statistics
4748
total_tests: int
4849
successful_tests: int
50+
skipped_tests: int
4951
success_rate: float
5052
total_cost: float
5153

@@ -75,6 +77,7 @@ def from_eval_outputs(
7577
test_result=TestResultData(
7678
success=output.test_result.success,
7779
reason=output.test_result.reason,
80+
skipped=output.test_result.skipped,
7881
),
7982
cost=output.cost,
8083
error_message=output.error_message,
@@ -84,6 +87,7 @@ def from_eval_outputs(
8487
# Calculate summary statistics
8588
total_tests = len(test_instances)
8689
successful_tests = sum(1 for t in test_instances if t.test_result.success)
90+
skipped_tests = sum(1 for t in test_instances if t.test_result.skipped)
8791
success_rate = successful_tests / total_tests if total_tests > 0 else 0.0
8892
total_cost = sum(t.cost for t in test_instances)
8993

@@ -94,6 +98,7 @@ def from_eval_outputs(
9498
test_instances=test_instances,
9599
total_tests=total_tests,
96100
successful_tests=successful_tests,
101+
skipped_tests=skipped_tests,
97102
success_rate=success_rate,
98103
total_cost=total_cost,
99104
eval_note=eval_note,

tests/integration/tests/t08_image_file_viewing.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from openhands.sdk.tool import Tool, register_tool
99
from openhands.tools.file_editor import FileEditorTool
1010
from openhands.tools.terminal import TerminalTool
11-
from tests.integration.base import BaseIntegrationTest, TestResult
11+
from tests.integration.base import BaseIntegrationTest, SkipTest, TestResult
1212

1313

1414
INSTRUCTION = (
@@ -33,7 +33,7 @@ def __init__(self, *args, **kwargs):
3333

3434
# Verify that the LLM supports vision
3535
if not self.llm.vision_is_active():
36-
raise ValueError(
36+
raise SkipTest(
3737
"This test requires a vision-capable LLM model. "
3838
"Please use a model that supports image input."
3939
)

tests/integration/utils/generate_markdown_report.py

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,20 +18,21 @@ def generate_model_summary_table(model_results: list[ModelTestResults]) -> str:
1818
"""Generate a summary table for all models."""
1919

2020
table_lines = [
21-
"| Model | Success Rate | Tests Passed | Total Tests | Cost |",
22-
"|-------|--------------|--------------|-------------|------|",
21+
"| Model | Success Rate | Tests Passed | Skipped | Total Tests | Cost |",
22+
"|-------|--------------|--------------|---------|-------------|------|",
2323
]
2424

2525
for result in model_results:
2626
success_rate = f"{result.success_rate:.1%}"
2727
tests_passed = f"{result.successful_tests}/{result.total_tests}"
28+
skipped = f"{result.skipped_tests}"
2829
cost = format_cost(result.total_cost)
2930

3031
model_name = result.model_name
3132
total_tests = result.total_tests
3233
row = (
3334
f"| {model_name} | {success_rate} | {tests_passed} | "
34-
f"{total_tests} | {cost} |"
35+
f"{skipped} | {total_tests} | {cost} |"
3536
)
3637
table_lines.append(row)
3738

@@ -51,11 +52,35 @@ def generate_detailed_results(model_results: list[ModelTestResults]) -> str:
5152
f"({result.successful_tests}/{result.total_tests})",
5253
f"- **Total Cost**: {format_cost(result.total_cost)}",
5354
f"- **Run Suffix**: `{result.run_suffix}`",
54-
"",
5555
]
5656

57+
if result.skipped_tests > 0:
58+
section_lines.append(f"- **Skipped Tests**: {result.skipped_tests}")
59+
60+
section_lines.append("")
61+
62+
# Add skipped tests if any
63+
skipped_tests = [t for t in result.test_instances if t.test_result.skipped]
64+
if skipped_tests:
65+
section_lines.extend(
66+
[
67+
"**Skipped Tests:**",
68+
"",
69+
]
70+
)
71+
72+
for test in skipped_tests:
73+
reason = test.test_result.reason or "No reason provided"
74+
section_lines.append(f"- `{test.instance_id}`: {reason}")
75+
76+
section_lines.append("")
77+
5778
# Add failed tests if any
58-
failed_tests = [t for t in result.test_instances if not t.test_result.success]
79+
failed_tests = [
80+
t
81+
for t in result.test_instances
82+
if not t.test_result.success and not t.test_result.skipped
83+
]
5984
if failed_tests:
6085
section_lines.extend(
6186
[

0 commit comments

Comments
 (0)