From 5177e4909854dea0b8d5923f7b8e5ef506dd8211 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 16 Jan 2026 22:25:48 +0000 Subject: [PATCH] feat: add fatal error detection with actionable suggestions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add classify_error() to detect configuration issues (quota, auth, model) - Sequential runs abort early on fatal errors with clear guidance - Parallel runs report fatal errors after completion with suggestions - Add FATAL_ERROR_PATTERNS for common API errors - Add 5 tests for error classification Example output on quota error: āŒ Fatal error detected: insufficient_quota šŸ’” API quota exceeded. Add credits at https://platform.openai.com/... Aborting benchmark - fix the issue above and retry. This prevents wasting time running remaining samples when the first one fails due to a configuration issue that requires user action. --- benchmarks/runner.py | 68 +++++++++++++++++++++++++++++ tests/benchmarks/test_benchmarks.py | 48 ++++++++++++++++++++ 2 files changed, 116 insertions(+) diff --git a/benchmarks/runner.py b/benchmarks/runner.py index e03a258..b4eae49 100644 --- a/benchmarks/runner.py +++ b/benchmarks/runner.py @@ -12,6 +12,7 @@ Includes progress tracking with ETA via tqdm or custom callbacks. """ +import sys import time from collections.abc import Callable from concurrent.futures import ThreadPoolExecutor, as_completed @@ -20,6 +21,47 @@ from benchmarks.base import Benchmark, BenchmarkResult, BenchmarkSample, SampleResult + +# Error patterns that indicate fatal configuration issues (not worth retrying) +FATAL_ERROR_PATTERNS = [ + # Authentication/Authorization + ("insufficient_quota", "API quota exceeded. Add credits at https://platform.openai.com/account/billing"), + ("invalid_api_key", "Invalid API key. Check your API key configuration."), + ("invalid api key", "Invalid API key. Check your API key configuration."), + ("incorrect api key", "Invalid API key. Check your API key configuration."), + ("authentication", "Authentication failed. Verify your API key is correct."), + ("unauthorized", "Unauthorized. Check your API key and permissions."), + ("api_key", "API key issue. Check your API key configuration."), + # Model issues + ("model_not_found", "Model not found. Verify the model name is correct."), + ("does not exist", "Model does not exist. Check available models for your account."), + ("no such model", "Model not found. Check available models for your account."), + # Rate limiting (without retry-after) + ("rate_limit", "Rate limited. Consider reducing --max-workers or adding delays."), +] + + +class FatalBenchmarkError(Exception): + """Raised when a benchmark encounters an unrecoverable error.""" + + def __init__(self, message: str, suggestion: str): + self.message = message + self.suggestion = suggestion + super().__init__(f"{message}\n\nSuggestion: {suggestion}") + + +def classify_error(error_str: str) -> tuple[bool, str]: + """Classify an error as fatal or transient. + + Returns: + (is_fatal, suggestion) - If fatal, includes actionable suggestion. + """ + error_lower = error_str.lower() + for pattern, suggestion in FATAL_ERROR_PATTERNS: + if pattern in error_lower: + return True, suggestion + return False, "" + # Type alias for progress callback ProgressCallback = Callable[[int, int, "SampleResult | None", "ProgressStats"], None] @@ -323,6 +365,18 @@ def _run_sequential( results.append(sample_result) self._update_progress(sample_result, stats, progress_mode, pbar) + # Check for fatal errors on first sample - abort early + if sample_result.error and stats.completed == 1: + is_fatal, suggestion = classify_error(sample_result.error) + if is_fatal: + if pbar: + pbar.close() + pbar = None + print(f"\nāŒ Fatal error detected: {sample_result.error}", file=sys.stderr) + print(f"\nšŸ’” {suggestion}", file=sys.stderr) + print("\nAborting benchmark - fix the issue above and retry.", file=sys.stderr) + return results + finally: if pbar is not None: pbar.close() @@ -356,6 +410,7 @@ def _run_parallel( results: dict[str, SampleResult] = {} lock = threading.Lock() pbar = None + fatal_error_found: tuple[str, str] | None = None # (error, suggestion) try: if progress_mode == "tqdm": @@ -395,10 +450,23 @@ def _run_parallel( results[sample.id] = sample_result self._update_progress(sample_result, stats, progress_mode, pbar) + # Track first fatal error (for reporting after completion) + if sample_result.error and fatal_error_found is None: + is_fatal, suggestion = classify_error(sample_result.error) + if is_fatal: + fatal_error_found = (sample_result.error, suggestion) + finally: if pbar is not None: pbar.close() + # Report fatal errors after parallel run completes + if fatal_error_found: + error, suggestion = fatal_error_found + print(f"\nāŒ Fatal error detected: {error}", file=sys.stderr) + print(f"\nšŸ’” {suggestion}", file=sys.stderr) + print("\nFix the issue above before running more benchmarks.", file=sys.stderr) + # Return results in original sample order return [results[sample.id] for sample in samples] diff --git a/tests/benchmarks/test_benchmarks.py b/tests/benchmarks/test_benchmarks.py index 908765d..44bce14 100644 --- a/tests/benchmarks/test_benchmarks.py +++ b/tests/benchmarks/test_benchmarks.py @@ -564,6 +564,54 @@ def __init__(self, **kwargs): assert captured_kwargs.get("backend_kwargs") == {"model_name": "gpt-4o-test"} +class TestErrorClassification: + """Tests for error classification and fatal error handling.""" + + def test_classify_quota_error(self): + """Test that quota errors are classified as fatal.""" + from benchmarks.runner import classify_error + + error = "Error code: 429 - {'error': {'code': 'insufficient_quota'}}" + is_fatal, suggestion = classify_error(error) + assert is_fatal + assert "quota" in suggestion.lower() or "billing" in suggestion.lower() + + def test_classify_auth_error(self): + """Test that auth errors are classified as fatal.""" + from benchmarks.runner import classify_error + + error = "Invalid API key provided" + is_fatal, suggestion = classify_error(error) + assert is_fatal + assert "key" in suggestion.lower() + + def test_classify_model_error(self): + """Test that model not found errors are classified as fatal.""" + from benchmarks.runner import classify_error + + error = "The model 'gpt-99' does not exist" + is_fatal, suggestion = classify_error(error) + assert is_fatal + assert "model" in suggestion.lower() + + def test_classify_transient_error(self): + """Test that transient errors are not classified as fatal.""" + from benchmarks.runner import classify_error + + error = "Connection timeout after 30 seconds" + is_fatal, suggestion = classify_error(error) + assert not is_fatal + assert suggestion == "" + + def test_classify_generic_error(self): + """Test that unknown errors are not classified as fatal.""" + from benchmarks.runner import classify_error + + error = "Something unexpected happened" + is_fatal, suggestion = classify_error(error) + assert not is_fatal + + class TestBenchmarkIntegration: """Integration tests for benchmark framework."""