From 5177e4909854dea0b8d5923f7b8e5ef506dd8211 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 16 Jan 2026 22:25:48 +0000
Subject: [PATCH] feat: add fatal error detection with actionable suggestions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add classify_error() to detect configuration issues (quota, auth, model)
- Sequential runs abort early on fatal errors with clear guidance
- Parallel runs report fatal errors after completion with suggestions
- Add FATAL_ERROR_PATTERNS for common API errors
- Add 5 tests for error classification

Example output on quota error:
  ❌ Fatal error detected: insufficient_quota
  💡 API quota exceeded. Add credits at https://platform.openai.com/...
  Aborting benchmark - fix the issue above and retry.

This prevents wasting time running remaining samples when the first one
fails due to a configuration issue that requires user action.
---
 benchmarks/runner.py                | 68 +++++++++++++++++++++++++++++
 tests/benchmarks/test_benchmarks.py | 48 ++++++++++++++++++++
 2 files changed, 116 insertions(+)

diff --git a/benchmarks/runner.py b/benchmarks/runner.py
index e03a258..b4eae49 100644
--- a/benchmarks/runner.py
+++ b/benchmarks/runner.py
@@ -12,6 +12,7 @@
 Includes progress tracking with ETA via tqdm or custom callbacks.
 """
 
+import sys
 import time
 from collections.abc import Callable
 from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -20,6 +21,47 @@
 
 from benchmarks.base import Benchmark, BenchmarkResult, BenchmarkSample, SampleResult
 
+
+# Error patterns that indicate fatal configuration issues (not worth retrying)
+FATAL_ERROR_PATTERNS = [
+    # Authentication/Authorization
+    ("insufficient_quota", "API quota exceeded. Add credits at https://platform.openai.com/account/billing"),
+    ("invalid_api_key", "Invalid API key. Check your API key configuration."),
+    ("invalid api key", "Invalid API key. Check your API key configuration."),
+    ("incorrect api key", "Invalid API key. Check your API key configuration."),
+    ("authentication", "Authentication failed. Verify your API key is correct."),
+    ("unauthorized", "Unauthorized. Check your API key and permissions."),
+    ("api_key", "API key issue. Check your API key configuration."),
+    # Model issues
+    ("model_not_found", "Model not found. Verify the model name is correct."),
+    ("does not exist", "Model does not exist. Check available models for your account."),
+    ("no such model", "Model not found. Check available models for your account."),
+    # Rate limiting (without retry-after)
+    ("rate_limit", "Rate limited. Consider reducing --max-workers or adding delays."),
+]
+
+
+class FatalBenchmarkError(Exception):
+    """Raised when a benchmark encounters an unrecoverable error."""
+
+    def __init__(self, message: str, suggestion: str):
+        self.message = message
+        self.suggestion = suggestion
+        super().__init__(f"{message}\n\nSuggestion: {suggestion}")
+
+
+def classify_error(error_str: str) -> tuple[bool, str]:
+    """Classify an error as fatal or transient.
+
+    Returns:
+        (is_fatal, suggestion) - If fatal, includes actionable suggestion.
+    """
+    error_lower = error_str.lower()
+    for pattern, suggestion in FATAL_ERROR_PATTERNS:
+        if pattern in error_lower:
+            return True, suggestion
+    return False, ""
+
 # Type alias for progress callback
 ProgressCallback = Callable[[int, int, "SampleResult | None", "ProgressStats"], None]
 
@@ -323,6 +365,18 @@ def _run_sequential(
                 results.append(sample_result)
                 self._update_progress(sample_result, stats, progress_mode, pbar)
 
+                # Check for fatal errors on first sample - abort early
+                if sample_result.error and stats.completed == 1:
+                    is_fatal, suggestion = classify_error(sample_result.error)
+                    if is_fatal:
+                        if pbar:
+                            pbar.close()
+                            pbar = None
+                        print(f"\n❌ Fatal error detected: {sample_result.error}", file=sys.stderr)
+                        print(f"\n💡 {suggestion}", file=sys.stderr)
+                        print("\nAborting benchmark - fix the issue above and retry.", file=sys.stderr)
+                        return results
+
         finally:
             if pbar is not None:
                 pbar.close()
@@ -356,6 +410,7 @@ def _run_parallel(
         results: dict[str, SampleResult] = {}
         lock = threading.Lock()
         pbar = None
+        fatal_error_found: tuple[str, str] | None = None  # (error, suggestion)
 
         try:
             if progress_mode == "tqdm":
@@ -395,10 +450,23 @@ def _run_parallel(
                         results[sample.id] = sample_result
                         self._update_progress(sample_result, stats, progress_mode, pbar)
 
+                        # Track first fatal error (for reporting after completion)
+                        if sample_result.error and fatal_error_found is None:
+                            is_fatal, suggestion = classify_error(sample_result.error)
+                            if is_fatal:
+                                fatal_error_found = (sample_result.error, suggestion)
+
         finally:
             if pbar is not None:
                 pbar.close()
 
+        # Report fatal errors after parallel run completes
+        if fatal_error_found:
+            error, suggestion = fatal_error_found
+            print(f"\n❌ Fatal error detected: {error}", file=sys.stderr)
+            print(f"\n💡 {suggestion}", file=sys.stderr)
+            print("\nFix the issue above before running more benchmarks.", file=sys.stderr)
+
         # Return results in original sample order
         return [results[sample.id] for sample in samples]
 
diff --git a/tests/benchmarks/test_benchmarks.py b/tests/benchmarks/test_benchmarks.py
index 908765d..44bce14 100644
--- a/tests/benchmarks/test_benchmarks.py
+++ b/tests/benchmarks/test_benchmarks.py
@@ -564,6 +564,54 @@ def __init__(self, **kwargs):
         assert captured_kwargs.get("backend_kwargs") == {"model_name": "gpt-4o-test"}
 
 
+class TestErrorClassification:
+    """Tests for error classification and fatal error handling."""
+
+    def test_classify_quota_error(self):
+        """Test that quota errors are classified as fatal."""
+        from benchmarks.runner import classify_error
+
+        error = "Error code: 429 - {'error': {'code': 'insufficient_quota'}}"
+        is_fatal, suggestion = classify_error(error)
+        assert is_fatal
+        assert "quota" in suggestion.lower() or "billing" in suggestion.lower()
+
+    def test_classify_auth_error(self):
+        """Test that auth errors are classified as fatal."""
+        from benchmarks.runner import classify_error
+
+        error = "Invalid API key provided"
+        is_fatal, suggestion = classify_error(error)
+        assert is_fatal
+        assert "key" in suggestion.lower()
+
+    def test_classify_model_error(self):
+        """Test that model not found errors are classified as fatal."""
+        from benchmarks.runner import classify_error
+
+        error = "The model 'gpt-99' does not exist"
+        is_fatal, suggestion = classify_error(error)
+        assert is_fatal
+        assert "model" in suggestion.lower()
+
+    def test_classify_transient_error(self):
+        """Test that transient errors are not classified as fatal."""
+        from benchmarks.runner import classify_error
+
+        error = "Connection timeout after 30 seconds"
+        is_fatal, suggestion = classify_error(error)
+        assert not is_fatal
+        assert suggestion == ""
+
+    def test_classify_generic_error(self):
+        """Test that unknown errors are not classified as fatal."""
+        from benchmarks.runner import classify_error
+
+        error = "Something unexpected happened"
+        is_fatal, suggestion = classify_error(error)
+        assert not is_fatal
+
+
 class TestBenchmarkIntegration:
     """Integration tests for benchmark framework."""