From 72b288ebfe2754cff5b8a149741ed1385ba4fb5c Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 17 Jan 2026 03:26:44 +0000
Subject: [PATCH] feat: add sample timeout to prevent indefinite hangs

- Add --sample-timeout CLI option (default: 300s / 5 minutes)
- Samples that exceed timeout are aborted with clear error message
- Timeout errors are classified as fatal with actionable suggestion:
  "Try --sample-timeout to increase, or reduce --context-length"
- Uses ThreadPoolExecutor with timeout for clean cancellation

This prevents benchmarks from hanging indefinitely when API calls
stall or when very large contexts cause excessive processing time.
---
 benchmarks/cli.py                   |  7 +++++++
 benchmarks/runner.py                | 20 ++++++++++++++++++--
 tests/benchmarks/test_benchmarks.py |  9 +++++++++
 3 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index 11e3023..949937c 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -365,6 +365,12 @@ def main():
         help="Progress display mode: auto (uses tqdm if available), tqdm (progress bar), "
         "simple (periodic status), none (quiet). Default: auto",
     )
+    run_parser.add_argument(
+        "--sample-timeout",
+        type=int,
+        default=300,
+        help="Timeout per sample in seconds (default: 300 = 5 minutes)",
+    )
 
     # Benchmark-specific options for run
     run_parser.add_argument("--context-length", type=int, default=100_000, help="NIAH context len")
@@ -470,6 +476,7 @@ def cmd_run(args: argparse.Namespace) -> int:
             log_dir=args.log_dir,
             max_workers=args.max_workers,
             progress=args.progress,
+            sample_timeout=args.sample_timeout,
         )
 
         for benchmark in benchmarks:
diff --git a/benchmarks/runner.py b/benchmarks/runner.py
index b4eae49..fbc3934 100644
--- a/benchmarks/runner.py
+++ b/benchmarks/runner.py
@@ -38,6 +38,8 @@
     ("no such model", "Model not found. Check available models for your account."),
     # Rate limiting (without retry-after)
     ("rate_limit", "Rate limited. Consider reducing --max-workers or adding delays."),
+    # Timeout
+    ("timed out", "Sample timed out. Try --sample-timeout to increase, or reduce --context-length."),
 ]
 
 
@@ -119,6 +121,7 @@ class RunnerConfig:
     max_workers: int = 1  # Number of parallel workers (1 = sequential)
     progress: str = "auto"  # Progress display: "auto", "tqdm", "simple", "none"
     progress_callback: ProgressCallback | None = None  # Custom progress callback
+    sample_timeout: int = 300  # Timeout per sample in seconds (default: 5 minutes)
     backend_kwargs: dict[str, Any] = field(default_factory=dict)
     environment_kwargs: dict[str, Any] = field(default_factory=dict)
 
@@ -157,6 +160,7 @@ def __init__(
         max_workers: int = 1,
         progress: str = "auto",
         progress_callback: ProgressCallback | None = None,
+        sample_timeout: int = 300,
         **kwargs,
     ):
         """Initialize runner with configuration.
@@ -176,6 +180,7 @@ def __init__(
                 - "none": No progress output
             progress_callback: Custom callback for progress updates.
                 Signature: (completed, total, sample_result, stats) -> None
+            sample_timeout: Timeout per sample in seconds (default: 300 = 5 minutes).
             **kwargs: Additional backend or environment kwargs.
         """
         self.config = RunnerConfig(
@@ -188,6 +193,7 @@ def __init__(
             max_workers=max_workers,
             progress=progress,
             progress_callback=progress_callback,
+            sample_timeout=sample_timeout,
             backend_kwargs={"model_name": model, **kwargs.get("backend_kwargs", {})},
             environment_kwargs=kwargs.get("environment_kwargs", {}),
         )
@@ -615,14 +621,24 @@ def _run_sample(
         inference_fn: Callable[[BenchmarkSample], tuple[str, dict[str, Any]]],
         benchmark: Benchmark,
     ) -> SampleResult:
-        """Run a single sample and evaluate."""
+        """Run a single sample and evaluate with timeout."""
+        from concurrent.futures import TimeoutError as FuturesTimeoutError
+
         start_time = time.time()
         error = None
         prediction = ""
         metadata: dict[str, Any] = {}
+        timeout = self.config.sample_timeout
 
         try:
-            prediction, metadata = inference_fn(sample)
+            # Use a thread pool to enforce timeout
+            with ThreadPoolExecutor(max_workers=1) as executor:
+                future = executor.submit(inference_fn, sample)
+                try:
+                    prediction, metadata = future.result(timeout=timeout)
+                except FuturesTimeoutError:
+                    error = f"Sample timed out after {timeout}s. Try --sample-timeout to increase."
+                    prediction = ""
         except Exception as e:
             error = str(e)
             prediction = ""
diff --git a/tests/benchmarks/test_benchmarks.py b/tests/benchmarks/test_benchmarks.py
index 44bce14..6fd55b2 100644
--- a/tests/benchmarks/test_benchmarks.py
+++ b/tests/benchmarks/test_benchmarks.py
@@ -611,6 +611,15 @@ def test_classify_generic_error(self):
         is_fatal, suggestion = classify_error(error)
         assert not is_fatal
 
+    def test_classify_timeout_error(self):
+        """Test that timeout errors are classified as fatal with helpful suggestion."""
+        from benchmarks.runner import classify_error
+
+        error = "Sample timed out after 300s"
+        is_fatal, suggestion = classify_error(error)
+        assert is_fatal
+        assert "timeout" in suggestion.lower() or "context" in suggestion.lower()
+
 
 class TestBenchmarkIntegration:
     """Integration tests for benchmark framework."""