Fix latent gpt-5.4 reasoning-model classification + auto-resolve --timeout

igerber · claude · igerber · commit cbe3396ab80a · 2026-05-12T07:53:34.000-04:00
The reverted state (and pre-PR-404 main) misclassified `gpt-5.4` as a non-reasoning model in `_is_reasoning_model()`. Per OpenAI's model docs, gpt-5.4 IS a reasoning model and should hit the reasoning code path (REASONING_MAX_TOKENS=32768, no `temperature` in payload, longer timeout). PR #404's commit message documented this as a "latent bug fix per OpenAI docs"; this restores that fix without re-introducing the gpt-5.5 bump or the Mandate prompt that #416 reverts. Concrete changes: .claude/scripts/openai_review.py: - Add `gpt-5.4` to `_is_reasoning_model()`'s prefix tuple - Add `REASONING_TIMEOUT = 900` constant - Add `_resolve_timeout(timeout, model)` helper: None -> auto-resolve (900s for reasoning, 300s otherwise); explicit values pass through - `call_openai()` signature: `timeout: int | None = None` (was `int = DEFAULT_TIMEOUT`); calls `_resolve_timeout()` internally so direct callers also get model-aware defaults - CLI `--timeout` argparse default: None (was DEFAULT_TIMEOUT); help text describes the dynamic default - CLI runtime: replace the "Consider --timeout 900" advisory with `args.timeout = _resolve_timeout(...)` and surface the effective timeout in the "Sending review to ..." log line .claude/commands/ai-review-local.md: - --timeout description: dynamic default for reasoning models - Reasoning-model handling section: skill no longer needs to pass --timeout 900 manually; gpt-5.4 added to reasoning-model list tests/test_openai_review.py: - Flip `test_gpt54_is_not_reasoning` -> `test_gpt54_is_reasoning` (and add snapshot variant) - Add `TestResolveTimeout` (4 cases: reasoning default, non-reasoning default, explicit passthrough, zero-as-explicit) - Update `test_standard_model_payload` to use `gpt-4.1` (true non-reasoning model) instead of `gpt-5.4` 169 tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/.claude/commands/ai-review-local.md b/.claude/commands/ai-review-local.md
@@ -23,7 +23,7 @@ pre-PR use. Designed for iterative review/revision cycles before submitting a PR
 - `--force-fresh`: Skip delta-diff mode, run a full fresh review even if previous state exists
 - `--full-registry`: Include the entire REGISTRY.md instead of selective sections
 - `--model <name>`: Override the OpenAI model (default: `gpt-5.4`)
-- `--timeout <seconds>`: HTTP request timeout (default: 300). Use 900 for reasoning models.
+- `--timeout <seconds>`: HTTP request timeout. If omitted, defaults to 900 for reasoning models (gpt-5.4, *-pro, o1/o3/o4) and 300 otherwise.
 - `--dry-run`: Print the compiled prompt without calling the API
 
 **Reasoning models** (`gpt-5.4-pro`, `o3`, `o4-mini`, etc.): Reviews may take 10-15
@@ -334,9 +334,10 @@ python3 .claude/scripts/openai_review.py \
 Note: `--force-fresh` is a skill-only flag — it controls whether delta diffs are
 generated in Step 4 and is NOT passed to the script.
 
-**Reasoning model handling:** If the model contains `-pro` or starts with `o1`/`o3`/`o4`
-(e.g., `gpt-5.4-pro`, `o3`, `o4-mini`):
-- Pass `--timeout 900` to the script (unless the user explicitly specified `--timeout`)
+**Reasoning model handling:** If the model is `gpt-5.4`, contains `-pro`, or starts with
+`o1`/`o3`/`o4` (e.g., `gpt-5.4`, `gpt-5.4-pro`, `o3`, `o4-mini`):
+- The script auto-resolves `--timeout` to 900s for reasoning models when omitted, so
+  no extra flag is required unless overriding
 - Run the Bash command with `run_in_background: true` (bypasses the 600s Bash tool timeout cap)
 - After the background command completes, continue to Step 6
 
diff --git a/.claude/scripts/openai_review.py b/.claude/scripts/openai_review.py
@@ -1117,13 +1117,26 @@ def compile_prompt(
 ENDPOINT = "https://api.openai.com/v1/responses"
 DEFAULT_MODEL = "gpt-5.4"
 DEFAULT_TIMEOUT = 300  # seconds
+REASONING_TIMEOUT = 900  # seconds
 DEFAULT_MAX_TOKENS = 16384
 REASONING_MAX_TOKENS = 32768
 
 
 def _is_reasoning_model(model: str) -> bool:
     """Return True for models that use internal chain-of-thought reasoning."""
-    return model.startswith(("o1", "o3", "o4")) or "-pro" in model
+    return model.startswith(("o1", "o3", "o4", "gpt-5.4")) or "-pro" in model
+
+
+def _resolve_timeout(timeout: "int | None", model: str) -> int:
+    """Auto-resolve omitted --timeout based on model class.
+
+    Reasoning models (o1/o3/o4/gpt-5.4/*-pro) get REASONING_TIMEOUT (900s).
+    Non-reasoning models get DEFAULT_TIMEOUT (300s).
+    Explicit values are passed through unchanged.
+    """
+    if timeout is not None:
+        return timeout
+    return REASONING_TIMEOUT if _is_reasoning_model(model) else DEFAULT_TIMEOUT
 
 
 def estimate_tokens(text: str) -> int:
@@ -1154,13 +1167,19 @@ def call_openai(
     prompt: str,
     model: str,
     api_key: str,
-    timeout: int = DEFAULT_TIMEOUT,
+    timeout: "int | None" = None,
 ) -> "tuple[str, dict]":
     """Call the OpenAI Responses API.
 
+    If ``timeout`` is None, resolves to REASONING_TIMEOUT (900s) for
+    reasoning models and DEFAULT_TIMEOUT (300s) otherwise — same logic
+    as the CLI ``--timeout`` flag. This guards future direct callers
+    against the old 300s-everywhere default.
+
     Returns (content, usage) where usage is the API response's usage dict
     containing input_tokens and output_tokens.
     """
+    timeout = _resolve_timeout(timeout, model)
     reasoning = _is_reasoning_model(model)
     max_tokens = REASONING_MAX_TOKENS if reasoning else DEFAULT_MAX_TOKENS
 
@@ -1363,8 +1382,12 @@ def main() -> None:
     parser.add_argument(
         "--timeout",
         type=int,
-        default=DEFAULT_TIMEOUT,
-        help=f"HTTP request timeout in seconds (default: {DEFAULT_TIMEOUT})",
+        default=None,
+        help=(
+            f"HTTP request timeout in seconds. If omitted, defaults to "
+            f"{REASONING_TIMEOUT} for reasoning models (gpt-5.4, *-pro, "
+            f"o1/o3/o4) and {DEFAULT_TIMEOUT} otherwise."
+        ),
     )
     parser.add_argument(
         "--delta-diff",
@@ -1634,13 +1657,8 @@ def main() -> None:
         sys.exit(0)
 
     # Call OpenAI API
-    if _is_reasoning_model(args.model) and args.timeout == DEFAULT_TIMEOUT:
-        print(
-            f"Note: {args.model} is a reasoning model. Consider --timeout 900 "
-            "for large reviews.",
-            file=sys.stderr,
-        )
-    print(f"Sending review to {args.model}...", file=sys.stderr)
+    args.timeout = _resolve_timeout(args.timeout, args.model)
+    print(f"Sending review to {args.model} (timeout={args.timeout}s)...", file=sys.stderr)
     print(f"Estimated input tokens: ~{est_tokens:,}", file=sys.stderr)
     if cost_str:
         print(f"Estimated cost: {cost_str}", file=sys.stderr)
diff --git a/tests/test_openai_review.py b/tests/test_openai_review.py
@@ -1546,8 +1546,12 @@ def test_pro_is_reasoning(self, review_mod):
     def test_pro_snapshot_is_reasoning(self, review_mod):
         assert review_mod._is_reasoning_model("gpt-5.4-pro-2026-03-05") is True
 
-    def test_gpt54_is_not_reasoning(self, review_mod):
-        assert review_mod._is_reasoning_model("gpt-5.4") is False
+    def test_gpt54_is_reasoning(self, review_mod):
+        # gpt-5.4 is a reasoning model per OpenAI docs (latent bug fix).
+        assert review_mod._is_reasoning_model("gpt-5.4") is True
+
+    def test_gpt54_snapshot_is_reasoning(self, review_mod):
+        assert review_mod._is_reasoning_model("gpt-5.4-2026-03-05") is True
 
     def test_gpt41_is_not_reasoning(self, review_mod):
         assert review_mod._is_reasoning_model("gpt-4.1") is False
@@ -1572,6 +1576,30 @@ def test_pro_snapshot_matches_pro(self, review_mod):
         assert snapshot == base
 
 
+class TestResolveTimeout:
+    """Omitted --timeout must auto-resolve to 900s for reasoning models
+    and 300s otherwise; explicit values pass through unchanged."""
+
+    def test_reasoning_model_default(self, review_mod):
+        assert review_mod._resolve_timeout(None, "gpt-5.4") == review_mod.REASONING_TIMEOUT
+        assert review_mod._resolve_timeout(None, "gpt-5.4") == 900
+        assert review_mod._resolve_timeout(None, "o3") == 900
+        assert review_mod._resolve_timeout(None, "gpt-5.4-pro") == 900
+
+    def test_non_reasoning_model_default(self, review_mod):
+        assert review_mod._resolve_timeout(None, "gpt-4.1") == review_mod.DEFAULT_TIMEOUT
+        assert review_mod._resolve_timeout(None, "gpt-4.1") == 300
+
+    def test_explicit_value_passthrough(self, review_mod):
+        assert review_mod._resolve_timeout(60, "gpt-4.1") == 60
+        assert review_mod._resolve_timeout(1200, "gpt-5.4") == 1200
+
+    def test_zero_is_explicit_value_not_default(self, review_mod):
+        # 0 is a valid explicit value (means "no timeout"); only None triggers
+        # auto-resolution.
+        assert review_mod._resolve_timeout(0, "gpt-5.4") == 0
+
+
 class TestSkillDocAPIConsistency:
     """Catch doc drift between the script's API endpoint and the skill doc's
     user-facing data-transmission note."""
@@ -1795,7 +1823,9 @@ def fake_urlopen(req, timeout=None):
 
     def test_standard_model_payload(self, review_mod, mock_urlopen):
         """Standard model sends input, max_output_tokens, and temperature=0."""
-        content, usage = review_mod.call_openai("test prompt", "gpt-5.4", "fake-key")
+        # gpt-4.1 is the canonical non-reasoning model; gpt-5.4 hits the
+        # reasoning branch (different max_tokens, no temperature).
+        content, usage = review_mod.call_openai("test prompt", "gpt-4.1", "fake-key")
         payload = mock_urlopen["payload"]
         assert payload["input"] == "test prompt"
         assert payload["max_output_tokens"] == review_mod.DEFAULT_MAX_TOKENS