Skip to content

Commit cbe3396

Browse files
igerberclaude
andcommitted
Fix latent gpt-5.4 reasoning-model classification + auto-resolve --timeout
The reverted state (and pre-PR-404 main) misclassified `gpt-5.4` as a non-reasoning model in `_is_reasoning_model()`. Per OpenAI's model docs, gpt-5.4 IS a reasoning model and should hit the reasoning code path (REASONING_MAX_TOKENS=32768, no `temperature` in payload, longer timeout). PR #404's commit message documented this as a "latent bug fix per OpenAI docs"; this restores that fix without re-introducing the gpt-5.5 bump or the Mandate prompt that #416 reverts. Concrete changes: .claude/scripts/openai_review.py: - Add `gpt-5.4` to `_is_reasoning_model()`'s prefix tuple - Add `REASONING_TIMEOUT = 900` constant - Add `_resolve_timeout(timeout, model)` helper: None -> auto-resolve (900s for reasoning, 300s otherwise); explicit values pass through - `call_openai()` signature: `timeout: int | None = None` (was `int = DEFAULT_TIMEOUT`); calls `_resolve_timeout()` internally so direct callers also get model-aware defaults - CLI `--timeout` argparse default: None (was DEFAULT_TIMEOUT); help text describes the dynamic default - CLI runtime: replace the "Consider --timeout 900" advisory with `args.timeout = _resolve_timeout(...)` and surface the effective timeout in the "Sending review to ..." log line .claude/commands/ai-review-local.md: - --timeout description: dynamic default for reasoning models - Reasoning-model handling section: skill no longer needs to pass --timeout 900 manually; gpt-5.4 added to reasoning-model list tests/test_openai_review.py: - Flip `test_gpt54_is_not_reasoning` -> `test_gpt54_is_reasoning` (and add snapshot variant) - Add `TestResolveTimeout` (4 cases: reasoning default, non-reasoning default, explicit passthrough, zero-as-explicit) - Update `test_standard_model_payload` to use `gpt-4.1` (true non-reasoning model) instead of `gpt-5.4` 169 tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 3498d3f commit cbe3396

3 files changed

Lines changed: 67 additions & 18 deletions

File tree

.claude/commands/ai-review-local.md

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ pre-PR use. Designed for iterative review/revision cycles before submitting a PR
2323
- `--force-fresh`: Skip delta-diff mode, run a full fresh review even if previous state exists
2424
- `--full-registry`: Include the entire REGISTRY.md instead of selective sections
2525
- `--model <name>`: Override the OpenAI model (default: `gpt-5.4`)
26-
- `--timeout <seconds>`: HTTP request timeout (default: 300). Use 900 for reasoning models.
26+
- `--timeout <seconds>`: HTTP request timeout. If omitted, defaults to 900 for reasoning models (gpt-5.4, *-pro, o1/o3/o4) and 300 otherwise.
2727
- `--dry-run`: Print the compiled prompt without calling the API
2828

2929
**Reasoning models** (`gpt-5.4-pro`, `o3`, `o4-mini`, etc.): Reviews may take 10-15
@@ -334,9 +334,10 @@ python3 .claude/scripts/openai_review.py \
334334
Note: `--force-fresh` is a skill-only flag — it controls whether delta diffs are
335335
generated in Step 4 and is NOT passed to the script.
336336

337-
**Reasoning model handling:** If the model contains `-pro` or starts with `o1`/`o3`/`o4`
338-
(e.g., `gpt-5.4-pro`, `o3`, `o4-mini`):
339-
- Pass `--timeout 900` to the script (unless the user explicitly specified `--timeout`)
337+
**Reasoning model handling:** If the model is `gpt-5.4`, contains `-pro`, or starts with
338+
`o1`/`o3`/`o4` (e.g., `gpt-5.4`, `gpt-5.4-pro`, `o3`, `o4-mini`):
339+
- The script auto-resolves `--timeout` to 900s for reasoning models when omitted, so
340+
no extra flag is required unless overriding
340341
- Run the Bash command with `run_in_background: true` (bypasses the 600s Bash tool timeout cap)
341342
- After the background command completes, continue to Step 6
342343

.claude/scripts/openai_review.py

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1117,13 +1117,26 @@ def compile_prompt(
11171117
ENDPOINT = "https://api.openai.com/v1/responses"
11181118
DEFAULT_MODEL = "gpt-5.4"
11191119
DEFAULT_TIMEOUT = 300 # seconds
1120+
REASONING_TIMEOUT = 900 # seconds
11201121
DEFAULT_MAX_TOKENS = 16384
11211122
REASONING_MAX_TOKENS = 32768
11221123

11231124

11241125
def _is_reasoning_model(model: str) -> bool:
11251126
"""Return True for models that use internal chain-of-thought reasoning."""
1126-
return model.startswith(("o1", "o3", "o4")) or "-pro" in model
1127+
return model.startswith(("o1", "o3", "o4", "gpt-5.4")) or "-pro" in model
1128+
1129+
1130+
def _resolve_timeout(timeout: "int | None", model: str) -> int:
1131+
"""Auto-resolve omitted --timeout based on model class.
1132+
1133+
Reasoning models (o1/o3/o4/gpt-5.4/*-pro) get REASONING_TIMEOUT (900s).
1134+
Non-reasoning models get DEFAULT_TIMEOUT (300s).
1135+
Explicit values are passed through unchanged.
1136+
"""
1137+
if timeout is not None:
1138+
return timeout
1139+
return REASONING_TIMEOUT if _is_reasoning_model(model) else DEFAULT_TIMEOUT
11271140

11281141

11291142
def estimate_tokens(text: str) -> int:
@@ -1154,13 +1167,19 @@ def call_openai(
11541167
prompt: str,
11551168
model: str,
11561169
api_key: str,
1157-
timeout: int = DEFAULT_TIMEOUT,
1170+
timeout: "int | None" = None,
11581171
) -> "tuple[str, dict]":
11591172
"""Call the OpenAI Responses API.
11601173
1174+
If ``timeout`` is None, resolves to REASONING_TIMEOUT (900s) for
1175+
reasoning models and DEFAULT_TIMEOUT (300s) otherwise — same logic
1176+
as the CLI ``--timeout`` flag. This guards future direct callers
1177+
against the old 300s-everywhere default.
1178+
11611179
Returns (content, usage) where usage is the API response's usage dict
11621180
containing input_tokens and output_tokens.
11631181
"""
1182+
timeout = _resolve_timeout(timeout, model)
11641183
reasoning = _is_reasoning_model(model)
11651184
max_tokens = REASONING_MAX_TOKENS if reasoning else DEFAULT_MAX_TOKENS
11661185

@@ -1363,8 +1382,12 @@ def main() -> None:
13631382
parser.add_argument(
13641383
"--timeout",
13651384
type=int,
1366-
default=DEFAULT_TIMEOUT,
1367-
help=f"HTTP request timeout in seconds (default: {DEFAULT_TIMEOUT})",
1385+
default=None,
1386+
help=(
1387+
f"HTTP request timeout in seconds. If omitted, defaults to "
1388+
f"{REASONING_TIMEOUT} for reasoning models (gpt-5.4, *-pro, "
1389+
f"o1/o3/o4) and {DEFAULT_TIMEOUT} otherwise."
1390+
),
13681391
)
13691392
parser.add_argument(
13701393
"--delta-diff",
@@ -1634,13 +1657,8 @@ def main() -> None:
16341657
sys.exit(0)
16351658

16361659
# Call OpenAI API
1637-
if _is_reasoning_model(args.model) and args.timeout == DEFAULT_TIMEOUT:
1638-
print(
1639-
f"Note: {args.model} is a reasoning model. Consider --timeout 900 "
1640-
"for large reviews.",
1641-
file=sys.stderr,
1642-
)
1643-
print(f"Sending review to {args.model}...", file=sys.stderr)
1660+
args.timeout = _resolve_timeout(args.timeout, args.model)
1661+
print(f"Sending review to {args.model} (timeout={args.timeout}s)...", file=sys.stderr)
16441662
print(f"Estimated input tokens: ~{est_tokens:,}", file=sys.stderr)
16451663
if cost_str:
16461664
print(f"Estimated cost: {cost_str}", file=sys.stderr)

tests/test_openai_review.py

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1546,8 +1546,12 @@ def test_pro_is_reasoning(self, review_mod):
15461546
def test_pro_snapshot_is_reasoning(self, review_mod):
15471547
assert review_mod._is_reasoning_model("gpt-5.4-pro-2026-03-05") is True
15481548

1549-
def test_gpt54_is_not_reasoning(self, review_mod):
1550-
assert review_mod._is_reasoning_model("gpt-5.4") is False
1549+
def test_gpt54_is_reasoning(self, review_mod):
1550+
# gpt-5.4 is a reasoning model per OpenAI docs (latent bug fix).
1551+
assert review_mod._is_reasoning_model("gpt-5.4") is True
1552+
1553+
def test_gpt54_snapshot_is_reasoning(self, review_mod):
1554+
assert review_mod._is_reasoning_model("gpt-5.4-2026-03-05") is True
15511555

15521556
def test_gpt41_is_not_reasoning(self, review_mod):
15531557
assert review_mod._is_reasoning_model("gpt-4.1") is False
@@ -1572,6 +1576,30 @@ def test_pro_snapshot_matches_pro(self, review_mod):
15721576
assert snapshot == base
15731577

15741578

1579+
class TestResolveTimeout:
1580+
"""Omitted --timeout must auto-resolve to 900s for reasoning models
1581+
and 300s otherwise; explicit values pass through unchanged."""
1582+
1583+
def test_reasoning_model_default(self, review_mod):
1584+
assert review_mod._resolve_timeout(None, "gpt-5.4") == review_mod.REASONING_TIMEOUT
1585+
assert review_mod._resolve_timeout(None, "gpt-5.4") == 900
1586+
assert review_mod._resolve_timeout(None, "o3") == 900
1587+
assert review_mod._resolve_timeout(None, "gpt-5.4-pro") == 900
1588+
1589+
def test_non_reasoning_model_default(self, review_mod):
1590+
assert review_mod._resolve_timeout(None, "gpt-4.1") == review_mod.DEFAULT_TIMEOUT
1591+
assert review_mod._resolve_timeout(None, "gpt-4.1") == 300
1592+
1593+
def test_explicit_value_passthrough(self, review_mod):
1594+
assert review_mod._resolve_timeout(60, "gpt-4.1") == 60
1595+
assert review_mod._resolve_timeout(1200, "gpt-5.4") == 1200
1596+
1597+
def test_zero_is_explicit_value_not_default(self, review_mod):
1598+
# 0 is a valid explicit value (means "no timeout"); only None triggers
1599+
# auto-resolution.
1600+
assert review_mod._resolve_timeout(0, "gpt-5.4") == 0
1601+
1602+
15751603
class TestSkillDocAPIConsistency:
15761604
"""Catch doc drift between the script's API endpoint and the skill doc's
15771605
user-facing data-transmission note."""
@@ -1795,7 +1823,9 @@ def fake_urlopen(req, timeout=None):
17951823

17961824
def test_standard_model_payload(self, review_mod, mock_urlopen):
17971825
"""Standard model sends input, max_output_tokens, and temperature=0."""
1798-
content, usage = review_mod.call_openai("test prompt", "gpt-5.4", "fake-key")
1826+
# gpt-4.1 is the canonical non-reasoning model; gpt-5.4 hits the
1827+
# reasoning branch (different max_tokens, no temperature).
1828+
content, usage = review_mod.call_openai("test prompt", "gpt-4.1", "fake-key")
17991829
payload = mock_urlopen["payload"]
18001830
assert payload["input"] == "test prompt"
18011831
assert payload["max_output_tokens"] == review_mod.DEFAULT_MAX_TOKENS

0 commit comments

Comments
 (0)