@@ -853,6 +853,8 @@ def apply_token_budget(
853853# Source: https://platform.openai.com/docs/pricing
854854# MAINTENANCE: Update when OpenAI changes pricing.
855855PRICING = {
856+ "gpt-5.5" : (5.00 , 30.00 ),
857+ "gpt-5.5-pro" : (30.00 , 180.00 ),
856858 "gpt-5.4" : (2.50 , 15.00 ),
857859 "gpt-5.4-pro" : (30.00 , 180.00 ),
858860 "gpt-4.1" : (2.00 , 8.00 ),
@@ -900,8 +902,8 @@ def estimate_cost(
900902 "code changes that have not yet been submitted as a pull request." ,
901903 ),
902904 (
903- "Review ONLY the changes introduced by this PR (diff)" ,
904- "Review ONLY the changes shown in the diff below" ,
905+ "Review the changes introduced by this PR (diff)" ,
906+ "Review the code changes shown in the diff below" ,
905907 ),
906908 (
907909 "If the PR changes an estimator" ,
@@ -933,6 +935,62 @@ def estimate_cost(
933935 "Use the branch name only to understand which "
934936 "methods/papers are intended." ,
935937 ),
938+ # Replace the CI Single-Pass Completeness Mandate with a local-mode note.
939+ # The CI Mandate instructs the reviewer to run shell greps, load sibling
940+ # files, and sweep transitive paths — none of which the local raw API
941+ # path can do. Leaving the CI wording in place would cause the model to
942+ # claim audits it cannot perform.
943+ (
944+ """## Single-Pass Completeness Mandate (Initial Review Only)
945+
946+ This is an INITIAL review. Treat this as the only chance to enumerate findings.
947+ Follow-up rounds are expensive — find ALL P0/P1/P2 issues in this pass.
948+
949+ Before finalizing, confirm you have run each of these audits on the diff:
950+
951+ 1. **Sibling-surface mirror audit**: For every fix or change in a method, schema,
952+ default-value path, or report block, identify the parallel surface in the same
953+ codebase (BR ↔ DR, schema ↔ renderer, default ↔ precomputed, summary ↔ full)
954+ and check whether the same change applies there. Flag the unmirrored side as P1.
955+
956+ 2. **Pattern-wide grep**: When you flag any anti-pattern or bug class, use `grep`
957+ on `diff_diff/**.py` to identify sibling occurrences of the same pattern and
958+ enumerate them in the SAME finding. Only LOAD a sibling file's full contents
959+ if grep returns a hit and you need surrounding context to verify the issue.
960+ Do not defer pattern-class findings to a follow-up round.
961+
962+ 3. **Reciprocal/symmetry check**: For dispatch code, validation, or guards in
963+ one direction (A-on-B), explicitly enumerate the reciprocal direction (B-on-A)
964+ and confirm coverage.
965+
966+ 4. **Transitive workflow deps**: For GH Actions workflow `paths:` or pytest
967+ selection changes, sweep transitive auto-loaded files (conftest.py,
968+ pyproject.toml, ancestor conftests) and confirm they are included.
969+
970+ 5. **Scope override (with carve-outs)**: The audits above explicitly authorize
971+ loading files outside the diff to verify completeness. This overrides the
972+ "minimum surrounding context" default in the Rules section below.
973+
974+ **DO NOT load these paths** (the workflow's diff-build deliberately excludes
975+ them; they are noise or out-of-scope):
976+ - `docs/tutorials/*.ipynb` (notebook outputs are large JSON blobs)
977+ - `benchmarks/data/real/*.json`
978+ - `benchmarks/data/real/*.csv`""" ,
979+ """## Single-Pass Completeness Audit (Local Review)
980+
981+ This is a local review running as a static-prompt API call. You do NOT have
982+ shell or file-loading access — only the prompt content below is available
983+ (diff + changed source files + first-level imports).
984+
985+ Find ALL P0/P1/P2 issues within the loaded context. Audit sibling surfaces,
986+ parallel patterns, and reciprocal directions THAT ARE VISIBLE in the loaded
987+ files.
988+
989+ Do NOT claim to have run shell greps, loaded sibling files outside the
990+ prompt, or audited paths not present here. If a relevant audit is impossible
991+ because the necessary context is not in the prompt, say so explicitly rather
992+ than asserting completeness.""" ,
993+ ),
936994]
937995
938996
@@ -1095,15 +1153,30 @@ def compile_prompt(
10951153# ---------------------------------------------------------------------------
10961154
10971155ENDPOINT = "https://api.openai.com/v1/responses"
1098- DEFAULT_MODEL = "gpt-5.4"
1099- DEFAULT_TIMEOUT = 300 # seconds
1156+ DEFAULT_MODEL = "gpt-5.5"
1157+ DEFAULT_TIMEOUT = 300 # seconds — non-reasoning models
1158+ REASONING_TIMEOUT = 900 # seconds — reasoning models can take 10-15 min
11001159DEFAULT_MAX_TOKENS = 16384
11011160REASONING_MAX_TOKENS = 32768
11021161
11031162
11041163def _is_reasoning_model (model : str ) -> bool :
11051164 """Return True for models that use internal chain-of-thought reasoning."""
1106- return model .startswith (("o1" , "o3" , "o4" )) or "-pro" in model
1165+ return model .startswith (("o1" , "o3" , "o4" , "gpt-5.4" , "gpt-5.5" )) or "-pro" in model
1166+
1167+
1168+ def _resolve_timeout (timeout : "int | None" , model : str ) -> int :
1169+ """Resolve the effective HTTP timeout for an API call.
1170+
1171+ If --timeout was explicitly provided, use it. Otherwise pick
1172+ REASONING_TIMEOUT (900s) for reasoning models and DEFAULT_TIMEOUT
1173+ (300s) for standard models. This prevents reasoning-model reviews
1174+ from hitting a too-short default when the wrapper command does not
1175+ pass --timeout.
1176+ """
1177+ if timeout is not None :
1178+ return timeout
1179+ return REASONING_TIMEOUT if _is_reasoning_model (model ) else DEFAULT_TIMEOUT
11071180
11081181
11091182def estimate_tokens (text : str ) -> int :
@@ -1134,13 +1207,19 @@ def call_openai(
11341207 prompt : str ,
11351208 model : str ,
11361209 api_key : str ,
1137- timeout : int = DEFAULT_TIMEOUT ,
1210+ timeout : " int | None" = None ,
11381211) -> "tuple[str, dict]" :
11391212 """Call the OpenAI Responses API.
11401213
1214+ If ``timeout`` is None, resolves to REASONING_TIMEOUT (900s) for
1215+ reasoning models and DEFAULT_TIMEOUT (300s) otherwise — same logic
1216+ as the CLI ``--timeout`` flag. This guards future direct callers
1217+ against the old 300s-everywhere default.
1218+
11411219 Returns (content, usage) where usage is the API response's usage dict
11421220 containing input_tokens and output_tokens.
11431221 """
1222+ timeout = _resolve_timeout (timeout , model )
11441223 reasoning = _is_reasoning_model (model )
11451224 max_tokens = REASONING_MAX_TOKENS if reasoning else DEFAULT_MAX_TOKENS
11461225
@@ -1343,8 +1422,12 @@ def main() -> None:
13431422 parser .add_argument (
13441423 "--timeout" ,
13451424 type = int ,
1346- default = DEFAULT_TIMEOUT ,
1347- help = f"HTTP request timeout in seconds (default: { DEFAULT_TIMEOUT } )" ,
1425+ default = None ,
1426+ help = (
1427+ f"HTTP request timeout in seconds. If omitted, defaults to "
1428+ f"{ REASONING_TIMEOUT } for reasoning models and { DEFAULT_TIMEOUT } for "
1429+ f"standard models."
1430+ ),
13481431 )
13491432 parser .add_argument (
13501433 "--delta-diff" ,
@@ -1374,6 +1457,10 @@ def main() -> None:
13741457
13751458 args = parser .parse_args ()
13761459
1460+ # Resolve --timeout: reasoning models default to REASONING_TIMEOUT (900s),
1461+ # standard models to DEFAULT_TIMEOUT (300s). Explicit --timeout overrides.
1462+ args .timeout = _resolve_timeout (args .timeout , args .model )
1463+
13771464 # Post-parse validation
13781465 if args .context != "minimal" and not args .repo_root :
13791466 parser .error (
@@ -1614,13 +1701,7 @@ def main() -> None:
16141701 sys .exit (0 )
16151702
16161703 # Call OpenAI API
1617- if _is_reasoning_model (args .model ) and args .timeout == DEFAULT_TIMEOUT :
1618- print (
1619- f"Note: { args .model } is a reasoning model. Consider --timeout 900 "
1620- "for large reviews." ,
1621- file = sys .stderr ,
1622- )
1623- print (f"Sending review to { args .model } ..." , file = sys .stderr )
1704+ print (f"Sending review to { args .model } (timeout: { args .timeout } s)..." , file = sys .stderr )
16241705 print (f"Estimated input tokens: ~{ est_tokens :,} " , file = sys .stderr )
16251706 if cost_str :
16261707 print (f"Estimated cost: { cost_str } " , file = sys .stderr )
0 commit comments