diff --git a/packages/gateway/src/cache-warmer.ts b/packages/gateway/src/cache-warmer.ts index 000b43ef..cc2c79a6 100644 --- a/packages/gateway/src/cache-warmer.ts +++ b/packages/gateway/src/cache-warmer.ts @@ -78,12 +78,17 @@ export const BLEND_PSEUDOCOUNT = 20; export const DEAD_SESSION_THRESHOLD = 0.02; /** Minimum completed turns before warming is eligible. Filters out one-shot - * sessions and ensures the survival model has ≥2 gap observations. */ -export const MIN_TURNS_FOR_WARMING = 3; + * sessions and ensures the survival model has ≥4 gap observations. */ +export const MIN_TURNS_FOR_WARMING = 5; /** Maximum duration (ms) to keep warming during a tool call before - * falling back to normal survival analysis. 30 min ≈ 6 cycles at 5m TTL. */ -export const MAX_TOOL_CALL_WARMING_MS = 30 * 60 * 1000; + * falling back to normal survival analysis. 10 min ≈ 2 cycles at 5m TTL. */ +export const MAX_TOOL_CALL_WARMING_MS = 10 * 60 * 1000; + +/** Maximum warmup cycles during a single tool-call break. Most tool + * calls complete in <5 minutes; 2 cycles covers a 10-minute operation + * at 5m TTL, sufficient for 95%+ of tool calls. */ +export const TOOL_CALL_MAX_CYCLES = 2; /** Max uncached warmup responses before the global circuit breaker trips. */ const CIRCUIT_BREAKER_MAX_FAILURES = 3; @@ -94,12 +99,24 @@ const CIRCUIT_BREAKER_MAX_FAILURES = 3; export const BREAK_FLOOR_MS = 180_000; /** Minimum total warmups before session-level hit-rate ROI check kicks in. */ -export const MIN_WARMUPS_FOR_ROI_CHECK = 10; +export const MIN_WARMUPS_FOR_ROI_CHECK = 5; /** Minimum session-level hit rate to continue warming. Below this, - * warming is empirically unprofitable and we stop. 20% means at least - * 1 in 5 warmups must result in a confirmed user return. */ -export const MIN_SESSION_HIT_RATE = 0.20; + * warming is empirically unprofitable and we stop. 25% means at least + * 1 in 4 warmups must result in a confirmed user return. */ +export const MIN_SESSION_HIT_RATE = 0.25; + +/** Minimum total input tokens (input + cache_read + cache_creation) before + * warming is eligible. Below this threshold the absolute savings per hit + * are too small to justify the risk of wasted warmups. At 50K tokens with + * Opus 5m, a hit saves ~$0.29 and a warmup costs ~$0.025. */ +export const MIN_INPUT_TOKENS_FOR_WARMING = 50_000; + +/** Minimum P(returns) floor for the initial warming commitment. The + * break-even threshold read/(write-read) is often very low (4–9%), + * which causes nearly every non-dead session to get warmed. This floor + * ensures at least 30% return probability before the first warmup. */ +export const MIN_RETURN_PROBABILITY_FLOOR = 0.30; // --------------------------------------------------------------------------- // Global circuit breaker @@ -749,11 +766,16 @@ export function shouldWarm( const maxCycles = maxProfitableCycles(cacheReadCostPerMTok, cacheMissCostPerMTok); const cyclesSpent = state.warmup?.warmupCount ?? 0; - if (cyclesSpent >= maxCycles) return false; + // Tool-call-specific cap: most tools complete in <10min (2 cycles at 5m TTL) + const effectiveMax = Math.min(maxCycles, TOOL_CALL_MAX_CYCLES); + if (cyclesSpent >= effectiveMax) return false; // Still require some history to have a stored body worth warming if (state.messageCount < MIN_TURNS_FOR_WARMING * 2) return false; + // Context too small — absolute savings per hit don't justify risk + if ((state.lastInputTokens ?? 0) < MIN_INPUT_TOKENS_FOR_WARMING) return false; + // Only warm in the margin window of the current TTL cycle const intoWindow = elapsed % ttlMs; if (intoWindow < ttlMs - warmupMarginMs) return false; @@ -768,6 +790,9 @@ export function shouldWarm( // wasted warmup at 200K Opus tokens). if (state.messageCount < MIN_TURNS_FOR_WARMING * 2) return false; + // Context too small — absolute savings per hit don't justify risk + if ((state.lastInputTokens ?? 0) < MIN_INPUT_TOKENS_FOR_WARMING) return false; + // Session marked dead if (state.warmup?.disabled) return false; @@ -796,9 +821,13 @@ export function shouldWarm( }); const pReturns = 1.0 - pFinished; - // Corrected cost threshold: read / (write - read) + // Corrected cost threshold: read / (write - read), with a floor to prevent + // warming sessions with trivially low return probability (8.7% for 5m TTL). + // NOTE: an explicit minReturnProbability config override intentionally + // bypasses the floor — it's a user-controlled knob for tuning. const autoThreshold = costThreshold(cacheReadCostPerMTok, cacheMissCostPerMTok); - const threshold = cfg.cache.warming.minReturnProbability ?? autoThreshold; + const threshold = cfg.cache.warming.minReturnProbability + ?? Math.max(MIN_RETURN_PROBABILITY_FLOOR, autoThreshold); // Max cycles before warming becomes unprofitable const maxCycles = maxProfitableCycles(cacheReadCostPerMTok, cacheMissCostPerMTok); @@ -841,6 +870,9 @@ export function shouldWarm( // Rising cost threshold: after k cycles, the accumulated warmup cost // means we need a higher P(returns) to justify the next one. // k=1: 8.7%, k=3: 26%, k=5: 43%, k=6: 52% for Opus 5m. + // NOTE: intentionally does NOT use MIN_RETURN_PROBABILITY_FLOOR — the + // floor only gates the initial commitment (Phase A). Once committed, + // the rising threshold handles profitability based on sunk costs. const risingThreshold = cumulativeCostThreshold( cyclesSpent + 1, // +1 because we're deciding whether to do the NEXT cycle cacheReadCostPerMTok, @@ -978,11 +1010,12 @@ export function computeWarmingSnapshot( }); const pReturns = 1.0 - pFinished; - // Corrected threshold + // Corrected threshold with floor const autoThreshold = profile ? costThreshold(profile.cacheReadCostPerMTok, profile.cacheMissCostPerMTok) : 0.1; - const thresholdVal = cfg.cache.warming.minReturnProbability ?? autoThreshold; + const thresholdVal = cfg.cache.warming.minReturnProbability + ?? Math.max(MIN_RETURN_PROBABILITY_FLOOR, autoThreshold); // Commitment model cost analysis const maxCyclesVal = profile @@ -1028,21 +1061,25 @@ export function computeWarmingSnapshot( notWarmingReason = "Force-keep: cooldown active"; } } - } else if (state.lastStopReason === "tool_use") { - if (state.warmup?.disabled) { - notWarmingReason = "Warming stopped (/lore:warm:stop)"; - } else if (state.messageCount < MIN_TURNS_FOR_WARMING * 2) { + } else if (state.lastStopReason === "tool_use" && !state.warmup?.disabled) { + // Mirror shouldWarm()'s tool-call entry: `toolCallActive && !disabled`. + // If disabled=true, fall through to the normal path below. + if (state.messageCount < MIN_TURNS_FOR_WARMING * 2) { notWarmingReason = `Too few turns (${state.messageCount} < ${MIN_TURNS_FOR_WARMING * 2})`; + } else if ((state.lastInputTokens ?? 0) < MIN_INPUT_TOKENS_FOR_WARMING) { + const tokK = Math.round((state.lastInputTokens ?? 0) / 1000); + notWarmingReason = `Context too small (${tokK}k < ${MIN_INPUT_TOKENS_FOR_WARMING / 1000}k tokens)`; } else if (idleMs > MAX_TOOL_CALL_WARMING_MS) { - notWarmingReason = `Tool call exceeded max duration (${Math.round(idleMs / 60_000)}min > 30min)`; + notWarmingReason = `Tool call exceeded max duration (${Math.round(idleMs / 60_000)}min > ${Math.round(MAX_TOOL_CALL_WARMING_MS / 60_000)}min)`; } else if ((state.warmup?.totalWarmups ?? 0) >= MIN_WARMUPS_FOR_ROI_CHECK && (state.warmup?.warmupHits ?? 0) / (state.warmup?.totalWarmups ?? 1) < MIN_SESSION_HIT_RATE) { const hitRate = ((state.warmup?.warmupHits ?? 0) / (state.warmup?.totalWarmups ?? 1) * 100).toFixed(0); notWarmingReason = `Tool call: session hit rate too low (${hitRate}% < ${(MIN_SESSION_HIT_RATE * 100).toFixed(0)}%)`; } else { const maxCyc = maxProfitableCycles(profile.cacheReadCostPerMTok, profile.cacheMissCostPerMTok); - if ((state.warmup?.warmupCount ?? 0) >= maxCyc) { - notWarmingReason = "Tool call: break-even exceeded"; + const effectiveMax = Math.min(maxCyc, TOOL_CALL_MAX_CYCLES); + if ((state.warmup?.warmupCount ?? 0) >= effectiveMax) { + notWarmingReason = `Tool call: cycle cap reached (${state.warmup?.warmupCount ?? 0} >= ${effectiveMax})`; } else { const intoWindow = idleMs % ttlMs; notWarmingReason = intoWindow < ttlMs - warmupMarginMs @@ -1054,6 +1091,9 @@ export function computeWarmingSnapshot( notWarmingReason = "Already warmed in this TTL window"; } else if (state.messageCount < MIN_TURNS_FOR_WARMING * 2) { notWarmingReason = `Too few turns (${state.messageCount} < ${MIN_TURNS_FOR_WARMING * 2})`; + } else if ((state.lastInputTokens ?? 0) < MIN_INPUT_TOKENS_FOR_WARMING) { + const tokK = Math.round((state.lastInputTokens ?? 0) / 1000); + notWarmingReason = `Context too small (${tokK}k < ${MIN_INPUT_TOKENS_FOR_WARMING / 1000}k tokens)`; } else if (state.warmup?.disabled) { notWarmingReason = "Warming stopped (/lore:warm:stop)"; } else if ((state.warmup?.totalWarmups ?? 0) >= MIN_WARMUPS_FOR_ROI_CHECK && diff --git a/packages/gateway/src/idle.ts b/packages/gateway/src/idle.ts index 8f0f41c3..cd0360e2 100644 --- a/packages/gateway/src/idle.ts +++ b/packages/gateway/src/idle.ts @@ -42,6 +42,7 @@ import { loadGlobalHistograms, flushGlobalHistograms, MIN_TURNS_FOR_WARMING, + MIN_INPUT_TOKENS_FOR_WARMING, } from "./cache-warmer"; import * as Sentry from "@sentry/bun"; import { runBackground } from "./background-limiter"; @@ -109,6 +110,10 @@ export function startIdleScheduler( // work before shouldWarm() rejects them anyway. if (state.messageCount < MIN_TURNS_FOR_WARMING * 2) continue; + // Skip sessions with small context — absolute savings per hit + // don't justify the risk of wasted warmups. + if ((state.lastInputTokens ?? 0) < MIN_INPUT_TOKENS_FOR_WARMING) continue; + // Ensure global histograms are loaded from SQLite for this project loadGlobalHistograms(state.projectPath); diff --git a/packages/gateway/test/cache-warmer.test.ts b/packages/gateway/test/cache-warmer.test.ts index bf935030..53048e14 100644 --- a/packages/gateway/test/cache-warmer.test.ts +++ b/packages/gateway/test/cache-warmer.test.ts @@ -19,6 +19,10 @@ import { MAX_TOOL_CALL_WARMING_MS, MIN_WARMUPS_FOR_ROI_CHECK, MIN_SESSION_HIT_RATE, + MIN_TURNS_FOR_WARMING, + MIN_INPUT_TOKENS_FOR_WARMING, + MIN_RETURN_PROBABILITY_FLOOR, + TOOL_CALL_MAX_CYCLES, HISTOGRAM_BINS, BREAK_FLOOR_MS, _resetForTest, @@ -52,7 +56,7 @@ function makeSessionState(overrides: Partial = {}): SessionState { fingerprint: "abc123", lastRequestTime: Date.now() - 270_000, // 4.5 min ago (inside 5m warmup window) lastUserTurnTime: Date.now() - 270_000, - messageCount: 10, + messageCount: 20, turnsSinceCuration: 2, consecutiveTextOnlyTurns: 0, recallStore: new Map(), @@ -60,6 +64,7 @@ function makeSessionState(overrides: Partial = {}): SessionState { lastModel: "claude-sonnet-4-20250514", lastProtocol: "anthropic", resolvedConversationTTL: "5m", + lastInputTokens: 100_000, // above MIN_INPUT_TOKENS_FOR_WARMING ...overrides, }; } @@ -549,7 +554,7 @@ describe("shouldWarm", () => { const now = Date.now(); const state = makeSessionState({ lastRequestTime: now - 270_000, - messageCount: 4, // 2 turns (user+assistant each) — below threshold of 3 turns (6 messages) + messageCount: 8, // 4 turns (user+assistant each) — below threshold of 5 turns (10 messages) cacheAnalytics: { ...makeCacheAnalytics(), lastRequestBody: compressBody('{"test": true}'), @@ -1670,15 +1675,16 @@ describe("shouldWarm tool-call warming", () => { expect(shouldWarm(state, profile, hist, now)).toBe(false); }); - test("works across multiple TTL windows", () => { + test("tool-call warming stops at TOOL_CALL_MAX_CYCLES even across TTL windows", () => { const now = Date.now(); // 14.5 min — in 3rd 5m window's warmup margin (14:15-15:00) + // With TOOL_CALL_MAX_CYCLES=2 and warmupCount=2, this should be rejected. const state = makeToolCallState({ lastRequestTime: now - 870_000, warmup: { lastWarmupAt: now - 270_000, // past cooldown - warmupCount: 2, - totalWarmups: 2, + warmupCount: TOOL_CALL_MAX_CYCLES, + totalWarmups: TOOL_CALL_MAX_CYCLES, warmupHits: 0, disabled: false, }, @@ -1686,7 +1692,8 @@ describe("shouldWarm tool-call warming", () => { const profile = buildAnthropicProfile("claude-sonnet-4-20250514", "5m"); const hist = createHistogram(); - expect(shouldWarm(state, profile, hist, now)).toBe(true); + // Tool-call warming is now capped at TOOL_CALL_MAX_CYCLES + expect(shouldWarm(state, profile, hist, now)).toBe(false); }); test("requires minimum turns", () => { @@ -1844,7 +1851,7 @@ describe("shouldWarm session ROI guard", () => { warmup: { lastWarmupAt: 0, warmupCount: 0, - totalWarmups: 12, // >= MIN_WARMUPS_FOR_ROI_CHECK (10) + totalWarmups: 12, // >= MIN_WARMUPS_FOR_ROI_CHECK (5) warmupHits: 1, // 8.3% < MIN_SESSION_HIT_RATE (20%) disabled: false, }, @@ -1890,7 +1897,7 @@ describe("shouldWarm session ROI guard", () => { warmup: { lastWarmupAt: 0, warmupCount: 0, - totalWarmups: 5, // < MIN_WARMUPS_FOR_ROI_CHECK (10) + totalWarmups: MIN_WARMUPS_FOR_ROI_CHECK - 1, // below threshold warmupHits: 0, // 0% hit rate, but too few warmups to judge disabled: false, }, @@ -1915,7 +1922,201 @@ describe("shouldWarm session ROI guard", () => { lastWarmupAt: 0, warmupCount: 0, totalWarmups: 15, - warmupHits: 1, // 6.7% < MIN_SESSION_HIT_RATE (20%) + warmupHits: 1, // 6.7% < MIN_SESSION_HIT_RATE (25%) + disabled: false, + }, + cacheAnalytics: { + ...makeCacheAnalytics(), + lastRequestBody: compressBody('{"model":"claude-sonnet-4-20250514","max_tokens":16384,"stream":true,"messages":[{"role":"user","content":"test"}]}'), + }, + }); + const profile = buildAnthropicProfile("claude-sonnet-4-20250514", "5m"); + const hist = createHistogram(); + for (let i = 0; i < 50; i++) recordGap(hist, 360_000); + + expect(shouldWarm(state, profile, hist, now)).toBe(false); + }); +}); + +// --------------------------------------------------------------------------- +// Cost optimization fixes (context size gate, threshold floor, tool-call cap) +// --------------------------------------------------------------------------- + +describe("shouldWarm cost optimization gates", () => { + beforeEach(() => { + _resetForTest(); + }); + + test("returns false when lastInputTokens < MIN_INPUT_TOKENS_FOR_WARMING", () => { + const now = Date.now(); + const state = makeSessionState({ + lastRequestTime: now - 270_000, + lastInputTokens: 30_000, // below 50K threshold + cacheAnalytics: { + ...makeCacheAnalytics(), + lastRequestBody: compressBody('{"model":"claude-sonnet-4-20250514","max_tokens":16384,"stream":true,"messages":[{"role":"user","content":"test"}]}'), + }, + }); + const profile = buildAnthropicProfile("claude-sonnet-4-20250514", "5m"); + const hist = createHistogram(); + for (let i = 0; i < 50; i++) recordGap(hist, 360_000); + + expect(shouldWarm(state, profile, hist, now)).toBe(false); + }); + + test("returns true when lastInputTokens >= MIN_INPUT_TOKENS_FOR_WARMING", () => { + const now = Date.now(); + const state = makeSessionState({ + lastRequestTime: now - 270_000, + lastInputTokens: 100_000, // above 50K threshold + cacheAnalytics: { + ...makeCacheAnalytics(), + lastRequestBody: compressBody('{"model":"claude-sonnet-4-20250514","max_tokens":16384,"stream":true,"messages":[{"role":"user","content":"test"}]}'), + }, + }); + const profile = buildAnthropicProfile("claude-sonnet-4-20250514", "5m"); + // Histogram with many breaks — high P(returns) + const hist = createHistogram(); + for (let i = 0; i < 50; i++) recordGap(hist, 360_000); + + expect(shouldWarm(state, profile, hist, now)).toBe(true); + }); + + test("context size gate applies to tool-call path", () => { + const now = Date.now(); + const state = makeSessionState({ + lastRequestTime: now - 270_000, + lastStopReason: "tool_use", + lastInputTokens: 20_000, // below 50K threshold + cacheAnalytics: { + ...makeCacheAnalytics(), + lastRequestBody: compressBody('{"model":"claude-sonnet-4-20250514","max_tokens":16384,"stream":true,"messages":[{"role":"user","content":"test"}]}'), + }, + }); + const profile = buildAnthropicProfile("claude-sonnet-4-20250514", "5m"); + const hist = createHistogram(); + + expect(shouldWarm(state, profile, hist, now)).toBe(false); + }); + + test("context size gate does NOT apply to forceKeepWarm path", () => { + const now = Date.now(); + const state = makeSessionState({ + lastRequestTime: now - 270_000, + lastInputTokens: 10_000, // well below 50K — but force-keep overrides + warmup: { + lastWarmupAt: 0, + warmupCount: 0, + totalWarmups: 0, + warmupHits: 0, + disabled: false, + forceKeepWarm: true, + }, + cacheAnalytics: { + ...makeCacheAnalytics(), + lastRequestBody: compressBody('{"test": true}'), + }, + }); + const profile = buildAnthropicProfile("claude-sonnet-4-20250514", "5m"); + const hist = createHistogram(); + + expect(shouldWarm(state, profile, hist, now)).toBe(true); + }); + + test("tool-call warming stops after TOOL_CALL_MAX_CYCLES", () => { + const now = Date.now(); + const state = makeSessionState({ + lastRequestTime: now - 570_000, // 9.5 min — in warmup margin of 2nd window + lastStopReason: "tool_use", + warmup: { + lastWarmupAt: now - 310_000, // past cooldown + warmupCount: TOOL_CALL_MAX_CYCLES, // already at cap + totalWarmups: TOOL_CALL_MAX_CYCLES, + warmupHits: 0, + disabled: false, + }, + cacheAnalytics: { + ...makeCacheAnalytics(), + lastRequestBody: compressBody('{"model":"claude-sonnet-4-20250514","max_tokens":16384,"stream":true,"messages":[{"role":"user","content":"test"}]}'), + }, + }); + const profile = buildAnthropicProfile("claude-sonnet-4-20250514", "5m"); + const hist = createHistogram(); + + expect(shouldWarm(state, profile, hist, now)).toBe(false); + }); + + test("tool-call warming allowed when below TOOL_CALL_MAX_CYCLES", () => { + const now = Date.now(); + const state = makeSessionState({ + lastRequestTime: now - 270_000, + lastStopReason: "tool_use", + warmup: { + lastWarmupAt: 0, + warmupCount: TOOL_CALL_MAX_CYCLES - 1, // one below cap + totalWarmups: TOOL_CALL_MAX_CYCLES - 1, + warmupHits: TOOL_CALL_MAX_CYCLES - 1, // good hit rate + disabled: false, + }, + cacheAnalytics: { + ...makeCacheAnalytics(), + lastRequestBody: compressBody('{"model":"claude-sonnet-4-20250514","max_tokens":16384,"stream":true,"messages":[{"role":"user","content":"test"}]}'), + }, + }); + const profile = buildAnthropicProfile("claude-sonnet-4-20250514", "5m"); + const hist = createHistogram(); + + expect(shouldWarm(state, profile, hist, now)).toBe(true); + }); + + test("initial commitment requires P(returns) >= MIN_RETURN_PROBABILITY_FLOOR", () => { + const now = Date.now(); + // Histogram with mostly short gaps and very few breaks — survival drops + // at 4.5m but not to zero. This creates P(returns) ~29%, above the old + // 8.7% threshold but below the new 30% floor. + const hist = createHistogram(); + for (let i = 0; i < 98; i++) recordGap(hist, 30_000); // 30s — active + for (let i = 0; i < 2; i++) recordGap(hist, 360_000); // 6m — break + + const state = makeSessionState({ + lastRequestTime: now - 270_000, + consecutiveTextOnlyTurns: 0, + cacheAnalytics: { + ...makeCacheAnalytics(), + lastRequestBody: compressBody('{"model":"claude-sonnet-4-20250514","max_tokens":16384,"stream":true,"messages":[{"role":"user","content":"test"}]}'), + }, + }); + const profile = buildAnthropicProfile("claude-sonnet-4-20250514", "5m"); + + // Verify this session has P(returns) below the 30% floor + const survivalAtIdle = survivalFunction(hist, 270_000); + const breakFrac = breakFraction(hist); + const pFinished = pSessionFinished({ + survivalAtIdle, + consecutiveTextOnlyTurns: 0, + breakFraction: breakFrac, + totalTurns: 10, + }); + const pReturns = 1.0 - pFinished; + expect(pReturns).toBeLessThan(MIN_RETURN_PROBABILITY_FLOOR); + // But it would have passed the old break-even threshold + const oldThreshold = costThreshold(profile.cacheReadCostPerMTok, profile.cacheMissCostPerMTok); + expect(pReturns).toBeGreaterThan(oldThreshold); + + // With the new floor, warming should be rejected + expect(shouldWarm(state, profile, hist, now)).toBe(false); + }); + + test("session-level ROI check kicks in at 5 warmups", () => { + const now = Date.now(); + // Session with 5 warmups and 0 hits → hit rate 0% < 25% + const state = makeSessionState({ + lastRequestTime: now - 270_000, + warmup: { + lastWarmupAt: 0, + warmupCount: 0, + totalWarmups: MIN_WARMUPS_FOR_ROI_CHECK, // exactly at threshold + warmupHits: 0, // 0% hit rate disabled: false, }, cacheAnalytics: { @@ -1929,4 +2130,100 @@ describe("shouldWarm session ROI guard", () => { expect(shouldWarm(state, profile, hist, now)).toBe(false); }); + + test("session-level ROI check passes when hit rate is above threshold", () => { + const now = Date.now(); + // Session with 5 warmups and 2 hits → hit rate 40% > 25% + const state = makeSessionState({ + lastRequestTime: now - 270_000, + warmup: { + lastWarmupAt: 0, + warmupCount: 0, + totalWarmups: MIN_WARMUPS_FOR_ROI_CHECK, + warmupHits: 2, // 40% hit rate > 25% + disabled: false, + }, + cacheAnalytics: { + ...makeCacheAnalytics(), + lastRequestBody: compressBody('{"model":"claude-sonnet-4-20250514","max_tokens":16384,"stream":true,"messages":[{"role":"user","content":"test"}]}'), + }, + }); + const profile = buildAnthropicProfile("claude-sonnet-4-20250514", "5m"); + const hist = createHistogram(); + for (let i = 0; i < 50; i++) recordGap(hist, 360_000); + + expect(shouldWarm(state, profile, hist, now)).toBe(true); + }); + + test("returns false when lastInputTokens is undefined (first turn)", () => { + const now = Date.now(); + const state = makeSessionState({ + lastRequestTime: now - 270_000, + lastInputTokens: undefined, // no response yet — ?? 0 < 50K + cacheAnalytics: { + ...makeCacheAnalytics(), + lastRequestBody: compressBody('{"model":"claude-sonnet-4-20250514","max_tokens":16384,"stream":true,"messages":[{"role":"user","content":"test"}]}'), + }, + }); + const profile = buildAnthropicProfile("claude-sonnet-4-20250514", "5m"); + const hist = createHistogram(); + for (let i = 0; i < 50; i++) recordGap(hist, 360_000); + + expect(shouldWarm(state, profile, hist, now)).toBe(false); + }); + + test("Phase B continuation uses rising threshold, not the floor", () => { + const now = Date.now(); + // Session has been idle for 9.5 min (past first 5m TTL window). + // In warmup margin of 2nd window: 9.5min % 5min = 4.5min > 4.25min. + // With 1 cycle already spent, risingThreshold(k=2) ≈ 17.4% for Sonnet 5m. + // We need P(returns) between 17.4% and 30% to prove Phase B does NOT + // use the 30% floor. + const state = makeSessionState({ + lastRequestTime: now - 570_000, // 9.5 min ago + warmup: { + lastWarmupAt: now - 310_000, // past cooldown + warmupCount: 1, + totalWarmups: 1, + warmupHits: 0, + disabled: false, + }, + cacheAnalytics: { + ...makeCacheAnalytics(), + lastRequestBody: compressBody('{"model":"claude-sonnet-4-20250514","max_tokens":16384,"stream":true,"messages":[{"role":"user","content":"test"}]}'), + }, + }); + const profile = buildAnthropicProfile("claude-sonnet-4-20250514", "5m"); + + // Histogram: 98% short gaps, 2% long breaks. At 9.5 min idle, + // survival is ~2% — P(returns) should land ~29%, between the rising + // threshold at k=2 (~17.4%) and the 30% floor. + const hist = createHistogram(); + for (let i = 0; i < 98; i++) recordGap(hist, 30_000); // 30s + for (let i = 0; i < 2; i++) recordGap(hist, 600_000); // 10m + + // Verify P(returns) is in the interesting range: above rising threshold + // at k=2 but below the 30% floor + const survivalAtIdle = survivalFunction(hist, 570_000); + const breakFrac = breakFraction(hist); + const pFinished = pSessionFinished({ + survivalAtIdle, + consecutiveTextOnlyTurns: 0, + breakFraction: breakFrac, + totalTurns: 10, + }); + const pReturns = 1.0 - pFinished; + const risingThresh = cumulativeCostThreshold( + 2, // cyclesSpent(1) + 1 + profile.cacheReadCostPerMTok, + profile.cacheMissCostPerMTok, + ); + // P(returns) should be above the rising threshold (continuation is profitable) + expect(pReturns).toBeGreaterThan(risingThresh); + // But below the floor (Phase A would reject this) + expect(pReturns).toBeLessThan(MIN_RETURN_PROBABILITY_FLOOR); + + // Phase B should allow warming because it uses risingThreshold, not the floor + expect(shouldWarm(state, profile, hist, now)).toBe(true); + }); }); diff --git a/packages/gateway/test/helpers/idle-worker.ts b/packages/gateway/test/helpers/idle-worker.ts index e19de0dc..7dc84750 100644 --- a/packages/gateway/test/helpers/idle-worker.ts +++ b/packages/gateway/test/helpers/idle-worker.ts @@ -88,7 +88,8 @@ mock.module("../../src/cache-warmer", () => ({ executeWarmup: async () => ({}), loadGlobalHistograms: () => {}, flushGlobalHistograms: () => {}, - MIN_TURNS_FOR_WARMING: 3, + MIN_TURNS_FOR_WARMING: 5, + MIN_INPUT_TOKENS_FOR_WARMING: 50_000, })); mock.module("../../src/worker-model", () => ({