From 13f5068323a292340eeeb636505a12f3d57b500f Mon Sep 17 00:00:00 2001 From: sukru tikves Date: Wed, 24 Jun 2026 22:41:28 -0700 Subject: [PATCH] Enable multi-turn KV cache reuse by removing per-turn engine.reset() The engine's implicit prefix caching (TokenHistory.resolve) already handles all cases: same prefix reuse, divergence rewinding, and full re-processing. Removing the unconditional reset() lets the KV cache persist across turns, saving re-prefill of the shared conversation history (~1-3s per turn for typical conversations). --- .../LanguageModel/CoreAILanguageModel.swift | 3 --- 1 file changed, 3 deletions(-) diff --git a/swift/Sources/CoreAILanguageModels/LanguageModel/CoreAILanguageModel.swift b/swift/Sources/CoreAILanguageModels/LanguageModel/CoreAILanguageModel.swift index f970446..238900b 100644 --- a/swift/Sources/CoreAILanguageModels/LanguageModel/CoreAILanguageModel.swift +++ b/swift/Sources/CoreAILanguageModels/LanguageModel/CoreAILanguageModel.swift @@ -305,9 +305,6 @@ public struct CoreAILanguageModel: LanguageModel { let effectiveSamplingConfig = createSamplingConfig(from: request.generationOptions) let maxTokens = request.generationOptions.maximumResponseTokens ?? 512 - // Reset engine state for new generation - try await engine.reset() - // FoundationModels now threads entry identity itself based on event // ordering — we no longer mint an entryID and pass it down.