From 2b7b93accc660e220b0da4066e37c0e72f6360cc Mon Sep 17 00:00:00 2001 From: mrciffa <49000955+davide221@users.noreply.github.com> Date: Thu, 11 Jun 2026 16:18:27 +0200 Subject: [PATCH] fix(server): drop stale prefix-cache entries when a slot is reused The inline and full-compress prefix caches assign snapshot slots round-robin via next_slot_, which advances in prepare_*_snap even when the snapshot later aborts (degenerate boundary, failed generation, client disconnect). A burned step makes a later confirm wrap onto a slot that a live entry still references. From then on the entry table and the slot contents disagree: the entry's hash describes one token stream, the slot holds a snapshot of another. Consequences of such a stale entry: - follow-up prompt shorter than the slot snapshot: failed request (snapshot_longer_than_prompt) before PR #370, conservative cache miss after it; - follow-up prompt longer than the slot snapshot: the restore path attaches KV from the wrong token stream with no validation - silent context corruption. Fix the root cause: when confirm_inline_snap / confirm_full_snap commit a snapshot into a slot, erase every other entry still pointing at that slot. A slot holds exactly one snapshot, so at most one entry may describe it. Verified on RTX 3090 (Qwen3.6-27B Q4_K_M, --prefix-cache-slots 2) with the deterministic PR #370 repro (short conv -> aborted snap -> big conv wrapping onto slot 0 -> shortened follow-up): the wrap now logs '[pc] dropping stale entry for reused slot=0', the follow-up is a clean miss with correct output, and a longer same-conversation follow-up restores a valid snapshot. Greedy outputs across the sequence match the no-cache baseline; 1905 server unit assertions pass. Co-Authored-By: WOZCODE --- server/src/server/prefix_cache.cpp | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/server/src/server/prefix_cache.cpp b/server/src/server/prefix_cache.cpp index 1503bc8a8..323ae95e7 100644 --- a/server/src/server/prefix_cache.cpp +++ b/server/src/server/prefix_cache.cpp @@ -261,6 +261,21 @@ void PrefixCache::confirm_inline_snap(int slot, int target_cut, has_pending_evict_ = false; } + // The new snapshot replaces whatever this slot previously held. Drop any + // other entries still pointing at the slot: their hashes describe a + // different (or shorter) token stream than the new snapshot, and a later + // restore through them would attach mismatched KV. Stale entries arise + // when an aborted snap burns a round-robin next_slot_ step and a later + // confirm wraps onto a slot with a live entry (PR #370 repro). + for (int i = (int)entries_.size() - 1; i >= 0; --i) { + if (entries_[(size_t)i].slot == slot) { + std::fprintf(stderr, + "[pc] dropping stale entry for reused slot=%d\n", slot); + entries_.erase(entries_.begin() + i); + entries_size_count_.fetch_sub(1, std::memory_order_relaxed); + } + } + auto key = hash_prefix(prompt_ids.data(), target_cut); entries_.push_back({key, slot}); entries_size_count_.fetch_add(1, std::memory_order_relaxed); @@ -368,6 +383,15 @@ void PrefixCache::confirm_full_snap(int slot, full_has_pending_evict_ = false; } + for (int i = (int)full_entries_.size() - 1; i >= 0; --i) { + if (full_entries_[(size_t)i].entry.slot == slot) { + std::fprintf(stderr, + "[pc] dropping stale full-cache entry for reused slot=%d\n", slot); + full_entries_.erase(full_entries_.begin() + i); + full_entries_size_count_.fetch_sub(1, std::memory_order_relaxed); + } + } + auto key = hash_prefix(prompt_ids.data(), (int)prompt_ids.size()); FullCacheEntry entry; entry.slot = slot;