diff --git a/BUILD b/BUILD
index ab77eddc83..3ac2c23fba 100755
--- a/BUILD
+++ b/BUILD
@@ -45,6 +45,11 @@ config_setting(
     values = {"define": "using_cuda12_x86=true"},
 )
 
+config_setting(
+    name = "using_cuda13_x86",
+    values = {"define": "using_cuda13_x86=true"},
+)
+
 config_setting(
     name = "using_rocm",
     values = {"define": "using_rocm=true"},
diff --git a/rtp_llm/cpp/cache/BUILD b/rtp_llm/cpp/cache/BUILD
index 3294d0b918..4b8a946ed9 100644
--- a/rtp_llm/cpp/cache/BUILD
+++ b/rtp_llm/cpp/cache/BUILD
@@ -1,13 +1,34 @@
 load("//:def.bzl", "copts")
 load("@arch_config//:arch_select.bzl", "torch_deps")
 
+cc_library(
+    name = "cp_slot_mapper",
+    srcs = ["CPSlotMapper.cc"],
+    hdrs = ["CPSlotMapper.h"],
+    copts = copts(),
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "cache_group_type",
     hdrs = [
-        "CacheGroupType.h",
+        "spec/CacheGroupType.h",
+    ],
+    copts = copts(),
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "kv_cache_spec_desc_types",
+    hdrs = [
+        "spec/KVCacheSpecDescTypes.h",
     ],
     copts = copts(),
     visibility = ["//visibility:public"],
+    deps = [
+        ":cache_group_type",
+        "//rtp_llm/models_py/bindings/core:types",
+    ],
 )
 
 cc_library(
@@ -19,17 +40,49 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
+cc_library(
+    name = "kv_cache_specs",
+    hdrs = [
+        "spec/KVCacheSpec.h",
+        "spec/KVCacheSpecBase.h",
+        "spec/KVCacheSpecDesc.h",
+        "spec/KVCacheSpecDescTypes.h",
+        "spec/MHAKVCacheSpec.h",
+        "spec/LinearKVCacheSpec.h",
+        "spec/MLAKVCacheSpec.h",
+        "spec/OpaqueKVCacheSpec.h",
+        "Types.h",
+    ],
+    copts = copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":block_info",
+        ":batch_kv_cache_resource",
+        ":cache_group_type",
+        ":cp_slot_mapper",
+        "//rtp_llm/cpp/config:config_modules",
+        "//rtp_llm/cpp/model_utils:model_utils",
+        "//rtp_llm/models_py/bindings/core:types",
+    ],
+)
+
 cc_library(
     name = "cache_types",
+    srcs = [
+        "Types.cc",
+    ],
     hdrs = [
         "BufferTypes.h",
         "CacheConfig.h",
-        "CacheGroupType.h",
-        "KVCacheSpec.h",
-        "KVCacheSpecBase.h",
-        "MHAKVCacheSpec.h",
-        "LinearKVCacheSpec.h",
-        "MLAKVCacheSpec.h",
+        "spec/CacheGroupType.h",
+        "spec/KVCacheSpec.h",
+        "spec/KVCacheSpecBase.h",
+        "spec/KVCacheSpecDesc.h",
+        "spec/KVCacheSpecDescTypes.h",
+        "spec/MHAKVCacheSpec.h",
+        "spec/LinearKVCacheSpec.h",
+        "spec/MLAKVCacheSpec.h",
+        "spec/OpaqueKVCacheSpec.h",
         "Types.h",
         "WarmUpResult.h",
     ],
@@ -39,8 +92,11 @@ cc_library(
         ":block_info",
         ":batch_kv_cache_resource",
         ":cache_group_type",
+        ":cp_slot_mapper",
         "//:rtp_compute_ops",
         "//rtp_llm/cpp/config:config_modules",
+        "//rtp_llm/cpp/engine_base/stream:complete_token_ids",
+        "//rtp_llm/cpp/model_utils:model_utils",
         "//rtp_llm/models_py/bindings/core:types",
     ] + torch_deps(),
 )
@@ -51,6 +107,7 @@ cc_library(
         "BlockCache.cc",
         "BlockPool.cc",
         "MemoryLayoutStrategy.cc",
+        "SharedBlockCache.cc",
     ],
     hdrs = [
         "BlockCache.h",
@@ -60,55 +117,100 @@ cc_library(
         "BlockRefCounter.h",
         "MemoryLayoutStrategy.h",
         "MemoryLayoutConfig.h",
+        "SharedBlockCache.h",
     ],
     copts = copts(),
     visibility = ["//visibility:public"],
     deps = [
         ":cache_types",
-        "//rtp_llm/models_py/bindings/core:exec_ops_hdr",
-        "//rtp_llm/cpp/disaggregate/cache_store",
-        "//rtp_llm/cpp/engine_base/stream:complete_token_ids",
+        "//rtp_llm/cpp/disaggregate/cache_store:cache_store_interface",
         "//rtp_llm/cpp/utils:kv_cache_utils",
         "//rtp_llm/cpp/utils:lru_cache",
         "//rtp_llm/cpp/utils:profiling_scope",
+        "//rtp_llm/models_py/bindings/core:type_convert",
+    ] + select({
+        "@//:using_cuda": [
+            "//rtp_llm/models_py/bindings/cuda:cuda_host_utils",
+            "@local_config_cuda//cuda:cuda_headers",
+            "@local_config_cuda//cuda:cudart",
+        ],
+        "//conditions:default": [],
+    }),
+)
+
+cc_library(
+    name = "kv_cache_transfer_planner",
+    srcs = [
+        "KVCacheTransferPlanner.cc",
+    ],
+    hdrs = [
+        "KVCacheTransferPlanner.h",
+    ],
+    copts = copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cache_group_type",
     ],
 )
 
 cc_library(
     name = "kv_cache_group",
     srcs = [
-        "FullKVCacheGroup.cc",
-        "KVCacheGroup.cc",
-        "LinearKVCacheGroup.cc",
+        "group/FullKVCacheGroup.cc",
+        "group/KVCacheGroup.cc",
+        "group/LinearKVCacheGroup.cc",
+        "group/SWAKVCacheGroup.cc",
     ],
     hdrs = [
-        "FullKVCacheGroup.h",
-        "KVCacheGroup.h",
-        "LinearKVCacheGroup.h",
+        "group/FullKVCacheGroup.h",
+        "group/KVCacheGroup.h",
+        "group/LinearKVCacheGroup.h",
+        "group/SWAKVCacheGroup.h",
     ],
     copts = copts(),
     visibility = ["//visibility:public"],
     deps = [
         ":block_pool",
+        ":cache_types",
     ],
 )
 
+cc_library(
+    name = "kv_cache_allocator_hdr",
+    hdrs = [
+        "allocator/KVCacheAllocator.h",
+    ],
+    copts = copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":block_pool",
+        ":cache_types",
+        "//rtp_llm/cpp/metrics:metrics",
+    ] + torch_deps(),
+)
+
 cc_library(
     name = "kv_cache_allocator",
     srcs = [
-        "HybridTypeKVCacheAllocator.cc",
-        "KVCacheAllocator.cc",
-        "SingleTypeKVCacheAllocator.cc",
+        "allocator/HybridKVCacheAllocator.cc",
+        "allocator/HybridPoolKVCacheAllocator.cc",
+        "allocator/HybridTypeKVCacheAllocator.cc",
+        "allocator/KVCacheAllocator.cc",
+        "allocator/SingleTypeKVCacheAllocator.cc",
     ],
     hdrs = [
-        "HybridTypeKVCacheAllocator.h",
-        "KVCacheAllocator.h",
-        "SingleTypeKVCacheAllocator.h",
+        "allocator/HybridKVCacheAllocator.h",
+        "allocator/HybridPoolKVCacheAllocator.h",
+        "allocator/HybridTypeKVCacheAllocator.h",
+        "allocator/KVCacheAllocator.h",
+        "allocator/SingleTypeKVCacheAllocator.h",
     ],
     copts = copts(),
     visibility = ["//visibility:public"],
     deps = [
+        ":kv_cache_allocator_hdr",
         ":kv_cache_group",
+        "//rtp_llm/cpp/engine_base/stream:complete_token_ids",
         "//rtp_llm/models_py/bindings/core:exec_ops_hdr",
     ],
 )
@@ -124,25 +226,27 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//rtp_llm/cpp/utils:core_utils",
-        "//rtp_llm/cpp/cache:cache_group_type",
+        ":cache_group_type",
     ],
 )
 
 cc_library(
     name = "cache_core",
     srcs = [
-        "CacheConfigCreator.cc",
-        "HybridConfigCreator.cc",
+        "config_creator/CacheConfigCreator.cc",
+        "config_creator/HybridConfigCreator.cc",
+        "config_creator/HybridPoolConfigCreator.cc",
         "KVCacheHashUtil.cc",
-        "MemoryEvaluationHelper.cc",
-        "SingleConfigCreator.cc",
+        "config_creator/MemoryEvaluationHelper.cc",
+        "config_creator/SingleConfigCreator.cc",
     ],
     hdrs = [
-        "CacheConfigCreator.h",
-        "HybridConfigCreator.h",
+        "config_creator/CacheConfigCreator.h",
+        "config_creator/HybridConfigCreator.h",
+        "config_creator/HybridPoolConfigCreator.h",
         "KVCacheHashUtil.h",
-        "MemoryEvaluationHelper.h",
-        "SingleConfigCreator.h",
+        "config_creator/MemoryEvaluationHelper.h",
+        "config_creator/SingleConfigCreator.h",
     ],
     copts = copts(),
     visibility = ["//visibility:public"],
@@ -151,6 +255,7 @@ cc_library(
         ":cache_types",
         ":kv_cache_allocator",
         "//rtp_llm/cpp/config:model_config",
+        "//rtp_llm/cpp/engine_base/stream:complete_token_ids",
         "//rtp_llm/models_py/bindings/core:exec_ops_hdr",
         "//rtp_llm/models_py/bindings/core:type_convert",
         "//rtp_llm/cpp/disaggregate/cache_store",
diff --git a/rtp_llm/cpp/cache/BatchKVCacheResource.h b/rtp_llm/cpp/cache/BatchKVCacheResource.h
index 2f51f8f377..0435185d7e 100644
--- a/rtp_llm/cpp/cache/BatchKVCacheResource.h
+++ b/rtp_llm/cpp/cache/BatchKVCacheResource.h
@@ -21,13 +21,13 @@ class BatchKVCacheResource {
         batch_resource.resize(batch_size);
     }
 
-    void initGroups(int                                group_nums,
-                    int                                layer_num,
-                    const std::vector<int>&            layer_to_group_id          = {},
-                    size_t                             kernel_blocks_per_kv_block = 1,
-                    const std::vector<CacheGroupType>& group_types                = {}) {
+    void initGroups(int                                  group_nums,
+                    int                                  layer_num,
+                    const std::vector<std::vector<int>>& layer_group_ids            = {},
+                    size_t                               kernel_blocks_per_kv_block = 1,
+                    const std::vector<CacheGroupType>&   group_types                = {}) {
         for (auto& batch : batch_resource) {
-            batch.initGroups(group_nums, layer_num, layer_to_group_id, kernel_blocks_per_kv_block, group_types);
+            batch.initGroups(group_nums, layer_num, layer_group_ids, kernel_blocks_per_kv_block, group_types);
         }
     }
 
@@ -67,16 +67,36 @@ class BatchKVCacheResource {
         return batch_resource[batch_id].blocks(group_id);
     }
 
+    const BlockIndicesType& blocks(int batch_id, int layer_id, int group_id) const {
+        RTP_LLM_CHECK(batch_id >= 0 && static_cast<size_t>(batch_id) < batch_resource.size());
+        return batch_resource[batch_id].blocks(layer_id, group_id);
+    }
+
     const BlockIndicesType& kernelBlocks(int batch_id, int group_id = 0) const {
         RTP_LLM_CHECK(batch_id >= 0 && static_cast<size_t>(batch_id) < batch_resource.size());
         return batch_resource[batch_id].kernelBlocks(group_id);
     }
 
+    const BlockIndicesType& kernelBlocks(int batch_id, int layer_id, int group_id) const {
+        RTP_LLM_CHECK(batch_id >= 0 && static_cast<size_t>(batch_id) < batch_resource.size());
+        return batch_resource[batch_id].kernelBlocks(layer_id, group_id);
+    }
+
+    int groupId(int batch_id, int layer_id, int group_id) const {
+        RTP_LLM_CHECK(batch_id >= 0 && static_cast<size_t>(batch_id) < batch_resource.size());
+        return batch_resource[batch_id].groupId(layer_id, group_id);
+    }
+
     BlockIds& mutableBlockIds(int batch_id, int group_id = 0) {
         RTP_LLM_CHECK(batch_id >= 0 && static_cast<size_t>(batch_id) < batch_resource.size());
         return batch_resource[batch_id].mutableBlockIds(group_id);
     }
 
+    int groupId(int layer_id, int group_id) const {
+        RTP_LLM_CHECK(!batch_resource.empty());
+        return batch_resource[0].groupId(layer_id, group_id);
+    }
+
     const GroupBlockIds& groupBlocks(int batch_id = 0) const {
         RTP_LLM_CHECK(batch_id >= 0 && static_cast<size_t>(batch_id) < batch_resource.size());
         return batch_resource[batch_id].groupBlocks();
@@ -106,6 +126,7 @@ class BatchKVCacheResource {
         auto& keys = batch_resource[batch_id].cacheKeys();
         if (!keys.empty()) {
             keys.pop_back();
+            batch_resource[batch_id].rebuildLinearBlockDependencies();
         }
     }
 
@@ -114,6 +135,7 @@ class BatchKVCacheResource {
             auto& keys = resource.cacheKeys();
             if (!keys.empty()) {
                 keys.pop_back();
+                resource.rebuildLinearBlockDependencies();
             }
         }
     }
@@ -121,22 +143,36 @@ class BatchKVCacheResource {
     void clearCacheKeys(int batch_id = 0) {
         RTP_LLM_CHECK(batch_id >= 0 && static_cast<size_t>(batch_id) < batch_resource.size());
         batch_resource[batch_id].cacheKeys().clear();
+        batch_resource[batch_id].blockDependencies().clear();
     }
 
     void pushBackCacheKey(int batch_id, CacheKeyType key) {
         RTP_LLM_CHECK(batch_id >= 0 && static_cast<size_t>(batch_id) < batch_resource.size());
-        batch_resource[batch_id].cacheKeys().push_back(key);
+        auto& resource = batch_resource[batch_id];
+        auto& keys     = resource.cacheKeys();
+        auto& deps     = resource.blockDependencies();
+        BlockDependency dependency;
+        dependency.ordinal = static_cast<uint32_t>(keys.size());
+        if (!keys.empty()) {
+            dependency.has_parent = true;
+            dependency.parent_key = keys.back();
+        }
+        keys.push_back(key);
+        deps.push_back(dependency);
     }
 
-    void initBatchGroups(int                                batch_id,
-                         int                                group_nums,
-                         int                                layer_num,
-                         const std::vector<int>&            layer_to_group_id          = {},
-                         size_t                             kernel_blocks_per_kv_block = 1,
-                         const std::vector<CacheGroupType>& group_types                = {}) {
+    void initBatchGroups(int                                  batch_id,
+                         int                                  group_nums,
+                         int                                  layer_num,
+                         const std::vector<std::vector<int>>& layer_group_ids            = {},
+                         size_t                               kernel_blocks_per_kv_block = 1,
+                         const std::vector<CacheGroupType>&   group_types                = {}) {
         RTP_LLM_CHECK(batch_id >= 0 && static_cast<size_t>(batch_id) < batch_resource.size());
-        batch_resource[batch_id].initGroups(
-            group_nums, layer_num, layer_to_group_id, kernel_blocks_per_kv_block, group_types);
+        batch_resource[batch_id].initGroups(group_nums,
+                                            layer_num,
+                                            layer_group_ids,
+                                            kernel_blocks_per_kv_block,
+                                            group_types);
     }
 
     void setBatchBlocks(int batch_id, int group_id, const BlockIndicesType& blocks) {
@@ -146,7 +182,7 @@ class BatchKVCacheResource {
 
     void setBatchCacheKeys(int batch_id, const CacheKeysType& keys) {
         RTP_LLM_CHECK(batch_id >= 0 && static_cast<size_t>(batch_id) < batch_resource.size());
-        batch_resource[batch_id].cacheKeys() = keys;
+        batch_resource[batch_id].setCacheKeys(keys);
     }
 
     void check() const {
diff --git a/rtp_llm/cpp/cache/BlockPool.cc b/rtp_llm/cpp/cache/BlockPool.cc
index c7a94322ea..267e6d1441 100644
--- a/rtp_llm/cpp/cache/BlockPool.cc
+++ b/rtp_llm/cpp/cache/BlockPool.cc
@@ -1,16 +1,127 @@
 #include "rtp_llm/cpp/cache/BlockPool.h"
-#include "rtp_llm/models_py/bindings/core/ExecOps.h"
 #include "rtp_llm/cpp/cache/MemoryLayoutStrategy.h"
+#include "rtp_llm/cpp/utils/Logger.h"
 #include "rtp_llm/cpp/utils/TimeUtil.h"
 #include "rtp_llm/cpp/utils/KVCacheUtils.h"
+#include "rtp_llm/cpp/disaggregate/cache_store/CacheStore.h"
 #include "rtp_llm/cpp/disaggregate/cache_store/MemoryUtil.h"
-#include "rtp_llm/cpp/disaggregate/cache_store/NormalCacheStore.h"
 #include "rtp_llm/cpp/utils/ProfilingScope.h"
 
+#include <cstdlib>
+#include <cerrno>
+#include <cstdint>
+#include <cstring>
+#include <exception>
+#include <string>
+#include <utility>
+
+#include <sys/mman.h>
+#include <unistd.h>
+
+#if USING_CUDA
+#include <cuda_runtime.h>
+#endif
+
 namespace rtp_llm {
 
-BlockPool::BlockPool(const BlockPoolConfig& config, AllocationType allocation_type):
-    config_(config), allocation_type_(allocation_type) {}
+namespace {
+
+bool shouldPinHostBlockPool();
+
+const char* allocationTypeName(AllocationType allocation_type) {
+    switch (allocation_type) {
+        case AllocationType::HOST:
+            return "HOST";
+        case AllocationType::DEVICE:
+            return "DEVICE";
+    }
+    return "UNKNOWN";
+}
+
+const char* memoryTypeName(MemoryType memory_type) {
+    switch (memory_type) {
+        case MemoryType::MEMORY_CPU:
+            return "CPU";
+        case MemoryType::MEMORY_CPU_PINNED:
+            return "CPU_PINNED";
+        case MemoryType::MEMORY_GPU:
+            return "GPU";
+    }
+    return "UNKNOWN";
+}
+
+const char*
+requestedBackingName(AllocationType allocation_type, bool use_pinned_cpu_backing, bool use_cuda_malloc_backing) {
+    if (allocation_type == AllocationType::HOST) {
+        return shouldPinHostBlockPool() ? "CPU_PINNED_OR_CPU_FALLBACK" : "CPU";
+    }
+    if (use_cuda_malloc_backing) {
+        return "GPU_CUDA_MALLOC";
+    }
+    return use_pinned_cpu_backing ? "CPU_PINNED" : "GPU";
+}
+
+bool shouldPinHostBlockPool() {
+    const char* value = std::getenv("RTP_LLM_PIN_HOST_BLOCK_POOL");
+    if (value == nullptr) {
+        return true;
+    }
+    const std::string flag(value);
+    return flag != "0" && flag != "false" && flag != "FALSE" && flag != "off" && flag != "OFF";
+}
+
+void markHostBlockPoolDontDump(const char* pool_name, void* ptr, size_t size) {
+#ifdef MADV_DONTDUMP
+    if (ptr == nullptr || size == 0) {
+        return;
+    }
+
+    long page_size = sysconf(_SC_PAGESIZE);
+    if (page_size <= 0) {
+        page_size = 4096;
+    }
+
+    const auto begin         = reinterpret_cast<uintptr_t>(ptr);
+    const auto page_mask     = static_cast<uintptr_t>(page_size - 1);
+    const auto aligned_begin = begin & ~page_mask;
+    const auto aligned_end   = (begin + size + page_mask) & ~page_mask;
+    const auto aligned_size  = static_cast<size_t>(aligned_end - aligned_begin);
+
+    if (madvise(reinterpret_cast<void*>(aligned_begin), aligned_size, MADV_DONTDUMP) != 0) {
+        RTP_LLM_LOG_WARNING("madvise MADV_DONTDUMP failed for host block pool, pool_name=%s ptr=%p, size=%zu, "
+                            "error=%s",
+                            pool_name,
+                            ptr,
+                            size,
+                            std::strerror(errno));
+    } else {
+        RTP_LLM_LOG_INFO("madvise MADV_DONTDUMP success for host block pool, pool_name=%s ptr=%p, size=%zu, "
+                         "aligned_ptr=%p, aligned_size=%zu",
+                         pool_name,
+                         ptr,
+                         size,
+                         reinterpret_cast<void*>(aligned_begin),
+                         aligned_size);
+    }
+#else
+    RTP_LLM_LOG_WARNING(
+        "MADV_DONTDUMP is not defined, host block pool may be included in coredump, pool_name=%s ptr=%p, size=%zu",
+        pool_name,
+        ptr,
+        size);
+#endif
+}
+
+}  // namespace
+
+BlockPool::BlockPool(const BlockPoolConfig& config,
+                     AllocationType         allocation_type,
+                     bool                   use_pinned_cpu_backing,
+                     bool                   use_cuda_malloc_backing):
+    config_(config),
+    allocation_type_(allocation_type),
+    use_pinned_cpu_backing_(use_pinned_cpu_backing),
+    use_cuda_malloc_backing_(use_cuda_malloc_backing) {}
 
 BlockPool::~BlockPool() {
     cache_aligned_buffer_ = torch::Tensor();
@@ -38,15 +149,123 @@ void BlockPool::validateConfig() const {
 
 void BlockPool::initializeCacheBuffer() {
     if (allocation_type_ == AllocationType::HOST) {
-        cache_aligned_buffer_ = torch::empty({static_cast<int64_t>(config_.total_size_bytes)},
-                                             torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU))
-                                    .pin_memory();
+        auto cpu_buffer = torch::empty({static_cast<int64_t>(config_.total_size_bytes)},
+                                       torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU));
+        if (shouldPinHostBlockPool()) {
+            try {
+                cache_aligned_buffer_ = cpu_buffer.pin_memory();
+            } catch (const std::exception& e) {
+                RTP_LLM_LOG_WARNING("pin host block pool failed, fallback to pageable CPU memory, pool_name=%s "
+                                    "total_size=%zu bytes, error=%s",
+                                    config_.pool_name.c_str(),
+                                    config_.total_size_bytes,
+                                    e.what());
+                cache_aligned_buffer_ = std::move(cpu_buffer);
+            }
+        } else {
+            RTP_LLM_LOG_INFO("host block pool uses pageable CPU memory, pool_name=%s total_size=%zu bytes",
+                             config_.pool_name.c_str(),
+                             config_.total_size_bytes);
+            cache_aligned_buffer_ = std::move(cpu_buffer);
+        }
+        RTP_LLM_LOG_INFO("mark host block pool dont dump, pool_name=%s ptr=%p, size=%zu",
+                         config_.pool_name.c_str(),
+                         cache_aligned_buffer_.data_ptr(),
+                         config_.total_size_bytes);
+        markHostBlockPoolDontDump(
+            config_.pool_name.c_str(), cache_aligned_buffer_.data_ptr(), config_.total_size_bytes);
+    } else if (use_pinned_cpu_backing_) {
+        initializePinnedCpuBuffer("device block pool pinned CPU backing");
+    } else if (use_cuda_malloc_backing_) {
+        initializeCudaMallocBuffer();
     } else {
         cache_aligned_buffer_ = torch::empty({static_cast<int64_t>(config_.total_size_bytes)},
                                              torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCUDA));
     }
     cache_base_ptr_ = cache_aligned_buffer_.data_ptr();
     RTP_LLM_CHECK_WITH_INFO(cache_base_ptr_ != nullptr, "block pool allocate cache aligned buffer is null");
+    const bool is_cuda   = cache_aligned_buffer_.is_cuda();
+    const bool is_pinned = !is_cuda && cache_aligned_buffer_.is_pinned();
+    static constexpr double kBytesPerMB = 1024.0 * 1024.0;
+    RTP_LLM_LOG_INFO("BlockPool backing selected: pool_name=%s allocation_type=%s requested_backing=%s "
+                     "actual_backing=%s is_cuda=%d is_pinned=%d ptr=%p total_size=%zu bytes total_size_mb=%.2f "
+                     "block_num=%u memory_layouts=%zu",
+                     config_.pool_name.c_str(),
+                     allocationTypeName(allocation_type_),
+                     requestedBackingName(allocation_type_, use_pinned_cpu_backing_, use_cuda_malloc_backing_),
+                     memoryTypeName(where()),
+                     is_cuda,
+                     is_pinned,
+                     cache_base_ptr_,
+                     config_.total_size_bytes,
+                     static_cast<double>(config_.total_size_bytes) / kBytesPerMB,
+                     config_.block_num,
+                     config_.memory_layouts.size());
+}
+
+void BlockPool::initializePinnedCpuBuffer(const char* log_context) {
+    RTP_LLM_LOG_WARNING(
+        "%s, pool_name=%s, total_size=%zu bytes", log_context, config_.pool_name.c_str(), config_.total_size_bytes);
+    auto cpu_buffer = torch::empty({static_cast<int64_t>(config_.total_size_bytes)},
+                                   torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU));
+    try {
+        cache_aligned_buffer_ = cpu_buffer.pin_memory();
+    } catch (const std::exception& e) {
+        RTP_LLM_FAIL("%s pin failed, pool_name=%s total_size=%zu bytes, error=%s",
+                     log_context,
+                     config_.pool_name.c_str(),
+                     config_.total_size_bytes,
+                     e.what());
+    }
+}
+
+void BlockPool::initializeCudaMallocBuffer() {
+#if USING_CUDA
+    RTP_LLM_CHECK_WITH_INFO(allocation_type_ == AllocationType::DEVICE,
+                            "cudaMalloc block pool backing requires DEVICE allocation");
+    RTP_LLM_CHECK_WITH_INFO(config_.total_size_bytes > 0, "cudaMalloc block pool total_size_bytes must be > 0");
+
+    int  device_id  = -1;
+    auto device_err = cudaGetDevice(&device_id);
+    RTP_LLM_CHECK_WITH_INFO(device_err == cudaSuccess,
+                            "cudaGetDevice failed before cudaMalloc block pool allocation, error=%s",
+                            cudaGetErrorString(device_err));
+
+    void*      ptr = nullptr;
+    const auto err = cudaMalloc(&ptr, config_.total_size_bytes);
+    RTP_LLM_CHECK_WITH_INFO(err == cudaSuccess,
+                            "cudaMalloc block pool failed, pool_name=%s, total_size=%zu bytes, error=%s",
+                            config_.pool_name.c_str(),
+                            config_.total_size_bytes,
+                            cudaGetErrorString(err));
+
+    auto deleter = [device_id](void* p) {
+        if (p == nullptr) {
+            return;
+        }
+        int current_device = -1;
+        if (cudaGetDevice(&current_device) == cudaSuccess && current_device != device_id) {
+            (void)cudaSetDevice(device_id);
+            (void)cudaFree(p);
+            (void)cudaSetDevice(current_device);
+            return;
+        }
+        (void)cudaFree(p);
+    };
+    cache_aligned_buffer_ =
+        torch::from_blob(ptr,
+                         {static_cast<int64_t>(config_.total_size_bytes)},
+                         std::move(deleter),
+                         torch::TensorOptions().dtype(torch::kUInt8).device(torch::Device(torch::kCUDA, device_id)));
+    RTP_LLM_LOG_INFO("cudaMalloc block pool backing allocated, pool_name=%s, ptr=%p, total_size=%zu bytes, device=%d",
+                     config_.pool_name.c_str(),
+                     ptr,
+                     config_.total_size_bytes,
+                     device_id);
+#else
+    RTP_LLM_FAIL("cudaMalloc block pool backing requested but this binary was not built with CUDA, pool_name=%s",
+                 config_.pool_name.c_str());
+#endif
 }
 
 void BlockPool::initializeLayerMappings() {
@@ -98,15 +317,16 @@ void BlockPool::processMemoryLayout(size_t layout_idx, const torch::Tensor& full
     processLayerTensors(layout_idx, layout_cfg, global_layer_begin);
 
     // 记录初始化信息
-    RTP_LLM_LOG_INFO(
-        "MemoryLayout[%zu] initialized: layer_num=%u block_num=%u kv_off=%zu kv_bytes=%zu scale_off=%zu scale_bytes=%zu",
-        layout_idx,
-        layout_cfg.layer_num,
-        layout_cfg.block_num,
-        layout_cfg.kv_cache_offset_bytes,
-        layout_cfg.kv_block_pool_size_bytes,
-        layout_cfg.kv_scale_offset_bytes,
-        layout_cfg.kv_scale_pool_size_bytes);
+    RTP_LLM_LOG_INFO("MemoryLayout[%zu] initialized: pool_name=%s layer_num=%u block_num=%u kv_off=%zu kv_bytes=%zu "
+                     "scale_off=%zu scale_bytes=%zu",
+                     layout_idx,
+                     config_.pool_name.c_str(),
+                     layout_cfg.layer_num,
+                     layout_cfg.block_num,
+                     layout_cfg.kv_cache_offset_bytes,
+                     layout_cfg.kv_block_pool_size_bytes,
+                     layout_cfg.kv_scale_offset_bytes,
+                     layout_cfg.kv_scale_pool_size_bytes);
 }
 
 torch::Tensor BlockPool::createTensor(
@@ -180,17 +400,14 @@ bool BlockPool::init() {
     initializeLayoutStrategies();
     initFreeBlocks();
 
-    RTP_LLM_LOG_INFO("BlockPool init success: memory_layouts=%zu, total_layers=%zu, total_size=%zu bytes",
+    RTP_LLM_LOG_INFO("BlockPool init success: pool_name=%s memory_layouts=%zu, total_layers=%zu, total_size=%zu bytes",
+                     config_.pool_name.c_str(),
                      config_.memory_layouts.size(),
                      global_layer_to_local_.size(),
                      config_.total_size_bytes);
     return true;
 }
 
-BlockCachePtr BlockPool::blockCache() {
-    return block_cache_;
-}
-
 void BlockPool::initFreeBlocks() {
     // block 0 is reserved
     for (BlockIdxType i = 1; i < static_cast<BlockIdxType>(config_.block_num); ++i) {
@@ -201,7 +418,6 @@ void BlockPool::initFreeBlocks() {
     req_con_ref_counter_.init(config_.block_num);
     block_cache_ref_counter_.init(config_.block_num);
     req_cache_ref_counter_.init(config_.block_num);
-    block_cache_ = std::make_shared<BlockCache>();
 }
 
 std::vector<torch::Tensor> BlockPool::allLayerCacheBase() const {
@@ -223,8 +439,10 @@ BlockIndicesType BlockPool::malloc(int num_blocks) {
     {
         std::scoped_lock lock(ref_mu_, free_mu_);
         if (free_block_ids_.size() < static_cast<size_t>(num_blocks)) {
-            RTP_LLM_LOG_WARNING(
-                "Block pool only has %zu free blocks, cannot allocate %d blocks", free_block_ids_.size(), num_blocks);
+            RTP_LLM_LOG_WARNING("Block pool only has %zu free blocks, cannot allocate %d blocks, pool_name=%s",
+                                free_block_ids_.size(),
+                                num_blocks,
+                                config_.pool_name.c_str());
             return {};
         }
         auto first = free_block_ids_.begin();
@@ -341,8 +559,9 @@ void BlockPool::regUserMr(size_t model_id, std::shared_ptr<CacheStore> cache_sto
         cache_store_ = std::move(cache_store);
     }
     if (cache_store_ && !kvcache_reg_mr_) {
-        RTP_LLM_LOG_INFO("start to register user mr");
-        auto memory_util = std::static_pointer_cast<NormalCacheStore>(cache_store_)->getMemoryUtil();
+        RTP_LLM_LOG_INFO("start to register user mr, pool_name=%s", config_.pool_name.c_str());
+        auto       memory_util = cache_store_->getMemoryUtil();
+        const bool gpu         = where() == MemoryType::MEMORY_GPU;
 
         for (size_t layout_idx = 0; layout_idx < config_.memory_layouts.size(); ++layout_idx) {
             const auto& layout_cfg = config_.memory_layouts[layout_idx];
@@ -353,6 +572,7 @@ void BlockPool::regUserMr(size_t model_id, std::shared_ptr<CacheStore> cache_sto
                                     layout_cfg.kv_cache_offset_bytes,
                                     layout_cfg.kv_block_pool_size_bytes,
                                     layout_cfg.kv_block_stride_bytes,
+                                    gpu,
                                     "kv");
 
             // Register scale buffer if present
@@ -362,6 +582,7 @@ void BlockPool::regUserMr(size_t model_id, std::shared_ptr<CacheStore> cache_sto
                                         layout_cfg.kv_scale_offset_bytes,
                                         layout_cfg.kv_scale_pool_size_bytes,
                                         layout_cfg.kv_scale_stride_bytes,
+                                        gpu,
                                         "scale");
             }
         }
@@ -372,22 +593,23 @@ void BlockPool::regUserMr(size_t model_id, std::shared_ptr<CacheStore> cache_sto
 
 void BlockPool::deregUserMr() {
     if (kvcache_reg_mr_ && cache_store_) {
-        RTP_LLM_LOG_INFO("start to deregister user mr");
-        auto memory_util = std::static_pointer_cast<NormalCacheStore>(cache_store_)->getMemoryUtil();
+        RTP_LLM_LOG_INFO("start to deregister user mr, pool_name=%s", config_.pool_name.c_str());
+        auto       memory_util = cache_store_->getMemoryUtil();
+        const bool gpu         = where() == MemoryType::MEMORY_GPU;
 
         for (size_t layout_idx = 0; layout_idx < config_.memory_layouts.size(); ++layout_idx) {
             const auto& layout_cfg = config_.memory_layouts[layout_idx];
 
             // Deregister KV buffer
-            deregisterUserMrForBuffer(memory_util, layout_idx, layout_cfg.kv_cache_offset_bytes, "kv");
+            deregisterUserMrForBuffer(memory_util, layout_idx, layout_cfg.kv_cache_offset_bytes, gpu, "kv");
 
             // Deregister scale buffer if present
             if (layout_cfg.hasScale()) {
-                deregisterUserMrForBuffer(memory_util, layout_idx, layout_cfg.kv_scale_offset_bytes, "scale");
+                deregisterUserMrForBuffer(memory_util, layout_idx, layout_cfg.kv_scale_offset_bytes, gpu, "scale");
             }
         }
 
-        RTP_LLM_LOG_INFO("deregister user mr for block pool success");
+        RTP_LLM_LOG_INFO("deregister user mr for block pool success, pool_name=%s", config_.pool_name.c_str());
         kvcache_reg_mr_ = false;
     }
 }
@@ -397,18 +619,23 @@ void BlockPool::registerUserMrForBuffer(std::shared_ptr<rtp_llm::MemoryUtil> mem
                                         size_t                               offset_bytes,
                                         size_t                               bytes,
                                         size_t                               stride_bytes,
+                                        bool                                 gpu,
                                         const std::string&                   buffer_type) {
     void* base_ptr = static_cast<void*>(static_cast<char*>(cache_base_ptr_) + static_cast<ptrdiff_t>(offset_bytes));
     auto  start_us = currentTimeUs();
 
-    if (!memory_util->regUserMr(base_ptr, bytes, true, stride_bytes)) {
-        RTP_LLM_FAIL("register user mr for block pool layout[%zu] %s buffer failed", layout_idx, buffer_type.c_str());
+    if (!memory_util->regUserMr(base_ptr, bytes, gpu, stride_bytes)) {
+        RTP_LLM_FAIL("register user mr for block pool layout[%zu] %s buffer failed, pool_name=%s",
+                     layout_idx,
+                     buffer_type.c_str(),
+                     config_.pool_name.c_str());
     }
 
     auto cost_ms = (currentTimeUs() - start_us) / 1000;
     mr_cost_time_ms_ += cost_ms;
 
-    RTP_LLM_LOG_INFO("register user mr success: layout[%zu] %s base=%p len=%zu aligned=%zu cost=%ld ms",
+    RTP_LLM_LOG_INFO("register user mr success: pool_name=%s layout[%zu] %s base=%p len=%zu aligned=%zu cost=%ld ms",
+                     config_.pool_name.c_str(),
                      layout_idx,
                      buffer_type.c_str(),
                      base_ptr,
@@ -420,11 +647,15 @@ void BlockPool::registerUserMrForBuffer(std::shared_ptr<rtp_llm::MemoryUtil> mem
 void BlockPool::deregisterUserMrForBuffer(std::shared_ptr<rtp_llm::MemoryUtil> memory_util,
                                           size_t                               layout_idx,
                                           size_t                               offset_bytes,
+                                          bool                                 gpu,
                                           const std::string&                   buffer_type) {
     void* base_ptr = static_cast<void*>(static_cast<char*>(cache_base_ptr_) + static_cast<ptrdiff_t>(offset_bytes));
 
-    if (!memory_util->deregUserMr(base_ptr, true)) {
-        RTP_LLM_FAIL("deregister user mr for block pool layout[%zu] %s buffer failed", layout_idx, buffer_type.c_str());
+    if (!memory_util->deregUserMr(base_ptr, gpu)) {
+        RTP_LLM_FAIL("deregister user mr for block pool layout[%zu] %s buffer failed, pool_name=%s",
+                     layout_idx,
+                     buffer_type.c_str(),
+                     config_.pool_name.c_str());
     }
 }
 
@@ -470,8 +701,10 @@ size_t BlockPool::notInUseBlocksNum() const {
 // Returns {layout_index, local_layer_id}. layout_index is the index in BlockPoolConfig.memory_layouts.
 std::pair<int, int> BlockPool::mapGlobalLayerIdToLocal(int global_layer_id) const {
     if (global_layer_id < 0 || static_cast<size_t>(global_layer_id) >= global_layer_to_local_.size()) {
-        RTP_LLM_LOG_ERROR(
-            "Global layer_id %d out of range (total layers: %zu)", global_layer_id, global_layer_to_local_.size());
+        RTP_LLM_LOG_ERROR("Global layer_id %d out of range (total layers: %zu), pool_name=%s",
+                          global_layer_id,
+                          global_layer_to_local_.size(),
+                          config_.pool_name.c_str());
         return {-1, -1};
     }
 
@@ -500,7 +733,10 @@ BlockPool::convertIndexToBuffer(int layer_id, int block_id, int partition_count,
 }
 
 MemoryType BlockPool::where() const {
-    return cache_aligned_buffer_.is_cuda() ? MemoryType::MEMORY_GPU : MemoryType::MEMORY_CPU;
+    if (cache_aligned_buffer_.is_cuda()) {
+        return MemoryType::MEMORY_GPU;
+    }
+    return cache_aligned_buffer_.is_pinned() ? MemoryType::MEMORY_CPU_PINNED : MemoryType::MEMORY_CPU;
 }
 
 void BlockPool::checkLayoutValidity(int layout_id) const {
diff --git a/rtp_llm/cpp/cache/BlockPool.h b/rtp_llm/cpp/cache/BlockPool.h
index 30a4ce1e85..3e8b1291a4 100644
--- a/rtp_llm/cpp/cache/BlockPool.h
+++ b/rtp_llm/cpp/cache/BlockPool.h
@@ -12,7 +12,6 @@
 #include "rtp_llm/cpp/cache/BlockRefCounter.h"
 #include "rtp_llm/cpp/cache/Types.h"
 #include "rtp_llm/cpp/cache/BufferTypes.h"
-#include "rtp_llm/cpp/cache/BlockCache.h"
 #include "rtp_llm/cpp/cache/MemoryLayoutStrategy.h"
 #include "rtp_llm/cpp/cache/BlockPoolConfig.h"
 #include "rtp_llm/cpp/disaggregate/cache_store/MemoryUtil.h"
@@ -23,13 +22,14 @@ class CacheStore;
 
 class BlockPool {
 public:
-    BlockPool(const BlockPoolConfig& config, AllocationType allocation_type = AllocationType::DEVICE);
+    BlockPool(const BlockPoolConfig& config,
+              AllocationType         allocation_type         = AllocationType::DEVICE,
+              bool                   use_pinned_cpu_backing  = false,
+              bool                   use_cuda_malloc_backing = false);
     ~BlockPool();
 
     bool init();
 
-    BlockCachePtr blockCache();
-
     MemoryType                 where() const;
     std::vector<torch::Tensor> allLayerCacheBase() const;
     std::vector<torch::Tensor> allLayerScaleCacheBase() const;
@@ -74,6 +74,9 @@ class BlockPool {
     size_t getTotalSizeBytes() const {
         return config_.total_size_bytes;
     }
+    const std::string& poolName() const {
+        return config_.pool_name;
+    }
 
 private:
     void initFreeBlocks();
@@ -85,6 +88,8 @@ class BlockPool {
     // Helper functions for init()
     void validateConfig() const;
     void initializeCacheBuffer();
+    void initializePinnedCpuBuffer(const char* log_context);
+    void initializeCudaMallocBuffer();
     void initializeLayerMappings();
     void initializeLayoutStrategies();
 
@@ -107,10 +112,12 @@ class BlockPool {
                                  size_t                               offset_bytes,
                                  size_t                               bytes,
                                  size_t                               stride_bytes,
+                                 bool                                 gpu,
                                  const std::string&                   buffer_type);
     void deregisterUserMrForBuffer(std::shared_ptr<rtp_llm::MemoryUtil> memory_util,
                                    size_t                               layout_idx,
                                    size_t                               offset_bytes,
+                                   bool                                 gpu,
                                    const std::string&                   buffer_type);
 
 private:
@@ -126,8 +133,8 @@ class BlockPool {
     BlockRefCounter        req_cache_ref_counter_;
 
     AllocationType allocation_type_;
-
-    BlockCachePtr block_cache_;
+    bool           use_pinned_cpu_backing_;
+    bool           use_cuda_malloc_backing_;
 
     torch::Tensor               cache_aligned_buffer_;
     void*                       cache_base_ptr_  = nullptr;
diff --git a/rtp_llm/cpp/cache/BlockPoolConfig.h b/rtp_llm/cpp/cache/BlockPoolConfig.h
index 09ff401cb8..67bc62fc87 100644
--- a/rtp_llm/cpp/cache/BlockPoolConfig.h
+++ b/rtp_llm/cpp/cache/BlockPoolConfig.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <string>
 #include <vector>
 
 #include "rtp_llm/cpp/cache/MemoryLayoutConfig.h"
@@ -7,6 +8,8 @@
 namespace rtp_llm {
 
 struct BlockPoolConfig {
+    std::string pool_name = "unnamed";
+
     // all memory layouts share the same block id space
     uint32_t block_num = 0;
 
@@ -15,4 +18,4 @@ struct BlockPoolConfig {
     std::vector<MemoryLayoutConfig> memory_layouts;
 };
 
-}  // namespace rtp_llm
\ No newline at end of file
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/BlockPoolConfigHelper.h b/rtp_llm/cpp/cache/BlockPoolConfigHelper.h
index deac75c410..bab8f95ee7 100644
--- a/rtp_llm/cpp/cache/BlockPoolConfigHelper.h
+++ b/rtp_llm/cpp/cache/BlockPoolConfigHelper.h
@@ -3,6 +3,8 @@
 #include "rtp_llm/cpp/cache/CacheConfig.h"
 #include "rtp_llm/cpp/cache/BlockPoolConfig.h"
 
+#include <string>
+
 namespace rtp_llm {
 
 class BlockPoolConfigHelper {
@@ -16,12 +18,13 @@ class BlockPoolConfigHelper {
      * @param cache_config The CacheConfig containing main model and optional MTP modules
      */
     static BlockPoolConfig createConfig(const CacheConfig& cache_config) {
-        RTP_LLM_CHECK_WITH_INFO(!cache_config.cache_specs.empty(), "cache_specs must not be empty");
+        RTP_LLM_CHECK_WITH_INFO(cache_config.groupNums() > 0, "cache groups must not be empty");
         BlockPoolConfig config;
+        config.pool_name      = "default";
         config.block_num      = cache_config.block_num;
         const bool  is_hybrid = cache_config.groupNums() > 1;
         auto        layer_num = is_hybrid ? cache_config.group_layer_num : cache_config.layer_num;
-        const auto& main_spec = cache_config.cache_specs[0];
+        const auto& main_spec = cache_config.specForGroup(0);
         // linear block size is same with full block block size
         MemoryLayoutConfig main_layout = createMemoryLayoutConfig(is_hybrid,
                                                                   layer_num,
@@ -42,13 +45,23 @@ class BlockPoolConfigHelper {
         for (size_t i = 0; i < cache_config.mtp_sub_configs.size(); ++i) {
             const auto& mtp_sub_config = cache_config.mtp_sub_configs[i];
             RTP_LLM_CHECK_WITH_INFO(mtp_sub_config != nullptr, "mtp_sub_configs[%zu] is null", i);
-            RTP_LLM_CHECK_WITH_INFO(
-                !mtp_sub_config->cache_specs.empty(), "MTP module %zu cache_specs must not be empty", i);
+            RTP_LLM_CHECK_WITH_INFO(mtp_sub_config->groupNums() > 0,
+                                    "MTP module %zu cache groups must not be empty",
+                                    i);
 
             const auto mtp_layer_num = mtp_sub_config->layer_num;
 
-            const auto& mtp_spec = mtp_sub_config->cache_specs[0];
-            // mtp block size is not same with main model block size
+            size_t real_mtp_gid = 0;
+            for (size_t gid = 0; gid < static_cast<size_t>(mtp_sub_config->groupNums()); ++gid) {
+                if (!mtp_sub_config->layerIdsForGroup(gid).empty()) {
+                    real_mtp_gid = gid;
+                    break;
+                }
+            }
+            const auto& mtp_spec = mtp_sub_config->specForGroup(real_mtp_gid);
+            // mtp block size is not same with main model block size.  MTP
+            // sub-configs may keep target-aligned placeholder groups, so use
+            // the first group that owns a real MTP layer instead of gid 0.
             MemoryLayoutConfig mtp_layout = createMemoryLayoutConfig(false,
                                                                      mtp_layer_num,
                                                                      mtp_spec->block_size_bytes(),
@@ -79,10 +92,61 @@ class BlockPoolConfigHelper {
         return config;
     }
 
+    static BlockPoolConfig createConfigForGroup(const CacheConfig& cache_config, size_t group_id) {
+        RTP_LLM_CHECK_WITH_INFO(group_id < static_cast<size_t>(cache_config.groupNums()),
+                                "group_id %zu out of range, groupNums=%d",
+                                group_id,
+                                cache_config.groupNums());
+        const auto& spec = cache_config.specForGroup(group_id);
+        RTP_LLM_CHECK_WITH_INFO(spec != nullptr, "cache_specs[%zu] is null", group_id);
+
+        BlockPoolConfig config;
+        config.pool_name = "group_" + std::to_string(group_id);
+        const auto& tag = cache_config.tagForGroup(group_id);
+        if (!tag.empty()) {
+            config.pool_name = tag;
+        }
+        config.block_num = cache_config.blockNumForGroup(group_id);
+        const bool has_group_blocks = config.block_num != cache_config.block_num;
+        RTP_LLM_LOG_INFO("createConfigForGroup: pool_name=%s gid=%zu block_num=%d (has_group_blocks=%d, "
+                         "groupNums=%d, global_block_num=%d)",
+                         config.pool_name.c_str(),
+                         group_id,
+                         config.block_num,
+                         has_group_blocks,
+                         cache_config.groupNums(),
+                         cache_config.block_num);
+
+        const uint32_t layer_num = static_cast<uint32_t>(cache_config.layerIdsForGroup(group_id).size());
+        RTP_LLM_CHECK_WITH_INFO(layer_num > 0, "group %zu has no layers", group_id);
+
+        const size_t kv_stride    = cache_config.kvBlockStrideBytesForGroup(group_id);
+        const size_t scale_stride = cache_config.kvScaleStrideBytesForGroup(group_id);
+
+        CacheConfig group_cache_config = cache_config;
+        group_cache_config.block_num   = config.block_num;
+        if (group_id < cache_config.group_seq_size_per_block.size()
+            && cache_config.group_seq_size_per_block[group_id] > 0) {
+            group_cache_config.seq_size_per_block = cache_config.group_seq_size_per_block[group_id];
+        }
+
+        MemoryLayoutConfig layout =
+            createMemoryLayoutConfig(false, layer_num, kv_stride, scale_stride, spec, group_cache_config);
+        const bool is_full_group          = cache_config.typeForGroup(group_id) == CacheGroupType::FULL;
+        layout.kernel_blocks_per_kv_block = is_full_group ? cache_config.kernelBlocksPerKvBlock() : 1;
+        layout.kv_cache_offset_bytes      = 0;
+        layout.kv_scale_offset_bytes      = layout.kv_cache_offset_bytes + layout.kv_block_pool_size_bytes;
+
+        config.memory_layouts.push_back(layout);
+        config.total_size_bytes = layout.kv_block_pool_size_bytes + layout.kv_scale_pool_size_bytes;
+        return config;
+    }
+
     // for memory connector
     static BlockPoolConfig
     createConfig(uint32_t layer_num, uint32_t block_num, size_t block_stride_bytes, rtp_llm::DataType dtype) {
         BlockPoolConfig config;
+        config.pool_name = "memory_connector";
         config.block_num = block_num;
 
         MemoryLayoutConfig layout_cfg;
@@ -122,7 +186,7 @@ class BlockPoolConfigHelper {
         cfg.v_scale_stride_bytes  = spec->v_scale_block_size_bytes();
 
         cfg.enable_kv_scale         = cfg.kv_scale_stride_bytes > 0;
-        cfg.dtype                   = cache_config.dtype;
+        cfg.dtype                   = spec->dtype;
         cfg.local_head_num_kv       = spec->local_head_num_kv;
         cfg.enable_hybrid_attention = enable_hybrid_attention;
         // Scale 3D layout for MLA and indexer; KV 3D only for MLA (concat_and_cache_mla)
diff --git a/rtp_llm/cpp/cache/BufferTypes.h b/rtp_llm/cpp/cache/BufferTypes.h
index 743f6182f8..3e0da5279f 100644
--- a/rtp_llm/cpp/cache/BufferTypes.h
+++ b/rtp_llm/cpp/cache/BufferTypes.h
@@ -1,9 +1,11 @@
 #pragma once
 
+#include <map>
+#include <string>
 #include <vector>
 
 #include <torch/extension.h>
-#include "rtp_llm/cpp/cache/CacheGroupType.h"
+#include "rtp_llm/cpp/cache/spec/CacheGroupType.h"
 
 namespace rtp_llm {
 
@@ -13,11 +15,16 @@ struct BlockBufferPtrInfo {
 };
 
 struct CacheLayerLayout {
-    std::vector<int>            layer_to_groups;
-    std::vector<CacheGroupType> group_types;
-    std::vector<CacheGroupType> layer_attn_types;
-    std::vector<torch::Tensor>  layers_to_kv_buffer_ptrs;
-    std::vector<torch::Tensor>  layers_to_scale_buffer_ptrs;
+    std::vector<std::vector<int>> layer_to_group_ids;
+    std::vector<CacheGroupType>   group_types;
+    std::vector<std::string>        group_tags;
+    std::vector<std::map<std::string, int>> layer_tag_to_group_id;
+    std::vector<size_t>             group_seq_size_per_block;
+    std::vector<CacheGroupType>   layer_group_types;
+    std::vector<torch::Tensor>              layers_to_kv_buffer_ptrs;
+    std::vector<torch::Tensor>              layers_to_scale_buffer_ptrs;
+    std::vector<std::vector<torch::Tensor>> layers_to_kv_buffer_ptrs_by_group;
+    std::vector<std::vector<torch::Tensor>> layers_to_scale_buffer_ptrs_by_group;
 };
 
 struct KVCacheBuffer {
diff --git a/rtp_llm/cpp/cache/CPSlotMapper.cc b/rtp_llm/cpp/cache/CPSlotMapper.cc
new file mode 100644
index 0000000000..382f34ced9
--- /dev/null
+++ b/rtp_llm/cpp/cache/CPSlotMapper.cc
@@ -0,0 +1,40 @@
+#include "rtp_llm/cpp/cache/CPSlotMapper.h"
+
+#include <stdexcept>
+
+namespace rtp_llm {
+
+CPSlotMapper::CPSlotMapper(): cp_rank_(0), cp_size_(1), block_size_(1), virtual_block_size_(1) {}
+
+CPSlotMapper::CPSlotMapper(int cp_rank, int cp_size, int block_size):
+    cp_rank_(cp_rank), cp_size_(cp_size), block_size_(block_size), virtual_block_size_(block_size * cp_size) {
+    if (cp_size <= 0) {
+        throw std::invalid_argument("CPSlotMapper cp_size must be positive");
+    }
+    if (block_size <= 0) {
+        throw std::invalid_argument("CPSlotMapper block_size must be positive");
+    }
+    if (cp_rank < 0 || cp_rank >= cp_size) {
+        throw std::invalid_argument("CPSlotMapper cp_rank out of range");
+    }
+}
+
+int CPSlotMapper::localBlockCount(int seq_len) const {
+    if (seq_len <= 0) {
+        return 0;
+    }
+    // All CP ranks keep the same block count = ceil(total_blocks / cp_size).
+    // rank0 is the controller: it allocates blocks and broadcasts block_ids
+    // to all ranks.  Using a uniform count simplifies KV cache management —
+    // ranks with fewer "real" data blocks simply have unused trailing blocks.
+    int total_blocks = (seq_len + block_size_ - 1) / block_size_;
+    return (total_blocks + cp_size_ - 1) / cp_size_;
+}
+
+int CPSlotMapper::effectiveSeqLenForAlloc(int actual_seq_len) const {
+    // Translate to a seq_len that, when the allocator divides by block_size,
+    // yields localBlockCount(actual_seq_len).
+    return localBlockCount(actual_seq_len) * block_size_;
+}
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/CPSlotMapper.h b/rtp_llm/cpp/cache/CPSlotMapper.h
new file mode 100644
index 0000000000..804bb31a4a
--- /dev/null
+++ b/rtp_llm/cpp/cache/CPSlotMapper.h
@@ -0,0 +1,48 @@
+#pragma once
+
+namespace rtp_llm {
+
+/// Page-level virtual block sharding for Context Parallelism.
+///
+/// Entire blocks are assigned to ranks round-robin: block_idx % cp_size == cp_rank.
+/// Virtual block size is block_size * cp_size (used for cache key grouping).
+///
+/// Sharded when cp_size > 1.  The default constructor (cp_size=1) gives
+/// passthrough behaviour identical to "no CP".
+class CPSlotMapper {
+public:
+    CPSlotMapper();
+    CPSlotMapper(int cp_rank, int cp_size, int block_size);
+
+    bool isSharded() const {
+        return cp_size_ > 1;
+    }
+
+    int cpRank() const {
+        return cp_rank_;
+    }
+    int cpSize() const {
+        return cp_size_;
+    }
+    int blockSize() const {
+        return block_size_;
+    }
+    int virtualBlockSize() const {
+        return virtual_block_size_;
+    }
+
+    int localBlockCount(int seq_len) const;
+
+    // Translate actual seq_len to an effective value that, when divided by
+    // block_size, yields localBlockCount(actual_seq_len).  Use this when
+    // feeding seq_len into an allocator that divides by block_size internally.
+    int effectiveSeqLenForAlloc(int actual_seq_len) const;
+
+private:
+    int cp_rank_            = 0;
+    int cp_size_            = 1;
+    int block_size_         = 1;
+    int virtual_block_size_ = 1;
+};
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/CacheConfig.h b/rtp_llm/cpp/cache/CacheConfig.h
index 75f80b6d32..0875ce0277 100644
--- a/rtp_llm/cpp/cache/CacheConfig.h
+++ b/rtp_llm/cpp/cache/CacheConfig.h
@@ -1,28 +1,50 @@
 #pragma once
 
+#include <algorithm>
+#include <cstdint>
+#include <map>
 #include <memory>
 #include <sstream>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
-#include "rtp_llm/cpp/cache/CacheGroupType.h"
-#include "rtp_llm/cpp/cache/KVCacheSpec.h"
+#include "rtp_llm/cpp/cache/spec/CacheGroupType.h"
+#include "rtp_llm/cpp/cache/spec/KVCacheSpec.h"
+#include "rtp_llm/cpp/config/ConfigModules.h"
+#include "rtp_llm/cpp/utils/AssertUtils.h"
 #include "rtp_llm/models_py/bindings/core/Types.h"
 #include "rtp_llm/cpp/utils/StringUtil.h"
 
 namespace rtp_llm {
 
+struct GroupBase {
+    KVCacheSpecPtr   spec;
+    CacheGroupPolicy policy;
+    std::vector<int> layer_ids;
+    uint32_t         block_num             = 0;
+    size_t           kv_block_stride_bytes = 0;
+    size_t           kv_scale_stride_bytes = 0;
+};
+
+struct LayerBase {
+    std::vector<int>           group_ids;
+    std::map<std::string, int> tag_to_gid;
+};
+
 struct CacheConfig {
-    // Cache specification and layer mapping
-    std::vector<KVCacheSpecPtr>   cache_specs;
-    std::vector<std::vector<int>> global_layer_ids;  // including mtp module layers
-    std::vector<std::vector<int>> layer_ids;
-    std::vector<std::vector<int>> linear_groups;  // for hybrid attention
-    std::vector<std::vector<int>> full_groups;    // for hybrid attention
-    std::vector<CacheGroupType>   group_types;    // for hybrid attention
-    std::vector<CacheGroupType>   layer_attn_types;
-    std::vector<int>              layer_to_group_id;
-    std::vector<int>              layer_to_block_stride_bytes;
+    std::vector<GroupBase>          groups;
+    std::vector<LayerBase>          layers;
+    std::unordered_map<std::string, int> tag_to_gid;
+
+    // Cache specification and layer mapping are owned by groups/layers above.
+    std::vector<int>               layer_to_block_stride_bytes;
+    std::vector<size_t>            group_seq_size_per_block;
+    bool                           group_block_layout_initialized          = false;
+    bool                           use_independent_block_pools              = false;
+    bool                           use_typed_cache_regions                  = false;
+    bool                           use_opaque_kv_cache_store                = false;
+    bool                           disable_decode_first_malloc_device_reuse = false;
 
     // Model configuration
     rtp_llm::DataType dtype;
@@ -46,19 +68,22 @@ struct CacheConfig {
 
     // Block sizing information
     // ---- Per-block sizes (all layers) ----
-    size_t kv_block_size_bytes = 0;
-    size_t kv_scale_size_bytes = 0;
-    size_t block_size_bytes    = 0;  // (kv + scales together)
+    size_t kv_block_size_bytes  = 0;
+    size_t kv_scale_size_bytes  = 0;
+    size_t block_size_bytes     = 0;  // (kv + scales together)
 
     // ---- Per-block strides (one layer) ----
     size_t kv_block_stride_bytes = 0;
     size_t kv_scale_stride_bytes = 0;
 
+    // Bytes pre-reserved for explicitly-sized pools.
+    // CacheConfigCreator deducts this from kv_cache_mem_size before computing the
+    // paged block_num, so paged pools don't overcommit HBM. 0 means no reservation.
+    size_t explicitly_sized_pool_reserve_bytes = 0;
+
     // Attention-specific configuration
-    int linear_step      = 1;  // For Linear attention: keep one cache block every `linear_step` blocks
+    int linear_step = 1;  // For Linear attention: keep one cache block every `linear_step` blocks
     int group_layer_num  = 1;  // Number of layers per group for hybrid attention
-    int linear_group_num = 0;  // Number of linear attention groups
-    int full_group_num   = 0;  // Number of full attention groups
 
     // mtp-model configurations
     std::vector<std::shared_ptr<CacheConfig>> mtp_sub_configs;
@@ -66,7 +91,601 @@ struct CacheConfig {
     CacheConfig() {}
 
     int groupNums() const {
-        return std::max<int>(1, static_cast<int>(cache_specs.size()));
+        return static_cast<int>(groups.size());
+    }
+
+    const KVCacheSpecPtr& specForGroup(size_t gid) const {
+        RTP_LLM_CHECK_WITH_INFO(gid < groups.size(), "CacheConfig::specForGroup invalid gid=%zu size=%zu", gid, groups.size());
+        RTP_LLM_CHECK_WITH_INFO(groups[gid].spec != nullptr, "CacheConfig::specForGroup null spec gid=%zu", gid);
+        return groups[gid].spec;
+    }
+
+    CacheGroupType typeForGroup(size_t gid) const {
+        RTP_LLM_CHECK_WITH_INFO(gid < groups.size(), "CacheConfig::typeForGroup invalid gid=%zu size=%zu", gid, groups.size());
+        return groups[gid].policy.group_type;
+    }
+
+    const std::string& tagForGroup(size_t gid) const {
+        return specForGroup(gid)->tag;
+    }
+
+    int groupIdForTag(const std::string& tag) const {
+        const auto it = tag_to_gid.find(tag);
+        RTP_LLM_CHECK_WITH_INFO(it != tag_to_gid.end(), "CacheConfig::groupIdForTag missing tag=%s", tag.c_str());
+        return it->second;
+    }
+
+    const std::vector<int>& layerIdsForGroup(size_t gid) const {
+        RTP_LLM_CHECK_WITH_INFO(gid < groups.size(), "CacheConfig::layerIdsForGroup invalid gid=%zu size=%zu", gid, groups.size());
+        return groups[gid].layer_ids;
+    }
+
+    std::vector<CacheGroupType> groupTypesSnapshot() const {
+        std::vector<CacheGroupType> types;
+        types.reserve(groups.size());
+        for (const auto& group : groups) {
+            types.push_back(group.policy.group_type);
+        }
+        return types;
+    }
+
+    std::vector<std::string> groupTagsSnapshot() const {
+        std::vector<std::string> tags;
+        tags.reserve(groups.size());
+        for (const auto& group : groups) {
+            RTP_LLM_CHECK_WITH_INFO(group.spec != nullptr, "CacheConfig::groupTagsSnapshot null spec");
+            tags.push_back(group.spec->tag);
+        }
+        return tags;
+    }
+
+    std::vector<CacheGroupPolicy> groupPoliciesSnapshot() const {
+        std::vector<CacheGroupPolicy> policies;
+        policies.reserve(groups.size());
+        for (const auto& group : groups) {
+            policies.push_back(group.policy);
+        }
+        return policies;
+    }
+
+    std::vector<uint32_t> groupBlockNumsSnapshot() const {
+        if (!group_block_layout_initialized) {
+            return {};
+        }
+        std::vector<uint32_t> block_nums;
+        block_nums.reserve(groups.size());
+        for (const auto& group : groups) {
+            block_nums.push_back(group.block_num);
+        }
+        return block_nums;
+    }
+
+    std::vector<size_t> groupBlockSizeBytesSnapshot() const {
+        std::vector<size_t> result;
+        result.reserve(static_cast<size_t>(groupNums()));
+        for (size_t gid = 0; gid < static_cast<size_t>(groupNums()); ++gid) {
+            result.push_back(blockSizeBytesForGroup(gid));
+        }
+        return result;
+    }
+
+    std::vector<size_t> groupKvBlockStrideBytesSnapshot() const {
+        if (!group_block_layout_initialized) {
+            return {};
+        }
+        std::vector<size_t> strides;
+        strides.reserve(groups.size());
+        for (const auto& group : groups) {
+            strides.push_back(group.kv_block_stride_bytes);
+        }
+        return strides;
+    }
+
+    std::vector<size_t> groupKvScaleStrideBytesSnapshot() const {
+        if (!group_block_layout_initialized) {
+            return {};
+        }
+        std::vector<size_t> strides;
+        strides.reserve(groups.size());
+        for (const auto& group : groups) {
+            strides.push_back(group.kv_scale_stride_bytes);
+        }
+        return strides;
+    }
+
+    std::vector<std::vector<int>> layerGroupIdsSnapshot() const {
+        std::vector<std::vector<int>> result;
+        result.reserve(layers.size());
+        for (const auto& layer : layers) {
+            result.push_back(layer.group_ids);
+        }
+        return result;
+    }
+
+    // Compatibility: flat layer→first-group-id mapping (old: layer_to_group_id).
+    std::vector<int> flatLayerToGroupId() const {
+        std::vector<int> result;
+        result.reserve(layers.size());
+        for (const auto& layer : layers) {
+            result.push_back(layer.group_ids.empty() ? -1 : layer.group_ids[0]);
+        }
+        return result;
+    }
+
+    // Compatibility: per-group global layer IDs (old: global_layer_ids).
+    std::vector<std::vector<int>> globalLayerIdsSnapshot() const {
+        std::vector<std::vector<int>> result;
+        result.reserve(groups.size());
+        for (const auto& group : groups) {
+            result.push_back(group.layer_ids);
+        }
+        return result;
+    }
+
+    std::vector<std::map<std::string, int>> layerTagToGroupIdSnapshot() const {
+        std::vector<std::map<std::string, int>> result;
+        result.reserve(layers.size());
+        for (const auto& layer : layers) {
+            result.push_back(layer.tag_to_gid);
+        }
+        return result;
+    }
+
+    uint32_t blockNumForGroup(size_t gid) const {
+        RTP_LLM_CHECK_WITH_INFO(gid < groups.size(), "CacheConfig::blockNumForGroup invalid gid=%zu size=%zu", gid, groups.size());
+        if (group_block_layout_initialized && groups[gid].block_num > 0) {
+            return groups[gid].block_num;
+        }
+        return block_num;
+    }
+
+    size_t kvBlockStrideBytesForGroup(size_t gid) const {
+        RTP_LLM_CHECK_WITH_INFO(gid < groups.size(), "CacheConfig::kvBlockStrideBytesForGroup invalid gid=%zu size=%zu", gid, groups.size());
+        if (group_block_layout_initialized) {
+            return groups[gid].kv_block_stride_bytes;
+        }
+        return specForGroup(gid)->block_size_bytes();
+    }
+
+    size_t kvScaleStrideBytesForGroup(size_t gid) const {
+        RTP_LLM_CHECK_WITH_INFO(gid < groups.size(), "CacheConfig::kvScaleStrideBytesForGroup invalid gid=%zu size=%zu", gid, groups.size());
+        if (group_block_layout_initialized) {
+            return groups[gid].kv_scale_stride_bytes;
+        }
+        return specForGroup(gid)->scale_block_size_bytes();
+    }
+
+    size_t blockSizeBytesForGroup(size_t gid) const {
+        return layerIdsForGroup(gid).size() * (kvBlockStrideBytesForGroup(gid) + kvScaleStrideBytesForGroup(gid));
+    }
+
+    void setGroupPolicies(const std::vector<CacheGroupPolicy>& policies) {
+        RTP_LLM_CHECK_WITH_INFO(policies.size() == groups.size(),
+                                "CacheConfig::setGroupPolicies size %zu != group size %zu",
+                                policies.size(),
+                                groups.size());
+        for (size_t gid = 0; gid < policies.size(); ++gid) {
+            groups[gid].policy = policies[gid];
+        }
+    }
+
+    void setGroupBlockLayout(const std::vector<uint32_t>& block_nums,
+                             const std::vector<size_t>&   kv_block_stride_bytes,
+                             const std::vector<size_t>&   kv_scale_stride_bytes) {
+        const size_t group_num = static_cast<size_t>(groupNums());
+        RTP_LLM_CHECK_WITH_INFO(block_nums.size() == group_num,
+                                "CacheConfig::setGroupBlockLayout block_nums size %zu != group size %zu",
+                                block_nums.size(),
+                                group_num);
+        RTP_LLM_CHECK_WITH_INFO(kv_block_stride_bytes.size() == group_num,
+                                "CacheConfig::setGroupBlockLayout kv stride size %zu != group size %zu",
+                                kv_block_stride_bytes.size(),
+                                group_num);
+        RTP_LLM_CHECK_WITH_INFO(kv_scale_stride_bytes.size() == group_num,
+                                "CacheConfig::setGroupBlockLayout scale stride size %zu != group size %zu",
+                                kv_scale_stride_bytes.size(),
+                                group_num);
+        for (size_t gid = 0; gid < group_num; ++gid) {
+            groups[gid].block_num             = block_nums[gid];
+            groups[gid].kv_block_stride_bytes = kv_block_stride_bytes[gid];
+            groups[gid].kv_scale_stride_bytes = kv_scale_stride_bytes[gid];
+        }
+        group_block_layout_initialized = true;
+    }
+
+    void resizeLayerRoutes(size_t layer_count) {
+        layers.resize(layer_count);
+    }
+
+    void setLayerIdsForGroup(size_t gid, const std::vector<int>& layer_ids) {
+        RTP_LLM_CHECK_WITH_INFO(gid < groups.size(),
+                                "CacheConfig::setLayerIdsForGroup invalid gid=%zu size=%zu",
+                                gid,
+                                groups.size());
+        groups[gid].layer_ids = layer_ids;
+        if (groups[gid].spec != nullptr) {
+            groups[gid].spec->layers = layer_ids;
+        }
+    }
+
+    void appendLayerToGroup(size_t gid, int layer_id, const std::string& tag) {
+        RTP_LLM_CHECK_WITH_INFO(gid < groups.size(),
+                                "CacheConfig::appendLayerToGroup invalid gid=%zu size=%zu",
+                                gid,
+                                groups.size());
+        RTP_LLM_CHECK_WITH_INFO(layer_id >= 0, "CacheConfig::appendLayerToGroup invalid layer_id=%d", layer_id);
+        const auto layer = static_cast<size_t>(layer_id);
+        if (layer >= layers.size()) {
+            layers.resize(layer + 1);
+        }
+        groups[gid].layer_ids.push_back(layer_id);
+        if (groups[gid].spec != nullptr) {
+            groups[gid].spec->layers = groups[gid].layer_ids;
+        }
+        layers[layer].group_ids.push_back(static_cast<int>(gid));
+        if (!tag.empty()) {
+            layers[layer].tag_to_gid[tag] = static_cast<int>(gid);
+        }
+    }
+
+    size_t fullGroupId() const {
+        for (size_t gid = 0; gid < static_cast<size_t>(groupNums()); ++gid) {
+            if (typeForGroup(gid) == CacheGroupType::FULL) {
+                return gid;
+            }
+        }
+        return 0;
+    }
+
+    std::shared_ptr<CacheConfig> mergeMTPModule(const CacheConfig& propose_config,
+                                                int                module_index,
+                                                uint32_t           main_layer_num) {
+        RTP_LLM_CHECK_WITH_INFO(!groups.empty(), "CacheConfig::mergeMTPModule requires destination topology views");
+        RTP_LLM_CHECK_WITH_INFO(!propose_config.groups.empty(),
+                                "CacheConfig::mergeMTPModule requires propose topology views");
+        RTP_LLM_CHECK_WITH_INFO(module_index >= 0, "CacheConfig::mergeMTPModule invalid module_index=%d", module_index);
+
+        auto sub_cfg           = std::make_shared<CacheConfig>(propose_config);
+        sub_cfg->block_num     = block_num;
+        sub_cfg->layer_all_num = sub_cfg->layer_num;
+
+        const auto mtp_layer_num = propose_config.layer_num;
+        const auto total_layers =
+            static_cast<size_t>(main_layer_num) + static_cast<size_t>(module_index + 1) * mtp_layer_num;
+        resizeLayerRoutes(total_layers);
+        if (layer_to_block_stride_bytes.size() < total_layers) {
+            layer_to_block_stride_bytes.resize(total_layers, 0);
+        }
+
+        // MTP currently relies on target and draft models sharing the same group-index
+        // namespace: model inputs, CUDA graph metadata, and Python attention inputs pass
+        // block tables by gid without a draft-local remap.  Therefore the sub-config
+        // keeps every target group in first-seen order. Groups not used by the propose
+        // model are placeholders with an empty layer list.
+        const auto target_group_num = static_cast<size_t>(groupNums());
+        std::unordered_map<std::string, size_t> propose_gid_by_tag;
+        for (size_t gid = 0; gid < static_cast<size_t>(propose_config.groupNums()); ++gid) {
+            propose_gid_by_tag.emplace(propose_config.tagForGroup(gid), gid);
+        }
+
+        std::vector<GroupBase> sub_groups;
+        std::vector<LayerBase> sub_layers(static_cast<size_t>(mtp_layer_num));
+        std::vector<size_t>    sub_group_seq_size_per_block;
+        sub_groups.reserve(target_group_num);
+        sub_group_seq_size_per_block.reserve(target_group_num);
+
+        for (size_t target_gid = 0; target_gid < target_group_num; ++target_gid) {
+            const auto& tag = tagForGroup(target_gid);
+            const auto  propose_it = propose_gid_by_tag.find(tag);
+            const bool  has_propose_group = propose_it != propose_gid_by_tag.end();
+            const size_t source_gid = has_propose_group ? propose_it->second : target_gid;
+            const auto&  source_config = has_propose_group ? propose_config : *this;
+            const auto&  source_group  = source_config.groups[source_gid];
+
+            GroupBase sub_group;
+            sub_group.spec                  = source_group.spec->clone();
+            sub_group.policy                = source_group.policy;
+            sub_group.block_num             = source_group.block_num;
+            sub_group.kv_block_stride_bytes = source_group.kv_block_stride_bytes;
+            sub_group.kv_scale_stride_bytes = source_group.kv_scale_stride_bytes;
+
+            if (has_propose_group) {
+                for (int local_layer_id : propose_config.layerIdsForGroup(source_gid)) {
+                    if (local_layer_id < 0 || local_layer_id >= static_cast<int>(mtp_layer_num)) {
+                        continue;
+                    }
+                    const auto global_layer_id = static_cast<int>(main_layer_num)
+                                                 + module_index * static_cast<int>(mtp_layer_num) + local_layer_id;
+                    const auto global_layer    = static_cast<size_t>(global_layer_id);
+
+                    sub_group.layer_ids.push_back(global_layer_id);
+                    auto& sub_layer = sub_layers[static_cast<size_t>(local_layer_id)];
+                    sub_layer.group_ids.push_back(static_cast<int>(target_gid));
+                    sub_layer.tag_to_gid[tag] = static_cast<int>(target_gid);
+
+                    appendLayerToGroup(target_gid, global_layer_id, tag);
+
+                    RTP_LLM_CHECK_WITH_INFO(static_cast<size_t>(local_layer_id)
+                                                < sub_cfg->layer_to_block_stride_bytes.size(),
+                                            "CacheConfig::mergeMTPModule local layer stride missing layer=%d size=%zu",
+                                            local_layer_id,
+                                            sub_cfg->layer_to_block_stride_bytes.size());
+                    layer_to_block_stride_bytes[global_layer] =
+                        sub_cfg->layer_to_block_stride_bytes[static_cast<size_t>(local_layer_id)];
+                }
+            }
+            sub_group.spec->layers = sub_group.layer_ids;
+            sub_groups.push_back(std::move(sub_group));
+
+            const auto& source_seq = source_config.group_seq_size_per_block;
+            sub_group_seq_size_per_block.push_back(source_gid < source_seq.size() ? source_seq[source_gid] : 0);
+        }
+
+        for (size_t layer_id = 0; layer_id < sub_layers.size(); ++layer_id) {
+            RTP_LLM_CHECK_WITH_INFO(!sub_layers[layer_id].group_ids.empty(),
+                                    "CacheConfig::mergeMTPModule missing group mapping for sub layer %zu",
+                                    layer_id);
+        }
+
+        sub_cfg->groups                         = std::move(sub_groups);
+        sub_cfg->layers                         = std::move(sub_layers);
+        sub_cfg->tag_to_gid.clear();
+        for (size_t gid = 0; gid < sub_cfg->groups.size(); ++gid) {
+            sub_cfg->tag_to_gid.emplace(sub_cfg->groups[gid].spec->tag, static_cast<int>(gid));
+        }
+        sub_cfg->group_seq_size_per_block       = std::move(sub_group_seq_size_per_block);
+        sub_cfg->group_block_layout_initialized = group_block_layout_initialized;
+        return sub_cfg;
+    }
+
+    uint32_t explicitIndependentBlocks(size_t gid) const {
+        return policyForGroup(gid).explicit_block_num;
+    }
+
+    bool usesExplicitIndependentBlocks(size_t gid) const {
+        return explicitIndependentBlocks(gid) > 0;
+    }
+
+    CacheGroupPolicy policyForGroup(size_t gid) const {
+        RTP_LLM_CHECK_WITH_INFO(gid < groups.size(), "CacheConfig::policyForGroup invalid gid=%zu size=%zu", gid, groups.size());
+        return groups[gid].policy;
+    }
+
+    int groupIdForLayerTag(int layer_id, const std::string& tag) const {
+        RTP_LLM_CHECK_WITH_INFO(layer_id >= 0 && static_cast<size_t>(layer_id) < layers.size(),
+                                "CacheConfig::groupIdForLayerTag invalid layer_id=%d size=%zu",
+                                layer_id,
+                                layers.size());
+        const auto& tag_to_group = layers[static_cast<size_t>(layer_id)].tag_to_gid;
+        const auto  it           = tag_to_group.find(tag);
+        RTP_LLM_CHECK_WITH_INFO(it != tag_to_group.end(),
+                                "CacheConfig::groupIdForLayerTag missing tag=%s for layer_id=%d",
+                                tag.c_str(),
+                                layer_id);
+        return it->second;
+    }
+
+    int groupIdFor(int layer_id) const {
+        RTP_LLM_CHECK_WITH_INFO(layer_id >= 0 && static_cast<size_t>(layer_id) < layers.size(),
+                                "CacheConfig::groupIdFor invalid layer_id=%d size=%zu",
+                                layer_id,
+                                layers.size());
+        const auto& gids = layers[static_cast<size_t>(layer_id)].group_ids;
+        RTP_LLM_CHECK_WITH_INFO(gids.size() == 1,
+                                "CacheConfig::groupIdFor requires exactly one cache tag for layer_id=%d, got %zu",
+                                layer_id,
+                                gids.size());
+        return gids.front();
+    }
+
+    const std::vector<int>& groupIdsForLayer(int layer_id) const {
+        RTP_LLM_CHECK_WITH_INFO(layer_id >= 0 && static_cast<size_t>(layer_id) < layers.size(),
+                                "CacheConfig::groupIdsForLayer invalid layer_id=%d size=%zu",
+                                layer_id,
+                                layers.size());
+        const auto& gids = layers[static_cast<size_t>(layer_id)].group_ids;
+        RTP_LLM_CHECK_WITH_INFO(!gids.empty(), "CacheConfig::groupIdsForLayer missing layer_id=%d", layer_id);
+        return gids;
+    }
+
+    static CacheGroupPolicy cacheGroupPolicyForSpec(const KVCacheSpecPtr& spec, CacheGroupType group_type) {
+        CacheGroupPolicy policy = defaultCacheGroupPolicy(group_type);
+        if (spec && spec->is_state_cache) {
+            policy.evict_policy = CacheEvictPolicy::INDEPENDENT;
+        }
+        if (spec && spec->skip_prefix_reuse) {
+            policy.reuse_policy         = CacheReusePolicy::NON_REUSABLE;
+            policy.active_tail_blocks   = 1;
+            policy.validate_tail_blocks = false;
+        }
+        return policy;
+    }
+
+    static bool samePolicy(const CacheGroupPolicy& lhs, const CacheGroupPolicy& rhs) {
+        return lhs.reuse_policy == rhs.reuse_policy && lhs.evict_policy == rhs.evict_policy
+               && lhs.active_tail_blocks == rhs.active_tail_blocks
+               && lhs.validate_tail_blocks == rhs.validate_tail_blocks
+               && lhs.explicit_block_num == rhs.explicit_block_num
+               && lhs.reserve_from_paged_budget == rhs.reserve_from_paged_budget
+               && lhs.prefix_reusable == rhs.prefix_reusable
+               && lhs.uses_pinned_cpu_backing == rhs.uses_pinned_cpu_backing
+               && lhs.is_cp_shardable == rhs.is_cp_shardable
+               && lhs.has_sparse_slots == rhs.has_sparse_slots
+               && lhs.has_kernel_block_subdiv == rhs.has_kernel_block_subdiv
+               && lhs.cp_compact_tail_blocks == rhs.cp_compact_tail_blocks
+               && lhs.is_reservable == rhs.is_reservable
+               && lhs.group_type == rhs.group_type;
+    }
+
+    void setTopology(std::vector<GroupBase> new_groups, std::vector<LayerBase> new_layers) {
+        RTP_LLM_CHECK_WITH_INFO(!new_groups.empty(), "CacheConfig::setTopology requires at least one cache group");
+        RTP_LLM_CHECK_WITH_INFO(layer_num > 0, "CacheConfig::setTopology requires positive layer_num");
+        RTP_LLM_CHECK_WITH_INFO(new_layers.size() == static_cast<size_t>(layer_num),
+                                "CacheConfig::setTopology layer count %zu != layer_num %u",
+                                new_layers.size(),
+                                layer_num);
+
+        std::unordered_map<std::string, int> new_tag_to_gid;
+        for (size_t gid = 0; gid < new_groups.size(); ++gid) {
+            auto& group = new_groups[gid];
+            RTP_LLM_CHECK_WITH_INFO(group.spec != nullptr, "CacheConfig::setTopology got null spec at group %zu", gid);
+            RTP_LLM_CHECK_WITH_INFO(!group.spec->tag.empty(),
+                                    "CacheConfig::setTopology requires non-empty tag for group %zu",
+                                    gid);
+            new_tag_to_gid.emplace(group.spec->tag, static_cast<int>(gid));
+            group.spec         = group.spec->clone();
+            group.spec->layers = group.layer_ids;
+        }
+
+        std::vector<std::vector<bool>> group_has_layer(
+            new_groups.size(), std::vector<bool>(static_cast<size_t>(layer_num), false));
+        for (size_t gid = 0; gid < new_groups.size(); ++gid) {
+            for (int layer_id : new_groups[gid].layer_ids) {
+                RTP_LLM_CHECK_WITH_INFO(layer_id >= 0 && static_cast<size_t>(layer_id) < new_layers.size(),
+                                        "CacheConfig::setTopology tag=%s has invalid layer id %d for layer_num=%u",
+                                        new_groups[gid].spec->tag.c_str(),
+                                        layer_id,
+                                        layer_num);
+                const auto layer_index = static_cast<size_t>(layer_id);
+                RTP_LLM_CHECK_WITH_INFO(!group_has_layer[gid][layer_index],
+                                        "CacheConfig::setTopology tag=%s has duplicate layer id %d",
+                                        new_groups[gid].spec->tag.c_str(),
+                                        layer_id);
+                group_has_layer[gid][layer_index] = true;
+            }
+        }
+
+        for (size_t layer_id = 0; layer_id < new_layers.size(); ++layer_id) {
+            auto& layer = new_layers[layer_id];
+            RTP_LLM_CHECK_WITH_INFO(!layer.group_ids.empty(),
+                                    "CacheConfig::setTopology missing group mapping for layer %zu",
+                                    layer_id);
+            std::map<int, bool> seen_gids;
+            for (int gid : layer.group_ids) {
+                RTP_LLM_CHECK_WITH_INFO(gid >= 0 && static_cast<size_t>(gid) < new_groups.size(),
+                                        "CacheConfig::setTopology layer %zu has invalid gid %d",
+                                        layer_id,
+                                        gid);
+                RTP_LLM_CHECK_WITH_INFO(seen_gids.emplace(gid, true).second,
+                                        "CacheConfig::setTopology layer %zu has duplicate gid %d",
+                                        layer_id,
+                                        gid);
+                RTP_LLM_CHECK_WITH_INFO(group_has_layer[static_cast<size_t>(gid)][layer_id],
+                                        "CacheConfig::setTopology layer %zu gid %d is missing reverse group layer id",
+                                        layer_id,
+                                        gid);
+            }
+
+            for (const auto& [tag, gid] : layer.tag_to_gid) {
+                RTP_LLM_CHECK_WITH_INFO(gid >= 0 && static_cast<size_t>(gid) < new_groups.size(),
+                                        "CacheConfig::setTopology layer %zu tag=%s has invalid gid %d",
+                                        layer_id,
+                                        tag.c_str(),
+                                        gid);
+                RTP_LLM_CHECK_WITH_INFO(tag == new_groups[static_cast<size_t>(gid)].spec->tag,
+                                        "CacheConfig::setTopology layer %zu tag=%s does not match gid %d tag=%s",
+                                        layer_id,
+                                        tag.c_str(),
+                                        gid,
+                                        new_groups[static_cast<size_t>(gid)].spec->tag.c_str());
+                RTP_LLM_CHECK_WITH_INFO(std::find(layer.group_ids.begin(), layer.group_ids.end(), gid)
+                                            != layer.group_ids.end(),
+                                        "CacheConfig::setTopology layer %zu tag=%s gid %d is not in layer groups",
+                                        layer_id,
+                                        tag.c_str(),
+                                        gid);
+            }
+        }
+
+        groups                         = std::move(new_groups);
+        layers                         = std::move(new_layers);
+        tag_to_gid                     = std::move(new_tag_to_gid);
+        group_block_layout_initialized = false;
+    }
+
+    void fromGroupedSpecs(const std::vector<KVCacheSpecPtr>&    specs,
+                          const std::vector<std::vector<int>>& layers_by_group,
+                          const std::vector<CacheGroupType>&   types,
+                          const std::vector<std::string>&      tags = {}) {
+        const size_t group_num = specs.size();
+        RTP_LLM_CHECK_WITH_INFO(group_num > 0, "CacheConfig::fromGroupedSpecs requires at least one cache spec");
+        RTP_LLM_CHECK_WITH_INFO(layers_by_group.size() == group_num,
+                                "CacheConfig::fromGroupedSpecs layer group count %zu != spec count %zu",
+                                layers_by_group.size(),
+                                group_num);
+        RTP_LLM_CHECK_WITH_INFO(types.size() == group_num,
+                                "CacheConfig::fromGroupedSpecs group type count %zu != spec count %zu",
+                                types.size(),
+                                group_num);
+        RTP_LLM_CHECK_WITH_INFO(tags.empty() || tags.size() == group_num,
+                                "CacheConfig::fromGroupedSpecs tag count %zu != spec count %zu",
+                                tags.size(),
+                                group_num);
+        RTP_LLM_CHECK_WITH_INFO(layer_num > 0, "CacheConfig::fromGroupedSpecs requires positive layer_num");
+
+        std::vector<GroupBase> new_groups;
+        std::vector<LayerBase> new_layers(static_cast<size_t>(layer_num));
+        new_groups.reserve(group_num);
+
+        for (size_t gid = 0; gid < group_num; ++gid) {
+            const auto& spec = specs[gid];
+            RTP_LLM_CHECK_WITH_INFO(spec != nullptr, "CacheConfig::fromGroupedSpecs got null spec at group %zu", gid);
+            std::string tag = tags.empty() ? spec->tag : tags[gid];
+            if (tag.empty() && group_num == 1) {
+                tag = "default";
+            }
+            RTP_LLM_CHECK_WITH_INFO(!tag.empty(),
+                                    "CacheConfig::fromGroupedSpecs requires non-empty tag for cache spec %zu",
+                                    gid);
+            auto stored_spec = spec->clone();
+            stored_spec->tag = tag;
+
+            GroupBase group;
+            group.spec      = stored_spec;
+            group.policy    = cacheGroupPolicyForSpec(stored_spec, types[gid]);
+            group.layer_ids = layers_by_group[gid];
+            new_groups.push_back(group);
+
+            for (int layer_id : layers_by_group[gid]) {
+                RTP_LLM_CHECK_WITH_INFO(layer_id >= 0 && static_cast<size_t>(layer_id) < new_layers.size(),
+                                        "CacheConfig::fromGroupedSpecs tag=%s has invalid layer id %d for layer_num=%u",
+                                        tag.c_str(),
+                                        layer_id,
+                                        layer_num);
+                auto& layer = new_layers[static_cast<size_t>(layer_id)];
+                layer.group_ids.push_back(static_cast<int>(gid));
+                const auto [it, inserted] = layer.tag_to_gid.emplace(tag, static_cast<int>(gid));
+                RTP_LLM_CHECK_WITH_INFO(inserted || it->second == static_cast<int>(gid),
+                                        "CacheConfig::fromGroupedSpecs layer %d tag %s maps to both group %d and %zu",
+                                        layer_id,
+                                        tag.c_str(),
+                                        inserted ? static_cast<int>(gid) : it->second,
+                                        gid);
+            }
+        }
+
+        setTopology(std::move(new_groups), std::move(new_layers));
+    }
+
+    void finalizeBlockNums(uint32_t global_block_num, const RuntimeConfig& runtime_config) {
+        (void)runtime_config;
+        if (!use_independent_block_pools || !group_block_layout_initialized || groups.empty()) {
+            explicitly_sized_pool_reserve_bytes = 0;
+            return;
+        }
+
+        size_t reserve = 0;
+        for (size_t gid = 0; gid < groups.size(); ++gid) {
+            const auto explicit_independent_blocks = explicitIndependentBlocks(gid);
+            const auto rule_blocks = explicit_independent_blocks > 0 ? explicit_independent_blocks : global_block_num;
+            groups[gid].block_num = rule_blocks;
+
+            // Explicit independent pools are allocated outside the paged pool budget.
+            if (explicit_independent_blocks > 0) {
+                reserve += static_cast<size_t>(rule_blocks) * blockSizeBytesForGroup(gid);
+            }
+        }
+        explicitly_sized_pool_reserve_bytes = reserve;
     }
 
     std::string debugString(size_t indent = 0) const {
@@ -107,19 +726,41 @@ struct CacheConfig {
         OUTPUT_FIELD(kv_scale_stride_bytes);
         os << "\n";
 
+        const auto group_policies      = groupPoliciesSnapshot();
+        const auto group_block_nums    = groupBlockNumsSnapshot();
+        const auto group_layer_ids     = layerGroupIdsSnapshot();
+        const auto group_tags          = groupTagsSnapshot();
+        std::vector<std::vector<int>> layers_by_group;
+        layers_by_group.reserve(groups.size());
+        for (const auto& group : groups) {
+            layers_by_group.push_back(group.layer_ids);
+        }
+
         // Attention-specific configuration section
         os << indent1 << "# Attention Configuration:\n";
         OUTPUT_FIELD(linear_step);
         OUTPUT_FIELD(group_layer_num);
-        OUTPUT_FIELD(linear_group_num);
-        OUTPUT_FIELD(full_group_num);
+        OUTPUT_FIELD_EXPR("full_group_num",
+                          std::count_if(group_policies.begin(), group_policies.end(),
+                                        [](const CacheGroupPolicy& p) { return p.group_type == CacheGroupType::FULL; }));
+        OUTPUT_FIELD_EXPR("linear_group_num",
+                          std::count_if(group_policies.begin(), group_policies.end(),
+                                        [](const CacheGroupPolicy& p) { return p.group_type == CacheGroupType::LINEAR; }));
+        OUTPUT_FIELD_EXPR("swa_group_num",
+                          std::count_if(group_policies.begin(), group_policies.end(),
+                                        [](const CacheGroupPolicy& p) { return p.group_type == CacheGroupType::SWA; }));
+        OUTPUT_FIELD(use_independent_block_pools);
+        OUTPUT_FIELD(use_typed_cache_regions);
+        OUTPUT_FIELD(use_opaque_kv_cache_store);
+        OUTPUT_FIELD(disable_decode_first_malloc_device_reuse);
+        os << indent1 << "group_block_nums=" << rtp_llm::vectorToString(group_block_nums) << "\n";
         os << "\n";
 
         // Cache specification section
         os << indent1 << "# Cache Specifications:\n";
-        OUTPUT_FIELD_EXPR("cache_specs.size()", cache_specs.size());
-        for (size_t i = 0; i < cache_specs.size(); ++i) {
-            const auto& spec = cache_specs[i];
+        OUTPUT_FIELD_EXPR("cache_specs.size()", groups.size());
+        for (size_t i = 0; i < groups.size(); ++i) {
+            const auto& spec = groups[i].spec;
             if (!spec) {
                 os << indent1 << "cache_specs[" << i << "]=null\n";
                 continue;
@@ -133,28 +774,28 @@ struct CacheConfig {
 
         // Layer mapping section
         os << indent1 << "# Layer Mapping:\n";
-        OUTPUT_FIELD_EXPR("global_layer_ids.size()", global_layer_ids.size());
-        os << indent1 << "global_layer_ids=" << rtp_llm::vectorsToString(global_layer_ids) << "\n";
-        OUTPUT_FIELD_EXPR("layer_ids.size()", layer_ids.size());
-        os << indent1 << "layer_ids=" << rtp_llm::vectorsToString(layer_ids) << "\n";
-        OUTPUT_FIELD_EXPR("group_types.size()", group_types.size());
+        OUTPUT_FIELD_EXPR("layer_ids.size()", layers_by_group.size());
+        os << indent1 << "layer_ids=" << rtp_llm::vectorsToString(layers_by_group) << "\n";
+        OUTPUT_FIELD_EXPR("group_types.size()", group_policies.size());
         os << indent1 << "group_types=[";
-        for (size_t i = 0; i < group_types.size(); ++i) {
-            os << static_cast<int>(group_types[i]);
-            if (i + 1 < group_types.size()) {
+        for (size_t i = 0; i < group_policies.size(); ++i) {
+            os << static_cast<int>(group_policies[i].group_type);
+            if (i + 1 < group_policies.size()) {
                 os << ",";
             }
         }
         os << "]\n";
-        OUTPUT_FIELD_EXPR("layer_attn_types.size()", layer_attn_types.size());
-        os << indent1 << "layer_attn_types=[";
-        for (size_t i = 0; i < layer_attn_types.size(); ++i) {
-            os << static_cast<int>(layer_attn_types[i]);
-            if (i + 1 < layer_attn_types.size()) {
+        OUTPUT_FIELD_EXPR("group_tags.size()", group_tags.size());
+        os << indent1 << "group_tags=[";
+        for (size_t i = 0; i < group_tags.size(); ++i) {
+            os << group_tags[i];
+            if (i + 1 < group_tags.size()) {
                 os << ",";
             }
         }
         os << "]\n";
+        OUTPUT_FIELD_EXPR("layer_to_group_ids.size()", group_layer_ids.size());
+        os << indent1 << "layer_to_group_ids=" << rtp_llm::vectorsToString(group_layer_ids) << "\n";
         os << "\n";
 
         // mtp configurations section
diff --git a/rtp_llm/cpp/cache/CacheConfigCreator.cc b/rtp_llm/cpp/cache/CacheConfigCreator.cc
deleted file mode 100644
index bd3985dbfb..0000000000
--- a/rtp_llm/cpp/cache/CacheConfigCreator.cc
+++ /dev/null
@@ -1,225 +0,0 @@
-#include "rtp_llm/cpp/cache/CacheConfigCreator.h"
-
-#include <numeric>
-
-#include "rtp_llm/cpp/cache/HybridConfigCreator.h"
-#include "rtp_llm/cpp/cache/MemoryEvaluationHelper.h"
-#include "rtp_llm/cpp/cache/SingleConfigCreator.h"
-#include "rtp_llm/cpp/utils/Logger.h"
-#include "rtp_llm/cpp/utils/AssertUtils.h"
-
-namespace rtp_llm {
-
-CacheConfig CacheConfigCreator::createBasicConfig(const ModelConfig&       model_config,
-                                                  const ParallelismConfig& parallelism_config,
-                                                  bool                     is_mtp) {
-    if (model_config.hybrid_attention_config.enable_hybrid_attention) {
-        return HybridConfigCreator::createHybridConfig(model_config, parallelism_config, is_mtp);
-    } else {
-        return SingleConfigCreator::createSingleConfig(model_config, parallelism_config, is_mtp);
-    }
-}
-
-CacheConfig CacheConfigCreator::createConfig(const ModelConfig&                               model_config,
-                                             const ParallelismConfig&                         parallelism_config,
-                                             const RuntimeConfig&                             runtime_config,
-                                             const KVCacheConfig&                             kv_cache_config,
-                                             const std::optional<WarmUpResult>&               warm_up_result,
-                                             const std::optional<SpeculativeExecutionConfig>& sp_config) {
-    CacheConfig config    = CacheConfigCreator::createBasicConfig(model_config, parallelism_config);
-    uint32_t    block_num = 0;
-
-    config.linear_step = kv_cache_config.linear_step;
-    if (kv_cache_config.kernel_seq_size_per_block > 0) {
-        RTP_LLM_CHECK_WITH_INFO(kv_cache_config.seq_size_per_block % kv_cache_config.kernel_seq_size_per_block == 0,
-                                "seq_size_per_block(%d) must be divisible by kernel_seq_size_per_block(%d)",
-                                kv_cache_config.seq_size_per_block,
-                                kv_cache_config.kernel_seq_size_per_block);
-        config.kernel_seq_size_per_block = static_cast<size_t>(kv_cache_config.kernel_seq_size_per_block);
-    } else {
-        // Default: kernel block size == physical block size (no split).
-        config.kernel_seq_size_per_block = config.seq_size_per_block;
-    }
-
-    if (kv_cache_config.test_block_num > 0) {
-        RTP_LLM_LOG_INFO("KVCacheConfig explicitly specified kv cache block num %d", kv_cache_config.test_block_num);
-        block_num = kv_cache_config.test_block_num;
-    } else {
-        const auto kv_cache_mem_size = MemoryEvaluationHelper::getKVCacheMemorySize(
-            runtime_config, kv_cache_config, model_config, parallelism_config, warm_up_result, sp_config);
-        block_num = kv_cache_mem_size / config.block_size_bytes;
-    }
-    RTP_LLM_CHECK_WITH_INFO(block_num > 0,
-                            "kv cache needs at least 1 block but %ld, each block needs %ld MiB memory",
-                            block_num,
-                            static_cast<long>(config.block_size_bytes / 1024 / 1024));
-
-    const auto kv_cache_seq_len = static_cast<size_t>(block_num) * config.seq_size_per_block;
-    config.block_num            = static_cast<int>(block_num);
-    RTP_LLM_LOG_INFO("kv cache block nums is %u, allows storing %ld tokens", block_num, kv_cache_seq_len);
-    if (kv_cache_seq_len < model_config.max_seq_len) {
-        RTP_LLM_LOG_WARNING("kv cache block nums %u can only store %ld tokens, less than max_seq_len %ld, "
-                            "this is dangerous, consider decrease max_seq_len",
-                            block_num,
-                            kv_cache_seq_len,
-                            model_config.max_seq_len);
-    }
-    return config;
-}
-
-CacheConfig CacheConfigCreator::createSpConfig(const ModelConfig&                 score_model_config,
-                                               const ModelConfig&                 propose_model_config,
-                                               const ParallelismConfig&           parallelism_config,
-                                               const RuntimeConfig&               runtime_config,
-                                               const KVCacheConfig&               kv_cache_config,
-                                               const SpeculativeExecutionConfig&  sp_config,
-                                               const std::optional<WarmUpResult>& warm_up_result,
-                                               bool                               is_mtp,
-                                               bool                               is_eagle) {
-    CacheConfig score_config = CacheConfigCreator::createBasicConfig(score_model_config, parallelism_config, false);
-    CacheConfig propose_config =
-        CacheConfigCreator::createBasicConfig(propose_model_config, parallelism_config, is_mtp);
-
-    if (kv_cache_config.kernel_seq_size_per_block > 0) {
-        const size_t kernel_seq_size_per_block = static_cast<size_t>(kv_cache_config.kernel_seq_size_per_block);
-        RTP_LLM_CHECK_WITH_INFO(score_config.seq_size_per_block % kernel_seq_size_per_block == 0,
-                                "score seq_size_per_block(%zu) must be divisible by kernel_seq_size_per_block(%zu)",
-                                score_config.seq_size_per_block,
-                                kernel_seq_size_per_block);
-        RTP_LLM_CHECK_WITH_INFO(propose_config.seq_size_per_block % kernel_seq_size_per_block == 0,
-                                "propose seq_size_per_block(%zu) must be divisible by kernel_seq_size_per_block(%zu)",
-                                propose_config.seq_size_per_block,
-                                kernel_seq_size_per_block);
-        score_config.kernel_seq_size_per_block   = kernel_seq_size_per_block;
-        propose_config.kernel_seq_size_per_block = kernel_seq_size_per_block;
-    } else {
-        // Default: kernel block size == physical block size (no split).
-        score_config.kernel_seq_size_per_block   = score_config.seq_size_per_block;
-        propose_config.kernel_seq_size_per_block = propose_config.seq_size_per_block;
-    }
-
-    int num_mtp_modules = 1;
-    if (is_mtp) {
-        num_mtp_modules = sp_config.gen_num_per_cycle;
-        if (is_eagle) {
-            num_mtp_modules = 1;
-        }
-    }
-
-    uint32_t total_layer_num = score_config.layer_num;
-    for (int i = 0; i < num_mtp_modules; ++i) {
-        total_layer_num += propose_config.layer_num;
-    }
-
-    size_t total_block_size_bytes = score_config.block_size_bytes;
-    for (int i = 0; i < num_mtp_modules; ++i) {
-        total_block_size_bytes += propose_config.block_size_bytes;
-    }
-
-    size_t block_num = 0;
-    if (kv_cache_config.test_block_num > 0) {
-        block_num = kv_cache_config.test_block_num;
-    } else {
-        const auto kv_cache_mem_size = MemoryEvaluationHelper::getKVCacheMemorySize(
-            runtime_config, kv_cache_config, score_model_config, parallelism_config, warm_up_result, sp_config);
-
-        block_num = kv_cache_mem_size
-                    / (static_cast<size_t>(score_config.block_size_bytes)
-                       + static_cast<size_t>(propose_config.block_size_bytes) * static_cast<size_t>(num_mtp_modules));
-    }
-
-    RTP_LLM_CHECK_WITH_INFO(block_num > 0, "kv cache needs at least 1 block but %zu", block_num);
-
-    CacheConfig config      = score_config;
-    config.linear_step      = std::max(1, kv_cache_config.linear_step);
-    config.layer_all_num    = total_layer_num;
-    config.block_size_bytes = total_block_size_bytes;
-    // config.block_size       = config.block_size_bytes / rtp_llm::getTypeSize(config.dtype);
-    config.block_num = block_num;
-
-    const uint32_t main_layer_num = score_config.layer_num;
-    const uint32_t mtp_layer_num  = propose_config.layer_num;
-
-    size_t full_gid = 0;
-    if (config.group_types.size() > 1) {
-        for (size_t gid = 0; gid < config.group_types.size(); ++gid) {
-            if (config.group_types[gid] == CacheGroupType::FULL) {
-                full_gid = gid;
-                break;
-            }
-        }
-    }
-
-    // Each sub-model needs an independent CacheConfig because global_layer_ids differs per module.
-    config.mtp_sub_configs.clear();
-    config.mtp_sub_configs.reserve(num_mtp_modules);
-    config.layer_to_group_id.resize(total_layer_num, 0);
-    config.layer_attn_types.resize(total_layer_num, CacheGroupType::FULL);
-    config.layer_to_block_stride_bytes.assign(static_cast<size_t>(total_layer_num), 0);
-
-    // Main(score) model per-layer stride (kv + scale).
-    // This is expected to be fully populated by createBasicConfig() (Single/Hybrid creators).
-    const size_t score_layers = static_cast<size_t>(main_layer_num);
-    RTP_LLM_CHECK_WITH_INFO(score_config.layer_to_block_stride_bytes.size() == score_layers,
-                            "score_config.layer_to_block_stride_bytes size mismatch, got=%zu need=%zu",
-                            score_config.layer_to_block_stride_bytes.size(),
-                            score_layers);
-    for (size_t l = 0; l < score_layers; ++l) {
-        config.layer_to_block_stride_bytes[l] = score_config.layer_to_block_stride_bytes[l];
-        if (l < score_config.layer_attn_types.size()) {
-            config.layer_attn_types[l] = score_config.layer_attn_types[l];
-        }
-    }
-
-    for (int m = 0; m < num_mtp_modules; ++m) {
-        auto sub_cfg           = std::make_shared<CacheConfig>(propose_config);
-        sub_cfg->block_num     = block_num;
-        sub_cfg->layer_all_num = sub_cfg->layer_num;
-
-        sub_cfg->global_layer_ids.clear();
-        sub_cfg->global_layer_ids.resize(1);
-        sub_cfg->global_layer_ids[0].resize(mtp_layer_num);
-        RTP_LLM_CHECK_WITH_INFO(sub_cfg->layer_to_block_stride_bytes.size() == static_cast<size_t>(mtp_layer_num),
-                                "sub_cfg.layer_to_block_stride_bytes size mismatch, got=%zu need=%u",
-                                sub_cfg->layer_to_block_stride_bytes.size(),
-                                mtp_layer_num);
-        for (size_t l = 0; l < mtp_layer_num; ++l) {
-            int global_layer_id                       = main_layer_num + m * mtp_layer_num + l;
-            sub_cfg->global_layer_ids[0][l]           = global_layer_id;
-            config.layer_to_group_id[global_layer_id] = static_cast<int>(full_gid);
-            config.global_layer_ids[full_gid].push_back(global_layer_id);
-
-            const int stride_bytes = sub_cfg->layer_to_block_stride_bytes[static_cast<size_t>(l)];
-            config.layer_to_block_stride_bytes[static_cast<size_t>(global_layer_id)] = stride_bytes;
-            if (l < sub_cfg->layer_attn_types.size()) {
-                config.layer_attn_types[static_cast<size_t>(global_layer_id)] = sub_cfg->layer_attn_types[l];
-            }
-        }
-
-        sub_cfg->layer_to_group_id.assign(static_cast<size_t>(sub_cfg->layer_num), static_cast<int>(full_gid));
-        config.mtp_sub_configs.push_back(sub_cfg);
-    }
-
-    const auto kv_cache_seq_len = static_cast<size_t>(block_num) * config.seq_size_per_block;
-    RTP_LLM_LOG_INFO("CacheConfig created: is_mtp=%d, total_layers=%u, num_mtp_modules=%d, block_num=%zu, "
-                     "allows storing %zu tokens, total_block_size=%zu bytes (main=%zu + %d*propose=%zu)",
-                     is_mtp,
-                     total_layer_num,
-                     num_mtp_modules,
-                     block_num,
-                     kv_cache_seq_len,
-                     total_block_size_bytes,
-                     score_config.block_size_bytes,
-                     num_mtp_modules,
-                     propose_config.block_size_bytes);
-
-    RTP_LLM_LOG_INFO("CacheConfig debugString(main_score_model):\n%s", score_config.debugString().c_str());
-    for (size_t i = 0; i < config.mtp_sub_configs.size(); ++i) {
-        const auto& sub = config.mtp_sub_configs[i];
-        RTP_LLM_LOG_INFO("CacheConfig debugString(sub_propose_model[%zu]):\n%s", i, sub->debugString().c_str());
-    }
-
-    return config;
-}
-
-}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/CacheGroupType.h b/rtp_llm/cpp/cache/CacheGroupType.h
deleted file mode 100644
index aae75d2b5d..0000000000
--- a/rtp_llm/cpp/cache/CacheGroupType.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#pragma once
-
-#include <cstdint>
-
-namespace rtp_llm {
-
-// Cache group type for hybrid KV-cache:
-// - LINEAR: linear attention group (only last block is needed for cache-store transfer)
-// - FULL: full attention group (all blocks are needed for cache-store transfer)
-enum class CacheGroupType : int8_t {
-    LINEAR = 0,
-    FULL   = 1,
-};
-
-}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/HybridConfigCreator.cc b/rtp_llm/cpp/cache/HybridConfigCreator.cc
deleted file mode 100644
index c9a3306fc5..0000000000
--- a/rtp_llm/cpp/cache/HybridConfigCreator.cc
+++ /dev/null
@@ -1,238 +0,0 @@
-#include "rtp_llm/cpp/cache/HybridConfigCreator.h"
-
-#include <numeric>
-
-#include "rtp_llm/cpp/cache/KVCacheSpec.h"
-#include "rtp_llm/cpp/cache/MemoryEvaluationHelper.h"
-
-namespace rtp_llm {
-
-std::vector<std::vector<int>> HybridConfigCreator::splitIntoGroups(const std::vector<int>& ids, int group_layer_num) {
-    std::vector<std::vector<int>> groups;
-    if (ids.empty()) {
-        return groups;
-    }
-    const int n = static_cast<int>(ids.size());
-    const int s = std::max(group_layer_num, 1);
-    groups.reserve((n + s - 1) / s);
-    for (int i = 0; i < n; i += s) {
-        const int end = std::min(i + s, n);
-        groups.emplace_back(ids.begin() + i, ids.begin() + end);
-    }
-    return groups;
-}
-
-int HybridConfigCreator::calculateGroupLayerNum(int linear_layer_count, int full_layer_count) {
-    // All full attention layers must reside in one cache group (full_group_num <= 1).
-    // prepare_fmha_impl binds the block table of group 0 once; it is not re-bound per layer.
-    // group_layer_num must be >= full_layer_count to satisfy this.
-    // When gcd is already sufficient it works directly; the fallback handles all other cases
-    // (coprime gcd==1, or gcd>1 but still smaller than full_layer_count).
-    int group_layer_num = 0;
-    if (linear_layer_count > 0 && full_layer_count > 0) {
-        group_layer_num = std::gcd(linear_layer_count, full_layer_count);
-        // Fallback: when gcd < full_layer_count, force group_layer_num = full_layer_count
-        // to guarantee all full layers fit in one group.
-        // e.g. Kimi Linear 20:7 -> gcd=1 < 7 -> group_layer_num=7, linear groups=[7,7,6],
-        // last group wastes 1 layer slot per block, negligible.
-        if (group_layer_num < full_layer_count) {
-            group_layer_num = full_layer_count;
-        }
-    } else {
-        group_layer_num = std::max(linear_layer_count, full_layer_count);
-    }
-    group_layer_num = std::max(group_layer_num, 1);
-    return group_layer_num;
-}
-
-std::pair<std::vector<int>, std::vector<int>>
-HybridConfigCreator::splitLayersByAttentionType(const ModelConfig& model_config) {
-    int64_t layer_num = model_config.num_layers;
-    RTP_LLM_CHECK_WITH_INFO(layer_num > 0, "invalid model_config.num_layers=%ld", layer_num);
-
-    std::vector<int> linear_layers;
-    std::vector<int> full_layers;
-    linear_layers.reserve(layer_num);
-    full_layers.reserve(layer_num);
-
-    const auto& types = model_config.hybrid_attention_config.hybrid_attention_types;
-    for (int i = 0; i < static_cast<int>(layer_num); ++i) {
-        if (types[static_cast<size_t>(i)] == HybridAttentionType::LINEAR) {
-            linear_layers.push_back(i);
-        } else {
-            full_layers.push_back(i);
-        }
-    }
-
-    return std::make_pair(std::move(linear_layers), std::move(full_layers));
-}
-
-CacheConfig HybridConfigCreator::initializeConfig(const ModelConfig&      model_config,
-                                                  const std::vector<int>& linear_layers,
-                                                  const std::vector<int>& full_layers,
-                                                  rtp_llm::DataType       dtype) {
-    int64_t layer_num = model_config.num_layers;
-
-    CacheConfig config;
-    config.layer_num          = static_cast<uint32_t>(layer_num);
-    config.layer_all_num      = static_cast<uint32_t>(layer_num);
-    config.block_num          = 0;
-    config.seq_size_per_block = static_cast<uint32_t>(model_config.attn_config.tokens_per_block);
-    config.use_mla            = model_config.attn_config.use_mla;
-    config.dtype              = dtype;
-    config.linear_step        = 1;
-
-    config.global_layer_ids.push_back(linear_layers);
-    config.global_layer_ids.push_back(full_layers);
-    config.layer_ids.push_back(linear_layers);
-    config.layer_ids.push_back(full_layers);
-
-    return config;
-}
-
-KVCacheSpecPtr HybridConfigCreator::createFullAttentionSpec(const ModelConfig&       model_config,
-                                                            const ParallelismConfig& parallelism_config,
-                                                            rtp_llm::DataType        dtype) {
-    KVCacheSpecPtr full_spec;
-    if (model_config.attn_config.use_mla && model_config.mla_ops_type != rtp_llm::MlaOpsType::MHA) {
-        full_spec = std::make_shared<MLAKVCacheSpec>(model_config.attn_config, parallelism_config);
-    } else {
-        full_spec = std::make_shared<MHAKVCacheSpec>(model_config.attn_config, parallelism_config);
-    }
-    full_spec->dtype = dtype;
-    return full_spec;
-}
-
-KVCacheSpecPtr HybridConfigCreator::createLinearAttentionSpec(const ModelConfig&       model_config,
-                                                              const ParallelismConfig& parallelism_config,
-                                                              rtp_llm::DataType        dtype) {
-    auto linear_spec = std::make_shared<LinearKVCacheSpec>(
-        model_config.attn_config, parallelism_config, model_config.linear_attention_config);
-    linear_spec->dtype = dtype;
-    return linear_spec;
-}
-
-std::pair<std::vector<std::vector<int>>, std::vector<std::vector<int>>> HybridConfigCreator::createLayerGroups(
-    const std::vector<int>& linear_layers, const std::vector<int>& full_layers, int& group_layer_num) {
-    const int linear_cnt = static_cast<int>(linear_layers.size());
-    const int full_cnt   = static_cast<int>(full_layers.size());
-    group_layer_num      = HybridConfigCreator::calculateGroupLayerNum(linear_cnt, full_cnt);
-
-    const auto linear_groups = HybridConfigCreator::splitIntoGroups(linear_layers, group_layer_num);
-    const auto full_groups   = HybridConfigCreator::splitIntoGroups(full_layers, group_layer_num);
-
-    return std::make_pair(std::move(linear_groups), std::move(full_groups));
-}
-
-void HybridConfigCreator::setupCacheConfigSpecs(CacheConfig&                         config,
-                                                const std::vector<std::vector<int>>& linear_groups,
-                                                const std::vector<std::vector<int>>& full_groups,
-                                                const KVCacheSpecPtr&                linear_spec,
-                                                const KVCacheSpecPtr&                full_spec) {
-    config.global_layer_ids.clear();
-    config.layer_ids.clear();
-    config.cache_specs.clear();
-    config.group_types.clear();
-
-    // Keep order: all full groups first, then linear groups.
-    for (const auto& g : full_groups) {
-        config.global_layer_ids.push_back(g);
-        config.layer_ids.push_back(g);
-        config.cache_specs.push_back(full_spec);
-        config.group_types.push_back(CacheGroupType::FULL);
-    }
-    for (const auto& g : linear_groups) {
-        config.global_layer_ids.push_back(g);
-        config.layer_ids.push_back(g);
-        config.cache_specs.push_back(linear_spec);
-        config.group_types.push_back(CacheGroupType::LINEAR);
-    }
-    config.linear_group_num = static_cast<int>(linear_groups.size());
-    config.full_group_num   = static_cast<int>(full_groups.size());
-}
-
-void HybridConfigCreator::setupPhysicalSizes(CacheConfig&          config,
-                                             const KVCacheSpecPtr& full_spec,
-                                             const KVCacheSpecPtr& linear_spec) {
-    // Decide the physical KV block/scale sizes by taking max between full and linear specs.
-    const size_t full_kv_block_stride_bytes   = full_spec->block_size_bytes();
-    const size_t linear_kv_block_stride_bytes = linear_spec->block_size_bytes();
-
-    // now we only support that linear attention block have padding
-    RTP_LLM_CHECK_WITH_INFO(full_kv_block_stride_bytes >= linear_kv_block_stride_bytes,
-                            "not support full attention with padding now");
-
-    config.kv_block_stride_bytes = full_kv_block_stride_bytes;
-    config.kv_block_size_bytes   = static_cast<size_t>(config.group_layer_num) * config.kv_block_stride_bytes;
-    config.kv_scale_stride_bytes = full_spec->scale_block_size_bytes();
-    config.kv_scale_size_bytes   = static_cast<size_t>(config.group_layer_num) * config.kv_scale_stride_bytes;
-    config.block_size_bytes      = config.kv_block_size_bytes + config.kv_scale_size_bytes;
-}
-
-void HybridConfigCreator::setupLayerToGroupMapping(CacheConfig& config) {
-    config.layer_to_group_id.assign(config.layer_num, 0);
-    for (size_t gid = 0; gid < config.layer_ids.size(); ++gid) {
-        for (int layer_id : config.layer_ids[gid]) {
-            if (layer_id >= 0 && static_cast<size_t>(layer_id) < config.layer_num) {
-                config.layer_to_group_id[static_cast<size_t>(layer_id)] = static_cast<int32_t>(gid);
-            }
-        }
-    }
-}
-
-CacheConfig HybridConfigCreator::createHybridConfig(const ModelConfig&       model_config,
-                                                    const ParallelismConfig& parallelism_config,
-                                                    bool                     is_mtp) {
-    auto dtype = MemoryEvaluationHelper::getDataTypeForCache(model_config);
-
-    // Split layers by attention type
-    auto [linear_layers, full_layers] = HybridConfigCreator::splitLayersByAttentionType(model_config);
-
-    // Initialize config
-    CacheConfig config = HybridConfigCreator::initializeConfig(model_config, linear_layers, full_layers, dtype);
-
-    // Create attention specs
-    auto full_spec   = HybridConfigCreator::createFullAttentionSpec(model_config, parallelism_config, dtype);
-    auto linear_spec = HybridConfigCreator::createLinearAttentionSpec(model_config, parallelism_config, dtype);
-
-    // Create layer groups and calculate group layer number
-    int group_layer_num = 0;
-    auto [linear_groups, full_groups] =
-        HybridConfigCreator::createLayerGroups(linear_layers, full_layers, group_layer_num);
-    config.group_layer_num = group_layer_num;
-
-    // Setup cache config specs
-    HybridConfigCreator::setupCacheConfigSpecs(config, linear_groups, full_groups, linear_spec, full_spec);
-
-    // Hard check: current only supports a single full attention group.
-    RTP_LLM_CHECK_WITH_INFO(
-        config.full_group_num <= 1,
-        "Multiple full attention groups (%d) are not supported in hybrid mode. "
-        "prepare_fmha_impl is called once before the layer loop, binding the block table from group 0. "
-        "To support multiple full groups, implement per-group fmha preparation.",
-        config.full_group_num);
-
-    // Setup physical sizes
-    HybridConfigCreator::setupPhysicalSizes(config, full_spec, linear_spec);
-
-    // Setup layer to group mapping
-    HybridConfigCreator::setupLayerToGroupMapping(config);
-
-    config.layer_attn_types.assign(config.layer_num, CacheGroupType::FULL);
-    for (size_t layer_id = 0; layer_id < config.layer_to_group_id.size(); ++layer_id) {
-        const int gid = config.layer_to_group_id[layer_id];
-        if (gid >= 0 && static_cast<size_t>(gid) < config.group_types.size()) {
-            config.layer_attn_types[layer_id] = config.group_types[static_cast<size_t>(gid)];
-        }
-    }
-
-    // Per-layer block stride (kv + scale).
-    // For hybrid attention, the physical per-layer stride follows the selected physical layout stride.
-    const size_t per_layer_stride_bytes = config.kv_block_stride_bytes + config.kv_scale_stride_bytes;
-    config.layer_to_block_stride_bytes.assign(static_cast<size_t>(config.layer_all_num),
-                                              static_cast<int>(per_layer_stride_bytes));
-
-    return config;
-}
-
-}  // namespace rtp_llm
\ No newline at end of file
diff --git a/rtp_llm/cpp/cache/HybridTypeKVCacheAllocator.cc b/rtp_llm/cpp/cache/HybridTypeKVCacheAllocator.cc
deleted file mode 100644
index 3423a3ca64..0000000000
--- a/rtp_llm/cpp/cache/HybridTypeKVCacheAllocator.cc
+++ /dev/null
@@ -1,536 +0,0 @@
-#include "rtp_llm/cpp/cache/HybridTypeKVCacheAllocator.h"
-
-#include <algorithm>
-#include <cstdlib>
-#include <sstream>
-#include <unordered_map>
-#include <unordered_set>
-
-#include "rtp_llm/cpp/cache/BlockPoolConfigHelper.h"
-#include "rtp_llm/cpp/utils/Logger.h"
-#include "rtp_llm/cpp/utils/TimeUtil.h"
-#include "rtp_llm/cpp/engine_base/stream/CompleteTokenIds.h"
-
-namespace rtp_llm {
-HybridTypeKVCacheAllocator::HybridTypeKVCacheAllocator(const CacheConfig&                 config,
-                                                       AllocationType                     allocation_type,
-                                                       const kmonitor::MetricsReporterPtr metrics_reporter,
-                                                       int64_t                            reserve_block_ratio):
-    KVCacheAllocator(config, allocation_type, metrics_reporter, reserve_block_ratio) {}
-
-bool HybridTypeKVCacheAllocator::doInit() {
-    RTP_LLM_CHECK_WITH_INFO(!config_.cache_specs.empty(), "no cache_specs found in CacheConfig");
-
-    auto pool_config = BlockPoolConfigHelper::createConfig(config_);
-    block_pool_      = std::make_shared<BlockPool>(pool_config, allocation_type_);
-    RTP_LLM_CHECK_WITH_INFO(block_pool_->init(), "Failed to initialize block pool for HybridTypeKVCacheAllocator");
-
-    const auto& layer_groups = config_.global_layer_ids;
-    const int   group_nums   = static_cast<int>(layer_groups.size());
-    kv_cache_groups_.reserve(group_nums);
-
-    // global layer id -> group id mapping (for address lookup APIs)
-    layer_to_group_id_ = config_.layer_to_group_id;
-
-    for (int gid = 0; gid < group_nums; ++gid) {
-        KVCacheSpecPtr spec = config_.cache_specs[static_cast<size_t>(gid)];
-        const auto&    ids  = layer_groups[static_cast<size_t>(gid)];
-
-        KVCacheGroupPtr group;
-        if (spec && spec->type == KVCacheSpecType::LinearAttention) {
-            group = std::make_shared<LinearKVCacheGroup>(ids, spec, block_pool_, gid, config_.linear_step);
-            linear_group_ids_.push_back(gid);
-        } else {
-            group = std::make_shared<FullKVCacheGroup>(ids, spec, block_pool_, gid);
-            full_group_ids_.push_back(gid);
-        }
-
-        RTP_LLM_CHECK_WITH_INFO(group->init(), "Failed to initialize KVCacheGroup gid %d", gid);
-        kv_cache_groups_.push_back(group);
-    }
-
-    global_layer_to_local_id_.assign(static_cast<size_t>(config_.layer_all_num), -1);
-    for (const auto& cur_group_layers : layer_groups) {
-        for (size_t local_layer_idx = 0; local_layer_idx < cur_group_layers.size(); ++local_layer_idx) {
-            const int global_layer_idx = cur_group_layers[local_layer_idx];
-            if (global_layer_idx >= 0 && static_cast<size_t>(global_layer_idx) < global_layer_to_local_id_.size()) {
-                global_layer_to_local_id_[static_cast<size_t>(global_layer_idx)] = static_cast<int>(local_layer_idx);
-            }
-        }
-    }
-
-    RTP_LLM_LOG_INFO("HybridTypeKVCacheAllocator init success");
-    return true;
-}
-
-void HybridTypeKVCacheAllocator::referenceValidBlocks(const BlockIndicesType& blocks) const {
-    BlockIndicesType valid;
-    valid.reserve(blocks.size());
-    for (auto b : blocks) {
-        if (!isNullBlockIdx(b)) {
-            valid.push_back(b);
-        }
-    }
-    if (!valid.empty()) {
-        block_pool_->requestReference(valid);
-    }
-}
-
-int HybridTypeKVCacheAllocator::reuseCache(const CacheKeysType& cache_keys, BatchKVCacheResource& kv_resource) {
-    // 1) Prefix match on all full-attn groups, take the shortest prefix.
-    int                           min_full_reuse_blocks = static_cast<int>(cache_keys.size());
-    std::vector<BlockIndicesType> full_matched_blocks(kv_cache_groups_.size());
-
-    for (int gid : full_group_ids_) {
-        auto match_result     = kv_cache_groups_[static_cast<size_t>(gid)]->match(cache_keys);
-        min_full_reuse_blocks = std::min(min_full_reuse_blocks, static_cast<int>(match_result.reuse_blocks));
-        full_matched_blocks[static_cast<size_t>(gid)] = std::move(match_result.block_indices);
-    }
-
-    // 2) Right-to-left joint check for all linear groups (single-key match).
-    int                       pos = min_full_reuse_blocks - 1;
-    std::vector<BlockIdxType> linear_tail_blocks;  // per linear group
-    linear_tail_blocks.resize(linear_group_ids_.size(), NULL_BLOCK_IDX);
-
-    for (; pos >= 0; --pos) {
-        bool all_linear_matched = true;
-        for (size_t i = 0; i < linear_group_ids_.size(); ++i) {
-            const int gid      = linear_group_ids_[i];
-            auto* linear_group = dynamic_cast<LinearKVCacheGroup*>(kv_cache_groups_[static_cast<size_t>(gid)].get());
-            auto  result       = linear_group->matchSingleKey(cache_keys[static_cast<size_t>(pos)]);
-            if (result.block_indices.empty()) {
-                all_linear_matched = false;
-                break;
-            }
-            linear_tail_blocks[i] = result.block_indices[0];
-        }
-        if (all_linear_matched) {
-            break;
-        }
-    }
-
-    const int reuse_blocks_len = std::max(pos + 1, 0);
-    if (reuse_blocks_len <= 0) {
-        return 0;
-    }
-
-    // Write matched blocks into batch 0 blocks, per group.
-    // NOTE: for linear groups we only reuse the tail block; other slots are set to NULL_BLOCK_IDX.
-    for (int gid = 0; gid < static_cast<int>(kv_cache_groups_.size()); ++gid) {
-        kv_resource.mutableBlockIds(0, gid).assign(
-            BlockIndicesType(static_cast<size_t>(reuse_blocks_len), NULL_BLOCK_IDX));
-    }
-
-    for (int gid : full_group_ids_) {
-        BlockIndicesType full_blocks = full_matched_blocks[static_cast<size_t>(gid)];
-        if (static_cast<int>(full_blocks.size()) > reuse_blocks_len) {
-            full_blocks.resize(static_cast<size_t>(reuse_blocks_len));
-        }
-        kv_resource.mutableBlockIds(0, gid).assign(std::move(full_blocks));
-    }
-
-    for (size_t i = 0; i < linear_group_ids_.size(); ++i) {
-        const int gid = linear_group_ids_[i];
-        kv_resource.mutableBlockIds(0, gid).setAt(static_cast<size_t>(reuse_blocks_len - 1), linear_tail_blocks[i]);
-    }
-
-    return reuse_blocks_len;
-}
-
-MallocResult HybridTypeKVCacheAllocator::incrMalloc(const MallocInfo& malloc_info) {
-    auto&     kv_resource  = malloc_info.batch_kv_cache_resource;
-    const int batch_size   = kv_resource->batchSize();
-    const int seq_len      = malloc_info.complete_token_ids->seqLength();
-    const int reserve_step = malloc_info.complete_token_ids->getReserveStep();
-
-    // Record original sizes for rollback in case any subsequent allocation fails
-    std::vector<std::vector<size_t>> original_sizes(batch_size);
-    for (int b = 0; b < batch_size; ++b) {
-        original_sizes[b].resize(static_cast<size_t>(kv_resource->groupNums()));
-        for (int gid = 0; gid < kv_resource->groupNums(); ++gid) {
-            original_sizes[b][static_cast<size_t>(gid)] = kv_resource->blocksNum(b, gid);
-        }
-    }
-
-    bool all_success  = true;
-    int  failed_batch = -1;
-    int  failed_group = -1;
-
-    for (int b = 0; b < batch_size; ++b) {
-        for (int gid = 0; gid < kv_resource->groupNums(); ++gid) {
-            auto& block_ids = kv_resource->mutableBlockIds(b, gid);
-
-            if (!kv_cache_groups_[static_cast<size_t>(gid)]->malloc(
-                    block_ids, seq_len, malloc_info.reuse_cache, reserve_step)) {
-                all_success  = false;
-                failed_batch = b;
-                failed_group = gid;
-                break;
-            }
-        }
-        if (!all_success) {
-            break;
-        }
-    }
-
-    if (all_success) {
-        // Sparse cleanup is only safe for incremental allocation. Prefill init keeps
-        // reused prefix slots intact because causal_conv1d still reads them by prefix_length.
-        if (!malloc_info.enable_remove_skipped_blocks) {
-            return {true, 0};
-        }
-        // Decode-time memory saving for linear groups (apply after we know allocations succeeded).
-        for (int b = 0; b < batch_size; ++b) {
-            for (int gid = 0; gid < kv_resource->groupNums(); ++gid) {
-                kv_cache_groups_[static_cast<size_t>(gid)]->removeSkippedBlocks(
-                    kv_resource->mutableBlockIds(b, gid), malloc_info.reuse_cache, reserve_step);
-            }
-        }
-        return {true, 0};
-    }
-
-    // rollback kvcache blocks
-    BlockIndicesType blocks_to_free;
-
-    for (int b = 0; b < batch_size; ++b) {
-        for (int gid = 0; gid < kv_resource->groupNums(); ++gid) {
-            auto&  block_ids    = kv_resource->mutableBlockIds(b, gid);
-            size_t original_num = original_sizes[b][static_cast<size_t>(gid)];
-            if (block_ids.blocksNum() > original_num) {
-                const auto& blk = block_ids.blocks();
-                for (size_t i = original_num; i < blk.size(); ++i) {
-                    if (!isNullBlockIdx(blk[i])) {
-                        blocks_to_free.push_back(blk[i]);
-                    }
-                }
-                block_ids.resize(original_num);
-            }
-        }
-        if (b > failed_batch) {
-            break;
-        }
-    }
-    if (!blocks_to_free.empty()) {
-        // All groups share the same block pool; free directly.
-        block_pool_->requestFree(blocks_to_free);
-    }
-    RTP_LLM_LOG_WARNING("Hybrid incrMalloc failed at batch=%d group=%d", failed_batch, failed_group);
-    return {false, 0};
-}
-
-MallocResult HybridTypeKVCacheAllocator::initMallocForCommonLen(const MallocInfo& malloc_info) {
-    auto&     kv_resource = malloc_info.batch_kv_cache_resource;
-    const int batch_size  = kv_resource->batchSize();
-
-    const int seq_len        = malloc_info.complete_token_ids->seqLength();
-    const int common_seq_len = std::min(malloc_info.complete_token_ids->commonSeqLength(), seq_len);
-
-    const auto&  cache_keys         = kv_resource->cacheKeys(0);
-    int64_t      match_cost_time_us = 0;
-    const size_t reserve_blocks     = reserveBlockNum();
-    int          reuse_blocks       = 0;
-
-    if (malloc_info.enable_device_cache) {
-        // Drop last key of partial block (same rationale as SingleType).
-        CacheKeysType match_keys(cache_keys.begin(), cache_keys.empty() ? cache_keys.end() : cache_keys.end() - 1);
-        auto          begin_us = currentTimeUs();
-        reuse_blocks           = reuseCache(match_keys, *kv_resource);
-        match_cost_time_us     = currentTimeUs() - begin_us;
-
-        // Reference reused blocks in batch 0 (filter NULL_BLOCK_IDX).
-        for (int gid = 0; gid < kv_resource->groupNums(); ++gid) {
-            const auto& blocks = kv_resource->blocks(0, gid);
-            referenceValidBlocks(blocks);
-        }
-        kv_resource->cacheResource(0).setDeviceReuseBlockNum(reuse_blocks);
-    }
-
-    const int need_blocks = (reserve_blocks > 0) ? getNeedBlocks(malloc_info) : 0;
-    // Reserve blocks check (best-effort, similar to SingleType).
-    if (reserve_blocks > 0 && need_blocks > 0) {
-        const size_t available_blocks = availableBlocksNum();
-        if (available_blocks < static_cast<size_t>(need_blocks) + reserve_blocks) {
-            if (malloc_info.verbose) {
-                RTP_LLM_LOG_INFO("Hybrid initMalloc rejected by reserve blocks: request_id=%ld "
-                                 "need_blocks=%d available_blocks=%zu "
-                                 "reserve_blocks=%zu",
-                                 malloc_info.request_id,
-                                 need_blocks,
-                                 available_blocks,
-                                 reserve_blocks);
-            }
-            return {false, 0};
-        }
-    }
-
-    // Allocate common blocks on batch 0.
-    for (int gid = 0; gid < kv_resource->groupNums(); ++gid) {
-        auto& block_ids_0 = kv_resource->mutableBlockIds(0, gid);
-
-        // Common blocks are shared across batches; reserve_step is per-batch extra and will be handled in incrMalloc.
-        if (!kv_cache_groups_[static_cast<size_t>(gid)]->malloc(
-                block_ids_0, common_seq_len, malloc_info.reuse_cache, 0)) {
-            return {false, 0};
-        }
-    }
-
-    // Other batches reference batch 0's common blocks.
-    for (int b = 1; b < batch_size; ++b) {
-        for (int gid = 0; gid < kv_resource->groupNums(); ++gid) {
-            kv_cache_groups_[static_cast<size_t>(gid)]->reference(kv_resource->mutableBlockIds(b, gid),
-                                                                  kv_resource->blocks(0, gid));
-        }
-    }
-
-    return {true, reuse_blocks * seqSizePerBlock(), match_cost_time_us};
-}
-
-void HybridTypeKVCacheAllocator::free(const FreeInfo& free_info) {
-    auto& kv_cache_resource = free_info.batch_kv_cache_resource;
-
-    if (kv_cache_resource->curBlocksNum() == 0) {
-        return;
-    }
-
-    for (int batch_id = 0; batch_id < kv_cache_resource->batchSize(); ++batch_id) {
-        for (int gid = 0; gid < kv_cache_resource->groupNums(); ++gid) {
-            kv_cache_groups_[static_cast<size_t>(gid)]->free(kv_cache_resource->blocks(batch_id, gid));
-        }
-    }
-    kv_cache_resource->clearBlocks();
-}
-
-void HybridTypeKVCacheAllocator::insertIntoCache(const InsertInfo& insert_info) {
-    auto& kv_cache_resource = insert_info.batch_kv_cache_resource;
-    RTP_LLM_CHECK(kv_cache_resource != nullptr);
-
-    int batch_size         = kv_cache_resource->batchSize();
-    int seq_size_per_block = seqSizePerBlock();
-
-    for (int batch_id = 0; batch_id < batch_size; ++batch_id) {
-        const auto& cache_keys = kv_cache_resource->cacheKeys(batch_id);
-
-        auto token_ids = insert_info.complete_token_ids->completeTokenIdsVec(batch_id);
-        if (token_ids.size() <= 1 || cache_keys.empty()) {
-            continue;
-        }
-
-        // Only insert full blocks.
-        const size_t token_len       = token_ids.size() - 1;
-        const size_t full_blocks_num = token_len / static_cast<size_t>(seq_size_per_block);
-        const size_t n               = std::min(cache_keys.size(), full_blocks_num);
-        if (n == 0) {
-            continue;
-        }
-
-        CacheKeysType put_cache_keys(cache_keys.begin(), cache_keys.begin() + n);
-        for (int gid = 0; gid < kv_cache_resource->groupNums(); ++gid) {
-            const auto&      blocks = kv_cache_resource->blocks(batch_id, gid);
-            BlockIndicesType put_blocks;
-            put_blocks.reserve(n);
-            for (size_t i = 0; i < n && i < blocks.size(); ++i) {
-                put_blocks.push_back(blocks[i]);
-            }
-            kv_cache_groups_[static_cast<size_t>(gid)]->insertIntoCache(
-                put_cache_keys, put_blocks, insert_info.is_resident);
-        }
-    }
-}
-
-CacheLayerLayout HybridTypeKVCacheAllocator::allLayerCacheBase() const {
-    CacheLayerLayout layout;
-    const auto       layer_tensors = block_pool_->allLayerCacheBase();
-    const auto       scale_tensors = block_pool_->allLayerScaleCacheBase();
-
-    layout.layer_to_groups = layer_to_group_id_;
-    layout.layers_to_kv_buffer_ptrs.resize(config_.layer_all_num);
-    layout.layers_to_scale_buffer_ptrs.resize(config_.layer_all_num);
-
-    for (size_t layer_id = 0; layer_id < static_cast<size_t>(config_.layer_all_num); ++layer_id) {
-        int32_t      local     = global_layer_to_local_id_[layer_id];
-        const size_t local_idx = static_cast<size_t>(local);
-
-        if (local_idx < layer_tensors.size() && layer_tensors[local_idx].defined()
-            && layer_tensors[local_idx].numel() > 0) {
-            layout.layers_to_kv_buffer_ptrs[layer_id] = layer_tensors[local_idx];
-        }
-
-        if (!scale_tensors.empty() && local_idx < scale_tensors.size() && scale_tensors[local_idx].defined()
-            && scale_tensors[local_idx].numel() > 0) {
-            layout.layers_to_scale_buffer_ptrs[layer_id] = scale_tensors[local_idx];
-        }
-    }
-    return layout;
-}
-
-BlockAddrInfo HybridTypeKVCacheAllocator::convertIndexToAddr(int layer_id, int block_id) const {
-    if (layer_id < 0 || layer_id >= static_cast<int>(layer_to_group_id_.size())) {
-        RTP_LLM_FAIL("convertIndexToAddr invalid layer_id=%d", layer_id);
-    }
-    const int gid = layer_to_group_id_[static_cast<size_t>(layer_id)];
-    RTP_LLM_CHECK_WITH_INFO(gid >= 0 && gid < static_cast<int>(kv_cache_groups_.size()), "invalid group id mapping");
-    return kv_cache_groups_[static_cast<size_t>(gid)]->convertIndexToAddr(layer_id, block_id);
-}
-
-std::vector<BlockInfo> HybridTypeKVCacheAllocator::convertIndexToBuffer(int layer_id, int block_id) const {
-    if (layer_id < 0 || layer_id >= static_cast<int>(layer_to_group_id_.size())) {
-        RTP_LLM_FAIL("convertIndexToBuffer invalid layer_id=%d", layer_id);
-    }
-    const int gid = layer_to_group_id_[static_cast<size_t>(layer_id)];
-    RTP_LLM_CHECK_WITH_INFO(gid >= 0 && gid < static_cast<int>(kv_cache_groups_.size()), "invalid group id mapping");
-    return kv_cache_groups_[static_cast<size_t>(gid)]->convertIndexToBuffer(layer_id, block_id);
-}
-
-std::vector<BlockInfo> HybridTypeKVCacheAllocator::convertIndexToBuffer(int layer_id,
-                                                                        int block_id,
-                                                                        int partition_count,
-                                                                        int partition_id) const {
-    if (layer_id < 0 || layer_id >= static_cast<int>(layer_to_group_id_.size())) {
-        RTP_LLM_FAIL("convertIndexToBuffer(partition) invalid layer_id=%d", layer_id);
-    }
-    const int gid = layer_to_group_id_[static_cast<size_t>(layer_id)];
-    RTP_LLM_CHECK_WITH_INFO(gid >= 0 && gid < static_cast<int>(kv_cache_groups_.size()), "invalid group id mapping");
-    return kv_cache_groups_[static_cast<size_t>(gid)]->convertIndexToBuffer(
-        layer_id, block_id, partition_count, partition_id);
-}
-
-std::shared_ptr<KVCacheResource> HybridTypeKVCacheAllocator::incrKVCacheRef(const KVCacheResource& kvcache_resource,
-                                                                            const CacheKeysType&   cache_keys,
-                                                                            bool                   is_connector) {
-    if (cache_keys.empty()) {
-        return nullptr;
-    }
-
-    const int group_nums = kvcache_resource.groupNums();
-    if (group_nums <= 0) {
-        return nullptr;
-    }
-
-    std::unordered_map<CacheKeyType, size_t> key_to_pos;
-    const auto&                              resource_keys = kvcache_resource.cacheKeys();
-    key_to_pos.reserve(resource_keys.size());
-    for (size_t i = 0; i < resource_keys.size(); ++i) {
-        key_to_pos.emplace(resource_keys[i], i);
-    }
-
-    auto selected_resource_ptr = new KVCacheResource(kvcache_resource);
-    auto deleter               = [self = shared_from_this(), is_connector](KVCacheResource* resource) {
-        self->decrKVCacheRef(*resource, is_connector);
-        delete resource;
-    };
-    std::shared_ptr<KVCacheResource> selected_resource(selected_resource_ptr, deleter);
-    selected_resource->initGroups(group_nums,
-                                  static_cast<int>(config_.layer_all_num),
-                                  config_.layer_to_group_id,
-                                  config_.kernelBlocksPerKvBlock(),
-                                  config_.group_types);
-
-    CacheKeysType&                selected_keys = selected_resource->cacheKeys();
-    std::vector<BlockIndicesType> selected_blocks(static_cast<size_t>(group_nums));
-
-    BlockIndicesType blocks_to_reference;
-    blocks_to_reference.reserve(cache_keys.size());
-
-    for (auto key : cache_keys) {
-        auto it = key_to_pos.find(key);
-        if (it == key_to_pos.end()) {
-            continue;
-        }
-        const size_t pos = it->second;
-        for (int gid = 0; gid < group_nums; ++gid) {
-            auto& src_blocks = kvcache_resource.blocks(gid);
-            if (pos >= src_blocks.size()) {
-                continue;
-            }
-            const auto block = src_blocks[pos];
-            selected_blocks[static_cast<size_t>(gid)].push_back(block);
-            if (!isNullBlockIdx(block) && block > 0) {
-                blocks_to_reference.push_back(block);
-            }
-        }
-    }
-
-    selected_keys.assign(cache_keys.begin(), cache_keys.end());
-    if (is_connector) {
-        block_pool_->connectorReference(blocks_to_reference);
-    } else {
-        block_pool_->requestReference(blocks_to_reference);
-    }
-
-    for (int gid = 0; gid < group_nums; ++gid) {
-        selected_resource->mutableBlockIds(gid).assign(std::move(selected_blocks[static_cast<size_t>(gid)]));
-    }
-
-    return selected_resource;
-}
-
-void HybridTypeKVCacheAllocator::decrKVCacheRef(const KVCacheResource& kvcache_resource, bool is_connector) {
-    const int        group_nums = kvcache_resource.groupNums();
-    std::vector<int> blocks_to_free;
-    for (int gid = 0; gid < group_nums; ++gid) {
-        const auto& blocks = kvcache_resource.blocks(gid);
-        for (auto b : blocks) {
-            if (!isNullBlockIdx(b) && b > 0) {
-                blocks_to_free.push_back(b);
-            }
-        }
-    }
-    if (is_connector) {
-        block_pool_->connectorFree(blocks_to_free);
-    } else {
-        block_pool_->requestFree(blocks_to_free);
-    }
-}
-
-int HybridTypeKVCacheAllocator::seqSizePerBlock() const {
-    return static_cast<int>(config_.seq_size_per_block);
-}
-
-bool HybridTypeKVCacheAllocator::updateKVBlock(const BatchKVCacheResourcePtr& batch_kv_cache_resource,
-                                               const std::vector<int>&        block_src_batch,
-                                               bool                           copy_last_block,
-                                               std::vector<BlockIdPair>&      block_update_mapping) {
-    // TODO(chanyin): may be implemented in Base class in future
-    return true;
-}
-
-int HybridTypeKVCacheAllocator::getNeedBlocks(const MallocInfo& malloc_info) const {
-    if (!malloc_info.batch_kv_cache_resource || !malloc_info.complete_token_ids) {
-        return 0;
-    }
-    const int batch_size     = malloc_info.batch_kv_cache_resource->batchSize();
-    const int total_seq_len  = malloc_info.complete_token_ids->totalSeqLength();
-    const int common_seq_len = std::min(malloc_info.complete_token_ids->commonSeqLength(), total_seq_len);
-
-    const int seq_len      = malloc_info.complete_token_ids->seqLength();
-    const int reserve_step = malloc_info.complete_token_ids->getReserveStep();
-
-    const bool reuse_enabled    = malloc_info.reuse_cache;
-    const int  reuse_blocks_len = reuse_enabled ? malloc_info.batch_kv_cache_resource->curBlocksNum() : 0;
-
-    int common_blocks_total = 0;
-    int extra_blocks_total  = 0;
-
-    for (int gid = 0; gid < static_cast<int>(kv_cache_groups_.size()); ++gid) {
-        const auto need = kv_cache_groups_[static_cast<size_t>(gid)]->getNeedBlocks(
-            common_seq_len, seq_len, reserve_step, reuse_blocks_len, reuse_enabled);
-        common_blocks_total += need.common_blocks;
-        extra_blocks_total += need.extra_blocks;
-    }
-
-    return common_blocks_total + batch_size * extra_blocks_total;
-}
-
-int HybridTypeKVCacheAllocator::singleBatchNeedBlocks(const BatchKVCacheResourcePtr& batch_kv_cache_resource,
-                                                      int                            seq_len,
-                                                      int                            reserve_step) const {
-    int need_blocks = 0;
-    for (int gid = 0; gid < batch_kv_cache_resource->groupNums(); ++gid) {
-        const int cur_blocks = batch_kv_cache_resource->blocksNum(0, gid);
-        need_blocks += kv_cache_groups_[static_cast<size_t>(gid)]->needBlocksNum(seq_len, cur_blocks, reserve_step);
-    }
-
-    return need_blocks;
-}
-
-}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/HybridTypeKVCacheAllocator.h b/rtp_llm/cpp/cache/HybridTypeKVCacheAllocator.h
deleted file mode 100644
index 1db81d9e81..0000000000
--- a/rtp_llm/cpp/cache/HybridTypeKVCacheAllocator.h
+++ /dev/null
@@ -1,68 +0,0 @@
-#pragma once
-
-#include <memory>
-#include <unordered_map>
-#include <vector>
-
-#include "rtp_llm/cpp/cache/KVCacheAllocator.h"
-#include "rtp_llm/cpp/cache/FullKVCacheGroup.h"
-#include "rtp_llm/cpp/cache/LinearKVCacheGroup.h"
-
-namespace rtp_llm {
-
-class HybridTypeKVCacheAllocator:
-    public KVCacheAllocator,
-    public std::enable_shared_from_this<HybridTypeKVCacheAllocator> {
-public:
-    HybridTypeKVCacheAllocator(const CacheConfig&                 config,
-                               AllocationType                     allocation_type     = AllocationType::DEVICE,
-                               const kmonitor::MetricsReporterPtr metrics_reporter    = nullptr,
-                               int64_t                            reserve_block_ratio = 0);
-
-    void                   free(const FreeInfo& free_info) override;
-    void                   insertIntoCache(const InsertInfo& insert_info) override;
-    BlockAddrInfo          convertIndexToAddr(int layer_id, int block_id) const override;
-    std::vector<BlockInfo> convertIndexToBuffer(int layer_id, int block_id) const override;
-    std::vector<BlockInfo>
-    convertIndexToBuffer(int layer_id, int block_id, int partition_count, int partition_id) const override;
-    std::shared_ptr<KVCacheResource> incrKVCacheRef(const KVCacheResource& kvcache_resource,
-                                                    const CacheKeysType&   cache_keys,
-                                                    bool                   is_connector = false) override;
-    CacheLayerLayout                 allLayerCacheBase() const override;
-
-    bool updateKVBlock(const BatchKVCacheResourcePtr& batch_kv_cache_resource,
-                       const std::vector<int>&        block_src_batch,
-                       bool                           copy_last_block,
-                       std::vector<BlockIdPair>&      block_update_mapping) override;
-
-    int seqSizePerBlock() const override;
-    int singleBatchNeedBlocks(const BatchKVCacheResourcePtr& batch_kv_cache_resource,
-                              int                            seq_len,
-                              int                            reserve_step) const override;
-
-private:
-    bool         doInit() override;
-    MallocResult incrMalloc(const MallocInfo& malloc_info) override;
-    MallocResult initMallocForCommonLen(const MallocInfo& malloc_info) override;
-    int          getNeedBlocks(const MallocInfo& malloc_info) const override;
-    void         decrKVCacheRef(const KVCacheResource& kvcache_resource, bool is_connector = false) override;
-
-    // Joint match across groups. Returns reuse_blocks decided by full groups + linear groups.
-    int  reuseCache(const CacheKeysType& cache_keys, BatchKVCacheResource& kv_resource);
-    void referenceValidBlocks(const BlockIndicesType& blocks) const;
-
-private:
-    std::vector<KVCacheGroupPtr> kv_cache_groups_;
-
-    std::vector<int> full_group_ids_;
-    std::vector<int> linear_group_ids_;
-
-    // global layer id -> group id
-    std::vector<int> layer_to_group_id_;
-    // global layer id -> local layer id
-    std::vector<int> global_layer_to_local_id_;
-};
-
-using HybridTypeKVCacheAllocatorPtr = std::shared_ptr<HybridTypeKVCacheAllocator>;
-
-}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/KVCacheHashUtil.cc b/rtp_llm/cpp/cache/KVCacheHashUtil.cc
index fdc30f85ab..924adcfae3 100644
--- a/rtp_llm/cpp/cache/KVCacheHashUtil.cc
+++ b/rtp_llm/cpp/cache/KVCacheHashUtil.cc
@@ -29,6 +29,9 @@ void initCacheKeys(BatchKVCacheResourcePtr batch_kv_cache_resource,
     }
 
     batch_kv_cache_resource->setLastBlockAligned(seq_len % seq_size_per_block == 0);
+    for (int i = 0; i < batch_size; ++i) {
+        batch_kv_cache_resource->cacheResource(i).ensureLinearBlockDependencies();
+    }
 }
 
 void updateCacheKeys(BatchKVCacheResourcePtr batch_kv_cache_resource,
@@ -60,6 +63,9 @@ void updateCacheKeys(BatchKVCacheResourcePtr batch_kv_cache_resource,
 
     // After incremental update we guarantee all existing keys are for full blocks.
     batch_kv_cache_resource->setLastBlockAligned(true);
+    for (int i = 0; i < batch_size; ++i) {
+        batch_kv_cache_resource->cacheResource(i).ensureLinearBlockDependencies();
+    }
 }
 
 void dropLastPartialBlock(BatchKVCacheResourcePtr batch_kv_cache_resource) {
diff --git a/rtp_llm/cpp/cache/KVCacheManager.cc b/rtp_llm/cpp/cache/KVCacheManager.cc
index ce9b4840c6..04383fa612 100644
--- a/rtp_llm/cpp/cache/KVCacheManager.cc
+++ b/rtp_llm/cpp/cache/KVCacheManager.cc
@@ -4,9 +4,12 @@
 #include <chrono>
 #include <unordered_set>
 
-#include "rtp_llm/cpp/cache/SingleTypeKVCacheAllocator.h"
-#include "rtp_llm/cpp/cache/HybridTypeKVCacheAllocator.h"
 #include "rtp_llm/cpp/cache/BatchKVCacheResource.h"
+#include "rtp_llm/cpp/cache/spec/CacheGroupType.h"
+#include "rtp_llm/cpp/cache/allocator/HybridPoolKVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/allocator/HybridTypeKVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/allocator/SingleTypeKVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/SharedBlockCache.h"
 #include "rtp_llm/cpp/cache/connector/KVCacheConnectorCoordinator.h"
 #include "rtp_llm/cpp/cache/KVCacheHashUtil.h"
 #include "rtp_llm/cpp/metrics/RtpLLMMetrics.h"
@@ -17,6 +20,84 @@
 
 namespace rtp_llm {
 
+namespace {
+
+struct GlobalCacheMetricsSnapshot {
+    RtpLLMCacheMetricsCollector collector;
+    size_t                      total_blocks         = 0;
+    size_t                      available_blocks     = 0;
+    size_t                      request_ref_blocks   = 0;
+    size_t                      connector_ref_blocks = 0;
+};
+
+GlobalCacheMetricsSnapshot collectGlobalCacheMetrics(const KVCacheAllocatorPtr& allocator) {
+    GlobalCacheMetricsSnapshot snapshot;
+    auto                       shared_cache = allocator->sharedBlockCache();
+
+    snapshot.total_blocks         = allocator->totalBlocksNum();
+    snapshot.available_blocks     = allocator->availableBlocksNum();
+    snapshot.request_ref_blocks   = allocator->requestRefBlocksNum();
+    snapshot.connector_ref_blocks = allocator->connectorRefBlocksNum();
+
+    auto& collector = snapshot.collector;
+    collector.kv_cache_item_num             = shared_cache ? static_cast<int64_t>(shared_cache->size()) : 0;
+    collector.kv_cache_left_seq             = static_cast<int64_t>(allocator->availableTokensNum());
+    collector.kv_cache_available_blocks     = static_cast<int64_t>(snapshot.available_blocks);
+    collector.kv_cache_request_ref_blocks   = static_cast<int64_t>(snapshot.request_ref_blocks);
+    collector.kv_cache_connector_ref_blocks = static_cast<int64_t>(snapshot.connector_ref_blocks);
+    collector.kv_cache_free_blocks          = static_cast<int64_t>(allocator->freeBlocksNum());
+    collector.kv_cache_used_ratio =
+        (snapshot.total_blocks == 0) ?
+            0.0f :
+            static_cast<float>(100.0 * (snapshot.total_blocks - snapshot.available_blocks)
+                               / static_cast<double>(snapshot.total_blocks));
+    collector.mr_cost_time_ms = allocator->getMrCostTimeMs();
+
+    return snapshot;
+}
+
+void logGlobalCacheMetrics(const GlobalCacheMetricsSnapshot& snapshot) {
+    RTP_LLM_LOG_INFO("kvc raw global: total=%zu avail=%zu req_ref=%zu con_ref=%zu free=%zu items=%ld ratio=%.4f%%",
+                     snapshot.total_blocks,
+                     snapshot.available_blocks,
+                     snapshot.request_ref_blocks,
+                     snapshot.connector_ref_blocks,
+                     static_cast<size_t>(snapshot.collector.kv_cache_free_blocks),
+                     static_cast<long>(snapshot.collector.kv_cache_item_num),
+                     snapshot.collector.kv_cache_used_ratio);
+}
+
+void reportPoolCacheMetrics(const kmonitor::MetricsReporterPtr& metrics_reporter,
+                            const KVCachePoolMetricsSnapshot&   pool_snapshot,
+                            bool                                should_log) {
+    if (should_log) {
+        RTP_LLM_LOG_INFO("kvc raw pool[%s]: total=%zu avail=%zu req_ref=%zu con_ref=%zu free=%zu reserve=%zu "
+                         "ratio=%.4f%%",
+                         pool_snapshot.pool_name.c_str(),
+                         pool_snapshot.total_blocks,
+                         pool_snapshot.available_blocks,
+                         pool_snapshot.request_ref_blocks,
+                         pool_snapshot.connector_ref_blocks,
+                         pool_snapshot.free_blocks,
+                         pool_snapshot.reserve_blocks,
+                         pool_snapshot.used_ratio);
+    }
+
+    RtpLLMCachePoolMetricsCollector pool_collector;
+    pool_collector.free_blocks          = static_cast<int64_t>(pool_snapshot.free_blocks);
+    pool_collector.available_blocks     = static_cast<int64_t>(pool_snapshot.available_blocks);
+    pool_collector.request_ref_blocks   = static_cast<int64_t>(pool_snapshot.request_ref_blocks);
+    pool_collector.connector_ref_blocks = static_cast<int64_t>(pool_snapshot.connector_ref_blocks);
+    pool_collector.total_blocks         = static_cast<int64_t>(pool_snapshot.total_blocks);
+    pool_collector.reserve_blocks       = static_cast<int64_t>(pool_snapshot.reserve_blocks);
+    pool_collector.used_ratio           = pool_snapshot.used_ratio;
+
+    kmonitor::MetricsTags pool_tags("pool_name", pool_snapshot.pool_name);
+    metrics_reporter->report<RtpLLMCachePoolMetrics, RtpLLMCachePoolMetricsCollector>(&pool_tags, &pool_collector);
+}
+
+}  // namespace
+
 KVCacheManager::KVCacheManager(const CacheConfig&                 config,
                                bool                               warmup,
                                const kmonitor::MetricsReporterPtr metrics_reporter,
@@ -25,7 +106,8 @@ KVCacheManager::KVCacheManager(const CacheConfig&                 config,
                                const RuntimeConfig&               runtime_config,
                                const SpeculativeExecutionConfig&  sp_config,
                                const PDSepConfig&                 pd_sep_config,
-                               const CacheStoreConfig&            cache_store_config):
+                               const CacheStoreConfig&            cache_store_config,
+                               bool                               use_cuda_malloc_block_pool):
     config_(config),
     metrics_reporter_(metrics_reporter),
     kv_cache_config_(kv_cache_config),
@@ -33,14 +115,32 @@ KVCacheManager::KVCacheManager(const CacheConfig&                 config,
     runtime_config_(runtime_config),
     sp_config_(sp_config),
     pd_sep_config_(pd_sep_config),
-    cache_store_config_(cache_store_config) {
+    cache_store_config_(cache_store_config),
+    use_cuda_malloc_block_pool_(use_cuda_malloc_block_pool) {
     if (warmup) {
         config_.block_num = 1;
     } else {
         allocateAndSync();
     }
 
-    RTP_LLM_LOG_INFO("cache config: layer_num=%d, block_num=%d, block_size=%dB, seq_size_per_block=%zu",
+    // Page-level RR sharding context: one CPSlotMapper for the lifetime of the
+    // manager and allocator. When kv_cache_sharded=false (or tp_size==1),
+    // cp_slot_mapper_ stays nullptr and every call site stays bit-equal to the
+    // pre-RR behaviour.
+    const auto& cp_cfg = parallelism_config_.prefill_cp_config;
+    if (cp_cfg.kv_cache_sharded && parallelism_config_.tp_size > 1) {
+        cp_slot_mapper_ = std::make_shared<CPSlotMapper>(static_cast<int>(parallelism_config_.tp_rank),
+                                                         static_cast<int>(parallelism_config_.tp_size),
+                                                         static_cast<int>(config_.seq_size_per_block));
+        RTP_LLM_LOG_INFO("CP sharded KV cache enabled: tp_rank=%d, tp_size=%d, block_size=%zu, "
+                         "virtual_block_size=%d",
+                         (int)parallelism_config_.tp_rank,
+                         (int)parallelism_config_.tp_size,
+                         config_.seq_size_per_block,
+                         cp_slot_mapper_->virtualBlockSize());
+    }
+
+    RTP_LLM_LOG_INFO("cache config: layer_num=%d, block_num=%d, block_size=%zuB, seq_size_per_block=%zu",
                      config_.layer_num,
                      config_.block_num,
                      config_.block_size_bytes,
@@ -59,19 +159,42 @@ KVCacheManager::~KVCacheManager() {
 // 初始化和配置相关
 
 bool KVCacheManager::init() {
-    RTP_LLM_CHECK_WITH_INFO(!config_.cache_specs.empty(), "cache specs must not be empty");
+    RTP_LLM_CHECK_WITH_INFO(!allocator_ && !coordinator_ && !metrics_reporter_thread_.joinable(),
+                            "KVCacheManager::init called more than once");
+    RTP_LLM_CHECK_WITH_INFO(config_.groupNums() > 0, "cache specs must not be empty");
+
+    auto shared_cache = std::make_shared<SharedBlockCache>();
+    shared_cache->setPrefixTreeEnabled(kv_cache_config_.enable_gpu_prefix_tree);
+    const bool enable_independent_group_eviction = kv_cache_config_.enable_memory_cache
+                                                  && kv_cache_config_.enable_prefix_tree_memory_cache
+                                                  && kv_cache_config_.enable_independent_group_eviction;
 
     const bool is_hybrid = config_.groupNums() > 1;
-    if (is_hybrid) {
+    if (config_.use_independent_block_pools) {
+        allocator_ = std::make_shared<rtp_llm::HybridPoolKVCacheAllocator>(config_,
+                                                                           AllocationType::DEVICE,
+                                                                           metrics_reporter_,
+                                                                           kv_cache_config_.reserve_block_ratio,
+                                                                           pd_sep_config_.role_type);
+    } else if (is_hybrid) {
         allocator_ = std::make_shared<rtp_llm::HybridTypeKVCacheAllocator>(
             config_, AllocationType::DEVICE, metrics_reporter_, kv_cache_config_.reserve_block_ratio);
-        RTP_LLM_CHECK_WITH_INFO(allocator_->init(), "HybridTypeKVCacheAllocator init failed");
     } else {
         allocator_ = std::make_shared<rtp_llm::SingleTypeKVCacheAllocator>(
             config_, AllocationType::DEVICE, metrics_reporter_, kv_cache_config_.reserve_block_ratio);
-        RTP_LLM_CHECK_WITH_INFO(allocator_->init(), "SingleTypeKVCacheAllocator init failed");
     }
 
+    if (use_cuda_malloc_block_pool_) {
+        RTP_LLM_LOG_INFO("RDMA cache store enabled for PD role, use cudaMalloc KV cache block-pool backing");
+        allocator_->setUseCudaMallocBlockPool(true);
+    }
+
+    allocator_->setCPSlotMapper(cp_slot_mapper_);
+    allocator_->setSharedBlockCache(shared_cache);
+    RTP_LLM_CHECK_WITH_INFO(allocator_->init(), "KVCacheAllocator init failed");
+    shared_cache->setIndependentGroupEviction(enable_independent_group_eviction,
+                                              allocator_->independentEvictionGroupIds());
+
     if (metrics_reporter_) {
         stop_.store(false, std::memory_order_relaxed);
         metrics_reporter_thread_ = std::thread(&KVCacheManager::reportMetricsLoop, this);
@@ -86,6 +209,13 @@ const CacheConfig& KVCacheManager::cacheConfig() const {
 }
 
 const CacheConfig& KVCacheManager::getMTPModuleCacheConfig(int mtp_module_id) const {
+    RTP_LLM_CHECK_WITH_INFO(mtp_module_id >= 0 && static_cast<size_t>(mtp_module_id) < config_.mtp_sub_configs.size(),
+                            "Invalid mtp_module_id: %d, must be in range [0, %zu)",
+                            mtp_module_id,
+                            config_.mtp_sub_configs.size());
+    RTP_LLM_CHECK_WITH_INFO(config_.mtp_sub_configs[mtp_module_id] != nullptr,
+                            "mtp_sub_configs[%d] is null",
+                            mtp_module_id);
     return *config_.mtp_sub_configs[mtp_module_id];
 }
 
@@ -95,6 +225,8 @@ MallocResult KVCacheManager::malloc(const MallocInfo& malloc_info) {
     RTP_LLM_PROFILE_FUNCTION();
     RTP_LLM_CHECK(malloc_info.batch_kv_cache_resource && malloc_info.complete_token_ids);
 
+    // Cache-key computation is identical for CP and non-CP — we always have
+    // the full sequence's token ids; rolling hash is at block_size granularity.
     const int seq_size_per_block = config_.seq_size_per_block;
     if (!malloc_info.batch_kv_cache_resource->curBlocksNum()) {
         initCacheKeys(malloc_info.batch_kv_cache_resource, malloc_info.complete_token_ids, seq_size_per_block);
@@ -120,6 +252,7 @@ void KVCacheManager::insertIntoCache(const InsertInfo& insert_info) {
 int KVCacheManager::singleBatchNeedBlocks(const BatchKVCacheResourcePtr& batch_kv_cache_resource,
                                           int                            seq_len,
                                           int                            reserve_step) const {
+    RTP_LLM_CHECK_WITH_INFO(allocator_ != nullptr, "singleBatchNeedBlocks called before KVCacheManager initialized");
     return allocator_->singleBatchNeedBlocks(batch_kv_cache_resource, seq_len, reserve_step);
 }
 
@@ -149,83 +282,6 @@ bool KVCacheManager::updateKVBlock(const BatchKVCacheResourcePtr& batch_kv_cache
     return allocator_->updateKVBlock(batch_kv_cache_resource, block_src_batch, copy_last_block, block_update_mapping);
 }
 
-// Write one KV block (optionally per-layer) from host/device tensors for test
-bool KVCacheManager::setKVBlockValue(int                  block_index,
-                                     int                  layer_id,
-                                     const torch::Tensor& k_buffer,
-                                     const torch::Tensor& v_buffer) {
-    // Basic size/type validation to prevent out-of-bounds copy
-    auto&  spec             = config_.cache_specs[0];
-    size_t expected_k_bytes = spec->k_block_size_bytes();
-    size_t expected_v_bytes = spec->v_block_size_bytes();
-    size_t src_k_bytes      = k_buffer.nbytes();
-    size_t src_v_bytes      = v_buffer.nbytes();
-    if (src_k_bytes < expected_k_bytes || src_v_bytes < expected_v_bytes) {
-        RTP_LLM_LOG_ERROR("setKVBlockValue src bytes too small: k[%zu]<[%zu] or v[%zu]<[%zu]",
-                          src_k_bytes,
-                          expected_k_bytes,
-                          src_v_bytes,
-                          expected_v_bytes);
-        return false;
-    }
-
-    auto dst = allocator_->convertIndexToBuffer(layer_id, block_index);
-    RTP_LLM_CHECK_WITH_INFO(
-        !dst.empty(), "convertIndexToBuffer returned empty for layer %d, block %d", layer_id, block_index);
-    if (!dst[0].addr) {
-        RTP_LLM_LOG_ERROR("convertIndexToBuffer returned null for layer %d, block %d", layer_id, block_index);
-        return false;
-    }
-
-    auto copyFunc = [&](const torch::Tensor& src_tensor, const BlockInfo& dst_block, size_t dst_byte_offset) -> bool {
-        const size_t dst_bytes = dst_block.size_bytes;
-        const size_t src_bytes = src_tensor.nbytes();
-        if (dst_bytes < dst_byte_offset + src_bytes) {
-            RTP_LLM_LOG_ERROR("dst block bytes[%zu] < dst_offset[%zu] + src bytes[%zu] in setKVBlockValue(layer=%d)",
-                              dst_bytes,
-                              dst_byte_offset,
-                              src_bytes,
-                              layer_id);
-            return false;
-        }
-
-        auto* dst_ptr    = static_cast<char*>(dst_block.addr) + dst_byte_offset;
-        auto  dst_device = dst_block.is_cuda ? torch::kCUDA : torch::kCPU;
-        auto  src_device = src_tensor.is_cuda() ? torch::kCUDA : torch::kCPU;
-        auto  dst_t      = torch::from_blob(
-            dst_ptr, {(int64_t)src_bytes}, torch::TensorOptions().dtype(torch::kUInt8).device(dst_device));
-        auto src_t = torch::from_blob(src_tensor.data_ptr(),
-                                      {(int64_t)src_bytes},
-                                      torch::TensorOptions().dtype(torch::kUInt8).device(src_device));
-        dst_t.copy_(src_t);
-        return true;
-    };
-
-    if (!copyFunc(k_buffer, dst[0], 0)) {
-        return false;
-    }
-
-    if (!copyFunc(v_buffer, dst[0], expected_k_bytes)) {
-        return false;
-    }
-
-    cudaSyncAndCheck();
-    return true;
-}
-
-bool KVCacheManager::setKVBlockValue(int block_index, const torch::Tensor& k_buffer, const torch::Tensor& v_buffer) {
-    if (block_index < 0 || block_index >= config_.block_num) {
-        RTP_LLM_LOG_WARNING("Invalid block_index: %d, valid range: [0, %d)", block_index, config_.block_num);
-        return false;
-    }
-
-    bool all_success = true;
-    for (int layer_id = 0; layer_id < config_.layer_num; ++layer_id) {
-        all_success = setKVBlockValue(block_index, layer_id, k_buffer, v_buffer) && all_success;
-    }
-    return all_success;
-}
-
 // 地址转换和缓冲区访问
 
 BlockAddrInfo KVCacheManager::convertIndexToAddr(int block_index, int layer_id) const {
@@ -241,6 +297,34 @@ KVCacheManager::convertIndexToBuffer(int block_index, int layer_id, int partitio
     return allocator_->convertIndexToBuffer(layer_id, block_index, partition_count, partition_id);
 }
 
+BlockAddrInfo KVCacheManager::convertIndexToAddr(int block_index, int layer_id, int group_id) const {
+    return allocator_->convertIndexToAddr(layer_id, group_id, block_index);
+}
+
+std::vector<BlockInfo>
+KVCacheManager::convertIndexToBuffer(int block_index, int layer_id, int group_id) const {
+    return allocator_->convertIndexToBuffer(layer_id, group_id, block_index);
+}
+
+std::vector<BlockInfo> KVCacheManager::convertIndexToBuffer(
+    int block_index, int layer_id, int group_id, int partition_count, int partition_id) const {
+    return allocator_->convertIndexToBuffer(layer_id, group_id, block_index, partition_count, partition_id);
+}
+
+BlockAddrInfo KVCacheManager::convertIndexToAddrByTag(int block_index, int layer_id, const std::string& tag) const {
+    return allocator_->convertIndexToAddrByTag(layer_id, tag, block_index);
+}
+
+std::vector<BlockInfo>
+KVCacheManager::convertIndexToBufferByTag(int block_index, int layer_id, const std::string& tag) const {
+    return allocator_->convertIndexToBufferByTag(layer_id, tag, block_index);
+}
+
+std::vector<BlockInfo> KVCacheManager::convertIndexToBufferByTag(
+    int block_index, int layer_id, const std::string& tag, int partition_count, int partition_id) const {
+    return allocator_->convertIndexToBufferByTag(layer_id, tag, block_index, partition_count, partition_id);
+}
+
 CacheLayerLayout KVCacheManager::allLayerCacheBase() const {
     return allocator_->allLayerCacheBase();
 }
@@ -252,15 +336,23 @@ CacheLayerLayout KVCacheManager::getMainModelCacheLayerLayout() const {
     auto& all_layer_tensors = all_layout.layers_to_kv_buffer_ptrs;
     auto& all_scale_tensors = all_layout.layers_to_scale_buffer_ptrs;
 
+    layout.layer_to_group_ids.resize(config_.layer_num);
     layout.layers_to_kv_buffer_ptrs.resize(config_.layer_num);
     if (!all_scale_tensors.empty()) {
         layout.layers_to_scale_buffer_ptrs.resize(config_.layer_num);
     }
 
-    layout.layer_to_groups = config_.layer_to_group_id;
-    layout.group_types     = config_.group_types;
-    layout.layer_to_groups.resize(config_.layer_num);
-    layout.layer_attn_types.resize(config_.layer_num, CacheGroupType::FULL);
+    const auto layer_group_ids      = config_.layerGroupIdsSnapshot();
+    const auto layer_tag_to_gid     = config_.layerTagToGroupIdSnapshot();
+    layout.group_types              = config_.groupTypesSnapshot();
+    layout.group_tags               = config_.groupTagsSnapshot();
+    layout.layer_tag_to_group_id.resize(config_.layer_num);
+    layout.group_seq_size_per_block = config_.group_seq_size_per_block;
+    layout.layer_group_types.resize(config_.layer_num, CacheGroupType::FULL);
+    layout.layers_to_kv_buffer_ptrs_by_group.resize(config_.layer_num);
+    if (!all_layout.layers_to_scale_buffer_ptrs_by_group.empty()) {
+        layout.layers_to_scale_buffer_ptrs_by_group.resize(config_.layer_num);
+    }
 
     RTP_LLM_CHECK_WITH_INFO(config_.layer_num <= all_layer_tensors.size(),
                             "config_.layer_num[%d] > all_layer_tensors.size()[%ld]",
@@ -269,7 +361,6 @@ CacheLayerLayout KVCacheManager::getMainModelCacheLayerLayout() const {
 
     for (int layer_id = 0; layer_id < static_cast<int>(config_.layer_num); ++layer_id) {
         if (static_cast<size_t>(layer_id) < all_layer_tensors.size()) {
-            layout.layer_to_groups[layer_id]          = all_layout.layer_to_groups[layer_id];
             layout.layers_to_kv_buffer_ptrs[layer_id] = all_layer_tensors[layer_id];
         } else {
             RTP_LLM_CHECK(false);
@@ -282,8 +373,23 @@ CacheLayerLayout KVCacheManager::getMainModelCacheLayerLayout() const {
                 RTP_LLM_CHECK(false);
             }
         }
-        if (static_cast<size_t>(layer_id) < config_.layer_attn_types.size()) {
-            layout.layer_attn_types[layer_id] = config_.layer_attn_types[static_cast<size_t>(layer_id)];
+        if (static_cast<size_t>(layer_id) < layer_group_ids.size()) {
+            layout.layer_to_group_ids[layer_id] = layer_group_ids[static_cast<size_t>(layer_id)];
+            if (!layout.layer_to_group_ids[layer_id].empty()) {
+                layout.layer_group_types[layer_id] =
+                    config_.typeForGroup(static_cast<size_t>(layout.layer_to_group_ids[layer_id].front()));
+            }
+        }
+        if (static_cast<size_t>(layer_id) < layer_tag_to_gid.size()) {
+            layout.layer_tag_to_group_id[layer_id] = layer_tag_to_gid[static_cast<size_t>(layer_id)];
+        }
+        if (static_cast<size_t>(layer_id) < all_layout.layers_to_kv_buffer_ptrs_by_group.size()) {
+            layout.layers_to_kv_buffer_ptrs_by_group[layer_id] =
+                all_layout.layers_to_kv_buffer_ptrs_by_group[static_cast<size_t>(layer_id)];
+        }
+        if (static_cast<size_t>(layer_id) < all_layout.layers_to_scale_buffer_ptrs_by_group.size()) {
+            layout.layers_to_scale_buffer_ptrs_by_group[layer_id] =
+                all_layout.layers_to_scale_buffer_ptrs_by_group[static_cast<size_t>(layer_id)];
         }
     }
 
@@ -300,49 +406,92 @@ CacheLayerLayout KVCacheManager::getMTPModuleCacheLayerLayout(int mtp_module_id)
 
     const auto& mtp_sub_config = config_.mtp_sub_configs[mtp_module_id];
     RTP_LLM_CHECK_WITH_INFO(mtp_sub_config != nullptr, "mtp_sub_configs[%d] is null", mtp_module_id);
-    RTP_LLM_CHECK_WITH_INFO(
-        !mtp_sub_config->global_layer_ids.empty(), "mtp_sub_configs[%d]->global_layer_ids is empty", mtp_module_id);
-    RTP_LLM_CHECK_WITH_INFO(!mtp_sub_config->global_layer_ids[0].empty(),
-                            "mtp_sub_configs[%d]->global_layer_ids[0] is empty",
-                            mtp_module_id);
-
-    const auto&    mtp_global_layer_ids = mtp_sub_config->global_layer_ids[0];
-    const uint32_t mtp_layer_num        = mtp_sub_config->layer_num;
+    const uint32_t mtp_layer_num = mtp_sub_config->layer_num;
+    const int      mtp_global_layer_base = static_cast<int>(config_.layer_num)
+                                      + mtp_module_id * static_cast<int>(mtp_layer_num);
+    std::vector<int> global_layer_for_local(mtp_layer_num, -1);
+    for (size_t local_gid = 0; local_gid < static_cast<size_t>(mtp_sub_config->groupNums()); ++local_gid) {
+        for (int global_layer_id : mtp_sub_config->layerIdsForGroup(local_gid)) {
+            const int local_layer_id = global_layer_id - mtp_global_layer_base;
+            RTP_LLM_CHECK_WITH_INFO(local_layer_id >= 0 && local_layer_id < static_cast<int>(mtp_layer_num),
+                                    "mtp_sub_configs[%d] global layer %d is outside local range [%d, %d)",
+                                    mtp_module_id,
+                                    global_layer_id,
+                                    mtp_global_layer_base,
+                                    mtp_global_layer_base + static_cast<int>(mtp_layer_num));
+            global_layer_for_local[static_cast<size_t>(local_layer_id)] = global_layer_id;
+        }
+    }
+    for (uint32_t local_layer_id = 0; local_layer_id < mtp_layer_num; ++local_layer_id) {
+        RTP_LLM_CHECK_WITH_INFO(global_layer_for_local[local_layer_id] >= 0,
+                                "mtp_sub_configs[%d] has no global layer for local layer %u",
+                                mtp_module_id,
+                                local_layer_id);
+    }
 
     auto  all_layout        = allocator_->allLayerCacheBase();
     auto& all_layer_tensors = all_layout.layers_to_kv_buffer_ptrs;
     auto& all_scale_tensors = all_layout.layers_to_scale_buffer_ptrs;
 
-    layout.layer_to_groups.resize(mtp_layer_num);
     layout.layers_to_kv_buffer_ptrs.resize(mtp_layer_num);
     if (!all_scale_tensors.empty()) {
         layout.layers_to_scale_buffer_ptrs.resize(mtp_layer_num);
     }
-    layout.layer_attn_types.resize(mtp_layer_num, CacheGroupType::FULL);
+    layout.layer_group_types.resize(mtp_layer_num, CacheGroupType::FULL);
+    layout.group_tags               = mtp_sub_config->groupTagsSnapshot();
+    layout.group_types              = mtp_sub_config->groupTypesSnapshot();
+    layout.group_seq_size_per_block = mtp_sub_config->group_seq_size_per_block;
+
+    const size_t group_count = layout.group_tags.size();
+    layout.layers_to_kv_buffer_ptrs_by_group.assign(mtp_layer_num, std::vector<torch::Tensor>(group_count));
+    layout.layers_to_scale_buffer_ptrs_by_group.assign(mtp_layer_num, std::vector<torch::Tensor>(group_count));
+    layout.layer_to_group_ids.resize(mtp_layer_num);
+    layout.layer_tag_to_group_id.resize(mtp_layer_num);
 
     for (uint32_t local_layer_id = 0; local_layer_id < mtp_layer_num; ++local_layer_id) {
-        if (local_layer_id < mtp_global_layer_ids.size()) {
-            const int global_layer_id = mtp_global_layer_ids[local_layer_id];
+        const int global_layer_id = global_layer_for_local[local_layer_id];
+
+        if (global_layer_id >= 0 && static_cast<size_t>(global_layer_id) < all_layer_tensors.size()) {
+            layout.layers_to_kv_buffer_ptrs[local_layer_id] = all_layer_tensors[global_layer_id];
+        } else {
+            RTP_LLM_CHECK(false);
+        }
 
-            if (global_layer_id >= 0 && static_cast<size_t>(global_layer_id) < all_layer_tensors.size()) {
-                layout.layer_to_groups[local_layer_id]          = all_layout.layer_to_groups[global_layer_id];
-                layout.layers_to_kv_buffer_ptrs[local_layer_id] = all_layer_tensors[global_layer_id];
+        if (!all_scale_tensors.empty()) {
+            if (global_layer_id >= 0 && static_cast<size_t>(global_layer_id) < all_scale_tensors.size()) {
+                layout.layers_to_scale_buffer_ptrs[local_layer_id] = all_scale_tensors[global_layer_id];
             } else {
                 RTP_LLM_CHECK(false);
             }
+        }
+
+        for (size_t local_gid = 0; local_gid < group_count; ++local_gid) {
+            const auto& tag        = mtp_sub_config->tagForGroup(local_gid);
+            const int   global_gid = config_.groupIdForTag(tag);
+            const auto& group_layers = mtp_sub_config->layerIdsForGroup(local_gid);
+            if (std::find(group_layers.begin(), group_layers.end(), global_layer_id) == group_layers.end()) {
+                continue;
+            }
 
-            if (!all_scale_tensors.empty()) {
-                if (global_layer_id >= 0 && static_cast<size_t>(global_layer_id) < all_scale_tensors.size()) {
-                    layout.layers_to_scale_buffer_ptrs[local_layer_id] = all_scale_tensors[global_layer_id];
-                } else {
-                    RTP_LLM_CHECK(false);
+            layout.layer_to_group_ids[local_layer_id].push_back(static_cast<int>(local_gid));
+            layout.layer_tag_to_group_id[local_layer_id][tag] = static_cast<int>(local_gid);
+            layout.layer_group_types[local_layer_id]          = mtp_sub_config->typeForGroup(local_gid);
+
+            if (static_cast<size_t>(global_layer_id) < all_layout.layers_to_kv_buffer_ptrs_by_group.size()) {
+                const auto& src_kv = all_layout.layers_to_kv_buffer_ptrs_by_group[static_cast<size_t>(global_layer_id)];
+                if (global_gid >= 0 && static_cast<size_t>(global_gid) < src_kv.size()) {
+                    layout.layers_to_kv_buffer_ptrs_by_group[local_layer_id][local_gid] =
+                        src_kv[static_cast<size_t>(global_gid)];
                 }
             }
-            if (local_layer_id < mtp_sub_config->layer_attn_types.size()) {
-                layout.layer_attn_types[local_layer_id] = mtp_sub_config->layer_attn_types[local_layer_id];
+            if (static_cast<size_t>(global_layer_id) < all_layout.layers_to_scale_buffer_ptrs_by_group.size()) {
+                const auto& src_scale =
+                    all_layout.layers_to_scale_buffer_ptrs_by_group[static_cast<size_t>(global_layer_id)];
+                if (global_gid >= 0 && static_cast<size_t>(global_gid) < src_scale.size()) {
+                    layout.layers_to_scale_buffer_ptrs_by_group[local_layer_id][local_gid] =
+                        src_scale[static_cast<size_t>(global_gid)];
+                }
             }
-        } else {
-            RTP_LLM_CHECK(false);
         }
     }
 
@@ -385,37 +534,40 @@ size_t KVCacheManager::maxAvailableTokensNum() const {
 
 KVCacheInfo KVCacheManager::getKVCacheInfo(int64_t latest_version, bool need_cache_keys) const {
     KVCacheInfo info;
+    info.version = latest_version;
 
     if (!allocator_) {
         RTP_LLM_LOG_ERROR("getKVCacheInfo called before KVCacheManager initialized");
-        info.version = latest_version;
         return info;
     }
 
     if (need_cache_keys) {
         std::unordered_set<CacheKeyType> all_keys;
         // device cache keys
-        auto block_cache = allocator_->getBlockPool()->blockCache();
-        auto snapshot    = block_cache->cacheSnapshot(latest_version);
-        for (const auto& cacheItem : snapshot.values) {
-            all_keys.insert(cacheItem.cache_key);
+        std::vector<CacheKeyType> device_cache_keys;
+        auto                      shared_cache = allocator_->sharedBlockCache();
+        if (shared_cache) {
+            device_cache_keys = shared_cache->allCacheKeys();
+            all_keys.insert(device_cache_keys.begin(), device_cache_keys.end());
+            info.version = shared_cache->version();
         }
         // memory cache keys
+        RTP_LLM_CHECK_WITH_INFO(coordinator_ != nullptr,
+                                "getKVCacheInfo called before KVCacheManager coordinator initialized");
         const auto mem_cache_keys = coordinator_->memoryCacheKeys();
         all_keys.insert(mem_cache_keys.begin(), mem_cache_keys.end());
 
         info.cached_keys.assign(all_keys.begin(), all_keys.end());
-        info.version = snapshot.version;
     }
 
-    const size_t block_size_tokens = config_.seq_size_per_block;
-    const size_t total_blocks      = allocator_->totalBlocksNum();
-    const size_t available_blocks  = allocator_->availableBlocksNum();
+    const size_t block_size_tokens = cp_slot_mapper_ && cp_slot_mapper_->isSharded() ?
+                                         cp_slot_mapper_->virtualBlockSize() :
+                                         config_.seq_size_per_block;
 
+    const auto capacity     = allocator_->tokenCapacity(block_size_tokens);
     info.block_size         = block_size_tokens;
-    info.total_kv_cache     = total_blocks * block_size_tokens;
-    info.available_kv_cache = available_blocks * block_size_tokens;
-    // cached_keys left empty for now; can be populated when distributed cache is wired up.
+    info.total_kv_cache     = capacity.total_tokens;
+    info.available_kv_cache = capacity.available_tokens;
 
     return info;
 }
@@ -455,16 +607,19 @@ bool KVCacheManager::hasP2PConnector() const {
 std::shared_ptr<AsyncContext>
 KVCacheManager::asyncLoadCache(const std::shared_ptr<KVCacheConnectorReadWriteContext>& connector_context) {
     RTP_LLM_PROFILE_FUNCTION();
+    RTP_LLM_CHECK_WITH_INFO(coordinator_ != nullptr, "asyncLoadCache called before KVCacheManager initialized");
     return coordinator_->asyncRead(connector_context);
 }
 
 std::shared_ptr<AsyncContext>
 KVCacheManager::asyncStoreCache(const std::shared_ptr<KVCacheConnectorReadWriteContext>& connector_context) {
     RTP_LLM_PROFILE_FUNCTION();
+    RTP_LLM_CHECK_WITH_INFO(coordinator_ != nullptr, "asyncStoreCache called before KVCacheManager initialized");
     return coordinator_->asyncWrite(connector_context);
 }
 
 bool KVCacheManager::executeFunction(const FunctionRequestPB& request, FunctionResponsePB& response) {
+    RTP_LLM_CHECK_WITH_INFO(coordinator_ != nullptr, "executeFunction called before KVCacheManager initialized");
     return coordinator_->executeFunction(request, response);
 }
 
@@ -489,6 +644,7 @@ void KVCacheManager::initConnectorCoordinator() {
 }
 
 void KVCacheManager::allocateAndSync() {
+    RTP_LLM_LOG_INFO("allocateAndSync start, block_num=%d", config_.block_num);
     size_t world_size = parallelism_config_.tp_size * parallelism_config_.dp_size;
     if (world_size > 1) {
         size_t local_rank    = parallelism_config_.tp_size * parallelism_config_.dp_rank + parallelism_config_.tp_rank;
@@ -505,41 +661,37 @@ void KVCacheManager::allocateAndSync() {
             config_.block_num = *std::min_element(block_num_ptr, block_num_ptr + world_size);
         }
     }
+    if (config_.use_independent_block_pools) {
+        config_.finalizeBlockNums(static_cast<uint32_t>(config_.block_num), runtime_config_);
+    }
     RTP_LLM_LOG_INFO("block_num is %d after tp sync", config_.block_num);
 }
 
 void KVCacheManager::reportMetricsLoop() {
     RTP_LLM_PROFILE_FUNCTION();
     kmonitor::MetricsTags tags;
+    constexpr auto kLogInterval  = std::chrono::minutes(1);
+    auto           last_log_time = std::chrono::steady_clock::now() - kLogInterval;
     while (!stop_.load(std::memory_order_relaxed)) {
         if (!metrics_reporter_ || !allocator_) {
             std::this_thread::sleep_for(std::chrono::seconds(1));
             continue;
         }
 
-        RtpLLMCacheMetricsCollector collector;
-
-        auto block_pool  = allocator_->getBlockPool();
-        auto block_cache = block_pool ? block_pool->blockCache() : nullptr;
-
-        const auto total_blocks         = allocator_->totalBlocksNum();
-        const auto available_blocks     = allocator_->availableBlocksNum();
-        const auto request_ref_blocks   = allocator_->requestRefBlocksNum();
-        const auto connector_ref_blocks = allocator_->connectorRefBlocksNum();
-
-        collector.kv_cache_item_num             = block_cache ? static_cast<int64_t>(block_cache->size()) : 0;
-        collector.kv_cache_left_seq             = static_cast<int64_t>(available_blocks * config_.seq_size_per_block);
-        collector.kv_cache_available_blocks     = static_cast<int64_t>(available_blocks);
-        collector.kv_cache_request_ref_blocks   = static_cast<int64_t>(request_ref_blocks);
-        collector.kv_cache_connector_ref_blocks = static_cast<int64_t>(connector_ref_blocks);
-        collector.kv_cache_free_blocks          = static_cast<int64_t>(allocator_->freeBlocksNum());
-        collector.kv_cache_used_ratio =
-            (total_blocks == 0) ?
-                0.0f :
-                static_cast<float>(100.0 * (total_blocks - available_blocks) / static_cast<double>(total_blocks));
-        collector.mr_cost_time_ms = allocator_->getMrCostTimeMs();
-
-        metrics_reporter_->report<RtpLLMCacheMetrics, RtpLLMCacheMetricsCollector>(&tags, &collector);
+        auto global_metrics = collectGlobalCacheMetrics(allocator_);
+        metrics_reporter_->report<RtpLLMCacheMetrics, RtpLLMCacheMetricsCollector>(&tags, &global_metrics.collector);
+
+        const auto now        = std::chrono::steady_clock::now();
+        const bool should_log = (now - last_log_time) >= kLogInterval;
+        if (should_log) {
+            last_log_time = now;
+            logGlobalCacheMetrics(global_metrics);
+        }
+
+        for (const auto& pool_snapshot : allocator_->poolMetricsSnapshots()) {
+            reportPoolCacheMetrics(metrics_reporter_, pool_snapshot, should_log);
+        }
+
         std::this_thread::sleep_for(std::chrono::seconds(1));  // 1s
     }
 }
@@ -547,9 +699,87 @@ void KVCacheManager::reportMetricsLoop() {
 void KVCacheManager::handleRead(const P2PConnectorStartLoadRequestPB& request,
                                 P2PConnectorStartLoadResponsePB&      response,
                                 std::function<bool()>                 is_cancelled) {
-    if (coordinator_) {
-        coordinator_->handleRead(request, response, is_cancelled);
+    RTP_LLM_CHECK_WITH_INFO(coordinator_ != nullptr, "handleRead called before KVCacheManager initialized");
+    coordinator_->handleRead(request, response, is_cancelled);
+}
+
+// Write one KV block (optionally per-layer) from host/device tensors for test
+bool KVCacheManager::writeKVBlockForTest(int                  block_index,
+                                          int                  layer_id,
+                                          const torch::Tensor& k_buffer,
+                                          const torch::Tensor& v_buffer) {
+    // Basic size/type validation to prevent out-of-bounds copy
+    auto&  spec             = config_.specForGroup(0);
+    size_t expected_k_bytes = spec->k_block_size_bytes();
+    size_t expected_v_bytes = spec->v_block_size_bytes();
+    size_t src_k_bytes      = k_buffer.nbytes();
+    size_t src_v_bytes      = v_buffer.nbytes();
+    if (src_k_bytes < expected_k_bytes || src_v_bytes < expected_v_bytes) {
+        RTP_LLM_LOG_ERROR("writeKVBlockForTest src bytes too small: k[%zu]<[%zu] or v[%zu]<[%zu]",
+                          src_k_bytes,
+                          expected_k_bytes,
+                          src_v_bytes,
+                          expected_v_bytes);
+        return false;
     }
+
+    auto dst = allocator_->convertIndexToBuffer(layer_id, block_index);
+    RTP_LLM_CHECK_WITH_INFO(
+        !dst.empty(), "convertIndexToBuffer returned empty for layer %d, block %d", layer_id, block_index);
+    if (!dst[0].addr) {
+        RTP_LLM_LOG_ERROR("convertIndexToBuffer returned null for layer %d, block %d", layer_id, block_index);
+        return false;
+    }
+
+    auto copyFunc = [&](const torch::Tensor& src_tensor,
+                        const BlockInfo&     dst_block,
+                        size_t               dst_byte_offset,
+                        size_t               copy_bytes) -> bool {
+        const size_t dst_bytes = dst_block.size_bytes;
+        if (dst_bytes < dst_byte_offset + copy_bytes) {
+            RTP_LLM_LOG_ERROR("dst block bytes[%zu] < dst_offset[%zu] + copy bytes[%zu] in writeKVBlockForTest(layer=%d)",
+                              dst_bytes,
+                              dst_byte_offset,
+                              copy_bytes,
+                              layer_id);
+            return false;
+        }
+
+        auto* dst_ptr    = static_cast<char*>(dst_block.addr) + dst_byte_offset;
+        auto  dst_device = dst_block.is_cuda ? torch::kCUDA : torch::kCPU;
+        auto  src_device = src_tensor.is_cuda() ? torch::kCUDA : torch::kCPU;
+        auto  dst_t      = torch::from_blob(
+            dst_ptr, {(int64_t)copy_bytes}, torch::TensorOptions().dtype(torch::kUInt8).device(dst_device));
+        auto src_t = torch::from_blob(src_tensor.data_ptr(),
+                                      {(int64_t)copy_bytes},
+                                      torch::TensorOptions().dtype(torch::kUInt8).device(src_device));
+        dst_t.copy_(src_t);
+        return true;
+    };
+
+    if (!copyFunc(k_buffer, dst[0], 0, expected_k_bytes)) {
+        return false;
+    }
+
+    if (!copyFunc(v_buffer, dst[0], expected_k_bytes, expected_v_bytes)) {
+        return false;
+    }
+
+    cudaSyncAndCheck();
+    return true;
+}
+
+bool KVCacheManager::writeKVBlockForTest(int block_index, const torch::Tensor& k_buffer, const torch::Tensor& v_buffer) {
+    if (block_index < 0 || block_index >= config_.block_num) {
+        RTP_LLM_LOG_WARNING("Invalid block_index: %d, valid range: [0, %d)", block_index, config_.block_num);
+        return false;
+    }
+
+    bool all_success = true;
+    for (int layer_id = 0; layer_id < config_.layer_num; ++layer_id) {
+        all_success = writeKVBlockForTest(block_index, layer_id, k_buffer, v_buffer) && all_success;
+    }
+    return all_success;
 }
 
 }  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/KVCacheManager.h b/rtp_llm/cpp/cache/KVCacheManager.h
index 7d84ed9f7f..69bc543ae2 100644
--- a/rtp_llm/cpp/cache/KVCacheManager.h
+++ b/rtp_llm/cpp/cache/KVCacheManager.h
@@ -3,6 +3,8 @@
 #include <atomic>
 #include <cassert>
 #include <functional>
+#include <mutex>
+#include <string>
 #include <thread>
 #include <vector>
 
@@ -10,7 +12,7 @@
 #include "rtp_llm/cpp/cache/BufferTypes.h"
 #include "rtp_llm/cpp/cache/CacheConfig.h"
 #include "rtp_llm/cpp/cache/connector/AsyncContext.h"
-#include "rtp_llm/cpp/cache/KVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h"
 #include "rtp_llm/cpp/config/ConfigModules.h"
 #include "rtp_llm/cpp/cache/connector/KVCacheConnector.h"
 #include "rtp_llm/cpp/model_rpc/proto/model_rpc_service.grpc.pb.h"
@@ -25,14 +27,15 @@ class KVCacheConnectorReadWriteContext;
 class KVCacheManager {
 public:
     KVCacheManager(const CacheConfig&                 config,
-                   bool                               warmup             = false,
-                   const kmonitor::MetricsReporterPtr metrics_reporter   = nullptr,
-                   const KVCacheConfig&               kv_cache_config    = KVCacheConfig{},
-                   const ParallelismConfig&           parallelism_config = ParallelismConfig{},
-                   const RuntimeConfig&               runtime_config     = RuntimeConfig{},
-                   const SpeculativeExecutionConfig&  sp_config          = SpeculativeExecutionConfig{},
-                   const PDSepConfig&                 pd_sep_config      = PDSepConfig{},
-                   const CacheStoreConfig&            cache_store_config = CacheStoreConfig{});
+                   bool                               warmup                     = false,
+                   const kmonitor::MetricsReporterPtr metrics_reporter           = nullptr,
+                   const KVCacheConfig&               kv_cache_config            = KVCacheConfig{},
+                   const ParallelismConfig&           parallelism_config         = ParallelismConfig{},
+                   const RuntimeConfig&               runtime_config             = RuntimeConfig{},
+                   const SpeculativeExecutionConfig&  sp_config                  = SpeculativeExecutionConfig{},
+                   const PDSepConfig&                 pd_sep_config              = PDSepConfig{},
+                   const CacheStoreConfig&            cache_store_config         = CacheStoreConfig{},
+                   bool                               use_cuda_malloc_block_pool = false);
     ~KVCacheManager();
 
     // 初始化和配置相关
@@ -60,16 +63,19 @@ class KVCacheManager {
                        bool                           copy_last_block,
                        std::vector<BlockIdPair>&      block_update_mapping);
 
-    // Write one KV block (optionally per-layer) from host/device tensors for test
-    virtual bool
-    setKVBlockValue(int block_index, int layer_id, const torch::Tensor& k_buffer, const torch::Tensor& v_buffer);
-    virtual bool setKVBlockValue(int block_index, const torch::Tensor& k_buffer, const torch::Tensor& v_buffer);
-
     // 地址转换和缓冲区访问
     BlockAddrInfo          convertIndexToAddr(int block_index, int layer_id) const;
     std::vector<BlockInfo> convertIndexToBuffer(int block_index, int layer_id) const;
     std::vector<BlockInfo>
-    convertIndexToBuffer(int block_index, int layer_id, int partition_count, int partition_id) const;
+                  convertIndexToBuffer(int block_index, int layer_id, int partition_count, int partition_id) const;
+    BlockAddrInfo          convertIndexToAddr(int block_index, int layer_id, int group_id) const;
+    std::vector<BlockInfo> convertIndexToBuffer(int block_index, int layer_id, int group_id) const;
+    std::vector<BlockInfo>
+    convertIndexToBuffer(int block_index, int layer_id, int group_id, int partition_count, int partition_id) const;
+    BlockAddrInfo          convertIndexToAddrByTag(int block_index, int layer_id, const std::string& tag) const;
+    std::vector<BlockInfo> convertIndexToBufferByTag(int block_index, int layer_id, const std::string& tag) const;
+    std::vector<BlockInfo> convertIndexToBufferByTag(
+        int block_index, int layer_id, const std::string& tag, int partition_count, int partition_id) const;
 
     CacheLayerLayout allLayerCacheBase() const;
 
@@ -124,6 +130,18 @@ class KVCacheManager {
     std::shared_ptr<KVCacheResource>
     incrKVCacheRef(const KVCacheResource& resource, const CacheKeysType& cache_keys, bool is_connector = true);
 
+    // CP page-level RR sharding context. Returns nullptr when sharding is not active
+    // (single-rank or kv_cache_sharded=false).  Used by connector / cache_store to
+    // remap cacheKeys -> last-rank-key namespace.
+    std::shared_ptr<CPSlotMapper> cpSlotMapper() const {
+        return cp_slot_mapper_;
+    }
+
+    // Write one KV block (optionally per-layer) from host/device tensors for test
+    virtual bool
+    writeKVBlockForTest(int block_index, int layer_id, const torch::Tensor& k_buffer, const torch::Tensor& v_buffer);
+    virtual bool writeKVBlockForTest(int block_index, const torch::Tensor& k_buffer, const torch::Tensor& v_buffer);
+
 private:
     void initConnectorCoordinator();
     void allocateAndSync();
@@ -140,6 +158,9 @@ class KVCacheManager {
     const SpeculativeExecutionConfig   sp_config_;
     const PDSepConfig                  pd_sep_config_;
     const CacheStoreConfig             cache_store_config_;
+    const bool                         use_cuda_malloc_block_pool_;
+
+    std::shared_ptr<CPSlotMapper> cp_slot_mapper_;
 
     std::atomic<bool> stop_{false};
     std::thread       metrics_reporter_thread_;
@@ -150,4 +171,4 @@ class KVCacheManager {
     std::shared_ptr<CacheStore> cache_store_;
 };
 
-}  // namespace rtp_llm
\ No newline at end of file
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/KVCacheResource.cc b/rtp_llm/cpp/cache/KVCacheResource.cc
index fda4bbbcb3..e7b72b36b6 100644
--- a/rtp_llm/cpp/cache/KVCacheResource.cc
+++ b/rtp_llm/cpp/cache/KVCacheResource.cc
@@ -1,5 +1,8 @@
 #include "rtp_llm/cpp/cache/KVCacheResource.h"
 
+#include <algorithm>
+
+
 namespace rtp_llm {
 
 size_t BlockIds::blocksNum() const {
@@ -44,8 +47,18 @@ void BlockIds::remove(const std::vector<size_t>& indices) {
 }
 
 void BlockIds::swap(size_t pos_a, size_t pos_b) {
-    RTP_LLM_CHECK(pos_a < block_indices.size());
-    RTP_LLM_CHECK(pos_b < block_indices.size());
+    if (pos_a >= block_indices.size() || pos_b >= block_indices.size()) {
+        RTP_LLM_LOG_ERROR("BlockIds::swap: pos_a=%d or pos_b=%d is out of range, block_indices.size()=%d",
+                          pos_a,
+                          pos_b,
+                          block_indices.size());
+        RTP_LLM_CHECK_WITH_INFO(false,
+                                "BlockIds::swap: pos_a=%d or pos_b=%d is out of range, block_indices.size()=%d",
+                                pos_a,
+                                pos_b,
+                                block_indices.size());
+    }
+
     if (pos_a == pos_b) {
         return;
     }
@@ -109,36 +122,54 @@ void BlockIds::syncKernelBlocks() {
     }
 }
 
-void KVCacheResource::initGroups(int                                group_num,
-                                 int                                layer_num,
-                                 const std::vector<int>&            layer_to_group_id,
-                                 size_t                             kernel_blocks_per_kv_block,
-                                 const std::vector<CacheGroupType>& group_types) {
+void KVCacheResource::initGroups(int                                  group_num,
+                                 int                                  layer_num,
+                                 const std::vector<std::vector<int>>& layer_group_ids,
+                                 size_t                               kernel_blocks_per_kv_block,
+                                 const std::vector<CacheGroupType>&   group_types) {
     group_block_ids.clear();
     layer_block_ids.clear();
+    layer_group_block_ids.clear();
+
+    if (!group_types.empty()) {
+        RTP_LLM_CHECK_WITH_INFO(group_types.size() >= static_cast<size_t>(group_num),
+                                "KVCacheResource::initGroups: group_types size %zu < group_num %d",
+                                group_types.size(),
+                                group_num);
+    }
 
     group_block_ids.reserve(static_cast<size_t>(group_num));
     for (int i = 0; i < group_num; i++) {
-        const bool   is_full = group_types.empty() || group_types[static_cast<size_t>(i)] == CacheGroupType::FULL;
-        const size_t group_kernel_blocks_per_kv_block = is_full ? kernel_blocks_per_kv_block : 1;
-        auto         bid                              = std::make_shared<BlockIds>(group_kernel_blocks_per_kv_block);
+        const bool   is_full_group = group_types.empty() || group_types[static_cast<size_t>(i)] == CacheGroupType::FULL;
+        const size_t bpk           = is_full_group ? std::max<size_t>(1, kernel_blocks_per_kv_block) : 1;
+        auto         bid           = std::make_shared<BlockIds>(bpk);
         group_block_ids.push_back(std::move(bid));
     }
 
     if (!group_block_ids.empty()) {
-        RTP_LLM_CHECK_WITH_INFO(layer_to_group_id.empty() || layer_to_group_id.size() >= static_cast<size_t>(layer_num),
-                                "KVCacheResource::initGroups: layer_to_group_id size %zu < layer_num %d",
-                                layer_to_group_id.size(),
+        RTP_LLM_CHECK_WITH_INFO(layer_group_ids.size() >= static_cast<size_t>(layer_num),
+                                "KVCacheResource::initGroups: layer_group_ids size %zu < layer_num %d",
+                                layer_group_ids.size(),
                                 layer_num);
-        layer_block_ids.resize(layer_num);
-        for (int i = 0; i < layer_num; ++i) {
-            int gid = layer_to_group_id.empty() ? 0 : layer_to_group_id[i];
-            RTP_LLM_CHECK_WITH_INFO(gid >= 0 && gid < group_num,
-                                    "KVCacheResource::initGroups: invalid group id %d for layer %d (group_num=%d)",
-                                    gid,
-                                    i,
-                                    group_num);
-            layer_block_ids[i] = group_block_ids[gid];
+        layer_block_ids.resize(static_cast<size_t>(layer_num));
+        layer_group_block_ids.resize(static_cast<size_t>(layer_num));
+        for (int layer = 0; layer < layer_num; ++layer) {
+            auto& group_blocks = layer_group_block_ids[static_cast<size_t>(layer)];
+            group_blocks.assign(static_cast<size_t>(group_num), nullptr);
+
+            const auto& gids = layer_group_ids[static_cast<size_t>(layer)];
+            for (int gid : gids) {
+                RTP_LLM_CHECK_WITH_INFO(
+                    gid >= 0 && gid < group_num,
+                    "KVCacheResource::initGroups: invalid group id %d for layer %d (group_num=%d)",
+                    gid,
+                    layer,
+                    group_num);
+                group_blocks[static_cast<size_t>(gid)] = group_block_ids[static_cast<size_t>(gid)];
+            }
+            if (gids.size() == 1) {
+                layer_block_ids[static_cast<size_t>(layer)] = group_block_ids[static_cast<size_t>(gids.front())];
+            }
         }
     }
 }
@@ -159,16 +190,33 @@ const BlockIndicesType& KVCacheResource::blocks(int group_id) const {
     return group_block_ids[group_id]->blocks();
 }
 
+const BlockIndicesType& KVCacheResource::blocks(int layer_id, int group_id) const {
+    return mutableBlockIds(layer_id, group_id).blocks();
+}
+
 const BlockIndicesType& KVCacheResource::kernelBlocks(int group_id) const {
     RTP_LLM_CHECK(group_block_ids.size() > static_cast<size_t>(group_id));
     return group_block_ids[group_id]->kernelBlocks();
 }
 
+const BlockIndicesType& KVCacheResource::kernelBlocks(int layer_id, int group_id) const {
+    return mutableBlockIds(layer_id, group_id).kernelBlocks();
+}
+
 BlockIds& KVCacheResource::mutableBlockIds(int group_id) const {
     RTP_LLM_CHECK(group_block_ids.size() > static_cast<size_t>(group_id));
     return *group_block_ids[group_id];
 }
 
+BlockIds& KVCacheResource::mutableBlockIds(int layer_id, int group_id) const {
+    RTP_LLM_CHECK(static_cast<size_t>(layer_id) < layer_group_block_ids.size());
+    RTP_LLM_CHECK(static_cast<size_t>(group_id) < layer_group_block_ids[static_cast<size_t>(layer_id)].size());
+    auto block_ids = layer_group_block_ids[static_cast<size_t>(layer_id)][static_cast<size_t>(group_id)];
+    RTP_LLM_CHECK_WITH_INFO(
+        block_ids != nullptr, "KVCacheResource: missing block ids for layer %d group_id %d", layer_id, group_id);
+    return *block_ids;
+}
+
 int KVCacheResource::groupNums() const {
     return static_cast<int>(group_block_ids.size());
 }
@@ -185,6 +233,19 @@ const LayerBlockIds& KVCacheResource::layerBlocks() const {
     return layer_block_ids;
 }
 
+const LayerAttnBlockIds& KVCacheResource::layerGroupBlocks() const {
+    return layer_group_block_ids;
+}
+
+int KVCacheResource::groupId(int layer_id, int group_id) const {
+    RTP_LLM_CHECK(static_cast<size_t>(layer_id) < layer_group_block_ids.size());
+    if (group_id < 0 || static_cast<size_t>(group_id) >= layer_group_block_ids[static_cast<size_t>(layer_id)].size()
+        || !layer_group_block_ids[static_cast<size_t>(layer_id)][static_cast<size_t>(group_id)]) {
+        return -1;
+    }
+    return group_id;
+}
+
 CacheKeysType& KVCacheResource::cacheKeys() {
     return cache_keys;
 }
@@ -193,6 +254,62 @@ const CacheKeysType& KVCacheResource::cacheKeys() const {
     return cache_keys;
 }
 
+void KVCacheResource::setCacheKeys(const CacheKeysType& keys) {
+    cache_keys = keys;
+    cache_keys_are_cp_canonical_ = false;
+    rebuildLinearBlockDependencies();
+}
+
+void KVCacheResource::setCacheKeys(CacheKeysType&& keys) {
+    cache_keys = std::move(keys);
+    cache_keys_are_cp_canonical_ = false;
+    rebuildLinearBlockDependencies();
+}
+
+bool KVCacheResource::cacheKeysAreCpCanonical() const {
+    return cache_keys_are_cp_canonical_;
+}
+
+void KVCacheResource::setCacheKeysAreCpCanonical(bool cache_keys_are_cp_canonical) {
+    cache_keys_are_cp_canonical_ = cache_keys_are_cp_canonical;
+}
+
+BlockDependenciesType& KVCacheResource::blockDependencies() {
+    return block_dependencies;
+}
+
+const BlockDependenciesType& KVCacheResource::blockDependencies() const {
+    return block_dependencies;
+}
+
+void KVCacheResource::setBlockDependencies(const BlockDependenciesType& dependencies) {
+    block_dependencies = dependencies;
+}
+
+void KVCacheResource::setBlockDependencies(BlockDependenciesType&& dependencies) {
+    block_dependencies = std::move(dependencies);
+}
+
+void KVCacheResource::rebuildLinearBlockDependencies() {
+    block_dependencies.clear();
+    block_dependencies.reserve(cache_keys.size());
+    for (size_t i = 0; i < cache_keys.size(); ++i) {
+        BlockDependency dependency;
+        dependency.ordinal = static_cast<uint32_t>(i);
+        if (i > 0) {
+            dependency.has_parent = true;
+            dependency.parent_key = cache_keys[i - 1];
+        }
+        block_dependencies.push_back(dependency);
+    }
+}
+
+void KVCacheResource::ensureLinearBlockDependencies() {
+    if (block_dependencies.size() != cache_keys.size()) {
+        rebuildLinearBlockDependencies();
+    }
+}
+
 size_t KVCacheResource::reuseBlockNum() const {
     return device_reuse_block_num_ + memory_reuse_block_num_ + remote_reuse_block_num_;
 }
diff --git a/rtp_llm/cpp/cache/KVCacheResource.h b/rtp_llm/cpp/cache/KVCacheResource.h
index a1ebe5219a..7de9c6c244 100644
--- a/rtp_llm/cpp/cache/KVCacheResource.h
+++ b/rtp_llm/cpp/cache/KVCacheResource.h
@@ -6,7 +6,7 @@
 #include <string>
 #include <vector>
 
-#include "rtp_llm/cpp/cache/CacheGroupType.h"
+#include "rtp_llm/cpp/cache/spec/CacheGroupType.h"
 #include "rtp_llm/cpp/utils/AssertUtils.h"
 
 namespace rtp_llm {
@@ -23,6 +23,14 @@ inline bool isNullBlockIdx(BlockIdxType block_idx) {
 using CacheKeysType    = std::vector<CacheKeyType>;
 using BlockIndicesType = std::vector<BlockIdxType>;
 
+struct BlockDependency {
+    bool         has_parent{false};
+    CacheKeyType parent_key{0};
+    uint32_t     ordinal{0};
+};
+
+using BlockDependenciesType = std::vector<BlockDependency>;
+
 class BlockIds {
 public:
     explicit BlockIds(size_t kernel_blocks_per_kv_block = 1):
@@ -67,32 +75,64 @@ class BlockIds {
     size_t           kernel_blocks_per_kv_block_ = 1;
 };
 
-using GroupBlockIds = std::vector<std::shared_ptr<BlockIds>>;
-using LayerBlockIds = std::vector<std::shared_ptr<BlockIds>>;
+using GroupBlockIds     = std::vector<std::shared_ptr<BlockIds>>;
+using LayerBlockIds     = std::vector<std::shared_ptr<BlockIds>>;
+using LayerAttnBlockIds = std::vector<std::vector<std::shared_ptr<BlockIds>>>;
 
 class KVCacheResource {
 public:
-    void initGroups(int                                group_num,
-                    int                                layer_num,
-                    const std::vector<int>&            layer_to_group_id          = {},
-                    size_t                             kernel_blocks_per_kv_block = 1,
-                    const std::vector<CacheGroupType>& group_types                = {});
+    void initGroups(int                                  group_num,
+                    int                                  layer_num,
+                    const std::vector<std::vector<int>>& layer_group_ids            = {},
+                    size_t                               kernel_blocks_per_kv_block = 1,
+                    const std::vector<CacheGroupType>&   group_types                = {});
     void resizeBlocks(int reserver_blocks, int value = 0);
 
     int                     blocksNum(int group_id = 0) const;
     const BlockIndicesType& blocks(int group_id = 0) const;
+    const BlockIndicesType& blocks(int layer_id, int group_id) const;
     const BlockIndicesType& kernelBlocks(int group_id = 0) const;
+    const BlockIndicesType& kernelBlocks(int layer_id, int group_id) const;
     BlockIds&               mutableBlockIds(int group_id = 0) const;
+    BlockIds&               mutableBlockIds(int layer_id, int group_id) const;
 
     int groupNums() const;
 
     GroupBlockIds&       groupBlocks();
     const GroupBlockIds& groupBlocks() const;
 
-    const LayerBlockIds& layerBlocks() const;
+    const LayerBlockIds&     layerBlocks() const;
+    const LayerAttnBlockIds& layerGroupBlocks() const;
+    int                      groupId(int layer_id, int group_id) const;
 
     CacheKeysType&       cacheKeys();
     const CacheKeysType& cacheKeys() const;
+    void                 setCacheKeys(const CacheKeysType& keys);
+    void                 setCacheKeys(CacheKeysType&& keys);
+    bool                 cacheKeysAreCpCanonical() const;
+    void                 setCacheKeysAreCpCanonical(bool cache_keys_are_cp_canonical);
+
+    BlockDependenciesType&       blockDependencies();
+    const BlockDependenciesType& blockDependencies() const;
+    void                         setBlockDependencies(const BlockDependenciesType& dependencies);
+    void                         setBlockDependencies(BlockDependenciesType&& dependencies);
+    void                         rebuildLinearBlockDependencies();
+    void                         ensureLinearBlockDependencies();
+
+    // Return rank-local cache keys: every cp_size-th key starting from cp_rank.
+    // localCacheKeys(r, s)[i] == cacheKeys()[i * s + r]
+    // Note: when cacheKeys().size() % cp_size != 0 (e.g. 1 real block, cp_size=2),
+    // localCacheKeys may return fewer entries than blocks().size().  This is
+    // intentional — padding blocks carry no real data and must NOT participate in
+    // device cache insert, PD transfer, or connector operations.  Downstream code
+    // (e.g. insertIntoCache) already uses min(keys, blocks) to handle this.
+    CacheKeysType localCacheKeys(int cp_rank, int cp_size) const {
+        CacheKeysType local;
+        for (int i = cp_rank; i < static_cast<int>(cache_keys.size()); i += cp_size) {
+            local.push_back(cache_keys[i]);
+        }
+        return local;
+    }
 
     size_t reuseBlockNum() const;
 
@@ -108,9 +148,6 @@ class KVCacheResource {
     bool lastBlockAligned() const;
     void setLastBlockAligned(bool last_block_aligned);
 
-    size_t remoteReuseBlocksNum() const;
-    void   setRemoteReuseBlocksNum(size_t remote_reuse_blocks_num);
-
     void swapBlocks(size_t group_id, size_t rhs, size_t lhs);
 
     std::string debugString() const;
@@ -118,9 +155,13 @@ class KVCacheResource {
 private:
     // layer_id -> block_indices
     LayerBlockIds layer_block_ids;
+    // layer_id -> group_id -> block_indices
+    LayerAttnBlockIds layer_group_block_ids;
     // group_id -> block_indices
     GroupBlockIds group_block_ids;
     CacheKeysType cache_keys;
+    BlockDependenciesType block_dependencies;
+    bool cache_keys_are_cp_canonical_{false};
 
     size_t device_reuse_block_num_{0};
     size_t memory_reuse_block_num_{0};
diff --git a/rtp_llm/cpp/cache/KVCacheSpecBase.h b/rtp_llm/cpp/cache/KVCacheSpecBase.h
deleted file mode 100644
index d5d192ce1f..0000000000
--- a/rtp_llm/cpp/cache/KVCacheSpecBase.h
+++ /dev/null
@@ -1,86 +0,0 @@
-#pragma once
-
-#include <memory>
-#include <sstream>
-#include <string>
-
-#include "rtp_llm/cpp/cache/Types.h"
-#include "rtp_llm/cpp/config/ConfigModules.h"
-#include "rtp_llm/cpp/utils/AssertUtils.h"
-#include "rtp_llm/models_py/bindings/core/Types.h"
-#include "rtp_llm/cpp/model_utils/AttentionConfig.h"
-
-namespace rtp_llm {
-
-enum KVCacheSpecType {
-    MultiHeadAttention,
-    MultiHeadLatentAttention,
-    LinearAttention,
-};
-
-inline const char* KVCacheSpecTypeToString(KVCacheSpecType t) {
-    switch (t) {
-        case KVCacheSpecType::MultiHeadAttention:
-            return "MultiHeadAttention";
-        case KVCacheSpecType::MultiHeadLatentAttention:
-            return "MultiHeadLatentAttention";
-        case KVCacheSpecType::LinearAttention:
-            return "LinearAttention";
-        default:
-            return "Unknown";
-    }
-}
-
-struct KVCacheSpec {
-    uint32_t layer_num;
-    uint32_t local_head_num_kv;
-    uint32_t seq_size_per_block = 1;
-
-    KVCacheSpecType   type;
-    rtp_llm::DataType dtype;
-
-    virtual size_t block_size() const   = 0;
-    virtual size_t k_block_size() const = 0;
-    virtual size_t v_block_size() const = 0;
-
-    virtual size_t block_size_bytes() const   = 0;
-    virtual size_t k_block_size_bytes() const = 0;
-    virtual size_t v_block_size_bytes() const = 0;
-
-    virtual size_t scale_block_size_bytes() const {
-        return 0;
-    }
-    virtual size_t k_scale_block_size_bytes() const {
-        return 0;
-    }
-    virtual size_t v_scale_block_size_bytes() const {
-        return 0;
-    }
-
-    virtual std::string debugString(size_t indent = 0) const = 0;
-
-protected:
-    // Helper method to generate common parts of debug string
-    std::string commonDebugString(size_t indent = 0) const {
-        const std::string indent_str = std::string(indent, ' ');
-        const std::string indent1    = indent_str + "  ";
-
-        std::ostringstream os;
-        os << indent1 << "type=" << KVCacheSpecTypeToString(type) << "(" << static_cast<int>(type) << ")\n";
-        os << indent1 << "dtype=" << static_cast<int>(dtype) << "\n";
-        os << indent1 << "layer_num=" << layer_num << "\n";
-        os << indent1 << "local_head_num_kv=" << local_head_num_kv << "\n";
-        os << indent1 << "seq_size_per_block=" << seq_size_per_block << "\n";
-        os << indent1 << "block_size=" << block_size() << "\n";
-        os << indent1 << "k_block_size=" << k_block_size() << "\n";
-        os << indent1 << "v_block_size=" << v_block_size() << "\n";
-        os << indent1 << "block_size_bytes=" << block_size_bytes() << "\n";
-        os << indent1 << "k_block_size_bytes=" << k_block_size_bytes() << "\n";
-        os << indent1 << "v_block_size_bytes=" << v_block_size_bytes() << "\n";
-        return os.str();
-    }
-};
-
-typedef std::shared_ptr<KVCacheSpec> KVCacheSpecPtr;
-
-}  // namespace rtp_llm
\ No newline at end of file
diff --git a/rtp_llm/cpp/cache/KVCacheTransferPlanner.cc b/rtp_llm/cpp/cache/KVCacheTransferPlanner.cc
new file mode 100644
index 0000000000..b6ce2d895a
--- /dev/null
+++ b/rtp_llm/cpp/cache/KVCacheTransferPlanner.cc
@@ -0,0 +1,121 @@
+#include "rtp_llm/cpp/cache/KVCacheTransferPlanner.h"
+
+#include <algorithm>
+
+namespace rtp_llm {
+
+std::vector<size_t> blockPositionsForCacheTransfer(size_t         block_num,
+                                                   size_t         reuse_block_size,
+                                                   bool           use_hybrid,
+                                                   CacheGroupType group_type,
+                                                   bool           hybrid_full_from_begin) {
+    return blockPositionsForCacheTransfer(
+        block_num,
+        reuse_block_size,
+        use_hybrid,
+        /*transfer_tail_blocks=*/group_type != CacheGroupType::FULL,
+        static_cast<size_t>(defaultCacheGroupPolicy(group_type).active_tail_blocks),
+        hybrid_full_from_begin);
+}
+
+std::vector<size_t> blockPositionsForCacheTransfer(size_t block_num,
+                                                   size_t reuse_block_size,
+                                                   bool   use_hybrid,
+                                                   bool   transfer_tail_blocks,
+                                                   size_t tail_block_count,
+                                                   bool   hybrid_full_from_begin) {
+    std::vector<size_t> block_pos_list;
+    block_pos_list.reserve(block_num);
+    if (use_hybrid && block_num > 0 && transfer_tail_blocks) {
+        const size_t tail_count = std::max<size_t>(1, tail_block_count);
+        const size_t start      = block_num > tail_count ? block_num - tail_count : 0;
+        for (size_t block_pos = start; block_pos < block_num; ++block_pos) {
+            block_pos_list.push_back(block_pos);
+        }
+        return block_pos_list;
+    }
+    const size_t start = use_hybrid && hybrid_full_from_begin ? 0 : reuse_block_size;
+    for (size_t block_pos = start; block_pos < block_num; ++block_pos) {
+        block_pos_list.push_back(block_pos);
+    }
+    return block_pos_list;
+}
+
+std::vector<CacheStoreBlockPair> buildCacheStoreBlockPlan(size_t         total_logical_blocks,
+                                                          size_t         reuse_block_size,
+                                                          bool           use_hybrid,
+                                                          CacheGroupType group_type,
+                                                          int            cp_rank,
+                                                          int            cp_size) {
+    const auto policy = defaultCacheGroupPolicy(group_type);
+    return buildCacheStoreBlockPlan(total_logical_blocks,
+                                    reuse_block_size,
+                                    use_hybrid,
+                                    /*cp_shardable=*/group_type == CacheGroupType::FULL,
+                                    /*cp_compact_tail_blocks=*/group_type == CacheGroupType::SWA,
+                                    static_cast<size_t>(policy.active_tail_blocks),
+                                    cp_rank,
+                                    cp_size);
+}
+
+std::vector<CacheStoreBlockPair> buildCacheStoreBlockPlan(size_t                      total_logical_blocks,
+                                                          size_t                      reuse_block_size,
+                                                          bool                        use_hybrid,
+                                                          bool                        cp_shardable,
+                                                          bool                        cp_compact_tail_blocks,
+                                                          size_t                      tail_block_count,
+                                                          int                         cp_rank,
+                                                          int                         cp_size) {
+    std::vector<CacheStoreBlockPair> plan;
+
+    const bool sharded_full      = (cp_size > 1) && cp_shardable;
+    const bool compact_swa_by_cp = (cp_size > 1) && cp_compact_tail_blocks;
+    if (compact_swa_by_cp) {
+        const size_t cp_size_t        = static_cast<size_t>(cp_size);
+        const size_t canonical_blocks = (total_logical_blocks + cp_size_t - 1) / cp_size_t;
+        const size_t tail_count = std::max<size_t>(1, tail_block_count);
+        const size_t start = use_hybrid ? (canonical_blocks > tail_count ? canonical_blocks - tail_count : 0) :
+                                          std::min(reuse_block_size, canonical_blocks);
+        plan.reserve(canonical_blocks - start);
+        for (size_t compact_idx = start; compact_idx < canonical_blocks; ++compact_idx) {
+            const size_t key_index = std::min((compact_idx + 1) * cp_size_t - 1, total_logical_blocks - 1);
+            plan.push_back({static_cast<int>(key_index), static_cast<int>(compact_idx)});
+        }
+        return plan;
+    }
+
+    auto positions = blockPositionsForCacheTransfer(total_logical_blocks,
+                                                    reuse_block_size,
+                                                    use_hybrid,
+                                                    /*transfer_tail_blocks=*/tail_block_count > 0,
+                                                    tail_block_count,
+                                                    /*hybrid_full_from_begin=*/true);
+
+    plan.reserve(positions.size());
+
+    if (!sharded_full && !compact_swa_by_cp) {
+        for (auto pos : positions) {
+            const int p = static_cast<int>(pos);
+            plan.push_back({p, p});
+        }
+        return plan;
+    }
+    for (auto pos : positions) {
+        const int p = static_cast<int>(pos);
+        if (p % cp_size != cp_rank) {
+            continue;
+        }
+        plan.push_back({p, p / cp_size});
+    }
+    return plan;
+}
+
+std::string layerTagCacheTransferKey(size_t request_id, size_t layer_id, const std::string& tag) {
+    auto key = std::to_string(request_id) + "-" + std::to_string(layer_id);
+    if (!tag.empty() && tag != "default") {
+        key += "-tag-" + tag;
+    }
+    return key;
+}
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/KVCacheTransferPlanner.h b/rtp_llm/cpp/cache/KVCacheTransferPlanner.h
new file mode 100644
index 0000000000..06afad8687
--- /dev/null
+++ b/rtp_llm/cpp/cache/KVCacheTransferPlanner.h
@@ -0,0 +1,66 @@
+#pragma once
+
+#include <cstddef>
+#include <string>
+#include <vector>
+
+#include "rtp_llm/cpp/cache/spec/CacheGroupType.h"
+
+namespace rtp_llm {
+
+std::vector<size_t> blockPositionsForCacheTransfer(
+    size_t block_num, size_t reuse_block_size, bool use_hybrid, CacheGroupType group_type, bool hybrid_full_from_begin);
+std::vector<size_t> blockPositionsForCacheTransfer(size_t                             block_num,
+                                                   size_t                             reuse_block_size,
+                                                   bool                               use_hybrid,
+                                                   bool                               transfer_tail_blocks,
+                                                   size_t                             tail_block_count,
+                                                   bool                               hybrid_full_from_begin);
+
+std::string layerTagCacheTransferKey(size_t request_id, size_t layer_id, const std::string& tag);
+
+// One iteration step of cache_store registration: pair the cache_key at
+// ``key_index`` (FULL-length namespace) with the kv_cache_offset slot at
+// ``offset_index`` (rank-local namespace). Outside CP-page-RR sharding the
+// two are equal; under sharding they diverge for FULL groups (see below).
+struct CacheStoreBlockPair {
+    int key_index;
+    int offset_index;
+};
+
+// Build the per-prefill-write iteration plan for cache_store registration.
+//
+// Background: ``cache_keys`` is always the FULL logical-block hash sequence
+// (length = total_logical_blocks). ``kv_cache_offset`` is per-group and
+// per-rank: for non-FULL groups every rank holds the full block list (length
+// = total_logical_blocks), for FULL groups under CP-page-RR sharding each
+// rank holds only the 1/cp_size logical blocks it owns, **compactly**, in
+// the order they appear within the rank — i.e. local index ``i`` ↔ logical
+// position ``cp_rank + i*cp_size``.
+//
+// To register the right key with the right buffer the planner emits:
+//   * (pos, pos)                              — non-CP / non-FULL groups
+//   * (cp_rank + i*cp_size, i) for owned i    — CP-sharded FULL groups
+//   * ((i+1)*cp_size-1, i)                    — CP-compact SWA/fixed groups
+//
+// Without this re-pairing the prefill side advertises ``cache_keys[i]``
+// (== key for logical position i) attached to data from logical position
+// ``cp_rank + i*cp_size`` — decode then receives content shifted by
+// ``cp_rank`` slots and produces coherent-but-wrong output (DSV4 PD reuse
+// regression seen 2026-05-12).
+std::vector<CacheStoreBlockPair> buildCacheStoreBlockPlan(size_t         total_logical_blocks,
+                                                          size_t         reuse_block_size,
+                                                          bool           use_hybrid,
+                                                          CacheGroupType group_type,
+                                                          int            cp_rank,
+                                                          int            cp_size);
+std::vector<CacheStoreBlockPair> buildCacheStoreBlockPlan(size_t                      total_logical_blocks,
+                                                          size_t                      reuse_block_size,
+                                                          bool                        use_hybrid,
+                                                          bool                        cp_shardable,
+                                                          bool                        cp_compact_tail_blocks,
+                                                          size_t                      tail_block_count,
+                                                          int                         cp_rank,
+                                                          int                         cp_size);
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/LinearKVCacheGroup.cc b/rtp_llm/cpp/cache/LinearKVCacheGroup.cc
deleted file mode 100644
index 45e7989279..0000000000
--- a/rtp_llm/cpp/cache/LinearKVCacheGroup.cc
+++ /dev/null
@@ -1,210 +0,0 @@
-#include "rtp_llm/cpp/cache/LinearKVCacheGroup.h"
-
-#include <algorithm>
-#include <unordered_set>
-
-#include "rtp_llm/cpp/utils/Logger.h"
-
-namespace rtp_llm {
-
-void LinearKVCacheGroup::filterValidBlocks(const BlockIndicesType& in, BlockIndicesType& out) const {
-    out.clear();
-    out.reserve(in.size());
-    for (auto b : in) {
-        if (!isNullBlockIdx(b)) {
-            out.push_back(b);
-        }
-    }
-}
-
-int LinearKVCacheGroup::needBlocksNum(int seq_len, int current_blocks, int reserve_step) const {
-    int extra_blocks = reserve_step ? reserve_step - 1 : 0;
-    return std::max((seq_len + seq_size_per_block_ - 1) / seq_size_per_block_ + extra_blocks - current_blocks, 0);
-}
-
-NeedBlocksInfo LinearKVCacheGroup::getNeedBlocks(
-    int common_seq_len, int seq_len, int reserve_step, int reuse_blocks_len, bool reuse_enabled) const {
-    const int reuse_begin = reuse_blocks_len;
-    const int step        = std::max(1, linear_step_);
-
-    // calculate the number of blocks in the range (begin, end]
-    auto count_linear_sparse_range = [&](int begin, int end) -> int {
-        if (end <= begin) {
-            return 0;
-        }
-        if (!reuse_enabled) {
-            // keeps only the tail block
-            return 1;
-        }
-        const int eligible = (end + 1) / step - (begin + 1) / step;
-        const int tail     = ((end + 1) % step == 0) ? 0 : 1;
-        return eligible + tail;
-    };
-
-    NeedBlocksInfo info;
-
-    // common_slots: blocks for common_seq_len (no reserve)
-    const int common_slots = needBlocksNum(common_seq_len, 0);
-    // seq_slots: blocks for seq_len (no reserve)
-    const int seq_slots = needBlocksNum(seq_len, 0);
-    // total_slots = seq_slots + reserve_step
-    const int total_slots = needBlocksNum(seq_len, 0, reserve_step);
-
-    info.common_blocks = count_linear_sparse_range(reuse_begin, common_slots);
-    info.extra_blocks  = count_linear_sparse_range(common_slots, seq_slots);
-    info.extra_blocks += std::max(total_slots - seq_slots, 0);  // for reserve_step
-
-    info.common_blocks = std::max(info.common_blocks, 0);
-    info.extra_blocks  = std::max(info.extra_blocks, 0);
-    return info;
-}
-
-MatchResult LinearKVCacheGroup::matchSingleKey(CacheKeyType cache_key) const {
-    MatchResult result;
-    auto        matched = block_cache_->match(cache_key, group_id_);
-    if (!isNullBlockIdx(matched.matched_index)) {
-        result.block_indices = {matched.matched_index};
-    }
-    return result;
-}
-
-MatchResult LinearKVCacheGroup::match(const CacheKeysType& cache_keys) {
-    return {};
-}
-
-bool LinearKVCacheGroup::malloc(BlockIds& block_ids, int seq_len, bool enable_reuse_cache, int reserve_step) {
-    const int step               = std::max(1, linear_step_);
-    const int current_blocks_len = static_cast<int>(block_ids.blocksNum());
-    const int seq_slots          = needBlocksNum(seq_len, 0, 0);
-    const int new_blocks_len     = needBlocksNum(seq_len, current_blocks_len, reserve_step);
-
-    if (new_blocks_len == 0) {
-        return true;
-    }
-
-    // LinearKVCacheGroup::malloc is responsible for:
-    // 1. allocating blocks for the current sequence length;
-    // 2. free unused blocks to reduce kvcache block usage;
-
-    // Two policies to follow:
-    // 1. Linear Steps: keep N * linear_step blocks if cache reuse enabled;
-    // 2. Allocate Tail Blocks: allocate the last partial block when initialization and keep last 2 block during
-    // decoding;
-
-    int need_alloc_blocks = 0;
-
-    for (int i = current_blocks_len; i < current_blocks_len + new_blocks_len; i++) {
-        const bool is_seq_tail  = (seq_slots > 0) && (i == seq_slots - 1);
-        const bool is_reserve   = (reserve_step > 0) && (i >= seq_slots);
-        const bool step_hit     = (((i + 1) % step) == 0);
-        const bool should_alloc = is_reserve || (enable_reuse_cache ? (step_hit || is_seq_tail) : is_seq_tail);
-        if (should_alloc) {
-            need_alloc_blocks++;
-        }
-    }
-
-    if (need_alloc_blocks > 0) {
-        const auto free_blocks_num = freeBlocksNum();
-        if (free_blocks_num < static_cast<size_t>(need_alloc_blocks)) {
-            if (!ensureFreeBlocks(need_alloc_blocks)) {
-                RTP_LLM_LOG_WARNING("Insufficient free blocks for LinearKVCacheGroup: need %d, have %zu",
-                                    need_alloc_blocks,
-                                    free_blocks_num);
-                return false;
-            }
-        }
-    }
-
-    BlockIndicesType new_ids;
-    new_ids.reserve(static_cast<size_t>(new_blocks_len));
-    for (int i = current_blocks_len; i < current_blocks_len + new_blocks_len; i++) {
-        const bool is_seq_tail  = (seq_slots > 0) && (i == seq_slots - 1);
-        const bool is_reserve   = (reserve_step > 0) && (i >= seq_slots);
-        const bool step_hit     = (((i + 1) % step) == 0);
-        const bool should_alloc = is_reserve || (enable_reuse_cache ? (step_hit || is_seq_tail) : is_seq_tail);
-        if (should_alloc) {
-            auto result = block_pool_->malloc(1);
-            if (result.empty()) {
-                return false;
-            }
-            new_ids.push_back(result[0]);
-        } else {
-            new_ids.push_back(NULL_BLOCK_IDX);
-        }
-    }
-    block_ids.add(new_ids);
-    return true;
-}
-
-void LinearKVCacheGroup::insertIntoCache(const CacheKeysType&    cache_keys,
-                                         const BlockIndicesType& block_indices,
-                                         bool                    is_resident) {
-    if (cache_keys.empty() || block_indices.empty()) {
-        return;
-    }
-    const size_t n = std::min(cache_keys.size(), block_indices.size());
-    for (size_t i = 0; i < n; ++i) {
-        const auto b = block_indices[i];
-        if (isNullBlockIdx(b)) {
-            continue;
-        }
-        BlockCache::CacheItem item;
-        item.cache_key   = cache_keys[i];
-        item.group_id    = group_id_;
-        item.block_index = b;
-        item.is_resident = is_resident;
-        if (block_cache_->put(item)) {
-            block_pool_->blockCacheReference(b);
-        }
-    }
-}
-
-void LinearKVCacheGroup::removeSkippedBlocks(BlockIds& block_ids, bool enable_reuse_cache, int reserve_step) {
-    const auto& block_indices = block_ids.blocks();  // const view for reading current state
-    if (block_indices.empty()) {
-        return;
-    }
-    const int step       = std::max(1, linear_step_);
-    const int block_size = static_cast<int>(block_indices.size());
-
-    BlockIndicesType    blocks_to_free;
-    std::vector<size_t> pos_to_remove;
-    // keep last 2 and every reserve_step
-    for (int i = block_size - 3 - reserve_step; i >= 0; i--) {
-        if (isNullBlockIdx(block_indices[i])) {
-            break;
-        }
-        if (enable_reuse_cache && ((i + 1) % step) == 0) {
-            continue;
-        }
-        blocks_to_free.push_back(block_indices[i]);
-        pos_to_remove.push_back(static_cast<size_t>(i));
-    }
-    if (!blocks_to_free.empty()) {
-        block_pool_->requestFree(blocks_to_free);
-        block_ids.remove(pos_to_remove);  // null-out by position, updates kernel slots incrementally
-    }
-}
-
-void LinearKVCacheGroup::free(const BlockIndicesType& block_indices) {
-    if (block_indices.empty()) {
-        return;
-    }
-    BlockIndicesType valid;
-    filterValidBlocks(block_indices, valid);
-    if (valid.empty()) {
-        return;
-    }
-    block_pool_->requestFree(valid);
-}
-
-void LinearKVCacheGroup::reference(BlockIds& block_ids, const BlockIndicesType& new_block_indices) {
-    block_ids.add(new_block_indices);
-    BlockIndicesType valid;
-    filterValidBlocks(new_block_indices, valid);
-    if (!valid.empty()) {
-        block_pool_->requestReference(valid);
-    }
-}
-
-}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/MemoryLayoutConfig.h b/rtp_llm/cpp/cache/MemoryLayoutConfig.h
index 088493556f..028f4ea09c 100644
--- a/rtp_llm/cpp/cache/MemoryLayoutConfig.h
+++ b/rtp_llm/cpp/cache/MemoryLayoutConfig.h
@@ -37,6 +37,13 @@ struct MemoryLayoutConfig {
     size_t local_head_num_kv  = 0;
     size_t seq_size_per_block = 0;
 
+    // Number of kernel blocks packed inside one BlockPool block.  When > 1,
+    // BlockPool allocates physical blocks (each = bpk × kernel block bytes), but
+    // kernels still address by kernel-block id; MemoryLayoutStrategy reshapes the
+    // KV tensor as (layer, block_num × bpk, kv_block_stride_bytes / bpk) so the
+    // kernel view sees per-kernel-block strides.
+    size_t kernel_blocks_per_kv_block = 1;
+
     bool enable_kv_scale         = false;
     bool enable_hybrid_attention = false;
 
diff --git a/rtp_llm/cpp/cache/MemoryLayoutStrategy.cc b/rtp_llm/cpp/cache/MemoryLayoutStrategy.cc
index 91e555a0e7..8e86cb5c3b 100644
--- a/rtp_llm/cpp/cache/MemoryLayoutStrategy.cc
+++ b/rtp_llm/cpp/cache/MemoryLayoutStrategy.cc
@@ -1,7 +1,7 @@
 #include "rtp_llm/cpp/cache/MemoryLayoutStrategy.h"
 #include "rtp_llm/models_py/bindings/core/torch_utils/TypeConvert.h"
 #include "rtp_llm/cpp/utils/Logger.h"
-#include "rtp_llm/cpp/cache/KVCacheSpec.h"
+#include "rtp_llm/cpp/cache/spec/KVCacheSpec.h"
 
 namespace rtp_llm {
 
@@ -74,10 +74,24 @@ void MemoryLayoutStrategy::processKVTensor(torch::Tensor& kv_cache_tensor) {
                               torch::str(layer_kv_tensors_[layer_id].sizes()).c_str());
         }
     } else {
-        // MHA: [layer_num, block_num, kv_block_stride_elems], per layer 2D
-        torch::Tensor reshaped_tensor = kv_cache_typed.reshape({static_cast<int64_t>(config_.layer_num),
-                                                                static_cast<int64_t>(config_.block_num),
-                                                                static_cast<int64_t>(kv_block_stride_elems)});
+        // MHA: [layer_num, block_num, kv_block_stride_elems], per layer 2D.
+        // When kernel_blocks_per_kv_block > 1 (e.g. DSV4 paged FULL pools with
+        // physical block > 256 tokens), reshape into the kernel-block view —
+        // (layer_num, block_num × bpk, kv_block_stride_elems / bpk) — so that
+        // kernels addressing by kernel-block id see per-kernel-block strides.
+        // The underlying memory is identical, only the shape interpretation
+        // changes; entries_per_block derived from tensor stride stays at the
+        // kernel-block size (e.g. 64 for compress_ratio=4) and FlashMLA's
+        // template instantiation constraint (block_kv == 64) holds.
+        const size_t bpk = std::max<size_t>(1, config_.kernel_blocks_per_kv_block);
+        RTP_LLM_CHECK_WITH_INFO(kv_block_stride_elems % bpk == 0,
+                                "kv_block_stride_elems(%zu) must be divisible by kernel_blocks_per_kv_block(%zu)",
+                                kv_block_stride_elems,
+                                bpk);
+        const int64_t kernel_block_count        = static_cast<int64_t>(config_.block_num) * static_cast<int64_t>(bpk);
+        const int64_t kernel_block_stride_elems = static_cast<int64_t>(kv_block_stride_elems / bpk);
+        torch::Tensor reshaped_tensor           = kv_cache_typed.reshape(
+            {static_cast<int64_t>(config_.layer_num), kernel_block_count, kernel_block_stride_elems});
         clearKVTensor(reshaped_tensor);
         for (uint32_t layer_id = 0; layer_id < config_.layer_num; ++layer_id) {
             layer_kv_tensors_.push_back(reshaped_tensor[layer_id]);
@@ -215,13 +229,29 @@ std::vector<BlockInfo> MemoryLayoutStrategy::createBasicBlockInfo(int layer_id,
 
     checkLayerIdValidity(layer_id);
     auto& layer_tensor = layer_kv_tensors_[layer_id];
-    void* kv_addr      = getBlockPtr(layer_tensor, block_id);
-    auto  kv_info      = makeBlockInfo(layer_tensor, kv_addr, static_cast<size_t>(config_.kv_block_stride_bytes));
+    void* kv_addr      = nullptr;
+    if (config_.kernel_blocks_per_kv_block > 1) {
+        RTP_LLM_CHECK_WITH_INFO(block_id >= 0 && static_cast<size_t>(block_id) < config_.block_num,
+                                "Physical block ID %d out of range (max: %zu)",
+                                block_id,
+                                config_.block_num);
+        kv_addr =
+            static_cast<char*>(layer_tensor.data_ptr()) + static_cast<size_t>(block_id) * config_.kv_block_stride_bytes;
+    } else {
+        kv_addr = getBlockPtr(layer_tensor, block_id);
+    }
+    auto kv_info = makeBlockInfo(layer_tensor, kv_addr, static_cast<size_t>(config_.kv_block_stride_bytes));
 
     if (config_.hasScale()) {
         auto& layer_scale_tensor = layer_kv_scale_tensors_[layer_id];
-        void* kv_scale_addr      = getBlockPtr(layer_scale_tensor, block_id);
-        auto  scale_info =
+        void* kv_scale_addr      = nullptr;
+        if (config_.kernel_blocks_per_kv_block > 1) {
+            kv_scale_addr = static_cast<char*>(layer_scale_tensor.data_ptr())
+                            + static_cast<size_t>(block_id) * config_.kv_scale_stride_bytes;
+        } else {
+            kv_scale_addr = getBlockPtr(layer_scale_tensor, block_id);
+        }
+        auto scale_info =
             makeBlockInfo(layer_scale_tensor, kv_scale_addr, static_cast<size_t>(config_.kv_scale_stride_bytes));
         return {kv_info, scale_info};
     }
@@ -307,4 +337,4 @@ void MemoryLayoutStrategy::checkLayerIdValidity(int layer_id) const {
                             layer_kv_tensors_.size());
 }
 
-}  // namespace rtp_llm
\ No newline at end of file
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/SharedBlockCache.cc b/rtp_llm/cpp/cache/SharedBlockCache.cc
new file mode 100644
index 0000000000..537af2f902
--- /dev/null
+++ b/rtp_llm/cpp/cache/SharedBlockCache.cc
@@ -0,0 +1,904 @@
+#include "rtp_llm/cpp/cache/SharedBlockCache.h"
+
+#include <algorithm>
+
+#include "rtp_llm/cpp/utils/Logger.h"
+#include "rtp_llm/cpp/utils/ProfilingScope.h"
+#include "rtp_llm/cpp/utils/TimeUtil.h"
+
+namespace rtp_llm {
+
+void SharedBlockCache::init(int group_num, const std::vector<BlockPoolPtr>& group_pools) {
+    std::lock_guard<std::mutex> lock(mu_);
+    RTP_LLM_CHECK_WITH_INFO(static_cast<int>(group_pools.size()) == group_num,
+                            "group_pools size %zu != group_num %d",
+                            group_pools.size(),
+                            group_num);
+    group_num_   = group_num;
+    group_pools_ = group_pools;
+}
+
+void SharedBlockCache::put(CacheKeyType cache_key, const std::vector<BlockIdxType>& group_slots, bool is_resident) {
+    BlockDependency dependency;
+    put(cache_key, group_slots, is_resident, kDefaultNamespace, dependency);
+}
+
+void SharedBlockCache::put(CacheKeyType                     cache_key,
+                           const std::vector<BlockIdxType>& group_slots,
+                           bool                             is_resident,
+                           NamespaceId                      namespace_id,
+                           const BlockDependency&           dependency,
+                           const std::vector<bool>&         matchable_slots) {
+    RTP_LLM_PROFILE_FUNCTION();
+    std::lock_guard<std::mutex> lock(mu_);
+
+    if (lru_cache_.contains(cache_key)) {
+        auto [success, existing_item] = lru_cache_.get(cache_key);
+        if (success) {
+            const auto now_us   = currentTimeUs();
+            const bool resident = existing_item.is_resident || is_resident;
+            if (resident != existing_item.is_resident) {
+                existing_item.is_resident = resident;
+            }
+            const bool dependency_updated = updateItemDependencyLocked(existing_item, namespace_id, dependency);
+            bool updated = false;
+            for (size_t gid = 0; gid < group_slots.size(); ++gid) {
+                if (isNullBlockIdx(group_slots[gid])) {
+                    continue;
+                }
+                if (gid >= existing_item.slots.size()) {
+                    existing_item.slots.resize(gid + 1, NULL_BLOCK_IDX);
+                }
+                if (gid >= existing_item.matchable_slots.size()) {
+                    existing_item.matchable_slots.resize(gid + 1, true);
+                }
+                if (gid >= existing_item.slot_created_time_us.size()) {
+                    existing_item.slot_created_time_us.resize(gid + 1, 0);
+                }
+                if (isNullBlockIdx(existing_item.slots[gid])) {
+                    existing_item.slots[gid] = group_slots[gid];
+                    existing_item.slot_created_time_us[gid] = now_us;
+                    existing_item.matchable_slots[gid] =
+                        matchable_slots.empty() || gid >= matchable_slots.size() ? true : matchable_slots[gid];
+                    updated                  = true;
+                    if (static_cast<int>(gid) < group_num_) {
+                        group_pools_[gid]->blockCacheReference(group_slots[gid]);
+                    }
+                } else if (!matchable_slots.empty() && gid < matchable_slots.size() && matchable_slots[gid]
+                           && !existing_item.matchable_slots[gid]) {
+                    existing_item.matchable_slots[gid] = true;
+                    updated                            = true;
+                }
+            }
+            if (updated || existing_item.is_resident || dependency_updated) {
+                lru_cache_.put(cache_key, existing_item);
+                ++version_;
+            }
+            if (existing_item.is_resident) {
+                markAllTreeAliasesResidentLocked(cache_key);
+            }
+            upsertTreeNodeLocked(cache_key, namespace_id, dependency, existing_item.is_resident);
+            refreshAllTreeAliasesLocked(cache_key);
+        }
+        return;
+    }
+
+    UnifiedCacheItem item;
+    const auto       now_us = currentTimeUs();
+    item.cache_key          = cache_key;
+    item.is_resident        = is_resident;
+    item.slots              = group_slots;
+    item.created_time_us    = now_us;
+    item.matchable_slots.resize(group_slots.size(), true);
+    item.slot_created_time_us.resize(group_slots.size(), 0);
+    for (size_t gid = 0; gid < group_slots.size() && gid < matchable_slots.size(); ++gid) {
+        item.matchable_slots[gid] = matchable_slots[gid];
+    }
+    for (size_t gid = 0; gid < group_slots.size(); ++gid) {
+        if (!isNullBlockIdx(group_slots[gid])) {
+            item.slot_created_time_us[gid] = now_us;
+        }
+    }
+    updateItemDependencyLocked(item, namespace_id, dependency);
+
+    lru_cache_.put(cache_key, item);
+    ++version_;
+    upsertTreeNodeLocked(cache_key, namespace_id, dependency, item.is_resident);
+    refreshAllTreeAliasesLocked(cache_key);
+
+    for (int gid = 0; gid < static_cast<int>(group_slots.size()) && gid < group_num_; ++gid) {
+        if (!isNullBlockIdx(group_slots[gid])) {
+            group_pools_[gid]->blockCacheReference(group_slots[gid]);
+        }
+    }
+}
+
+SharedBlockCache::MatchResult SharedBlockCache::match(CacheKeyType cache_key) {
+    RTP_LLM_PROFILE_FUNCTION();
+    std::lock_guard<std::mutex> lock(mu_);
+
+    auto [success, item] = lru_cache_.get(cache_key);
+    if (!success) {
+        return {false, {}};
+    }
+    touchTreeAliasesLocked(cache_key);
+    return {true, item.slots};
+}
+
+BlockIdxType SharedBlockCache::matchGroup(CacheKeyType cache_key, int group_id) {
+    RTP_LLM_PROFILE_FUNCTION();
+    std::lock_guard<std::mutex> lock(mu_);
+
+    auto [success, item] = lru_cache_.get(cache_key);
+    if (!success) {
+        return NULL_BLOCK_IDX;
+    }
+    touchTreeAliasesLocked(cache_key);
+    if (group_id < 0 || static_cast<size_t>(group_id) >= item.slots.size()) {
+        return NULL_BLOCK_IDX;
+    }
+    if (!slotMatchable(item, static_cast<size_t>(group_id))) {
+        return NULL_BLOCK_IDX;
+    }
+    const auto block = item.slots[group_id];
+    return block;
+}
+
+SharedBlockCache::EvictResult SharedBlockCache::selectAndEvict(size_t min_blocks) {
+    RTP_LLM_PROFILE_FUNCTION();
+    std::lock_guard<std::mutex> lock(mu_);
+
+    EvictResult result;
+    if (lru_cache_.empty() || min_blocks == 0) {
+        return result;
+    }
+
+    if (prefix_tree_enabled_ && !leaf_lru_.empty()) {
+        size_t selected_blocks = 0;
+        while (selected_blocks < min_blocks && !leaf_lru_.empty()) {
+            const auto leaf     = *leaf_lru_.begin();
+            const auto leaf_key = NamespacedKey{leaf.namespace_id, leaf.cache_key};
+            auto       chain    = collectEvictChainLocked(leaf_key);
+            if (chain.empty()) {
+                removeTreeAliasLocked(leaf_key);
+                continue;
+            }
+            std::vector<NamespacedKey> ordered_chain(chain.rbegin(), chain.rend());
+            for (const auto& tree_key : ordered_chain) {
+                UnifiedCacheItem removed_item;
+                if (!lru_cache_.remove(tree_key.cache_key, &removed_item)) {
+                    removeAllTreeAliasesForCacheKeyLocked(tree_key.cache_key);
+                    continue;
+                }
+                if (result.evicted_slots.find(tree_key.cache_key) == result.evicted_slots.end()) {
+                    result.evicted_keys.push_back(tree_key.cache_key);
+                    result.evicted_slots[tree_key.cache_key] = removed_item.slots;
+                    result.evicted_lifetime_ms[tree_key.cache_key] =
+                        std::max<int64_t>(0, (currentTimeUs() - removed_item.created_time_us) / 1000);
+                    result.evicted_namespaces[tree_key.cache_key] =
+                        removed_item.has_dependency ? removed_item.dependency_namespace : tree_key.namespace_id;
+                    if (removed_item.has_dependency) {
+                        result.evicted_dependencies[tree_key.cache_key] = removed_item.dependency;
+                    }
+                    for (const auto& slot : removed_item.slots) {
+                        if (!isNullBlockIdx(slot)) {
+                            selected_blocks++;
+                        }
+                    }
+                }
+                removeAllTreeAliasesForCacheKeyLocked(tree_key.cache_key);
+            }
+        }
+        return result;
+    }
+
+    std::unordered_set<CacheKeyType> resident_keys;
+    for (const auto& [key, item] : lru_cache_.items()) {
+        if (item.is_resident) {
+            resident_keys.insert(item.cache_key);
+        }
+    }
+
+    std::vector<CacheKeyType> lru_keys;
+    for (auto it = lru_cache_.items().rbegin(); it != lru_cache_.items().rend(); ++it) {
+        const auto& item = it->second;
+        if (item.is_resident || resident_keys.count(item.cache_key)) {
+            continue;
+        }
+        lru_keys.push_back(item.cache_key);
+    }
+
+    size_t selected_blocks = 0;
+    for (const auto cache_key : lru_keys) {
+        UnifiedCacheItem removed_item;
+        if (!lru_cache_.remove(cache_key, &removed_item)) {
+            continue;
+        }
+        removeAllTreeAliasesForCacheKeyLocked(cache_key);
+
+        result.evicted_keys.push_back(cache_key);
+        result.evicted_slots[cache_key] = removed_item.slots;
+        result.evicted_lifetime_ms[cache_key] =
+            std::max<int64_t>(0, (currentTimeUs() - removed_item.created_time_us) / 1000);
+        result.evicted_namespaces[cache_key] =
+            removed_item.has_dependency ? removed_item.dependency_namespace : kDefaultNamespace;
+        if (removed_item.has_dependency) {
+            result.evicted_dependencies[cache_key] = removed_item.dependency;
+        }
+
+        for (const auto& slot : removed_item.slots) {
+            if (!isNullBlockIdx(slot)) {
+                selected_blocks++;
+            }
+        }
+        if (selected_blocks >= min_blocks) {
+            break;
+        }
+    }
+
+    return result;
+}
+
+SharedBlockCache::EvictResult SharedBlockCache::selectAndEvictForGroup(int group_id, size_t min_blocks) {
+    RTP_LLM_PROFILE_FUNCTION();
+    if (min_blocks == 0) {
+        return {};
+    }
+
+    std::lock_guard<std::mutex> lock(mu_);
+    EvictResult                 result;
+    if (independent_group_eviction_enabled_ && prefix_tree_enabled_ && isIndependentEvictionGroupLocked(group_id)) {
+        if (selectIndependentGroupEvictionsLocked(group_id, min_blocks, result)) {
+            return result;
+        }
+    }
+    if (!result.evicted_keys.empty()) {
+        return result;
+    }
+
+    // Re-enter the normal selection path without taking the mutex twice.
+    if (lru_cache_.empty()) {
+        return result;
+    }
+    if (prefix_tree_enabled_ && !leaf_lru_.empty()) {
+        size_t selected_blocks = 0;
+        bool   made_progress   = true;
+        while (selected_blocks < min_blocks && made_progress && !leaf_lru_.empty()) {
+            made_progress = false;
+            std::vector<LeafKey> leaves(leaf_lru_.begin(), leaf_lru_.end());
+            for (const auto& leaf : leaves) {
+                if (selected_blocks >= min_blocks) {
+                    break;
+                }
+                const auto leaf_key = NamespacedKey{leaf.namespace_id, leaf.cache_key};
+                auto       chain    = collectEvictChainLocked(leaf_key);
+                if (chain.empty()) {
+                    removeTreeAliasLocked(leaf_key);
+                    made_progress = true;
+                    continue;
+                }
+                const bool chain_has_target = chainHasUsableSlotLocked(chain, group_id);
+                if (!chain_has_target && !chainHasReachableAncestorSlotLocked(chain, group_id)) {
+                    continue;
+                }
+                std::vector<NamespacedKey> ordered_chain(chain.rbegin(), chain.rend());
+                for (const auto& tree_key : ordered_chain) {
+                    UnifiedCacheItem removed_item;
+                    if (!lru_cache_.remove(tree_key.cache_key, &removed_item)) {
+                        removeAllTreeAliasesForCacheKeyLocked(tree_key.cache_key);
+                        continue;
+                    }
+                    made_progress = true;
+                    if (result.evicted_slots.find(tree_key.cache_key) == result.evicted_slots.end()) {
+                        result.evicted_keys.push_back(tree_key.cache_key);
+                        result.evicted_slots[tree_key.cache_key] = removed_item.slots;
+                        result.evicted_lifetime_ms[tree_key.cache_key] =
+                            std::max<int64_t>(0, (currentTimeUs() - removed_item.created_time_us) / 1000);
+                        result.evicted_namespaces[tree_key.cache_key] =
+                            removed_item.has_dependency ? removed_item.dependency_namespace : tree_key.namespace_id;
+                        if (removed_item.has_dependency) {
+                            result.evicted_dependencies[tree_key.cache_key] = removed_item.dependency;
+                        }
+                        if (hasUsableSlot(removed_item, group_id)) {
+                            selected_blocks++;
+                        }
+                    }
+                    removeAllTreeAliasesForCacheKeyLocked(tree_key.cache_key);
+                }
+            }
+        }
+        return result;
+    }
+
+    std::unordered_set<CacheKeyType> resident_keys;
+    for (const auto& [key, item] : lru_cache_.items()) {
+        if (item.is_resident) {
+            resident_keys.insert(item.cache_key);
+        }
+    }
+
+    std::vector<CacheKeyType> lru_keys;
+    for (auto it = lru_cache_.items().rbegin(); it != lru_cache_.items().rend(); ++it) {
+        const auto& item = it->second;
+        if (item.is_resident || resident_keys.count(item.cache_key)) {
+            continue;
+        }
+        lru_keys.push_back(item.cache_key);
+    }
+
+    size_t selected_blocks = 0;
+    for (const auto cache_key : lru_keys) {
+        UnifiedCacheItem removed_item;
+        bool has_target_slot = false;
+        for (const auto& [key, item] : lru_cache_.items()) {
+            if (key == cache_key) {
+                has_target_slot = hasUsableSlot(item, group_id);
+                break;
+            }
+        }
+        if (!has_target_slot) {
+            continue;
+        }
+        if (!lru_cache_.remove(cache_key, &removed_item)) {
+            continue;
+        }
+        removeAllTreeAliasesForCacheKeyLocked(cache_key);
+
+        result.evicted_keys.push_back(cache_key);
+        result.evicted_slots[cache_key] = removed_item.slots;
+        result.evicted_lifetime_ms[cache_key] =
+            std::max<int64_t>(0, (currentTimeUs() - removed_item.created_time_us) / 1000);
+        result.evicted_namespaces[cache_key] =
+            removed_item.has_dependency ? removed_item.dependency_namespace : kDefaultNamespace;
+        if (removed_item.has_dependency) {
+            result.evicted_dependencies[cache_key] = removed_item.dependency;
+        }
+
+        if (hasUsableSlot(removed_item, group_id)) {
+            selected_blocks++;
+        }
+        if (selected_blocks >= min_blocks) {
+            break;
+        }
+    }
+
+    return result;
+}
+
+size_t SharedBlockCache::evictAndFree(size_t min_blocks) {
+    RTP_LLM_PROFILE_FUNCTION();
+
+    auto evict_result = selectAndEvict(min_blocks);
+    if (evict_result.evicted_keys.empty()) {
+        return 0;
+    }
+
+    size_t freed = 0;
+    for (size_t i = 0; i < evict_result.evicted_keys.size(); ++i) {
+        const auto  cache_key = evict_result.evicted_keys[i];
+        const auto& slots     = evict_result.evicted_slots.at(cache_key);
+
+        for (int gid = 0; gid < static_cast<int>(slots.size()) && gid < group_num_; ++gid) {
+            if (!isNullBlockIdx(slots[gid])) {
+                group_pools_[gid]->blockCacheFree(slots[gid]);
+                freed++;
+            }
+        }
+    }
+    return freed;
+}
+
+size_t SharedBlockCache::evictAndFreeForGroup(int group_id, size_t min_blocks, EvictResult* evict_result_out) {
+    RTP_LLM_PROFILE_FUNCTION();
+
+    auto evict_result = selectAndEvictForGroup(group_id, min_blocks);
+    if (evict_result.evicted_keys.empty()) {
+        if (evict_result_out) {
+            *evict_result_out = std::move(evict_result);
+        }
+        return 0;
+    }
+
+    size_t freed = 0;
+    for (size_t i = 0; i < evict_result.evicted_keys.size(); ++i) {
+        const auto  cache_key = evict_result.evicted_keys[i];
+        const auto& slots     = evict_result.evicted_slots.at(cache_key);
+
+        for (int gid = 0; gid < static_cast<int>(slots.size()) && gid < group_num_; ++gid) {
+            if (!isNullBlockIdx(slots[gid])) {
+                group_pools_[gid]->blockCacheFree(slots[gid]);
+                if (gid == group_id) {
+                    freed++;
+                }
+            }
+        }
+    }
+    if (evict_result_out) {
+        *evict_result_out = std::move(evict_result);
+    }
+    return freed;
+}
+
+std::optional<SharedBlockCache::UnifiedCacheItem> SharedBlockCache::remove(CacheKeyType cache_key) {
+    std::lock_guard<std::mutex> lock(mu_);
+
+    UnifiedCacheItem removed_item;
+    if (!lru_cache_.remove(cache_key, &removed_item)) {
+        return std::nullopt;
+    }
+    removeAllTreeAliasesForCacheKeyLocked(cache_key);
+    return removed_item;
+}
+
+bool SharedBlockCache::contains(CacheKeyType cache_key) const {
+    std::lock_guard<std::mutex> lock(mu_);
+    return lru_cache_.contains(cache_key);
+}
+
+bool SharedBlockCache::empty() const {
+    std::lock_guard<std::mutex> lock(mu_);
+    return lru_cache_.empty();
+}
+
+size_t SharedBlockCache::size() const {
+    std::lock_guard<std::mutex> lock(mu_);
+    return lru_cache_.size();
+}
+
+std::vector<CacheKeyType> SharedBlockCache::allCacheKeys() const {
+    std::lock_guard<std::mutex> lock(mu_);
+    std::vector<CacheKeyType>   keys;
+    keys.reserve(lru_cache_.size());
+    for (const auto& [key, item] : lru_cache_.items()) {
+        keys.push_back(key);
+    }
+    return keys;
+}
+
+int64_t SharedBlockCache::version() const {
+    std::lock_guard<std::mutex> lock(mu_);
+    return version_;
+}
+
+void SharedBlockCache::setPrefixTreeEnabled(bool enabled) {
+    std::lock_guard<std::mutex> lock(mu_);
+    prefix_tree_enabled_ = enabled;
+}
+
+bool SharedBlockCache::prefixTreeEnabled() const {
+    std::lock_guard<std::mutex> lock(mu_);
+    return prefix_tree_enabled_;
+}
+
+void SharedBlockCache::setIndependentGroupEviction(bool enabled, const std::vector<int>& group_ids) {
+    std::lock_guard<std::mutex> lock(mu_);
+    independent_group_eviction_enabled_ = enabled;
+    independent_eviction_group_ids_.clear();
+    for (const auto gid : group_ids) {
+        if (gid >= 0) {
+            independent_eviction_group_ids_.insert(gid);
+        }
+    }
+}
+
+void SharedBlockCache::upsertTreeNodeLocked(CacheKeyType               cache_key,
+                                            NamespaceId                namespace_id,
+                                            const BlockDependency&     dependency,
+                                            bool                       is_resident) {
+    if (!prefix_tree_enabled_) {
+        return;
+    }
+    const NamespacedKey key{namespace_id, cache_key};
+    const bool          has_parent = dependency.has_parent && dependency.parent_key != cache_key;
+    const NamespacedKey parent{namespace_id, dependency.parent_key};
+    auto                it = tree_nodes_.find(key);
+    if (it == tree_nodes_.end()) {
+        PrefixTreeNode node;
+        node.key        = key;
+        node.parent     = parent;
+        node.has_parent = has_parent;
+        node.ordinal    = dependency.ordinal;
+        node.resident   = is_resident;
+        node.last_access_seq = ++tree_access_seq_;
+        auto [inserted_it, _] = tree_nodes_.emplace(key, std::move(node));
+        it = inserted_it;
+        aliases_by_cache_key_[cache_key].insert(key);
+    } else {
+        eraseLeafLocked(it->second);
+        if (it->second.has_parent && (it->second.parent == parent) == false) {
+            if (auto parent_it = tree_nodes_.find(it->second.parent); parent_it != tree_nodes_.end()) {
+                parent_it->second.children.erase(key);
+                refreshLeafLocked(parent_it->first);
+            } else {
+                detachPendingChildLocked(it->second.parent, key);
+            }
+        }
+        it->second.parent     = parent;
+        it->second.has_parent = has_parent;
+        it->second.ordinal    = dependency.ordinal;
+        it->second.resident   = it->second.resident || is_resident;
+        it->second.last_access_seq = ++tree_access_seq_;
+    }
+
+    if (has_parent) {
+        auto parent_it = tree_nodes_.find(parent);
+        if (parent_it != tree_nodes_.end()) {
+            eraseLeafLocked(parent_it->second);
+            parent_it->second.children.insert(key);
+        } else {
+            pending_children_by_parent_[parent].insert(key);
+        }
+    }
+    attachPendingChildrenLocked(it->second);
+    insertLeafIfEligibleLocked(it->second);
+}
+
+void SharedBlockCache::detachPendingChildLocked(const NamespacedKey& parent, const NamespacedKey& child) {
+    auto pending_it = pending_children_by_parent_.find(parent);
+    if (pending_it == pending_children_by_parent_.end()) {
+        return;
+    }
+    pending_it->second.erase(child);
+    if (pending_it->second.empty()) {
+        pending_children_by_parent_.erase(pending_it);
+    }
+}
+
+void SharedBlockCache::attachPendingChildrenLocked(PrefixTreeNode& node) {
+    auto pending_it = pending_children_by_parent_.find(node.key);
+    if (pending_it == pending_children_by_parent_.end()) {
+        return;
+    }
+    for (const auto& child_key : pending_it->second) {
+        auto child_it = tree_nodes_.find(child_key);
+        if (child_it != tree_nodes_.end() && child_it->second.has_parent && child_it->second.parent == node.key) {
+            eraseLeafLocked(node);
+            node.children.insert(child_key);
+        }
+    }
+    pending_children_by_parent_.erase(pending_it);
+}
+
+void SharedBlockCache::touchTreeAliasesLocked(CacheKeyType cache_key) {
+    if (!prefix_tree_enabled_) {
+        return;
+    }
+    auto aliases_it = aliases_by_cache_key_.find(cache_key);
+    if (aliases_it == aliases_by_cache_key_.end()) {
+        return;
+    }
+    std::vector<NamespacedKey> aliases(aliases_it->second.begin(), aliases_it->second.end());
+    for (const auto& key : aliases) {
+        auto node_it = tree_nodes_.find(key);
+        if (node_it != tree_nodes_.end()) {
+            touchTreeNodeLocked(node_it->second);
+        }
+    }
+}
+
+void SharedBlockCache::touchTreeNodeLocked(PrefixTreeNode& node) {
+    eraseLeafLocked(node);
+    node.last_access_seq = ++tree_access_seq_;
+    insertLeafIfEligibleLocked(node);
+}
+
+void SharedBlockCache::eraseLeafLocked(const PrefixTreeNode& node) {
+    leaf_lru_.erase(LeafKey{node.last_access_seq, node.key.namespace_id, node.key.cache_key});
+}
+
+void SharedBlockCache::insertLeafIfEligibleLocked(const PrefixTreeNode& node) {
+    if (node.resident || !node.children.empty() || !hasFlatItemLocked(node.key.cache_key)
+        || isFlatItemResidentLocked(node.key.cache_key)) {
+        return;
+    }
+    if (node.key.namespace_id != kGpuCpCanonicalNamespace && flatItemHasCanonicalDependencyLocked(node.key.cache_key)) {
+        return;
+    }
+    leaf_lru_.insert(LeafKey{node.last_access_seq, node.key.namespace_id, node.key.cache_key});
+}
+
+void SharedBlockCache::refreshLeafLocked(const NamespacedKey& key) {
+    auto it = tree_nodes_.find(key);
+    if (it == tree_nodes_.end()) {
+        return;
+    }
+    eraseLeafLocked(it->second);
+    insertLeafIfEligibleLocked(it->second);
+}
+
+void SharedBlockCache::removeTreeAliasLocked(const NamespacedKey& key) {
+    auto it = tree_nodes_.find(key);
+    if (it == tree_nodes_.end()) {
+        return;
+    }
+    PrefixTreeNode node = it->second;
+    eraseLeafLocked(node);
+    if (node.has_parent) {
+        auto parent_it = tree_nodes_.find(node.parent);
+        if (parent_it != tree_nodes_.end()) {
+            parent_it->second.children.erase(key);
+            refreshLeafLocked(parent_it->first);
+        } else {
+            detachPendingChildLocked(node.parent, key);
+        }
+    }
+    for (const auto& child : node.children) {
+        auto child_it = tree_nodes_.find(child);
+        if (child_it != tree_nodes_.end() && child_it->second.parent == key) {
+            child_it->second.has_parent = false;
+        }
+    }
+    auto aliases_it = aliases_by_cache_key_.find(key.cache_key);
+    if (aliases_it != aliases_by_cache_key_.end()) {
+        aliases_it->second.erase(key);
+        if (aliases_it->second.empty()) {
+            aliases_by_cache_key_.erase(aliases_it);
+        }
+    }
+    tree_nodes_.erase(it);
+}
+
+void SharedBlockCache::removeAllTreeAliasesForCacheKeyLocked(CacheKeyType cache_key) {
+    auto aliases_it = aliases_by_cache_key_.find(cache_key);
+    if (aliases_it == aliases_by_cache_key_.end()) {
+        return;
+    }
+    std::vector<NamespacedKey> aliases(aliases_it->second.begin(), aliases_it->second.end());
+    for (const auto& key : aliases) {
+        removeTreeAliasLocked(key);
+    }
+}
+
+void SharedBlockCache::markAllTreeAliasesResidentLocked(CacheKeyType cache_key) {
+    auto aliases_it = aliases_by_cache_key_.find(cache_key);
+    if (aliases_it == aliases_by_cache_key_.end()) {
+        return;
+    }
+    for (const auto& key : aliases_it->second) {
+        auto node_it = tree_nodes_.find(key);
+        if (node_it == tree_nodes_.end() || node_it->second.resident) {
+            continue;
+        }
+        eraseLeafLocked(node_it->second);
+        node_it->second.resident = true;
+    }
+}
+
+void SharedBlockCache::refreshAllTreeAliasesLocked(CacheKeyType cache_key) {
+    auto aliases_it = aliases_by_cache_key_.find(cache_key);
+    if (aliases_it == aliases_by_cache_key_.end()) {
+        return;
+    }
+    std::vector<NamespacedKey> aliases(aliases_it->second.begin(), aliases_it->second.end());
+    for (const auto& key : aliases) {
+        refreshLeafLocked(key);
+    }
+}
+
+bool SharedBlockCache::flatItemHasCanonicalDependencyLocked(CacheKeyType cache_key) const {
+    for (const auto& [key, item] : lru_cache_.items()) {
+        if (key == cache_key) {
+            return item.has_dependency && item.dependency_namespace == kGpuCpCanonicalNamespace;
+        }
+    }
+    return false;
+}
+
+bool SharedBlockCache::updateItemDependencyLocked(UnifiedCacheItem&      item,
+                                                  NamespaceId            namespace_id,
+                                                  const BlockDependency& dependency) const {
+    if (item.has_dependency && item.dependency_namespace == kGpuCpCanonicalNamespace
+        && namespace_id != kGpuCpCanonicalNamespace) {
+        return false;
+    }
+    if (item.has_dependency && item.dependency_namespace == namespace_id
+        && item.dependency.has_parent == dependency.has_parent && item.dependency.parent_key == dependency.parent_key
+        && item.dependency.ordinal == dependency.ordinal) {
+        return false;
+    }
+    item.dependency           = dependency;
+    item.dependency_namespace = namespace_id;
+    item.has_dependency       = true;
+    return true;
+}
+
+bool SharedBlockCache::slotMatchable(const UnifiedCacheItem& item, size_t group_id) {
+    return group_id >= item.matchable_slots.size() || item.matchable_slots[group_id];
+}
+
+bool SharedBlockCache::hasUsableSlot(const UnifiedCacheItem& item, int group_id) {
+    return group_id >= 0 && static_cast<size_t>(group_id) < item.slots.size()
+           && !isNullBlockIdx(item.slots[static_cast<size_t>(group_id)]);
+}
+
+std::vector<SharedBlockCache::NamespacedKey>
+SharedBlockCache::collectEvictChainLocked(const NamespacedKey& leaf_key) const {
+    std::vector<NamespacedKey> chain;
+    auto                       it = tree_nodes_.find(leaf_key);
+    if (it == tree_nodes_.end() || it->second.resident || !it->second.children.empty()
+        || !hasFlatItemLocked(it->second.key.cache_key) || isFlatItemResidentLocked(it->second.key.cache_key)) {
+        return chain;
+    }
+
+    NamespacedKey cur = leaf_key;
+    while (true) {
+        auto node_it = tree_nodes_.find(cur);
+        if (node_it == tree_nodes_.end() || node_it->second.resident || !hasFlatItemLocked(cur.cache_key)
+            || isFlatItemResidentLocked(cur.cache_key)) {
+            break;
+        }
+        chain.push_back(cur);
+        if (!node_it->second.has_parent) {
+            break;
+        }
+        auto parent_it = tree_nodes_.find(node_it->second.parent);
+        if (parent_it == tree_nodes_.end() || parent_it->second.resident
+            || isFlatItemResidentLocked(parent_it->first.cache_key)) {
+            break;
+        }
+        if (parent_it->second.children.size() != 1) {
+            break;
+        }
+        cur = parent_it->first;
+    }
+    return chain;
+}
+
+bool SharedBlockCache::chainHasUsableSlotLocked(const std::vector<NamespacedKey>& chain, int group_id) const {
+    for (const auto& key : chain) {
+        for (const auto& [cache_key, item] : lru_cache_.items()) {
+            if (cache_key == key.cache_key && hasUsableSlot(item, group_id)) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+bool SharedBlockCache::chainHasReachableAncestorSlotLocked(const std::vector<NamespacedKey>& chain,
+                                                           int                               group_id) const {
+    if (chain.empty()) {
+        return false;
+    }
+    auto node_it = tree_nodes_.find(chain.back());
+    while (node_it != tree_nodes_.end() && node_it->second.has_parent) {
+        auto parent_it = tree_nodes_.find(node_it->second.parent);
+        if (parent_it == tree_nodes_.end() || parent_it->second.resident
+            || !hasFlatItemLocked(parent_it->first.cache_key) || isFlatItemResidentLocked(parent_it->first.cache_key)) {
+            return false;
+        }
+        bool parent_has_target_slot = false;
+        for (const auto& [cache_key, item] : lru_cache_.items()) {
+            if (cache_key == parent_it->first.cache_key && hasUsableSlot(item, group_id)) {
+                parent_has_target_slot = true;
+                break;
+            }
+        }
+        if (parent_has_target_slot) {
+            bool all_children_evictable = true;
+            for (const auto& child : parent_it->second.children) {
+                if (!subtreeEvictableForAncestorSlotLocked(child)) {
+                    all_children_evictable = false;
+                    break;
+                }
+            }
+            if (all_children_evictable) {
+                return true;
+            }
+        }
+        node_it = parent_it;
+    }
+    return false;
+}
+
+bool SharedBlockCache::subtreeEvictableForAncestorSlotLocked(const NamespacedKey& key) const {
+    auto node_it = tree_nodes_.find(key);
+    if (node_it == tree_nodes_.end() || node_it->second.resident || !hasFlatItemLocked(key.cache_key)
+        || isFlatItemResidentLocked(key.cache_key)) {
+        return false;
+    }
+    for (const auto& child : node_it->second.children) {
+        if (!subtreeEvictableForAncestorSlotLocked(child)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+bool SharedBlockCache::selectIndependentGroupEvictionsLocked(int group_id, size_t min_blocks, EvictResult& result) {
+    if (group_id < 0 || (group_num_ > 0 && group_id >= group_num_) || min_blocks == 0) {
+        return false;
+    }
+    size_t selected_blocks = 0;
+    std::vector<LeafKey> leaves(leaf_lru_.begin(), leaf_lru_.end());
+    for (const auto& leaf : leaves) {
+        if (selected_blocks >= min_blocks) {
+            break;
+        }
+        const auto leaf_key = NamespacedKey{leaf.namespace_id, leaf.cache_key};
+        auto       chain    = collectEvictChainLocked(leaf_key);
+        if (chain.size() <= 1) {
+            continue;
+        }
+        // Keep the leaf group tail block when possible. Scan from leaf-parent
+        // upward and drop the deepest non-tail slot first.
+        for (size_t chain_idx = 1; chain_idx < chain.size(); ++chain_idx) {
+            const auto& key = chain[chain_idx];
+            auto [success, item] = lru_cache_.get(key.cache_key);
+            if (!success || item.is_resident || static_cast<size_t>(group_id) >= item.slots.size()
+                || isNullBlockIdx(item.slots[static_cast<size_t>(group_id)])) {
+                continue;
+            }
+            removeSlotFromItemLocked(key.cache_key, group_id, result);
+            ++selected_blocks;
+            break;
+        }
+    }
+    return selected_blocks >= min_blocks;
+}
+
+void SharedBlockCache::removeSlotFromItemLocked(CacheKeyType cache_key, int group_id, EvictResult& result) {
+    UnifiedCacheItem item;
+    if (!lru_cache_.remove(cache_key, &item)) {
+        return;
+    }
+    if (group_id < 0 || static_cast<size_t>(group_id) >= item.slots.size()
+        || isNullBlockIdx(item.slots[static_cast<size_t>(group_id)])) {
+        lru_cache_.put(cache_key, item);
+        return;
+    }
+
+    std::vector<BlockIdxType> evicted_slots(item.slots.size(), NULL_BLOCK_IDX);
+    evicted_slots[static_cast<size_t>(group_id)] = item.slots[static_cast<size_t>(group_id)];
+    result.evicted_keys.push_back(cache_key);
+    result.evicted_slots[cache_key] = std::move(evicted_slots);
+    result.evicted_namespaces[cache_key] =
+        item.has_dependency ? item.dependency_namespace : SharedBlockCache::kGpuLogicalNamespace;
+    if (item.has_dependency) {
+        result.evicted_dependencies[cache_key] = item.dependency;
+    }
+    const int64_t created_time_us =
+        static_cast<size_t>(group_id) < item.slot_created_time_us.size() ?
+            item.slot_created_time_us[static_cast<size_t>(group_id)] :
+            item.created_time_us;
+    result.evicted_lifetime_ms[cache_key] = std::max<int64_t>(0, (currentTimeUs() - created_time_us) / 1000);
+    result.evicted_independent_group[cache_key] = group_id;
+
+    item.slots[static_cast<size_t>(group_id)] = NULL_BLOCK_IDX;
+    if (static_cast<size_t>(group_id) < item.matchable_slots.size()) {
+        item.matchable_slots[static_cast<size_t>(group_id)] = false;
+    }
+    if (static_cast<size_t>(group_id) < item.slot_created_time_us.size()) {
+        item.slot_created_time_us[static_cast<size_t>(group_id)] = 0;
+    }
+
+    const bool has_any_slot = std::any_of(item.slots.begin(), item.slots.end(), [](BlockIdxType slot) {
+        return !isNullBlockIdx(slot);
+    });
+    if (has_any_slot) {
+        lru_cache_.put(cache_key, item);
+        refreshAllTreeAliasesLocked(cache_key);
+    } else {
+        removeAllTreeAliasesForCacheKeyLocked(cache_key);
+    }
+    ++version_;
+}
+
+bool SharedBlockCache::hasFlatItemLocked(CacheKeyType cache_key) const {
+    return lru_cache_.contains(cache_key);
+}
+
+bool SharedBlockCache::isFlatItemResidentLocked(CacheKeyType cache_key) const {
+    for (const auto& [key, item] : lru_cache_.items()) {
+        if (key == cache_key) {
+            return item.is_resident;
+        }
+    }
+    return false;
+}
+
+bool SharedBlockCache::isIndependentEvictionGroupLocked(int group_id) const {
+    return independent_eviction_group_ids_.find(group_id) != independent_eviction_group_ids_.end();
+}
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/SharedBlockCache.h b/rtp_llm/cpp/cache/SharedBlockCache.h
new file mode 100644
index 0000000000..2fc0b2d6ac
--- /dev/null
+++ b/rtp_llm/cpp/cache/SharedBlockCache.h
@@ -0,0 +1,187 @@
+#pragma once
+
+#include <mutex>
+#include <memory>
+#include <optional>
+#include <set>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "rtp_llm/cpp/utils/LRUCache.h"
+#include "rtp_llm/cpp/cache/Types.h"
+#include "rtp_llm/cpp/cache/BlockPool.h"
+#include "rtp_llm/cpp/cache/KVCacheResource.h"
+
+namespace rtp_llm {
+
+class SharedBlockCache {
+public:
+    using NamespaceId = uint32_t;
+
+    static constexpr NamespaceId kDefaultNamespace     = 0;
+    static constexpr NamespaceId kGpuLogicalNamespace  = 1;
+    static constexpr NamespaceId kGpuCpCanonicalNamespace = 2;
+
+    struct NamespacedKey {
+        NamespaceId  namespace_id{0};
+        CacheKeyType cache_key{0};
+
+        bool operator==(const NamespacedKey& other) const {
+            return namespace_id == other.namespace_id && cache_key == other.cache_key;
+        }
+    };
+
+    struct NamespacedKeyHash {
+        size_t operator()(const NamespacedKey& key) const {
+            return std::hash<uint64_t>()((static_cast<uint64_t>(key.namespace_id) << 32)
+                                         ^ static_cast<uint64_t>(key.cache_key));
+        }
+    };
+
+    struct UnifiedCacheItem {
+        CacheKeyType              cache_key;
+        bool                      is_resident = false;
+        std::vector<BlockIdxType> slots;
+        std::vector<bool>         matchable_slots;
+        std::vector<int64_t>      slot_created_time_us;
+        int64_t                   created_time_us = 0;
+        BlockDependency           dependency;
+        NamespaceId               dependency_namespace = kDefaultNamespace;
+        bool                      has_dependency = false;
+    };
+
+    struct EvictResult {
+        std::vector<CacheKeyType>                                   evicted_keys;
+        std::unordered_map<CacheKeyType, std::vector<BlockIdxType>> evicted_slots;
+        std::unordered_map<CacheKeyType, BlockDependency>           evicted_dependencies;
+        std::unordered_map<CacheKeyType, NamespaceId>               evicted_namespaces;
+        std::unordered_map<CacheKeyType, int64_t>                   evicted_lifetime_ms;
+        std::unordered_map<CacheKeyType, int>                       evicted_independent_group;
+    };
+
+    struct MatchResult {
+        bool                      found = false;
+        std::vector<BlockIdxType> group_blocks;
+    };
+
+    using LRUCacheType = LRUCache<CacheKeyType, UnifiedCacheItem>;
+
+public:
+    explicit SharedBlockCache(): lru_cache_(kCacheMaxCapacity) {}
+
+    void init(int group_num, const std::vector<BlockPoolPtr>& group_pools);
+
+    void put(CacheKeyType cache_key, const std::vector<BlockIdxType>& group_slots, bool is_resident);
+    void put(CacheKeyType                 cache_key,
+             const std::vector<BlockIdxType>& group_slots,
+             bool                         is_resident,
+             NamespaceId                  namespace_id,
+             const BlockDependency&       dependency,
+             const std::vector<bool>&     matchable_slots = {});
+
+    MatchResult match(CacheKeyType cache_key);
+
+    BlockIdxType matchGroup(CacheKeyType cache_key, int group_id);
+
+    EvictResult selectAndEvict(size_t min_blocks);
+    EvictResult selectAndEvictForGroup(int group_id, size_t min_blocks);
+
+    size_t evictAndFree(size_t min_blocks);
+    size_t evictAndFreeForGroup(int group_id, size_t min_blocks, EvictResult* evict_result_out = nullptr);
+
+    std::optional<UnifiedCacheItem> remove(CacheKeyType cache_key);
+
+    bool contains(CacheKeyType cache_key) const;
+
+    bool empty() const;
+
+    size_t size() const;
+
+    std::vector<CacheKeyType> allCacheKeys() const;
+
+    int64_t version() const;
+    void    setPrefixTreeEnabled(bool enabled);
+    bool    prefixTreeEnabled() const;
+    void    setIndependentGroupEviction(bool enabled, const std::vector<int>& group_ids);
+
+private:
+    static const size_t kCacheMaxCapacity = 10000000;
+
+    struct PrefixTreeNode {
+        NamespacedKey key;
+        NamespacedKey parent;
+        bool          has_parent{false};
+        bool          resident{false};
+        uint32_t      ordinal{0};
+        uint64_t      last_access_seq{0};
+        std::unordered_set<NamespacedKey, NamespacedKeyHash> children;
+    };
+
+    struct LeafKey {
+        uint64_t      last_access_seq{0};
+        NamespaceId   namespace_id{0};
+        CacheKeyType  cache_key{0};
+
+        bool operator<(const LeafKey& other) const {
+            if (last_access_seq != other.last_access_seq) {
+                return last_access_seq < other.last_access_seq;
+            }
+            if (namespace_id != other.namespace_id) {
+                return namespace_id < other.namespace_id;
+            }
+            return cache_key < other.cache_key;
+        }
+    };
+
+    void upsertTreeNodeLocked(CacheKeyType                 cache_key,
+                              NamespaceId                  namespace_id,
+                              const BlockDependency&       dependency,
+                              bool                         is_resident);
+    void detachPendingChildLocked(const NamespacedKey& parent, const NamespacedKey& child);
+    void attachPendingChildrenLocked(PrefixTreeNode& node);
+    void touchTreeAliasesLocked(CacheKeyType cache_key);
+    void touchTreeNodeLocked(PrefixTreeNode& node);
+    void eraseLeafLocked(const PrefixTreeNode& node);
+    void insertLeafIfEligibleLocked(const PrefixTreeNode& node);
+    void refreshLeafLocked(const NamespacedKey& key);
+    void removeTreeAliasLocked(const NamespacedKey& key);
+    void removeAllTreeAliasesForCacheKeyLocked(CacheKeyType cache_key);
+    void markAllTreeAliasesResidentLocked(CacheKeyType cache_key);
+    void refreshAllTreeAliasesLocked(CacheKeyType cache_key);
+    bool flatItemHasCanonicalDependencyLocked(CacheKeyType cache_key) const;
+    bool updateItemDependencyLocked(UnifiedCacheItem& item,
+                                    NamespaceId       namespace_id,
+                                    const BlockDependency& dependency) const;
+    static bool slotMatchable(const UnifiedCacheItem& item, size_t group_id);
+    static bool hasUsableSlot(const UnifiedCacheItem& item, int group_id);
+    std::vector<NamespacedKey> collectEvictChainLocked(const NamespacedKey& leaf_key) const;
+    bool chainHasUsableSlotLocked(const std::vector<NamespacedKey>& chain, int group_id) const;
+    bool chainHasReachableAncestorSlotLocked(const std::vector<NamespacedKey>& chain, int group_id) const;
+    bool subtreeEvictableForAncestorSlotLocked(const NamespacedKey& key) const;
+    bool selectIndependentGroupEvictionsLocked(int group_id, size_t min_blocks, EvictResult& result);
+    void removeSlotFromItemLocked(CacheKeyType cache_key, int group_id, EvictResult& result);
+    bool hasFlatItemLocked(CacheKeyType cache_key) const;
+    bool isFlatItemResidentLocked(CacheKeyType cache_key) const;
+    bool isIndependentEvictionGroupLocked(int group_id) const;
+
+    LRUCacheType       lru_cache_;
+    mutable std::mutex mu_;
+    int64_t            version_{0};
+    bool               prefix_tree_enabled_{true};
+    bool               independent_group_eviction_enabled_{false};
+    uint64_t           tree_access_seq_{0};
+
+    int                       group_num_ = 0;
+    std::vector<BlockPoolPtr> group_pools_;
+    std::unordered_map<NamespacedKey, PrefixTreeNode, NamespacedKeyHash> tree_nodes_;
+    std::unordered_map<CacheKeyType, std::unordered_set<NamespacedKey, NamespacedKeyHash>> aliases_by_cache_key_;
+    std::unordered_map<NamespacedKey, std::unordered_set<NamespacedKey, NamespacedKeyHash>, NamespacedKeyHash>
+        pending_children_by_parent_;
+    std::set<LeafKey> leaf_lru_;
+    std::unordered_set<int> independent_eviction_group_ids_;
+};
+
+using SharedBlockCachePtr = std::shared_ptr<SharedBlockCache>;
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/SingleConfigCreator.cc b/rtp_llm/cpp/cache/SingleConfigCreator.cc
deleted file mode 100644
index 1a67d34057..0000000000
--- a/rtp_llm/cpp/cache/SingleConfigCreator.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-#include "rtp_llm/cpp/cache/SingleConfigCreator.h"
-
-#include "rtp_llm/cpp/cache/KVCacheSpec.h"
-#include "rtp_llm/cpp/cache/MemoryEvaluationHelper.h"
-#include "rtp_llm/cpp/utils/Logger.h"
-
-namespace rtp_llm {
-
-CacheConfig SingleConfigCreator::createSingleConfig(const ModelConfig&       model_config,
-                                                    const ParallelismConfig& parallelism_config,
-                                                    bool                     is_mtp) {
-    auto dtype = MemoryEvaluationHelper::getDataTypeForCache(model_config);
-
-    auto layer_num = model_config.num_layers;
-
-    std::vector<int> all_layer_ids(layer_num);
-    for (int i = 0; i < layer_num; ++i) {
-        all_layer_ids[i] = i;
-    }
-
-    CacheConfig config;
-    config.layer_num          = static_cast<uint32_t>(layer_num);
-    config.layer_all_num      = static_cast<uint32_t>(layer_num);
-    config.block_num          = 0;
-    config.seq_size_per_block = static_cast<uint32_t>(model_config.attn_config.tokens_per_block);
-
-    config.use_mla   = model_config.attn_config.use_mla;
-    config.dtype     = dtype;
-    config.is_sparse = model_config.attn_config.is_sparse;
-
-    KVCacheSpecPtr spec;
-    if (model_config.attn_config.use_mla && model_config.mla_ops_type != rtp_llm::MlaOpsType::MHA) {
-        spec = std::make_shared<MLAKVCacheSpec>(model_config.attn_config, parallelism_config);
-    } else {
-        spec = std::make_shared<MHAKVCacheSpec>(model_config.attn_config, parallelism_config);
-    }
-    spec->dtype = dtype;
-    config.cache_specs.push_back(spec);
-    config.group_types.push_back(CacheGroupType::FULL);
-
-    // Using spec interface for block size and scale
-    config.kv_block_stride_bytes = config.cache_specs[0]->block_size_bytes();
-    config.kv_block_size_bytes   = static_cast<size_t>(config.layer_num) * config.kv_block_stride_bytes;
-
-    // Scale handling - no need to check dtype as scale_block_size_bytes() returns 0 if no scale support
-    config.kv_scale_stride_bytes = config.cache_specs[0]->scale_block_size_bytes();
-    config.kv_scale_size_bytes   = static_cast<size_t>(config.layer_num) * config.kv_scale_stride_bytes;
-
-    if (config.is_sparse) {
-        auto indexer_dim             = model_config.attn_config.indexer_head_dim;
-        config.kv_scale_stride_bytes = (indexer_dim + indexer_dim / 128 * 4) * spec->seq_size_per_block;
-        config.kv_scale_size_bytes   = static_cast<size_t>(config.layer_num) * config.kv_scale_stride_bytes;
-    }
-
-    config.block_size_bytes = config.kv_block_size_bytes + config.kv_scale_size_bytes;
-    config.group_layer_num  = layer_num;  // only 1 group for SingleConfig
-
-    // Per-layer block stride (kv + scale).
-    const size_t per_layer_stride_bytes = config.kv_block_stride_bytes + config.kv_scale_stride_bytes;
-    config.layer_to_block_stride_bytes.assign(static_cast<size_t>(config.layer_all_num),
-                                              static_cast<int>(per_layer_stride_bytes));
-
-    // Global layer ids are the indices used by BlockPool::convertIndexToAddr (0..N-1 in a single-model case).
-    config.global_layer_ids.push_back(all_layer_ids);
-    config.layer_ids.push_back(all_layer_ids);
-    config.layer_to_group_id.assign(config.layer_num, 0);
-    config.layer_attn_types.assign(config.layer_num, CacheGroupType::FULL);
-    return config;
-}
-
-}  // namespace rtp_llm
\ No newline at end of file
diff --git a/rtp_llm/cpp/cache/Types.cc b/rtp_llm/cpp/cache/Types.cc
new file mode 100644
index 0000000000..f4a65b82d3
--- /dev/null
+++ b/rtp_llm/cpp/cache/Types.cc
@@ -0,0 +1,11 @@
+#include "rtp_llm/cpp/cache/Types.h"
+
+#include "rtp_llm/cpp/engine_base/stream/CompleteTokenIds.h"
+
+namespace rtp_llm {
+
+int MallocInfo::incrSeqLen() const {
+    return incr_seq_len_override >= 0 ? incr_seq_len_override : complete_token_ids->seqLength();
+}
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/Types.h b/rtp_llm/cpp/cache/Types.h
index 3a025e06bc..75908ddf05 100644
--- a/rtp_llm/cpp/cache/Types.h
+++ b/rtp_llm/cpp/cache/Types.h
@@ -1,17 +1,21 @@
 #pragma once
 
 #include <cstddef>
+#include <memory>
 #include <vector>
 #include <cstdint>
 
 #include "rtp_llm/cpp/cache/BlockInfo.h"
-#include "rtp_llm/cpp/cache/CacheGroupType.h"
+#include "rtp_llm/cpp/cache/spec/CacheGroupType.h"
+#include "rtp_llm/cpp/cache/CPSlotMapper.h"
 #include "rtp_llm/models_py/bindings/core/Types.h"
 #include "rtp_llm/cpp/cache/BatchKVCacheResource.h"
-#include "rtp_llm/cpp/engine_base/stream/CompleteTokenIds.h"
 
 namespace rtp_llm {
 
+class CompleteTokenIds;
+using CompleteTokenIdsPtr = std::shared_ptr<CompleteTokenIds>;
+
 typedef int32_t          GroupIdType;
 typedef std::vector<int> LayerIdsType;
 
@@ -48,14 +52,21 @@ struct KVPartitionBytes {
 };
 
 struct MallocInfo {
-    BatchKVCacheResourcePtr batch_kv_cache_resource;
-    CompleteTokenIdsPtr     complete_token_ids;
-    int64_t                 request_id          = 0;
-    bool                    verbose             = true;  // for failed log
-    bool                    reuse_cache         = true;
-    bool                    enable_device_cache = true;
-    // Sparse linear-block cleanup is only valid for incremental allocation.
+    BatchKVCacheResourcePtr       batch_kv_cache_resource;
+    CompleteTokenIdsPtr           complete_token_ids;
+    int64_t                       request_id          = 0;
+    bool                          verbose             = true;  // for failed log
+    bool                          reuse_cache         = true;
+    bool                          enable_device_cache = true;
+    // Sparse tail-group cleanup is only valid for incremental allocation.
+    // Prefill init keeps reused prefix slots intact because model-path kernels
+    // still read them by prefix_length.
     bool enable_remove_skipped_blocks = true;
+    // Override for incrMalloc's seqLength read; -1 = fall back to complete_token_ids->seqLength().
+    // Lets the state machine feed the publish-time value instead of racing with the async worker.
+    int incr_seq_len_override = -1;
+
+    int incrSeqLen() const;
 };
 
 struct MallocResult {
@@ -73,9 +84,9 @@ struct FreeInfo {
 };
 
 struct InsertInfo {
-    BatchKVCacheResourcePtr batch_kv_cache_resource;
-    CompleteTokenIdsPtr     complete_token_ids;
-    bool                    is_resident;
+    BatchKVCacheResourcePtr       batch_kv_cache_resource;
+    CompleteTokenIdsPtr           complete_token_ids;
+    bool                          is_resident;
 };
 
-}  // namespace rtp_llm
\ No newline at end of file
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/allocator/HybridKVCacheAllocator.cc b/rtp_llm/cpp/cache/allocator/HybridKVCacheAllocator.cc
new file mode 100644
index 0000000000..e5e76ee768
--- /dev/null
+++ b/rtp_llm/cpp/cache/allocator/HybridKVCacheAllocator.cc
@@ -0,0 +1,755 @@
+#include "rtp_llm/cpp/cache/allocator/HybridKVCacheAllocator.h"
+
+#include <algorithm>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "rtp_llm/cpp/cache/BlockPoolConfigHelper.h"
+#include "rtp_llm/cpp/cache/CPSlotMapper.h"
+#include "rtp_llm/cpp/engine_base/stream/CompleteTokenIds.h"
+#include "rtp_llm/cpp/utils/Logger.h"
+#include "rtp_llm/cpp/utils/TimeUtil.h"
+
+namespace rtp_llm {
+namespace {
+
+// CP shard helpers: when mapper is null/passthrough, all helpers no-op.
+inline int cpEffectiveSeqLen(const std::shared_ptr<CPSlotMapper>& mapper, int seq_len) {
+    return (mapper && mapper->isSharded()) ? mapper->effectiveSeqLenForAlloc(seq_len) : seq_len;
+}
+
+inline CacheKeysType cpEffectiveCacheKeys(const std::shared_ptr<CPSlotMapper>& mapper, const CacheKeysType& full) {
+    if (!mapper || !mapper->isSharded()) {
+        return full;
+    }
+    CacheKeysType local;
+    const int     cp_size = mapper->cpSize();
+    const int     start   = cp_size - 1;
+    for (int i = start; i < static_cast<int>(full.size()); i += cp_size) {
+        local.push_back(full[i]);
+    }
+    return local;
+}
+
+inline int cpVirtualBlockSize(const std::shared_ptr<CPSlotMapper>& mapper, int block_size) {
+    return (mapper && mapper->isSharded()) ? mapper->virtualBlockSize() : block_size;
+}
+
+inline bool containsGroupId(const std::vector<int>& group_ids, int gid) {
+    return std::find(group_ids.begin(), group_ids.end(), gid) != group_ids.end();
+}
+
+inline bool cpShardThisGroup(const std::shared_ptr<CPSlotMapper>& mapper, const KVCacheGroupPtr& group) {
+    return mapper && mapper->isSharded() && group && group->isCpShardable();
+}
+
+inline int cpEffectiveSeqLenForGroup(const std::shared_ptr<CPSlotMapper>& mapper,
+                                     const KVCacheGroupPtr&               group,
+                                     int                                  seq_len) {
+    return cpShardThisGroup(mapper, group) ? mapper->effectiveSeqLenForAlloc(seq_len) : seq_len;
+}
+
+inline int cpVirtualBlockSizeForGroup(const std::shared_ptr<CPSlotMapper>& mapper,
+                                      const KVCacheGroupPtr&               group,
+                                      int                                  block_size) {
+    return cpShardThisGroup(mapper, group) ? mapper->virtualBlockSize() : block_size;
+}
+
+inline size_t groupSeqSize(const CacheConfig& config, int gid, size_t fallback) {
+    return (gid >= 0 && static_cast<size_t>(gid) < config.group_seq_size_per_block.size()
+            && config.group_seq_size_per_block[static_cast<size_t>(gid)] > 0) ?
+               config.group_seq_size_per_block[static_cast<size_t>(gid)] :
+               fallback;
+}
+
+BlockIndicesType validBlocksAfter(const BlockIndicesType& blocks, size_t begin) {
+    BlockIndicesType valid;
+    if (begin >= blocks.size()) {
+        return valid;
+    }
+    valid.reserve(blocks.size() - begin);
+    for (size_t i = begin; i < blocks.size(); ++i) {
+        if (!isNullBlockIdx(blocks[i])) {
+            valid.push_back(blocks[i]);
+        }
+    }
+    return valid;
+}
+
+}  // namespace
+
+bool HybridKVCacheAllocator::skipReuseCacheGroup(int gid) const {
+    return gid >= 0 && static_cast<size_t>(gid) < kv_cache_groups_.size()
+           && kv_cache_groups_[static_cast<size_t>(gid)]->reusePolicy() == CacheReusePolicy::NON_REUSABLE;
+}
+
+std::vector<int> HybridKVCacheAllocator::independentEvictionGroupIds() const {
+    std::vector<int> group_ids;
+    for (size_t gid = 0; gid < kv_cache_groups_.size(); ++gid) {
+        if (kv_cache_groups_[gid]->evictPolicy() == CacheEvictPolicy::INDEPENDENT) {
+            group_ids.push_back(static_cast<int>(gid));
+        }
+    }
+    return group_ids;
+}
+
+bool HybridKVCacheAllocator::cpCompactSwaGroup(int gid, const std::shared_ptr<CPSlotMapper>& mapper) const {
+    if (!mapper || !mapper->isSharded() || gid < 0 || static_cast<size_t>(gid) >= kv_cache_groups_.size()
+        || !kv_cache_groups_[static_cast<size_t>(gid)]->cpCompactTailBlocks()) {
+        return false;
+    }
+    const auto row_tokens = groupSeqSize(config_, gid, seqSizePerBlock());
+    return row_tokens == static_cast<size_t>(mapper->virtualBlockSize());
+}
+
+HybridKVCacheAllocator::HybridKVCacheAllocator(const CacheConfig&                 config,
+                                               AllocationType                     allocation_type,
+                                               const kmonitor::MetricsReporterPtr metrics_reporter,
+                                               int64_t                            reserve_block_ratio):
+    KVCacheAllocator(config, allocation_type, metrics_reporter, reserve_block_ratio) {}
+
+int HybridKVCacheAllocator::reuseCache(const CacheKeysType&                 cache_keys,
+                                       BatchKVCacheResource&                kv_resource,
+                                       const std::shared_ptr<CPSlotMapper>& cp_mapper) {
+    // Under cp shard, FULL groups index block_ids by cp-virtual-block units
+    // (one entry covers cp_size physical blocks). LINEAR/SWA groups index by
+    // raw block_size logical blocks. So when populating tail blocks for
+    // LINEAR/SWA we need to scale the array length and matched-block position
+    // back to the logical-block coordinate system.
+    const int                     cp_scale = (cp_mapper && cp_mapper->isSharded()) ? cp_mapper->cpSize() : 1;
+    int                           min_full_reuse_blocks = static_cast<int>(cache_keys.size());
+    std::vector<BlockIndicesType> full_matched_blocks(kv_cache_groups_.size());
+
+    for (int gid : full_group_ids_) {
+        auto match_result     = kv_cache_groups_[static_cast<size_t>(gid)]->match(cache_keys);
+        min_full_reuse_blocks = std::min(min_full_reuse_blocks, static_cast<int>(match_result.reuse_blocks));
+        full_matched_blocks[static_cast<size_t>(gid)] = std::move(match_result.block_indices);
+    }
+
+    int                           pos = min_full_reuse_blocks - 1;
+    std::vector<BlockIdxType>     linear_tail_blocks(linear_group_ids_.size(), NULL_BLOCK_IDX);
+    std::vector<BlockIndicesType> swa_tail_blocks(swa_group_ids_.size());
+    const bool                    has_tail_groups = !linear_group_ids_.empty() || !swa_group_ids_.empty();
+    for (; pos >= 0 && has_tail_groups; --pos) {
+        bool                          all_tail_groups_matched = true;
+        std::vector<BlockIdxType>     candidate_linear_tail_blocks(linear_group_ids_.size(), NULL_BLOCK_IDX);
+        std::vector<BlockIndicesType> candidate_swa_tail_blocks(swa_group_ids_.size());
+        for (size_t i = 0; i < linear_group_ids_.size(); ++i) {
+            const int gid      = linear_group_ids_[i];
+            auto result = kv_cache_groups_[static_cast<size_t>(gid)]->matchSingleKey(cache_keys[static_cast<size_t>(pos)]);
+            if (result.block_indices.empty()) {
+                all_tail_groups_matched = false;
+                break;
+            }
+            candidate_linear_tail_blocks[i] = result.block_indices[0];
+        }
+        if (!all_tail_groups_matched) {
+            continue;
+        }
+        for (size_t i = 0; i < swa_group_ids_.size(); ++i) {
+            const int gid       = swa_group_ids_[i];
+            if (skipReuseCacheGroup(gid)) {
+                continue;
+            }
+            auto result = kv_cache_groups_[static_cast<size_t>(gid)]->matchSingleKey(cache_keys[static_cast<size_t>(pos)]);
+            if (result.block_indices.empty()) {
+                all_tail_groups_matched = false;
+                break;
+            }
+            candidate_swa_tail_blocks[i].push_back(result.block_indices[0]);
+        }
+        if (all_tail_groups_matched) {
+            linear_tail_blocks = std::move(candidate_linear_tail_blocks);
+            swa_tail_blocks    = std::move(candidate_swa_tail_blocks);
+            break;
+        }
+    }
+
+    const int reuse_blocks_len = has_tail_groups ? std::max(pos + 1, 0) : std::max(min_full_reuse_blocks, 0);
+    if (reuse_blocks_len <= 0) {
+        return 0;
+    }
+
+    for (int gid : full_group_ids_) {
+        BlockIndicesType full_blocks = full_matched_blocks[static_cast<size_t>(gid)];
+        if (static_cast<int>(full_blocks.size()) > reuse_blocks_len) {
+            full_blocks.resize(static_cast<size_t>(reuse_blocks_len));
+        }
+        kv_resource.mutableBlockIds(0, gid).assign(std::move(full_blocks));
+    }
+
+    // LINEAR/SWA arrays are sized in logical-block units (cp_size× larger
+    // than the FULL groups' cp-virtual-block units). The matched tail block
+    // corresponds to the LAST logical block in the canonical (last-rank)
+    // namespace, so its index is `(reuse_blocks_len * cp_size) - 1` in
+    // logical units, NOT `reuse_blocks_len - 1`.
+    const int logical_reuse_len = reuse_blocks_len * cp_scale;
+    for (size_t i = 0; i < linear_group_ids_.size(); ++i) {
+        const int gid = linear_group_ids_[i];
+        kv_resource.mutableBlockIds(0, gid).assign(
+            BlockIndicesType(static_cast<size_t>(logical_reuse_len), NULL_BLOCK_IDX));
+        kv_resource.mutableBlockIds(0, gid).setAt(static_cast<size_t>(logical_reuse_len - 1), linear_tail_blocks[i]);
+    }
+    for (size_t i = 0; i < swa_group_ids_.size(); ++i) {
+        const int gid = swa_group_ids_[i];
+        const int group_reuse_len = cpCompactSwaGroup(gid, cp_mapper) ? reuse_blocks_len : logical_reuse_len;
+        kv_resource.mutableBlockIds(0, gid).assign(
+            BlockIndicesType(static_cast<size_t>(group_reuse_len), NULL_BLOCK_IDX));
+        if (skipReuseCacheGroup(gid)) {
+            continue;
+        }
+        const size_t tail_begin =
+            static_cast<size_t>(std::max(group_reuse_len - static_cast<int>(swa_tail_blocks[i].size()), 0));
+        for (size_t j = 0; j < swa_tail_blocks[i].size(); ++j) {
+            kv_resource.mutableBlockIds(0, gid).setAt(tail_begin + j, swa_tail_blocks[i][j]);
+        }
+    }
+    return reuse_blocks_len;
+}
+
+MallocResult HybridKVCacheAllocator::initMallocForCommonLen(const MallocInfo& malloc_info) {
+    auto&     kv_resource = malloc_info.batch_kv_cache_resource;
+    const int batch_size  = kv_resource->batchSize();
+    RTP_LLM_CHECK_WITH_INFO(batch_size == 1, "currently batch size should be 1 in hybrid attention but %d", batch_size);
+
+    const int   seq_len        = malloc_info.complete_token_ids->seqLength();
+    const int   common_seq_len = std::min(malloc_info.complete_token_ids->commonSeqLength(), seq_len);
+    const auto& cp_mapper      = cp_slot_mapper_;
+    // reuse_unit_tokens is computed against the canonical (paged FULL) group's
+    // block_size: cache_keys reuse only happens for paged groups so virtual block
+    // size = canonical block_size * cp_size; non-paged groups don't enter reuse.
+    const KVCacheGroupPtr reuse_group =
+        full_group_ids_.empty() ? KVCacheGroupPtr{} : kv_cache_groups_[static_cast<size_t>(full_group_ids_.front())];
+    const int reuse_unit_tokens = cpVirtualBlockSizeForGroup(cp_mapper, reuse_group, seqSizePerBlock());
+
+    const auto&                   cache_keys         = kv_resource->cacheKeys(0);
+    int64_t                       match_cost_time_us = 0;
+    const size_t                  reserve_blocks     = reserveBlockNum();
+    int                           reuse_blocks       = 0;
+    std::vector<BlockIndicesType> referenced_blocks(static_cast<size_t>(kv_resource->groupNums()));
+
+    if (malloc_info.enable_device_cache) {
+        // CP-sharded: subsample to last-rank canonical key namespace before matching.
+        CacheKeysType cp_keys = cpEffectiveCacheKeys(cp_mapper, cache_keys);
+        // Off mode drops the last key to skip the partial trailing block. Under
+        // CP sharding cpEffectiveCacheKeys already excludes the partial block
+        // (last-rank stride lands inside completed full blocks only), so the
+        // extra drop would discard a valid full-block key — costing the SWA
+        // tail-loop its only matchable key (full_keys[cp_size-1 + (n-1)*cp_size]
+        // is exactly what the non-sharded SWA group caches).
+        const bool    cp_active = cp_mapper && cp_mapper->isSharded();
+        CacheKeysType match_keys(cp_keys.begin(),
+                                 cp_active ? cp_keys.end() : (cp_keys.empty() ? cp_keys.end() : cp_keys.end() - 1));
+        auto          begin_us = currentTimeUs();
+        reuse_blocks           = reuseCache(match_keys, *kv_resource, cp_mapper);
+        match_cost_time_us     = currentTimeUs() - begin_us;
+
+        for (int gid = 0; gid < kv_resource->groupNums(); ++gid) {
+            const auto&      blocks = kv_resource->blocks(0, gid);
+            BlockIndicesType valid;
+            valid.reserve(blocks.size());
+            for (auto b : blocks) {
+                if (!isNullBlockIdx(b)) {
+                    valid.push_back(b);
+                }
+            }
+            if (!valid.empty()) {
+                referenceBlocksInGroup(gid, valid);
+                referenced_blocks[static_cast<size_t>(gid)] = std::move(valid);
+            }
+        }
+        kv_resource->cacheResource(0).setDeviceReuseBlockNum(reuse_blocks);
+    }
+
+    if (reserve_blocks > 0 && !hasAvailableBlocksForReserve(malloc_info, reserve_blocks)) {
+        rollbackInitMalloc(*kv_resource, referenced_blocks, {});
+        return {false, 0};
+    }
+
+    std::vector<size_t> original_sizes(static_cast<size_t>(kv_resource->groupNums()));
+    for (int gid = 0; gid < kv_resource->groupNums(); ++gid) {
+        original_sizes[static_cast<size_t>(gid)] = kv_resource->blocksNum(0, gid);
+    }
+    for (int gid = 0; gid < kv_resource->groupNums(); ++gid) {
+        auto&      block_ids_0   = kv_resource->mutableBlockIds(0, gid);
+        const int  group_seq_len =
+            cpEffectiveSeqLenForGroup(cp_mapper, kv_cache_groups_[static_cast<size_t>(gid)], common_seq_len);
+        if (!kv_cache_groups_[static_cast<size_t>(gid)]->malloc(
+                block_ids_0, group_seq_len, malloc_info.reuse_cache, 0)) {
+            rollbackInitMalloc(*kv_resource, referenced_blocks, original_sizes);
+            return {false, 0};
+        }
+    }
+
+    for (int b = 1; b < batch_size; ++b) {
+        for (int gid = 0; gid < kv_resource->groupNums(); ++gid) {
+            kv_cache_groups_[static_cast<size_t>(gid)]->reference(kv_resource->mutableBlockIds(b, gid),
+                                                                  kv_resource->blocks(0, gid));
+        }
+    }
+    return {true, reuse_blocks * reuse_unit_tokens, match_cost_time_us};
+}
+
+MallocResult HybridKVCacheAllocator::incrMalloc(const MallocInfo& malloc_info) {
+    auto&       kv_resource  = malloc_info.batch_kv_cache_resource;
+    const auto& cp_mapper    = cp_slot_mapper_;
+    const int   batch_size   = kv_resource->batchSize();
+    const int   raw_seq_len  = malloc_info.incrSeqLen();
+    const int   reserve_step = malloc_info.complete_token_ids->getReserveStep();
+
+    std::vector<std::vector<BlockIndicesType>> original_blocks(static_cast<size_t>(batch_size));
+    for (int b = 0; b < batch_size; ++b) {
+        original_blocks[static_cast<size_t>(b)].resize(static_cast<size_t>(kv_resource->groupNums()));
+        for (int gid = 0; gid < kv_resource->groupNums(); ++gid) {
+            original_blocks[static_cast<size_t>(b)][static_cast<size_t>(gid)] = kv_resource->blocks(b, gid);
+        }
+    }
+
+    bool all_success  = true;
+    int  failed_batch = -1;
+    int  failed_group = -1;
+    for (int b = 0; b < batch_size; ++b) {
+        for (int gid = 0; gid < kv_resource->groupNums(); ++gid) {
+            auto&      block_ids     = kv_resource->mutableBlockIds(b, gid);
+            const int group_seq_len =
+                cpEffectiveSeqLenForGroup(cp_mapper, kv_cache_groups_[static_cast<size_t>(gid)], raw_seq_len);
+            if (!kv_cache_groups_[static_cast<size_t>(gid)]->malloc(
+                    block_ids, group_seq_len, malloc_info.reuse_cache, reserve_step)) {
+                all_success  = false;
+                failed_batch = b;
+                failed_group = gid;
+                break;
+            }
+        }
+        if (!all_success) {
+            break;
+        }
+    }
+
+    if (all_success) {
+        if (!malloc_info.enable_remove_skipped_blocks) {
+            return {true, 0};
+        }
+        for (int b = 0; b < batch_size; ++b) {
+            for (int gid = 0; gid < kv_resource->groupNums(); ++gid) {
+                kv_cache_groups_[static_cast<size_t>(gid)]->removeSkippedBlocks(
+                    kv_resource->mutableBlockIds(b, gid), malloc_info.reuse_cache, reserve_step);
+            }
+        }
+        return {true, 0};
+    }
+
+    for (int b = 0; b <= failed_batch && b < batch_size; ++b) {
+        for (int gid = 0; gid < kv_resource->groupNums(); ++gid) {
+            auto&       block_ids = kv_resource->mutableBlockIds(b, gid);
+            const auto& original  = original_blocks[static_cast<size_t>(b)][static_cast<size_t>(gid)];
+
+            std::unordered_set<BlockIdxType> original_valid_blocks;
+            original_valid_blocks.reserve(original.size());
+            for (auto block : original) {
+                if (!isNullBlockIdx(block)) {
+                    original_valid_blocks.insert(block);
+                }
+            }
+
+            BlockIndicesType blocks_to_free;
+            for (auto block : block_ids.blocks()) {
+                if (!isNullBlockIdx(block) && original_valid_blocks.find(block) == original_valid_blocks.end()) {
+                    blocks_to_free.push_back(block);
+                }
+            }
+            if (!blocks_to_free.empty()) {
+                freeBlocksInGroup(gid, blocks_to_free);
+            }
+            block_ids.assign(original);
+        }
+    }
+    RTP_LLM_LOG_WARNING("Hybrid incrMalloc failed at batch=%d group=%d", failed_batch, failed_group);
+    return {false, 0};
+}
+
+void HybridKVCacheAllocator::free(const FreeInfo& free_info) {
+    auto& kv_cache_resource = free_info.batch_kv_cache_resource;
+    if (kv_cache_resource->curBlocksNum() == 0) {
+        return;
+    }
+    for (int batch_id = 0; batch_id < kv_cache_resource->batchSize(); ++batch_id) {
+        for (int gid = 0; gid < kv_cache_resource->groupNums(); ++gid) {
+            kv_cache_groups_[static_cast<size_t>(gid)]->free(kv_cache_resource->blocks(batch_id, gid));
+        }
+    }
+    kv_cache_resource->clearBlocks();
+}
+
+void HybridKVCacheAllocator::insertIntoCache(const InsertInfo& insert_info) {
+    auto& kv_cache_resource = insert_info.batch_kv_cache_resource;
+    RTP_LLM_CHECK(kv_cache_resource != nullptr);
+    if (!shared_block_cache_) {
+        return;
+    }
+
+    const auto& cp_mapper  = cp_slot_mapper_;
+    const bool  cp_active  = cp_mapper && cp_mapper->isSharded();
+    const int   group_nums = kv_cache_resource->groupNums();
+    const int   batch_size = kv_cache_resource->batchSize();
+
+    for (int batch_id = 0; batch_id < batch_size; ++batch_id) {
+        kv_cache_resource->cacheResource(batch_id).ensureLinearBlockDependencies();
+        const auto& full_keys = kv_cache_resource->cacheKeys(batch_id);
+        if (full_keys.empty()) {
+            continue;
+        }
+        const auto& full_dependencies = kv_cache_resource->cacheResource(batch_id).blockDependencies();
+
+        if (!cp_active) {
+            // Preserve the legacy non-CP GPU reuse surface: aggregate all groups
+            // under one key. The prefix tree only receives extra dependency
+            // metadata here.
+            const size_t max_keys = full_keys.size();
+            for (size_t pos = max_keys; pos > 0; --pos) {
+                const size_t              i = pos - 1;
+                std::vector<BlockIdxType> group_slots(static_cast<size_t>(group_nums), NULL_BLOCK_IDX);
+                bool                      has_valid = false;
+                for (int gid = 0; gid < group_nums; ++gid) {
+                    if (skipReuseCacheGroup(gid)) {
+                        continue;
+                    }
+                    const auto& blocks = kv_cache_resource->blocks(batch_id, gid);
+                    if (i >= blocks.size()) {
+                        continue;
+                    }
+                    if (!isNullBlockIdx(blocks[i])) {
+                        group_slots[static_cast<size_t>(gid)] = blocks[i];
+                        has_valid                             = true;
+                    }
+                }
+                if (has_valid) {
+                    const auto dependency =
+                        i < full_dependencies.size() ? full_dependencies[i] :
+                                                       BlockDependency{false, 0, static_cast<uint32_t>(i)};
+                    shared_block_cache_->put(full_keys[i],
+                                             group_slots,
+                                             insert_info.is_resident,
+                                             SharedBlockCache::kGpuLogicalNamespace,
+                                             dependency);
+                }
+            }
+            continue;
+        }
+
+        // Per-group key namespace, per-(key, group) put. SharedBlockCache::put
+        // merges multiple puts on the same key into a single item with each group's slot
+        // populated independently (NULL_BLOCK_IDX entries are skipped by the merge path).
+        //
+        // CP per-group key namespace: paged FULL groups use cp-subsampled (last-rank) keys
+        // to align 1:1 with rank-local blocks; non-paged groups (SWA / LINEAR) keep the
+        // full key sequence so their tail blocks (real entries at positions >= length-2)
+        // get inserted alongside the keys that the reuseCache tail-loop later queries.
+        CacheKeysType cp_keys   = cpEffectiveCacheKeys(cp_mapper, full_keys);
+        BlockDependenciesType cp_dependencies;
+        cp_dependencies.reserve(cp_keys.size());
+        for (size_t i = 0; i < cp_keys.size(); ++i) {
+            BlockDependency dependency;
+            dependency.ordinal = static_cast<uint32_t>(i);
+            if (i > 0) {
+                dependency.has_parent = true;
+                dependency.parent_key = cp_keys[i - 1];
+            }
+            cp_dependencies.push_back(dependency);
+        }
+        auto          token_ids = insert_info.complete_token_ids->completeTokenIdsVec(batch_id);
+        if (token_ids.size() <= 1) {
+            continue;
+        }
+        const size_t token_len = token_ids.size() - 1;
+
+        for (int gid = 0; gid < group_nums; ++gid) {
+            if (skipReuseCacheGroup(gid)) {
+                continue;
+            }
+            const int  raw_group_seq = kv_cache_groups_[static_cast<size_t>(gid)]->seqSizePerBlock();
+            const bool gp_sharded = cpShardThisGroup(cp_mapper, kv_cache_groups_[static_cast<size_t>(gid)]);
+            const bool           compact_swa   = cpCompactSwaGroup(gid, cp_mapper);
+            const bool           use_cp_keys   = cp_active && (gp_sharded || compact_swa);
+            const CacheKeysType& src_keys      = use_cp_keys ? cp_keys : full_keys;
+            const auto&          dependencies  = use_cp_keys ? cp_dependencies : full_dependencies;
+            const auto           namespace_id  = use_cp_keys ? SharedBlockCache::kGpuCpCanonicalNamespace :
+                                                               SharedBlockCache::kGpuLogicalNamespace;
+            if (src_keys.empty()) {
+                continue;
+            }
+            const int group_seq_size =
+                cpVirtualBlockSizeForGroup(cp_mapper, kv_cache_groups_[static_cast<size_t>(gid)], raw_group_seq);
+            const size_t full_blocks_num = token_len / static_cast<size_t>(group_seq_size);
+            const size_t n               = std::min(src_keys.size(), full_blocks_num);
+            const auto&  blocks          = kv_cache_resource->blocks(batch_id, gid);
+            const size_t loop_end        = std::min(n, blocks.size());
+
+            // Reverse iterate so prefix-base keys land at MRU end (matches non-CP path).
+            for (size_t pos = loop_end; pos > 0; --pos) {
+                const size_t i = pos - 1;
+                if (isNullBlockIdx(blocks[i])) {
+                    continue;
+                }
+                std::vector<BlockIdxType> group_slots(static_cast<size_t>(group_nums), NULL_BLOCK_IDX);
+                std::vector<bool>         matchable_slots(static_cast<size_t>(group_nums), true);
+                group_slots[static_cast<size_t>(gid)] = blocks[i];
+                const auto dependency =
+                    i < dependencies.size() ? dependencies[i] : BlockDependency{false, 0, static_cast<uint32_t>(i)};
+                shared_block_cache_->put(
+                    src_keys[i], group_slots, insert_info.is_resident, namespace_id, dependency, matchable_slots);
+            }
+        }
+    }
+}
+
+std::shared_ptr<KVCacheResource> HybridKVCacheAllocator::incrKVCacheRef(const KVCacheResource& kvcache_resource,
+                                                                        const CacheKeysType&   cache_keys,
+                                                                        bool                   is_connector) {
+    if (cache_keys.empty() || kvcache_resource.groupNums() <= 0) {
+        return nullptr;
+    }
+
+    std::unordered_map<CacheKeyType, size_t> key_to_pos;
+    const auto&                              resource_keys = kvcache_resource.cacheKeys();
+    for (size_t i = 0; i < resource_keys.size(); ++i) {
+        key_to_pos.emplace(resource_keys[i], i);
+    }
+
+    auto selected_resource_ptr = new KVCacheResource(kvcache_resource);
+    auto deleter               = [self = shared_from_this(), is_connector](KVCacheResource* resource) {
+        self->decrKVCacheRef(*resource, is_connector);
+        delete resource;
+    };
+    std::shared_ptr<KVCacheResource> selected_resource(selected_resource_ptr, deleter);
+    selected_resource->initGroups(kvcache_resource.groupNums(),
+                                  static_cast<int>(config_.layer_all_num),
+                                  config_.layerGroupIdsSnapshot(),
+                                  config_.kernelBlocksPerKvBlock(),
+                                  config_.groupTypesSnapshot());
+
+    CacheKeysType                 selected_keys;
+    BlockDependenciesType         selected_dependencies;
+    std::vector<BlockIndicesType> selected_blocks(static_cast<size_t>(kvcache_resource.groupNums()));
+    const auto&                   source_dependencies = kvcache_resource.blockDependencies();
+
+    selected_dependencies.reserve(cache_keys.size());
+    selected_keys.reserve(cache_keys.size());
+    for (auto key : cache_keys) {
+        auto it = key_to_pos.find(key);
+        if (it == key_to_pos.end()) {
+            continue;
+        }
+        const size_t pos = it->second;
+        bool any_valid_block = false;
+        std::vector<BlockIdxType> blocks_for_key(static_cast<size_t>(kvcache_resource.groupNums()), NULL_BLOCK_IDX);
+        for (int gid = 0; gid < kvcache_resource.groupNums(); ++gid) {
+            const auto& src_blocks = kvcache_resource.blocks(gid);
+            const auto  block      = pos < src_blocks.size() ? src_blocks[pos] : NULL_BLOCK_IDX;
+            blocks_for_key[static_cast<size_t>(gid)] = block;
+            any_valid_block = any_valid_block || (!isNullBlockIdx(block) && block > 0);
+        }
+        const bool preserve_connector_tail = is_connector && !kvcache_resource.lastBlockAligned()
+                                             && pos + 1 == resource_keys.size() && !selected_keys.empty();
+        if (!any_valid_block && !preserve_connector_tail) {
+            continue;
+        }
+        selected_keys.push_back(key);
+        selected_dependencies.push_back(
+            pos < source_dependencies.size() ?
+                source_dependencies[pos] :
+                BlockDependency{false, 0, static_cast<uint32_t>(selected_dependencies.size())});
+        for (int gid = 0; gid < kvcache_resource.groupNums(); ++gid) {
+            selected_blocks[static_cast<size_t>(gid)].push_back(blocks_for_key[static_cast<size_t>(gid)]);
+        }
+    }
+
+    if (selected_keys.empty()) {
+        return nullptr;
+    }
+
+    selected_resource->cacheKeys() = std::move(selected_keys);
+    selected_resource->setBlockDependencies(std::move(selected_dependencies));
+    for (int gid = 0; gid < kvcache_resource.groupNums(); ++gid) {
+        BlockIndicesType valid;
+        for (auto b : selected_blocks[static_cast<size_t>(gid)]) {
+            if (!isNullBlockIdx(b) && b > 0) {
+                valid.push_back(b);
+            }
+        }
+        if (!valid.empty()) {
+            referenceBlocksInGroup(gid, valid, is_connector);
+        }
+        selected_resource->mutableBlockIds(gid).assign(std::move(selected_blocks[static_cast<size_t>(gid)]));
+    }
+    return selected_resource;
+}
+
+void HybridKVCacheAllocator::decrKVCacheRef(const KVCacheResource& kvcache_resource, bool is_connector) {
+    for (int gid = 0; gid < kvcache_resource.groupNums(); ++gid) {
+        BlockIndicesType valid;
+        for (auto b : kvcache_resource.blocks(gid)) {
+            if (!isNullBlockIdx(b) && b > 0) {
+                valid.push_back(b);
+            }
+        }
+        if (!valid.empty()) {
+            freeBlocksInGroup(gid, valid, is_connector);
+        }
+    }
+}
+
+bool HybridKVCacheAllocator::updateKVBlock(const BatchKVCacheResourcePtr& batch_kv_cache_resource,
+                                           const std::vector<int>&        block_src_batch,
+                                           bool                           copy_last_block,
+                                           std::vector<BlockIdPair>&      block_update_mapping) {
+    (void)batch_kv_cache_resource;
+    (void)block_src_batch;
+    (void)copy_last_block;
+    (void)block_update_mapping;
+    RTP_LLM_FAIL("HybridKVCacheAllocator::updateKVBlock is not supported");
+}
+
+int HybridKVCacheAllocator::seqSizePerBlock() const {
+    return static_cast<int>(config_.seq_size_per_block);
+}
+
+bool HybridKVCacheAllocator::hasAvailableBlocksForReserve(const MallocInfo& malloc_info, size_t reserve_blocks) const {
+    const int need_blocks = getNeedBlocks(malloc_info);
+    if (need_blocks <= 0) {
+        return true;
+    }
+    const size_t available_blocks = availableBlocksNum();
+    const bool   accepted         = available_blocks >= static_cast<size_t>(need_blocks) + reserve_blocks;
+    if (!accepted && malloc_info.verbose) {
+        RTP_LLM_LOG_INFO("Hybrid initMalloc rejected by reserve blocks: request_id=%ld "
+                         "need_blocks=%d available_blocks=%zu reserve_blocks=%zu",
+                         malloc_info.request_id,
+                         need_blocks,
+                         available_blocks,
+                         reserve_blocks);
+    }
+    return accepted;
+}
+
+void HybridKVCacheAllocator::rollbackBlockIdsToSize(int gid, BlockIds& block_ids, size_t original_size) {
+    if (block_ids.blocksNum() <= original_size) {
+        return;
+    }
+    const auto blocks_to_free = validBlocksAfter(block_ids.blocks(), original_size);
+    block_ids.resize(original_size);
+    if (!blocks_to_free.empty()) {
+        freeBlocksInGroup(gid, blocks_to_free);
+    }
+}
+
+void HybridKVCacheAllocator::rollbackInitMalloc(BatchKVCacheResource&                kv_resource,
+                                                const std::vector<BlockIndicesType>& referenced_blocks,
+                                                const std::vector<size_t>&           original_sizes) {
+    for (int gid = 0; gid < kv_resource.groupNums(); ++gid) {
+        auto& block_ids = kv_resource.mutableBlockIds(0, gid);
+        if (!original_sizes.empty() && static_cast<size_t>(gid) < original_sizes.size()
+            && block_ids.blocksNum() > original_sizes[static_cast<size_t>(gid)]) {
+            rollbackBlockIdsToSize(gid, block_ids, original_sizes[static_cast<size_t>(gid)]);
+        }
+        if (static_cast<size_t>(gid) < referenced_blocks.size()
+            && !referenced_blocks[static_cast<size_t>(gid)].empty()) {
+            freeBlocksInGroup(gid, referenced_blocks[static_cast<size_t>(gid)]);
+        }
+        block_ids.resize(0);
+    }
+    kv_resource.cacheResource(0).setDeviceReuseBlockNum(0);
+}
+
+void HybridKVCacheAllocator::rollbackIncrMalloc(BatchKVCacheResource&                   kv_resource,
+                                                const std::vector<std::vector<size_t>>& original_sizes,
+                                                int                                     failed_batch) {
+    const int last_touched_batch = std::min(failed_batch, kv_resource.batchSize() - 1);
+    for (int b = 0; b <= last_touched_batch; ++b) {
+        for (int gid = 0; gid < kv_resource.groupNums(); ++gid) {
+            auto&        block_ids    = kv_resource.mutableBlockIds(b, gid);
+            const size_t original_num = original_sizes[static_cast<size_t>(b)][static_cast<size_t>(gid)];
+            rollbackBlockIdsToSize(gid, block_ids, original_num);
+        }
+    }
+}
+
+int HybridKVCacheAllocator::getNeedBlocks(const MallocInfo& malloc_info) const {
+    if (!malloc_info.batch_kv_cache_resource || !malloc_info.complete_token_ids) {
+        return 0;
+    }
+    const auto& cp_mapper          = cp_slot_mapper_;
+    const int   batch_size         = malloc_info.batch_kv_cache_resource->batchSize();
+    const int   total_seq_len      = malloc_info.complete_token_ids->totalSeqLength();
+    const int   raw_common_seq_len = std::min(malloc_info.complete_token_ids->commonSeqLength(), total_seq_len);
+    const int   raw_seq_len        = malloc_info.complete_token_ids->seqLength();
+    const int   reserve_step       = malloc_info.complete_token_ids->getReserveStep();
+    const bool  reuse_enabled      = malloc_info.reuse_cache;
+    const int   reuse_blocks_len   = reuse_enabled ? malloc_info.batch_kv_cache_resource->curBlocksNum() : 0;
+
+    int common_blocks_total = 0;
+    int extra_blocks_total  = 0;
+    for (int gid = 0; gid < static_cast<int>(kv_cache_groups_.size()); ++gid) {
+        const auto group            = kv_cache_groups_[static_cast<size_t>(gid)];
+        const int  group_common_seq = cpEffectiveSeqLenForGroup(cp_mapper, group, raw_common_seq_len);
+        const int  group_seq_len    = cpEffectiveSeqLenForGroup(cp_mapper, group, raw_seq_len);
+        const auto need             = kv_cache_groups_[static_cast<size_t>(gid)]->getNeedBlocks(
+            group_common_seq, group_seq_len, reserve_step, reuse_blocks_len, reuse_enabled);
+        common_blocks_total += need.common_blocks;
+        extra_blocks_total += need.extra_blocks;
+    }
+    return common_blocks_total + batch_size * extra_blocks_total;
+}
+
+void HybridKVCacheAllocator::checkCPShardedMallocResult(const MallocInfo& malloc_info) const {
+    if (!cp_slot_mapper_ || !cp_slot_mapper_->isSharded()) {
+        return;
+    }
+
+    const auto& kv_resource  = malloc_info.batch_kv_cache_resource;
+    const int   seq_len      = malloc_info.incrSeqLen();
+    const int   reserve_step = malloc_info.complete_token_ids->getReserveStep();
+
+    for (int batch_id = 0; batch_id < kv_resource->batchSize(); ++batch_id) {
+        for (int gid = 0; gid < kv_resource->groupNums(); ++gid) {
+            const auto group = kv_cache_groups_[static_cast<size_t>(gid)];
+            if (!cpShardThisGroup(cp_slot_mapper_, group)) {
+                continue;
+            }
+            const int effective_seq_len = cpEffectiveSeqLenForGroup(cp_slot_mapper_, group, seq_len);
+            const int expected_blocks =
+                kv_cache_groups_[static_cast<size_t>(gid)]->needBlocksNum(effective_seq_len, 0, reserve_step);
+            const int actual_blocks = kv_resource->blocksNum(batch_id, gid);
+            RTP_LLM_CHECK_WITH_INFO(actual_blocks == expected_blocks,
+                                    "CP invariant violated: batch=%d group=%d blocks=%d != expected_local_blocks=%d "
+                                    "(seq_len=%d, effective_seq_len=%d, reserve_step=%d, cp_size=%d, "
+                                    "block_size=%d, cacheKeys=%zu)",
+                                    batch_id,
+                                    gid,
+                                    actual_blocks,
+                                    expected_blocks,
+                                    seq_len,
+                                    effective_seq_len,
+                                    reserve_step,
+                                    cp_slot_mapper_->cpSize(),
+                                    cp_slot_mapper_->blockSize(),
+                                    kv_resource->cacheKeys(batch_id).size());
+        }
+    }
+}
+
+int HybridKVCacheAllocator::singleBatchNeedBlocks(const BatchKVCacheResourcePtr& batch_kv_cache_resource,
+                                                  int                            seq_len,
+                                                  int                            reserve_step) const {
+    int need_blocks = 0;
+    for (int gid = 0; gid < batch_kv_cache_resource->groupNums(); ++gid) {
+        const int effective_seq_len =
+            cpEffectiveSeqLenForGroup(cp_slot_mapper_, kv_cache_groups_[static_cast<size_t>(gid)], seq_len);
+        const int cur_blocks        = batch_kv_cache_resource->blocksNum(0, gid);
+        need_blocks +=
+            kv_cache_groups_[static_cast<size_t>(gid)]->needBlocksNum(effective_seq_len, cur_blocks, reserve_step);
+    }
+    return need_blocks;
+}
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/allocator/HybridKVCacheAllocator.h b/rtp_llm/cpp/cache/allocator/HybridKVCacheAllocator.h
new file mode 100644
index 0000000000..f68b5e9c00
--- /dev/null
+++ b/rtp_llm/cpp/cache/allocator/HybridKVCacheAllocator.h
@@ -0,0 +1,70 @@
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "rtp_llm/cpp/cache/group/FullKVCacheGroup.h"
+#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/group/LinearKVCacheGroup.h"
+#include "rtp_llm/cpp/cache/group/SWAKVCacheGroup.h"
+
+namespace rtp_llm {
+
+class HybridKVCacheAllocator: public KVCacheAllocator, public std::enable_shared_from_this<HybridKVCacheAllocator> {
+public:
+    HybridKVCacheAllocator(const CacheConfig&                 config,
+                           AllocationType                     allocation_type     = AllocationType::DEVICE,
+                           const kmonitor::MetricsReporterPtr metrics_reporter    = nullptr,
+                           int64_t                            reserve_block_ratio = 0);
+
+    void free(const FreeInfo& free_info) override;
+    void insertIntoCache(const InsertInfo& insert_info) override;
+
+    std::shared_ptr<KVCacheResource> incrKVCacheRef(const KVCacheResource& kvcache_resource,
+                                                    const CacheKeysType&   cache_keys,
+                                                    bool                   is_connector = false) override;
+
+    bool updateKVBlock(const BatchKVCacheResourcePtr& batch_kv_cache_resource,
+                       const std::vector<int>&        block_src_batch,
+                       bool                           copy_last_block,
+                       std::vector<BlockIdPair>&      block_update_mapping) override;
+
+    int seqSizePerBlock() const override;
+    int singleBatchNeedBlocks(const BatchKVCacheResourcePtr& batch_kv_cache_resource,
+                              int                            seq_len,
+                              int                            reserve_step) const override;
+    std::vector<int> independentEvictionGroupIds() const override;
+
+protected:
+    MallocResult incrMalloc(const MallocInfo& malloc_info) override;
+    MallocResult initMallocForCommonLen(const MallocInfo& malloc_info) override;
+    int          getNeedBlocks(const MallocInfo& malloc_info) const override;
+    void         checkCPShardedMallocResult(const MallocInfo& malloc_info) const override;
+    void         decrKVCacheRef(const KVCacheResource& kvcache_resource, bool is_connector = false) override;
+
+    int reuseCache(const CacheKeysType&                 cache_keys,
+                   BatchKVCacheResource&                kv_resource,
+                   const std::shared_ptr<CPSlotMapper>& cp_mapper);
+
+    virtual void referenceBlocksInGroup(int gid, const BlockIndicesType& blocks, bool is_connector = false) const = 0;
+    virtual void freeBlocksInGroup(int gid, const BlockIndicesType& blocks, bool is_connector = false)            = 0;
+    virtual bool hasAvailableBlocksForReserve(const MallocInfo& malloc_info, size_t reserve_blocks) const;
+    bool         skipReuseCacheGroup(int gid) const;
+    bool         cpCompactSwaGroup(int gid, const std::shared_ptr<CPSlotMapper>& mapper) const;
+    void         rollbackBlockIdsToSize(int gid, BlockIds& block_ids, size_t original_size);
+    void         rollbackInitMalloc(BatchKVCacheResource&                kv_resource,
+                                    const std::vector<BlockIndicesType>& referenced_blocks,
+                                    const std::vector<size_t>&           original_sizes);
+    void         rollbackIncrMalloc(BatchKVCacheResource&                   kv_resource,
+                                    const std::vector<std::vector<size_t>>& original_sizes,
+                                    int                                     failed_batch);
+
+    std::vector<KVCacheGroupPtr> kv_cache_groups_;
+    std::vector<int>             full_group_ids_;
+    std::vector<int>             linear_group_ids_;
+    std::vector<int>             swa_group_ids_;
+};
+
+using HybridKVCacheAllocatorPtr = std::shared_ptr<HybridKVCacheAllocator>;
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/allocator/HybridPoolKVCacheAllocator.cc b/rtp_llm/cpp/cache/allocator/HybridPoolKVCacheAllocator.cc
new file mode 100644
index 0000000000..0d59ec02d1
--- /dev/null
+++ b/rtp_llm/cpp/cache/allocator/HybridPoolKVCacheAllocator.cc
@@ -0,0 +1,687 @@
+#include "rtp_llm/cpp/cache/allocator/HybridPoolKVCacheAllocator.h"
+
+#include <algorithm>
+#include <iomanip>
+#include <limits>
+#include <numeric>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+
+#include "rtp_llm/cpp/cache/BlockPoolConfigHelper.h"
+#include "rtp_llm/cpp/cache/CPSlotMapper.h"
+#include "rtp_llm/cpp/engine_base/stream/CompleteTokenIds.h"
+#include "rtp_llm/cpp/metrics/RtpLLMMetrics.h"
+#include "rtp_llm/cpp/utils/Logger.h"
+#include "rtp_llm/models_py/bindings/core/ExecOps.h"
+#include "rtp_llm/models_py/bindings/core/OpData.h"
+
+namespace rtp_llm {
+namespace {
+
+inline bool cpShardThisGroupForReserve(const std::shared_ptr<CPSlotMapper>& mapper, CacheGroupType group_type) {
+    return mapper && mapper->isSharded() && group_type == CacheGroupType::FULL;
+}
+
+inline int
+cpEffectiveSeqLenForReserve(const std::shared_ptr<CPSlotMapper>& mapper, CacheGroupType group_type, int seq_len) {
+    return cpShardThisGroupForReserve(mapper, group_type) ? mapper->effectiveSeqLenForAlloc(seq_len) : seq_len;
+}
+
+void appendPoolSummary(std::ostringstream&     os,
+                       bool&                   has_any,
+                       int                     gid,
+                       const std::string&      tag,
+                       CacheGroupType          group_type,
+                       const BlockPoolConfig&  pool_config) {
+    static constexpr double kBytesPerMB = 1024.0 * 1024.0;
+    if (has_any) {
+        os << "; ";
+    }
+    has_any = true;
+    os << "pool_name=" << pool_config.pool_name << ", gid=" << gid << ", tag=" << tag
+       << ", type=" << cacheGroupTypeName(group_type) << ", size=" << pool_config.total_size_bytes << " bytes("
+       << std::fixed << std::setprecision(2) << static_cast<double>(pool_config.total_size_bytes) / kBytesPerMB
+       << " MB)"
+       << ", blocks=" << pool_config.block_num;
+}
+
+}  // namespace
+
+HybridPoolKVCacheAllocator::HybridPoolKVCacheAllocator(const CacheConfig&                 config,
+                                                       AllocationType                     allocation_type,
+                                                       const kmonitor::MetricsReporterPtr metrics_reporter,
+                                                       int64_t                            reserve_block_ratio,
+                                                       RoleType                           role_type):
+    HybridKVCacheAllocator(config, allocation_type, metrics_reporter, reserve_block_ratio), role_type_(role_type) {}
+
+bool HybridPoolKVCacheAllocator::doInit() {
+    RTP_LLM_CHECK_WITH_INFO(config_.groupNums() > 0, "no cache groups found in CacheConfig");
+
+    const int group_nums = config_.groupNums();
+    group_block_pools_.reserve(static_cast<size_t>(group_nums));
+    kv_cache_groups_.reserve(static_cast<size_t>(group_nums));
+
+    SharedBlockCache*       shared_cache_raw = shared_block_cache_ ? shared_block_cache_.get() : nullptr;
+    static constexpr double kBytesPerMB = 1024.0 * 1024.0;
+    std::ostringstream      pool_summary;
+    size_t                  pool_total_bytes  = 0;
+    size_t                  pool_total_blocks = 0;
+    bool                    has_pool          = false;
+
+    std::vector<BlockPoolConfig> group_pool_configs;
+    group_pool_configs.reserve(static_cast<size_t>(group_nums));
+    for (int gid = 0; gid < group_nums; ++gid) {
+        auto pool_config = BlockPoolConfigHelper::createConfigForGroup(config_, static_cast<size_t>(gid));
+        const auto tag        = config_.tagForGroup(static_cast<size_t>(gid));
+        const auto group_type = config_.typeForGroup(static_cast<size_t>(gid));
+        appendPoolSummary(pool_summary, has_pool, gid, tag, group_type, pool_config);
+        pool_total_bytes += pool_config.total_size_bytes;
+        pool_total_blocks += pool_config.block_num;
+        group_pool_configs.push_back(std::move(pool_config));
+    }
+
+    if (has_pool) {
+        const auto summary = pool_summary.str();
+        RTP_LLM_LOG_INFO("HybridPool pool summary: pools=[%s], total_size=%zu bytes total_size_mb=%.2f "
+                         "total_blocks=%zu",
+                         summary.c_str(),
+                         pool_total_bytes,
+                         static_cast<double>(pool_total_bytes) / kBytesPerMB,
+                         pool_total_blocks);
+    }
+
+    for (int gid = 0; gid < group_nums; ++gid) {
+        const auto& pool_config = group_pool_configs[static_cast<size_t>(gid)];
+        const auto  group_type = config_.typeForGroup(static_cast<size_t>(gid));
+
+        auto group_pool = std::make_shared<BlockPool>(pool_config,
+                                                      allocation_type_,
+                                                      /*use_pinned_cpu_backing=*/false,
+                                                      use_cuda_malloc_block_pool_);
+        RTP_LLM_CHECK_WITH_INFO(
+            group_pool->init(), "Failed to initialize block pool %s(group %d)", pool_config.pool_name.c_str(), gid);
+
+        const auto& ids    = config_.layerIdsForGroup(static_cast<size_t>(gid));
+        auto        spec   = config_.specForGroup(static_cast<size_t>(gid));
+        const auto  policy = config_.policyForGroup(static_cast<size_t>(gid));
+
+        KVCacheGroupPtr group;
+        if (group_type == CacheGroupType::LINEAR) {
+            group = std::make_shared<LinearKVCacheGroup>(
+                ids, spec, group_pool, gid, config_.linear_step, shared_cache_raw, metrics_reporter_, policy);
+            linear_group_ids_.push_back(gid);
+        } else if (group_type == CacheGroupType::SWA) {
+            group = std::make_shared<SWAKVCacheGroup>(
+                ids, spec, group_pool, gid, config_.linear_step, shared_cache_raw, metrics_reporter_, policy);
+            swa_group_ids_.push_back(gid);
+        } else {
+            group = std::make_shared<FullKVCacheGroup>(ids, spec, group_pool, gid, shared_cache_raw, metrics_reporter_, policy);
+            full_group_ids_.push_back(gid);
+        }
+
+        RTP_LLM_CHECK_WITH_INFO(
+            group->init(), "Failed to initialize KVCacheGroup %s(gid %d)", pool_config.pool_name.c_str(), gid);
+        group_block_pools_.push_back(group_pool);
+        kv_cache_groups_.push_back(group);
+    }
+
+    if (shared_block_cache_) {
+        shared_block_cache_->init(group_nums, group_block_pools_);
+    }
+
+    RTP_LLM_LOG_INFO("HybridPoolKVCacheAllocator init success, group pools=%zu", group_block_pools_.size());
+    return true;
+}
+
+int HybridPoolKVCacheAllocator::defaultGroupIdForLayer(int layer_id) const {
+    if (layer_id < 0 || static_cast<size_t>(layer_id) >= config_.layer_all_num) {
+        RTP_LLM_FAIL("invalid layer_id=%d", layer_id);
+    }
+    const int gid = config_.groupIdFor(layer_id);
+    RTP_LLM_CHECK_WITH_INFO(gid >= 0 && gid < static_cast<int>(kv_cache_groups_.size()),
+                            "invalid default group id %d for layer %d",
+                            gid,
+                            layer_id);
+    return gid;
+}
+
+int HybridPoolKVCacheAllocator::validateGroupIdForLayer(int layer_id, int group_id) const {
+    RTP_LLM_CHECK_WITH_INFO(group_id >= 0 && group_id < static_cast<int>(kv_cache_groups_.size()),
+                            "invalid group id %d for layer %d",
+                            group_id,
+                            layer_id);
+    RTP_LLM_CHECK_WITH_INFO(layer_id >= 0 && static_cast<size_t>(layer_id) < config_.layer_all_num,
+                            "invalid layer id %d for layer_all_num=%u",
+                            layer_id,
+                            config_.layer_all_num);
+    const auto& group_ids = config_.groupIdsForLayer(layer_id);
+    RTP_LLM_CHECK_WITH_INFO(std::find(group_ids.begin(), group_ids.end(), group_id) != group_ids.end(),
+                            "layer %d does not own cache group %d",
+                            layer_id,
+                            group_id);
+    return group_id;
+}
+
+void HybridPoolKVCacheAllocator::referenceBlocksInGroup(int                     gid,
+                                                        const BlockIndicesType& blocks,
+                                                        bool                    is_connector) const {
+    if (is_connector) {
+        group_block_pools_[static_cast<size_t>(gid)]->connectorReference(blocks);
+    } else {
+        group_block_pools_[static_cast<size_t>(gid)]->requestReference(blocks);
+    }
+}
+
+void HybridPoolKVCacheAllocator::freeBlocksInGroup(int gid, const BlockIndicesType& blocks, bool is_connector) {
+    if (is_connector) {
+        group_block_pools_[static_cast<size_t>(gid)]->connectorFree(blocks);
+    } else {
+        group_block_pools_[static_cast<size_t>(gid)]->requestFree(blocks);
+    }
+}
+
+CacheLayerLayout HybridPoolKVCacheAllocator::allLayerCacheBase() const {
+    CacheLayerLayout layout;
+    const auto layer_group_ids      = config_.layerGroupIdsSnapshot();
+    layout.layer_to_group_ids       = layer_group_ids;
+    layout.group_types              = config_.groupTypesSnapshot();
+    layout.group_tags               = config_.groupTagsSnapshot();
+    layout.layer_tag_to_group_id    = config_.layerTagToGroupIdSnapshot();
+    layout.group_seq_size_per_block = config_.group_seq_size_per_block;
+    layout.layer_group_types.resize(config_.layer_all_num, CacheGroupType::FULL);
+    for (size_t layer_id = 0; layer_id < layer_group_ids.size() && layer_id < layout.layer_group_types.size();
+        ++layer_id) {
+        if (!layer_group_ids[layer_id].empty()) {
+            layout.layer_group_types[layer_id] =
+                config_.typeForGroup(static_cast<size_t>(layer_group_ids[layer_id].front()));
+        }
+    }
+
+    layout.layers_to_kv_buffer_ptrs.resize(config_.layer_all_num);
+    layout.layers_to_scale_buffer_ptrs.resize(config_.layer_all_num);
+    const size_t group_count = kv_cache_groups_.size();
+    layout.layers_to_kv_buffer_ptrs_by_group.resize(config_.layer_all_num);
+    layout.layers_to_scale_buffer_ptrs_by_group.resize(config_.layer_all_num);
+    for (size_t layer_id = 0; layer_id < static_cast<size_t>(config_.layer_all_num); ++layer_id) {
+        layout.layers_to_kv_buffer_ptrs_by_group[layer_id].resize(group_count);
+        layout.layers_to_scale_buffer_ptrs_by_group[layer_id].resize(group_count);
+    }
+
+    for (size_t layer_id = 0; layer_id < static_cast<size_t>(config_.layer_all_num); ++layer_id) {
+        if (layer_id >= layer_group_ids.size() || layer_group_ids[layer_id].size() != 1) {
+            continue;
+        }
+        const int  gid           = layer_group_ids[layer_id][0];
+        RTP_LLM_CHECK_WITH_INFO(gid >= 0 && gid < static_cast<int>(kv_cache_groups_.size()),
+                                "invalid single-tag group id %d for layer %zu",
+                                gid,
+                                layer_id);
+        const auto layer_tensors = kv_cache_groups_[static_cast<size_t>(gid)]->allLayerCacheBase();
+        const auto scale_tensors = kv_cache_groups_[static_cast<size_t>(gid)]->allLayerScaleCacheBase();
+        auto       it            = layer_tensors.find(static_cast<int>(layer_id));
+        if (it != layer_tensors.end()) {
+            layout.layers_to_kv_buffer_ptrs[layer_id] = it->second;
+        }
+        auto scale_it = scale_tensors.find(static_cast<int>(layer_id));
+        if (scale_it != scale_tensors.end()) {
+            layout.layers_to_scale_buffer_ptrs[layer_id] = scale_it->second;
+        }
+    }
+
+    for (int gid = 0; gid < static_cast<int>(kv_cache_groups_.size()); ++gid) {
+        const auto layer_tensors = kv_cache_groups_[static_cast<size_t>(gid)]->allLayerCacheBase();
+        const auto scale_tensors = kv_cache_groups_[static_cast<size_t>(gid)]->allLayerScaleCacheBase();
+        for (const auto& [layer_id, tensor] : layer_tensors) {
+            RTP_LLM_CHECK_WITH_INFO(
+                layer_id >= 0 && static_cast<size_t>(layer_id) < layout.layers_to_kv_buffer_ptrs_by_group.size(),
+                "layer_id %d out of by-group kv layout range %zu",
+                layer_id,
+                layout.layers_to_kv_buffer_ptrs_by_group.size());
+            layout.layers_to_kv_buffer_ptrs_by_group[static_cast<size_t>(layer_id)][static_cast<size_t>(gid)] = tensor;
+        }
+        for (const auto& [layer_id, tensor] : scale_tensors) {
+            RTP_LLM_CHECK_WITH_INFO(
+                layer_id >= 0 && static_cast<size_t>(layer_id) < layout.layers_to_scale_buffer_ptrs_by_group.size(),
+                "layer_id %d out of by-group scale layout range %zu",
+                layer_id,
+                layout.layers_to_scale_buffer_ptrs_by_group.size());
+            layout.layers_to_scale_buffer_ptrs_by_group[static_cast<size_t>(layer_id)][static_cast<size_t>(gid)] =
+                tensor;
+        }
+    }
+    return layout;
+}
+
+BlockAddrInfo HybridPoolKVCacheAllocator::convertIndexToAddr(int layer_id, int block_id) const {
+    const int gid = defaultGroupIdForLayer(layer_id);
+    return kv_cache_groups_[static_cast<size_t>(gid)]->convertIndexToAddr(layer_id, block_id);
+}
+
+std::vector<BlockInfo> HybridPoolKVCacheAllocator::convertIndexToBuffer(int layer_id, int block_id) const {
+    const int gid = defaultGroupIdForLayer(layer_id);
+    return kv_cache_groups_[static_cast<size_t>(gid)]->convertIndexToBuffer(layer_id, block_id);
+}
+
+std::vector<BlockInfo> HybridPoolKVCacheAllocator::convertIndexToBuffer(int layer_id,
+                                                                        int block_id,
+                                                                        int partition_count,
+                                                                        int partition_id) const {
+    const int gid = defaultGroupIdForLayer(layer_id);
+    return kv_cache_groups_[static_cast<size_t>(gid)]->convertIndexToBuffer(
+        layer_id, block_id, partition_count, partition_id);
+}
+
+BlockAddrInfo HybridPoolKVCacheAllocator::convertIndexToAddr(int layer_id, int group_id, int block_id) const {
+    const int gid = validateGroupIdForLayer(layer_id, group_id);
+    return kv_cache_groups_[static_cast<size_t>(gid)]->convertIndexToAddr(layer_id, block_id);
+}
+
+std::vector<BlockInfo> HybridPoolKVCacheAllocator::convertIndexToBuffer(int layer_id, int group_id, int block_id) const {
+    const int gid = validateGroupIdForLayer(layer_id, group_id);
+    return kv_cache_groups_[static_cast<size_t>(gid)]->convertIndexToBuffer(layer_id, block_id);
+}
+
+std::vector<BlockInfo> HybridPoolKVCacheAllocator::convertIndexToBuffer(
+    int layer_id, int group_id, int block_id, int partition_count, int partition_id) const {
+    const int gid = validateGroupIdForLayer(layer_id, group_id);
+    return kv_cache_groups_[static_cast<size_t>(gid)]->convertIndexToBuffer(
+        layer_id, block_id, partition_count, partition_id);
+}
+
+void HybridPoolKVCacheAllocator::blockBatchCopy(const BlockIdPair* begin_ptr, const BlockIdPair* end_ptr) {
+    if (end_ptr == begin_ptr) {
+        return;
+    }
+
+    size_t copy_nums[BatchCopyParams::TYPE_SIZE] = {};
+    for (int gid = 0; gid < static_cast<int>(kv_cache_groups_.size()); ++gid) {
+        RTP_LLM_CHECK_WITH_INFO(
+            static_cast<size_t>(gid) < group_block_pools_.size(), "missing block pool for group %d", gid);
+        const auto   copy_type = BatchCopyParams::get_copy_type(group_block_pools_[static_cast<size_t>(gid)]->where(),
+                                                              group_block_pools_[static_cast<size_t>(gid)]->where());
+        const auto&  spec      = config_.specForGroup(static_cast<size_t>(gid));
+        const size_t buffers_per_layer = spec->scale_block_size_bytes() > 0 ? 2 : 1;
+        copy_nums[copy_type] += config_.layerIdsForGroup(static_cast<size_t>(gid)).size()
+                                * static_cast<size_t>(end_ptr - begin_ptr) * buffers_per_layer;
+    }
+
+    BatchCopyParams copy_params;
+    for (size_t i = 0; i < BatchCopyParams::TYPE_SIZE; ++i) {
+        copy_params.reserve(static_cast<BatchCopyParams::CopyType>(i), copy_nums[i]);
+    }
+
+    for (auto it = begin_ptr; it != end_ptr; ++it) {
+        auto [src_block_index, dest_block_index] = *it;
+
+        for (int gid = 0; gid < static_cast<int>(kv_cache_groups_.size()); ++gid) {
+            const auto&  spec                = config_.specForGroup(static_cast<size_t>(gid));
+            const size_t kv_block_size_bytes = spec->block_size_bytes();
+            const size_t scale_block_bytes   = spec->scale_block_size_bytes();
+            const auto   copy_type =
+                BatchCopyParams::get_copy_type(group_block_pools_[static_cast<size_t>(gid)]->where(),
+                                               group_block_pools_[static_cast<size_t>(gid)]->where());
+
+            for (int layer_id : config_.layerIdsForGroup(static_cast<size_t>(gid))) {
+                auto src_addr_info =
+                    kv_cache_groups_[static_cast<size_t>(gid)]->convertIndexToAddr(layer_id, src_block_index);
+                auto dst_addr_info =
+                    kv_cache_groups_[static_cast<size_t>(gid)]->convertIndexToAddr(layer_id, dest_block_index);
+
+                if (!src_addr_info.kv_addr || !dst_addr_info.kv_addr) {
+                    RTP_LLM_LOG_ERROR("Failed to get block address for pool %s(group %d) layer %d, src_block %d, "
+                                      "dst_block %d",
+                                      group_block_pools_[static_cast<size_t>(gid)]->poolName().c_str(),
+                                      gid,
+                                      layer_id,
+                                      src_block_index,
+                                      dest_block_index);
+                    continue;
+                }
+
+                copy_params.add(dst_addr_info.kv_addr, src_addr_info.kv_addr, kv_block_size_bytes, copy_type);
+
+                if (scale_block_bytes > 0 && src_addr_info.kv_scale_addr && dst_addr_info.kv_scale_addr) {
+                    copy_params.add(
+                        dst_addr_info.kv_scale_addr, src_addr_info.kv_scale_addr, scale_block_bytes, copy_type);
+                }
+            }
+        }
+    }
+
+    execBatchCopy(copy_params);
+}
+
+size_t HybridPoolKVCacheAllocator::freeBlocksNum() const {
+    size_t total = 0;
+    for (const auto& pool : group_block_pools_) {
+        total += pool->freeBlocksNum();
+    }
+    return total;
+}
+
+size_t HybridPoolKVCacheAllocator::availableBlocksNum() const {
+    size_t total = 0;
+    for (const auto& pool : group_block_pools_) {
+        total += pool->availableBlocksNum();
+    }
+    return total;
+}
+
+BatchKVCacheResourcePtr HybridPoolKVCacheAllocator::popBlocksFromCache(size_t min_blocks_to_free) {
+    if (min_blocks_to_free == 0 || !shared_block_cache_) {
+        return nullptr;
+    }
+
+    auto evict_result = shared_block_cache_->selectAndEvict(min_blocks_to_free);
+    if (evict_result.evicted_keys.empty()) {
+        return nullptr;
+    }
+    if (metrics_reporter_) {
+        for (const auto& [cache_key, lifetime_ms] : evict_result.evicted_lifetime_ms) {
+            RtpLLMCacheEvictionMetricsCollector collector;
+            collector.lifetime_ms = lifetime_ms;
+            kmonitor::MetricsTags tags("scope", "gpu");
+            tags.AddTag("evict_policy",
+                        evict_result.evicted_independent_group.count(cache_key) ? "independent" : "chain");
+            tags.AddTag("backing", "device");
+            metrics_reporter_->report<RtpLLMCacheEvictionMetrics, RtpLLMCacheEvictionMetricsCollector>(&tags,
+                                                                                                       &collector);
+        }
+    }
+
+    auto batch_resource = std::make_shared<BatchKVCacheResource>();
+    batch_resource->resetBatchSize(1);
+    batch_resource->initGroups(config_.groupNums(),
+                               static_cast<int>(config_.layer_all_num),
+                               config_.layerGroupIdsSnapshot(),
+                               config_.kernelBlocksPerKvBlock(),
+                               config_.groupTypesSnapshot());
+    batch_resource->setLastBlockAligned(true);
+
+    for (int gid = 0; gid < config_.groupNums(); ++gid) {
+        batch_resource->mutableBlockIds(0, gid).resize(evict_result.evicted_keys.size(), NULL_BLOCK_IDX);
+    }
+
+    CacheKeysType         evicted_keys;
+    BlockDependenciesType evicted_dependencies;
+    evicted_keys.reserve(evict_result.evicted_keys.size());
+    evicted_dependencies.reserve(evict_result.evicted_keys.size());
+    for (size_t evicted_idx = 0; evicted_idx < evict_result.evicted_keys.size(); ++evicted_idx) {
+        const auto  cache_key = evict_result.evicted_keys[evicted_idx];
+        const auto& slots     = evict_result.evicted_slots.at(cache_key);
+        evicted_keys.push_back(cache_key);
+        auto dep_it = evict_result.evicted_dependencies.find(cache_key);
+        if (dep_it != evict_result.evicted_dependencies.end()) {
+            evicted_dependencies.push_back(dep_it->second);
+        } else {
+            BlockDependency dependency;
+            dependency.ordinal = static_cast<uint32_t>(evicted_idx);
+            if (evicted_idx > 0) {
+                dependency.has_parent = true;
+                dependency.parent_key = evict_result.evicted_keys[evicted_idx - 1];
+            }
+            evicted_dependencies.push_back(dependency);
+        }
+        for (int gid = 0; gid < static_cast<int>(slots.size()) && gid < config_.groupNums(); ++gid) {
+            if (!isNullBlockIdx(slots[gid])) {
+                batch_resource->mutableBlockIds(0, gid).setAt(evicted_idx, slots[gid]);
+            }
+        }
+    }
+    batch_resource->cacheResource(0).setCacheKeys(std::move(evicted_keys));
+    batch_resource->cacheResource(0).setBlockDependencies(std::move(evicted_dependencies));
+    // Evicted keys already come from the GPU cache's actual key namespace.
+    // Under CP this can be a mixed batch of canonical paged keys and logical
+    // state/SWA keys, so coordinator must not remap the whole batch again.
+    batch_resource->cacheResource(0).setCacheKeysAreCpCanonical(true);
+    return batch_resource;
+}
+
+void HybridPoolKVCacheAllocator::blockCacheFree(const BatchKVCacheResourcePtr& batch_kv_cache_resource) {
+    if (!batch_kv_cache_resource) {
+        return;
+    }
+    for (int batch_id = 0; batch_id < batch_kv_cache_resource->batchSize(); ++batch_id) {
+        for (int gid = 0; gid < batch_kv_cache_resource->groupNums(); ++gid) {
+            BlockIndicesType                 blocks_to_free;
+            std::unordered_set<BlockIdxType> seen_blocks;
+            for (auto block_idx : batch_kv_cache_resource->blocks(batch_id, gid)) {
+                if (isNullBlockIdx(block_idx) || !seen_blocks.insert(block_idx).second) {
+                    continue;
+                }
+                blocks_to_free.push_back(block_idx);
+            }
+            if (!blocks_to_free.empty()) {
+                group_block_pools_[static_cast<size_t>(gid)]->blockCacheFree(blocks_to_free);
+            }
+        }
+    }
+}
+
+size_t HybridPoolKVCacheAllocator::requestRefBlocksNum() const {
+    size_t total = 0;
+    for (const auto& pool : group_block_pools_) {
+        total += pool->requestRefBlocksNum();
+    }
+    return total;
+}
+
+size_t HybridPoolKVCacheAllocator::connectorRefBlocksNum() const {
+    size_t total = 0;
+    for (const auto& pool : group_block_pools_) {
+        total += pool->connectorRefBlocksNum();
+    }
+    return total;
+}
+
+size_t HybridPoolKVCacheAllocator::blockCacheRefBlocksNum() const {
+    size_t total = 0;
+    for (const auto& pool : group_block_pools_) {
+        total += pool->blockCacheRefBlocksNum();
+    }
+    return total;
+}
+
+size_t HybridPoolKVCacheAllocator::notInUseBlocksNum() const {
+    size_t total = 0;
+    for (const auto& pool : group_block_pools_) {
+        total += pool->notInUseBlocksNum();
+    }
+    return total;
+}
+
+size_t HybridPoolKVCacheAllocator::minTokenCapacity(bool use_available_blocks, bool full_groups_only) const {
+    if (group_block_pools_.empty()) {
+        return 0;
+    }
+
+    auto calculate = [&](bool only_full_groups) {
+        size_t min_tokens = std::numeric_limits<size_t>::max();
+        bool   saw_group  = false;
+        for (size_t gid = 0; gid < group_block_pools_.size(); ++gid) {
+            if (only_full_groups && config_.typeForGroup(gid) != CacheGroupType::FULL) {
+                continue;
+            }
+            if (!group_block_pools_[gid]) {
+                continue;
+            }
+            saw_group        = true;
+            const auto block = use_available_blocks ? group_block_pools_[gid]->availableBlocksNum() :
+                                                      group_block_pools_[gid]->totalBlocksNum();
+            min_tokens       = std::min(min_tokens, block * logicalSeqSizePerBlockForCapacity(gid));
+        }
+        return std::make_pair(saw_group, min_tokens);
+    };
+
+    if (full_groups_only) {
+        const auto [saw_full_group, min_tokens] = calculate(/*only_full_groups=*/true);
+        if (saw_full_group) {
+            return min_tokens;
+        }
+    }
+
+    const auto [saw_group, min_tokens] = calculate(/*only_full_groups=*/false);
+    return saw_group ? min_tokens : 0;
+}
+
+size_t HybridPoolKVCacheAllocator::availableTokensNum() const {
+    return minTokenCapacity(/*use_available_blocks=*/true, /*full_groups_only=*/true);
+}
+
+size_t HybridPoolKVCacheAllocator::totalTokensNum() const {
+    return minTokenCapacity(/*use_available_blocks=*/false, /*full_groups_only=*/true);
+}
+
+size_t HybridPoolKVCacheAllocator::totalBlocksNum() const {
+    size_t total = 0;
+    for (const auto& pool : group_block_pools_) {
+        total += pool->totalBlocksNum();
+    }
+    return total;
+}
+
+size_t HybridPoolKVCacheAllocator::maxAvailableTokensNum() const {
+    return minTokenCapacity(/*use_available_blocks=*/false, /*full_groups_only=*/true);
+}
+
+KVCacheTokenCapacity HybridPoolKVCacheAllocator::tokenCapacity(size_t default_seq_size_per_block) const {
+    if (group_block_pools_.empty()) {
+        return {};
+    }
+    size_t total_tokens     = std::numeric_limits<size_t>::max();
+    size_t available_tokens = std::numeric_limits<size_t>::max();
+    bool   has_pool         = false;
+    for (size_t gid = 0; gid < group_block_pools_.size(); ++gid) {
+        const auto& pool = group_block_pools_[gid];
+        if (!pool) {
+            continue;
+        }
+        const size_t seq_size =
+            (gid < config_.group_seq_size_per_block.size() && config_.group_seq_size_per_block[gid] > 0) ?
+                config_.group_seq_size_per_block[gid] :
+                default_seq_size_per_block;
+        total_tokens     = std::min(total_tokens, pool->totalBlocksNum() * seq_size);
+        available_tokens = std::min(available_tokens, pool->availableBlocksNum() * seq_size);
+        has_pool         = true;
+    }
+    return has_pool ? KVCacheTokenCapacity{total_tokens, available_tokens} : KVCacheTokenCapacity{};
+}
+
+std::vector<KVCachePoolMetricsSnapshot> HybridPoolKVCacheAllocator::poolMetricsSnapshots() const {
+    std::vector<KVCachePoolMetricsSnapshot> snapshots;
+    snapshots.reserve(group_block_pools_.size());
+    const size_t reserve_blocks                     = reserveBlockNum();
+    const size_t total_reservable_available_blocks = totalReservableAvailableBlocks();
+    for (size_t gid = 0; gid < group_block_pools_.size(); ++gid) {
+        const auto& pool = group_block_pools_[gid];
+        if (!pool) {
+            continue;
+        }
+        KVCachePoolMetricsSnapshot snapshot;
+        snapshot.pool_index           = gid;
+        snapshot.pool_name            = pool->poolName();
+        snapshot.total_blocks         = pool->totalBlocksNum();
+        snapshot.available_blocks     = pool->availableBlocksNum();
+        snapshot.free_blocks          = pool->freeBlocksNum();
+        snapshot.request_ref_blocks   = pool->requestRefBlocksNum();
+        snapshot.connector_ref_blocks = pool->connectorRefBlocksNum();
+        snapshot.reserve_blocks       = reserveBlocksForPool(gid, reserve_blocks, total_reservable_available_blocks);
+        snapshot.used_ratio           = (snapshot.total_blocks == 0) ?
+                                            0.0f :
+                                            static_cast<float>(100.0 * (snapshot.total_blocks - snapshot.available_blocks)
+                                                     / static_cast<double>(snapshot.total_blocks));
+        snapshots.push_back(snapshot);
+    }
+    return snapshots;
+}
+
+void HybridPoolKVCacheAllocator::regUserMr(size_t model_id, std::shared_ptr<CacheStore> cache_store) {
+    for (auto& pool : group_block_pools_) {
+        pool->regUserMr(model_id, cache_store);
+    }
+}
+
+int64_t HybridPoolKVCacheAllocator::getMrCostTimeMs() const {
+    int64_t total = 0;
+    for (const auto& pool : group_block_pools_) {
+        total += pool->getMrCostTimeMs();
+    }
+    return total;
+}
+
+size_t HybridPoolKVCacheAllocator::totalReservableAvailableBlocks() const {
+    size_t total = 0;
+    for (size_t gid = 0; gid < group_block_pools_.size(); ++gid) {
+        if (!group_block_pools_[gid] || config_.usesExplicitIndependentBlocks(gid)) {
+            continue;
+        }
+        total += group_block_pools_[gid]->availableBlocksNum();
+    }
+    return total;
+}
+
+size_t HybridPoolKVCacheAllocator::reserveBlocksForPool(size_t gid,
+                                                        size_t reserve_blocks,
+                                                        size_t total_reservable_available_blocks) const {
+    if (gid >= group_block_pools_.size() || !group_block_pools_[gid] || config_.usesExplicitIndependentBlocks(gid)
+        || total_reservable_available_blocks == 0) {
+        return 0;
+    }
+    return reserve_blocks * group_block_pools_[gid]->availableBlocksNum() / total_reservable_available_blocks;
+}
+
+bool HybridPoolKVCacheAllocator::hasAvailableBlocksForReserve(const MallocInfo& malloc_info,
+                                                              size_t            reserve_blocks) const {
+    if (!malloc_info.batch_kv_cache_resource || !malloc_info.complete_token_ids) {
+        return true;
+    }
+    const auto& cp_mapper          = cp_slot_mapper_;
+    const int   batch_size         = malloc_info.batch_kv_cache_resource->batchSize();
+    const int   total_seq_len      = malloc_info.complete_token_ids->totalSeqLength();
+    const int   raw_common_seq_len = std::min(malloc_info.complete_token_ids->commonSeqLength(), total_seq_len);
+    const int   raw_seq_len        = malloc_info.complete_token_ids->seqLength();
+    const int   reserve_step       = malloc_info.complete_token_ids->getReserveStep();
+    const bool  reuse_enabled      = malloc_info.reuse_cache;
+
+    const size_t total_reservable_available_blocks = totalReservableAvailableBlocks();
+
+    for (int gid = 0; gid < static_cast<int>(kv_cache_groups_.size()); ++gid) {
+        const auto group_type             = config_.typeForGroup(static_cast<size_t>(gid));
+        const int  group_common_seq       = cpEffectiveSeqLenForReserve(cp_mapper, group_type, raw_common_seq_len);
+        const int  group_seq_len          = cpEffectiveSeqLenForReserve(cp_mapper, group_type, raw_seq_len);
+        const int  group_reuse_blocks_len = reuse_enabled ? malloc_info.batch_kv_cache_resource->blocksNum(0, gid) : 0;
+        const auto need                   = kv_cache_groups_[static_cast<size_t>(gid)]->getNeedBlocks(
+            group_common_seq, group_seq_len, reserve_step, group_reuse_blocks_len, reuse_enabled);
+        const int need_blocks = need.common_blocks + batch_size * need.extra_blocks;
+        if (need_blocks <= 0) {
+            continue;
+        }
+        const auto&  pool                 = group_block_pools_[static_cast<size_t>(gid)];
+        const size_t available_blocks     = pool->availableBlocksNum();
+        const size_t total_blocks         = pool->totalBlocksNum();
+        const size_t group_reserve_blocks =
+            reserveBlocksForPool(static_cast<size_t>(gid), reserve_blocks, total_reservable_available_blocks);
+        if (available_blocks < static_cast<size_t>(need_blocks) + group_reserve_blocks) {
+            if (malloc_info.verbose) {
+                RTP_LLM_LOG_INFO("HybridPool initMalloc rejected by reserve blocks: request_id=%ld pool_name=%s "
+                                 "group=%d need_blocks=%d total_blocks=%zu available_blocks=%zu "
+                                 "reserve_blocks=%zu group_reserve_blocks=%zu",
+                                 malloc_info.request_id,
+                                 pool->poolName().c_str(),
+                                 gid,
+                                 need_blocks,
+                                 total_blocks,
+                                 available_blocks,
+                                 reserve_blocks,
+                                 group_reserve_blocks);
+            }
+            return false;
+        }
+    }
+    return true;
+}
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/allocator/HybridPoolKVCacheAllocator.h b/rtp_llm/cpp/cache/allocator/HybridPoolKVCacheAllocator.h
new file mode 100644
index 0000000000..4c5dc81c60
--- /dev/null
+++ b/rtp_llm/cpp/cache/allocator/HybridPoolKVCacheAllocator.h
@@ -0,0 +1,72 @@
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "rtp_llm/cpp/cache/allocator/HybridKVCacheAllocator.h"
+#include "rtp_llm/cpp/config/ConfigModules.h"
+
+namespace rtp_llm {
+
+class HybridPoolKVCacheAllocator: public HybridKVCacheAllocator {
+public:
+    HybridPoolKVCacheAllocator(const CacheConfig&                 config,
+                               AllocationType                     allocation_type     = AllocationType::DEVICE,
+                               const kmonitor::MetricsReporterPtr metrics_reporter    = nullptr,
+                               int64_t                            reserve_block_ratio = 0,
+                               RoleType                           role_type           = RoleType::PDFUSION);
+
+    BlockAddrInfo          convertIndexToAddr(int layer_id, int block_id) const override;
+    std::vector<BlockInfo> convertIndexToBuffer(int layer_id, int block_id) const override;
+    std::vector<BlockInfo>
+    convertIndexToBuffer(int layer_id, int block_id, int partition_count, int partition_id) const override;
+    BlockAddrInfo          convertIndexToAddr(int layer_id, int group_id, int block_id) const override;
+    std::vector<BlockInfo> convertIndexToBuffer(int layer_id, int group_id, int block_id) const override;
+    std::vector<BlockInfo> convertIndexToBuffer(
+        int layer_id, int group_id, int block_id, int partition_count, int partition_id) const override;
+    void blockBatchCopy(const BlockIdPair* copy_mapping_begin, const BlockIdPair* copy_mapping_end) override;
+
+    CacheLayerLayout allLayerCacheBase() const override;
+
+    size_t                  freeBlocksNum() const override;
+    size_t                  availableBlocksNum() const override;
+    BatchKVCacheResourcePtr popBlocksFromCache(size_t min_blocks_to_free) override;
+    void                    blockCacheFree(const BatchKVCacheResourcePtr& batch_kv_cache_resource) override;
+    size_t                  requestRefBlocksNum() const override;
+    size_t                  connectorRefBlocksNum() const override;
+    size_t                  blockCacheRefBlocksNum() const override;
+    size_t                  notInUseBlocksNum() const override;
+    size_t                  availableTokensNum() const override;
+    size_t                  totalTokensNum() const override;
+    size_t                  totalBlocksNum() const override;
+    size_t                  maxAvailableTokensNum() const override;
+    KVCacheTokenCapacity    tokenCapacity(size_t default_seq_size_per_block) const override;
+    std::vector<KVCachePoolMetricsSnapshot> poolMetricsSnapshots() const override;
+    void                    regUserMr(size_t model_id, std::shared_ptr<CacheStore> cache_store = nullptr) override;
+    int64_t                 getMrCostTimeMs() const override;
+
+    // Per-pool access for diagnostics / per-pool metrics reporting.
+    const std::vector<BlockPoolPtr>& groupBlockPools() const {
+        return group_block_pools_;
+    }
+
+private:
+    bool doInit() override;
+
+    void referenceBlocksInGroup(int gid, const BlockIndicesType& blocks, bool is_connector = false) const override;
+    void freeBlocksInGroup(int gid, const BlockIndicesType& blocks, bool is_connector = false) override;
+    bool hasAvailableBlocksForReserve(const MallocInfo& malloc_info, size_t reserve_blocks) const override;
+
+    int validateGroupIdForLayer(int layer_id, int group_id) const;
+    int defaultGroupIdForLayer(int layer_id) const;
+    size_t minTokenCapacity(bool use_available_blocks, bool full_groups_only) const;
+    size_t totalReservableAvailableBlocks() const;
+    size_t reserveBlocksForPool(size_t gid, size_t reserve_blocks, size_t total_reservable_available_blocks) const;
+
+    std::vector<BlockPoolPtr> group_block_pools_;
+    RoleType                  role_type_{RoleType::PDFUSION};
+};
+
+using HybridPoolKVCacheAllocatorPtr = std::shared_ptr<HybridPoolKVCacheAllocator>;
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/allocator/HybridTypeKVCacheAllocator.cc b/rtp_llm/cpp/cache/allocator/HybridTypeKVCacheAllocator.cc
new file mode 100644
index 0000000000..0efe811580
--- /dev/null
+++ b/rtp_llm/cpp/cache/allocator/HybridTypeKVCacheAllocator.cc
@@ -0,0 +1,192 @@
+#include "rtp_llm/cpp/cache/allocator/HybridTypeKVCacheAllocator.h"
+
+#include <algorithm>
+#include <utility>
+
+#include "rtp_llm/cpp/cache/BlockPoolConfigHelper.h"
+#include "rtp_llm/cpp/utils/Logger.h"
+
+namespace rtp_llm {
+
+HybridTypeKVCacheAllocator::HybridTypeKVCacheAllocator(const CacheConfig&                 config,
+                                                       AllocationType                     allocation_type,
+                                                       const kmonitor::MetricsReporterPtr metrics_reporter,
+                                                       int64_t                            reserve_block_ratio):
+    HybridKVCacheAllocator(config, allocation_type, metrics_reporter, reserve_block_ratio) {}
+
+bool HybridTypeKVCacheAllocator::doInit() {
+    RTP_LLM_CHECK_WITH_INFO(config_.groupNums() > 0, "no cache groups found in CacheConfig");
+
+    auto pool_config = BlockPoolConfigHelper::createConfig(config_);
+    block_pool_      = std::make_shared<BlockPool>(
+        pool_config, allocation_type_, /*use_pinned_cpu_backing=*/false, use_cuda_malloc_block_pool_);
+    RTP_LLM_CHECK_WITH_INFO(block_pool_->init(), "Failed to initialize block pool for HybridTypeKVCacheAllocator");
+
+    const int group_nums = config_.groupNums();
+    kv_cache_groups_.reserve(group_nums);
+
+    SharedBlockCache* shared_cache_raw = shared_block_cache_ ? shared_block_cache_.get() : nullptr;
+
+    if (shared_block_cache_) {
+        std::vector<BlockPoolPtr> group_pools(static_cast<size_t>(group_nums), block_pool_);
+        shared_block_cache_->init(group_nums, group_pools);
+    }
+
+    for (int gid = 0; gid < group_nums; ++gid) {
+        KVCacheSpecPtr spec = config_.specForGroup(static_cast<size_t>(gid));
+        const auto&    ids  = config_.layerIdsForGroup(static_cast<size_t>(gid));
+
+        KVCacheGroupPtr group;
+        const auto      group_type = config_.typeForGroup(static_cast<size_t>(gid));
+        const auto      policy     = config_.policyForGroup(static_cast<size_t>(gid));
+        if (group_type == CacheGroupType::SWA) {
+            group = std::make_shared<SWAKVCacheGroup>(
+                ids, spec, block_pool_, gid, config_.linear_step, shared_cache_raw, nullptr, policy);
+            swa_group_ids_.push_back(gid);
+        } else if (group_type == CacheGroupType::LINEAR || (spec && spec->type == KVCacheSpecType::LinearAttention)) {
+            group = std::make_shared<LinearKVCacheGroup>(
+                ids, spec, block_pool_, gid, config_.linear_step, shared_cache_raw, nullptr, policy);
+            linear_group_ids_.push_back(gid);
+        } else {
+            group = std::make_shared<FullKVCacheGroup>(ids, spec, block_pool_, gid, shared_cache_raw, nullptr, policy);
+            full_group_ids_.push_back(gid);
+        }
+
+        RTP_LLM_CHECK_WITH_INFO(group->init(), "Failed to initialize KVCacheGroup gid %d", gid);
+        kv_cache_groups_.push_back(group);
+    }
+
+    global_layer_to_local_id_.assign(static_cast<size_t>(config_.layer_all_num), -1);
+    for (int gid = 0; gid < group_nums; ++gid) {
+        const auto& cur_group_layers = config_.layerIdsForGroup(static_cast<size_t>(gid));
+        for (size_t local_layer_idx = 0; local_layer_idx < cur_group_layers.size(); ++local_layer_idx) {
+            const int global_layer_idx = cur_group_layers[local_layer_idx];
+            if (global_layer_idx >= 0 && static_cast<size_t>(global_layer_idx) < global_layer_to_local_id_.size()) {
+                global_layer_to_local_id_[static_cast<size_t>(global_layer_idx)] = static_cast<int>(local_layer_idx);
+            }
+        }
+    }
+
+    RTP_LLM_LOG_INFO("HybridTypeKVCacheAllocator init success");
+    return true;
+}
+
+void HybridTypeKVCacheAllocator::referenceBlocksInGroup(int                     gid,
+                                                        const BlockIndicesType& blocks,
+                                                        bool                    is_connector) const {
+    (void)gid;
+    if (is_connector) {
+        block_pool_->connectorReference(blocks);
+    } else {
+        block_pool_->requestReference(blocks);
+    }
+}
+
+void HybridTypeKVCacheAllocator::freeBlocksInGroup(int gid, const BlockIndicesType& blocks, bool is_connector) {
+    (void)gid;
+    if (is_connector) {
+        block_pool_->connectorFree(blocks);
+    } else {
+        block_pool_->requestFree(blocks);
+    }
+}
+
+CacheLayerLayout HybridTypeKVCacheAllocator::allLayerCacheBase() const {
+    CacheLayerLayout layout;
+    const auto       layer_tensors = block_pool_->allLayerCacheBase();
+    const auto       scale_tensors = block_pool_->allLayerScaleCacheBase();
+
+    layout.layer_to_group_ids = config_.layerGroupIdsSnapshot();
+    layout.layers_to_kv_buffer_ptrs.resize(config_.layer_all_num);
+    layout.layers_to_scale_buffer_ptrs.resize(config_.layer_all_num);
+
+    for (size_t layer_id = 0; layer_id < static_cast<size_t>(config_.layer_all_num); ++layer_id) {
+        int32_t      local     = global_layer_to_local_id_[layer_id];
+        const size_t local_idx = static_cast<size_t>(local);
+
+        if (local_idx < layer_tensors.size() && layer_tensors[local_idx].defined()
+            && layer_tensors[local_idx].numel() > 0) {
+            layout.layers_to_kv_buffer_ptrs[layer_id] = layer_tensors[local_idx];
+        }
+
+        if (!scale_tensors.empty() && local_idx < scale_tensors.size() && scale_tensors[local_idx].defined()
+            && scale_tensors[local_idx].numel() > 0) {
+            layout.layers_to_scale_buffer_ptrs[layer_id] = scale_tensors[local_idx];
+        }
+    }
+    return layout;
+}
+
+int HybridTypeKVCacheAllocator::defaultGroupIdForLayer(int layer_id) const {
+    if (layer_id < 0 || static_cast<size_t>(layer_id) >= config_.layer_all_num) {
+        RTP_LLM_FAIL("invalid layer_id=%d", layer_id);
+    }
+    const int gid = config_.groupIdFor(layer_id);
+    RTP_LLM_CHECK_WITH_INFO(gid >= 0 && gid < static_cast<int>(kv_cache_groups_.size()), "invalid group id mapping");
+    return gid;
+}
+
+int HybridTypeKVCacheAllocator::validateGroupIdForLayer(int layer_id, int group_id) const {
+    RTP_LLM_CHECK_WITH_INFO(group_id >= 0 && group_id < static_cast<int>(kv_cache_groups_.size()),
+                            "invalid group id %d for layer %d",
+                            group_id,
+                            layer_id);
+    RTP_LLM_CHECK_WITH_INFO(layer_id >= 0 && static_cast<size_t>(layer_id) < config_.layer_all_num,
+                            "invalid layer id %d for layer_all_num=%u",
+                            layer_id,
+                            config_.layer_all_num);
+    const auto& group_ids = config_.groupIdsForLayer(layer_id);
+    RTP_LLM_CHECK_WITH_INFO(std::find(group_ids.begin(), group_ids.end(), group_id) != group_ids.end(),
+                            "layer %d does not own cache group %d",
+                            layer_id,
+                            group_id);
+    return group_id;
+}
+
+BlockAddrInfo HybridTypeKVCacheAllocator::convertIndexToAddr(int layer_id, int block_id) const {
+    if (layer_id < 0 || static_cast<size_t>(layer_id) >= config_.layer_all_num) {
+        RTP_LLM_FAIL("convertIndexToAddr invalid layer_id=%d", layer_id);
+    }
+    const int gid = defaultGroupIdForLayer(layer_id);
+    return kv_cache_groups_[static_cast<size_t>(gid)]->convertIndexToAddr(layer_id, block_id);
+}
+
+std::vector<BlockInfo> HybridTypeKVCacheAllocator::convertIndexToBuffer(int layer_id, int block_id) const {
+    if (layer_id < 0 || static_cast<size_t>(layer_id) >= config_.layer_all_num) {
+        RTP_LLM_FAIL("convertIndexToBuffer invalid layer_id=%d", layer_id);
+    }
+    const int gid = defaultGroupIdForLayer(layer_id);
+    return kv_cache_groups_[static_cast<size_t>(gid)]->convertIndexToBuffer(layer_id, block_id);
+}
+
+std::vector<BlockInfo> HybridTypeKVCacheAllocator::convertIndexToBuffer(int layer_id,
+                                                                        int block_id,
+                                                                        int partition_count,
+                                                                        int partition_id) const {
+    if (layer_id < 0 || static_cast<size_t>(layer_id) >= config_.layer_all_num) {
+        RTP_LLM_FAIL("convertIndexToBuffer(partition) invalid layer_id=%d", layer_id);
+    }
+    const int gid = defaultGroupIdForLayer(layer_id);
+    return kv_cache_groups_[static_cast<size_t>(gid)]->convertIndexToBuffer(
+        layer_id, block_id, partition_count, partition_id);
+}
+
+BlockAddrInfo HybridTypeKVCacheAllocator::convertIndexToAddr(int layer_id, int group_id, int block_id) const {
+    const int gid = validateGroupIdForLayer(layer_id, group_id);
+    return kv_cache_groups_[static_cast<size_t>(gid)]->convertIndexToAddr(layer_id, block_id);
+}
+
+std::vector<BlockInfo>
+HybridTypeKVCacheAllocator::convertIndexToBuffer(int layer_id, int group_id, int block_id) const {
+    const int gid = validateGroupIdForLayer(layer_id, group_id);
+    return kv_cache_groups_[static_cast<size_t>(gid)]->convertIndexToBuffer(layer_id, block_id);
+}
+
+std::vector<BlockInfo> HybridTypeKVCacheAllocator::convertIndexToBuffer(
+    int layer_id, int group_id, int block_id, int partition_count, int partition_id) const {
+    const int gid = validateGroupIdForLayer(layer_id, group_id);
+    return kv_cache_groups_[static_cast<size_t>(gid)]->convertIndexToBuffer(
+        layer_id, block_id, partition_count, partition_id);
+}
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/allocator/HybridTypeKVCacheAllocator.h b/rtp_llm/cpp/cache/allocator/HybridTypeKVCacheAllocator.h
new file mode 100644
index 0000000000..7dc4b3f119
--- /dev/null
+++ b/rtp_llm/cpp/cache/allocator/HybridTypeKVCacheAllocator.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "rtp_llm/cpp/cache/allocator/HybridKVCacheAllocator.h"
+
+namespace rtp_llm {
+
+class HybridTypeKVCacheAllocator: public HybridKVCacheAllocator {
+public:
+    HybridTypeKVCacheAllocator(const CacheConfig&                 config,
+                               AllocationType                     allocation_type     = AllocationType::DEVICE,
+                               const kmonitor::MetricsReporterPtr metrics_reporter    = nullptr,
+                               int64_t                            reserve_block_ratio = 0);
+
+    BlockAddrInfo          convertIndexToAddr(int layer_id, int block_id) const override;
+    std::vector<BlockInfo> convertIndexToBuffer(int layer_id, int block_id) const override;
+    std::vector<BlockInfo>
+    convertIndexToBuffer(int layer_id, int block_id, int partition_count, int partition_id) const override;
+    BlockAddrInfo          convertIndexToAddr(int layer_id, int group_id, int block_id) const override;
+    std::vector<BlockInfo> convertIndexToBuffer(int layer_id, int group_id, int block_id) const override;
+    std::vector<BlockInfo> convertIndexToBuffer(
+        int layer_id, int group_id, int block_id, int partition_count, int partition_id) const override;
+    CacheLayerLayout allLayerCacheBase() const override;
+
+private:
+    bool doInit() override;
+
+    void referenceBlocksInGroup(int gid, const BlockIndicesType& blocks, bool is_connector = false) const override;
+    void freeBlocksInGroup(int gid, const BlockIndicesType& blocks, bool is_connector = false) override;
+
+    int defaultGroupIdForLayer(int layer_id) const;
+    int validateGroupIdForLayer(int layer_id, int group_id) const;
+
+    // global layer id -> local layer id
+    std::vector<int> global_layer_to_local_id_;
+};
+
+using HybridTypeKVCacheAllocatorPtr = std::shared_ptr<HybridTypeKVCacheAllocator>;
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/KVCacheAllocator.cc b/rtp_llm/cpp/cache/allocator/KVCacheAllocator.cc
similarity index 57%
rename from rtp_llm/cpp/cache/KVCacheAllocator.cc
rename to rtp_llm/cpp/cache/allocator/KVCacheAllocator.cc
index 484c520596..32d2e3db0d 100644
--- a/rtp_llm/cpp/cache/KVCacheAllocator.cc
+++ b/rtp_llm/cpp/cache/allocator/KVCacheAllocator.cc
@@ -7,7 +7,7 @@
 #include "rtp_llm/models_py/bindings/core/OpData.h"
 #include "rtp_llm/cpp/cache/BlockPoolConfigHelper.h"
 #include "rtp_llm/cpp/engine_base/stream/CompleteTokenIds.h"
-#include "rtp_llm/cpp/cache/KVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h"
 #include "rtp_llm/cpp/metrics/RtpLLMMetrics.h"
 
 namespace rtp_llm {
@@ -51,7 +51,8 @@ MallocResult KVCacheAllocator::initMalloc(const MallocInfo& malloc_info) {
             if (malloc_info.batch_kv_cache_resource) {
                 const auto& cache_keys      = malloc_info.batch_kv_cache_resource->cacheKeys(0);
                 size_t      match_keys_size = cache_keys.size();
-                device_input_length         = static_cast<int64_t>(match_keys_size) * config_.seq_size_per_block;
+                device_input_length =
+                    static_cast<int64_t>(match_keys_size) * deviceCacheMetricTokensPerBlock();
             }
 
             if (device_input_length > 0) {
@@ -81,11 +82,16 @@ MallocResult KVCacheAllocator::malloc(const MallocInfo& malloc_info) {
         return {false, 0};
     }
 
+    MallocResult result;
     if (malloc_info.batch_kv_cache_resource->curBlocksNum() == 0) {
-        return initMalloc(malloc_info);
+        result = initMalloc(malloc_info);
     } else {
-        return incrMalloc(malloc_info);
+        result = incrMalloc(malloc_info);
     }
+    if (result.success) {
+        checkCPShardedMallocResult(malloc_info);
+    }
+    return result;
 }
 
 uint32_t KVCacheAllocator::convertToGlobalLayerId(size_t model_id, int local_layer_id) const {
@@ -110,17 +116,60 @@ uint32_t KVCacheAllocator::convertToGlobalLayerId(size_t model_id, int local_lay
         RTP_LLM_LOG_ERROR("convertToGlobalLayerId: mtp_sub_configs[%zu] is null", model_id - 1);
         return std::numeric_limits<uint32_t>::max();
     }
-    if (sub->global_layer_ids.empty()) {
-        RTP_LLM_LOG_ERROR("convertToGlobalLayerId: mtp_sub_configs[%zu] global_layer_ids is empty", model_id - 1);
+    if (sub->groupNums() <= 0) {
+        RTP_LLM_LOG_ERROR("convertToGlobalLayerId: mtp_sub_configs[%zu] cache groups are empty", model_id - 1);
         return std::numeric_limits<uint32_t>::max();
     }
-    if (local_layer_id >= 0 && static_cast<size_t>(local_layer_id) < sub->global_layer_ids[0].size()) {
-        return sub->global_layer_ids[0][static_cast<size_t>(local_layer_id)];
+    // SWA-only DSV4 propose configs put the single MTP layer in the SWA group
+    // (gid=6), not FULL[0], so ``global_layer_ids[0]`` is empty.  Flatten across
+    // all groups — matches ``KVCacheManager::getMTPModuleCacheLayerLayout``.
+    size_t flat_idx = 0;
+    for (int gid = 0; gid < sub->groupNums(); ++gid) {
+        const auto& group_ids = sub->layerIdsForGroup(static_cast<size_t>(gid));
+        for (int gid_val : group_ids) {
+            if (static_cast<int>(flat_idx) == local_layer_id) {
+                return static_cast<uint32_t>(gid_val);
+            }
+            ++flat_idx;
+        }
     }
     RTP_LLM_LOG_ERROR("convertToGlobalLayerId: local_layer_id=%d is invalid", local_layer_id);
     return std::numeric_limits<uint32_t>::max();
 }
 
+BlockAddrInfo KVCacheAllocator::convertIndexToAddr(int layer_id, int group_id, int block_id) const {
+    (void)group_id;
+    return convertIndexToAddr(layer_id, block_id);
+}
+
+std::vector<BlockInfo> KVCacheAllocator::convertIndexToBuffer(int layer_id, int group_id, int block_id) const {
+    (void)group_id;
+    return convertIndexToBuffer(layer_id, block_id);
+}
+
+std::vector<BlockInfo> KVCacheAllocator::convertIndexToBuffer(
+    int layer_id, int group_id, int block_id, int partition_count, int partition_id) const {
+    (void)group_id;
+    return convertIndexToBuffer(layer_id, block_id, partition_count, partition_id);
+}
+
+BlockAddrInfo KVCacheAllocator::convertIndexToAddrByTag(int layer_id, const std::string& tag, int block_id) const {
+    const int group_id = config_.groupIdForLayerTag(layer_id, tag);
+    return convertIndexToAddr(layer_id, group_id, block_id);
+}
+
+std::vector<BlockInfo>
+KVCacheAllocator::convertIndexToBufferByTag(int layer_id, const std::string& tag, int block_id) const {
+    const int group_id = config_.groupIdForLayerTag(layer_id, tag);
+    return convertIndexToBuffer(layer_id, group_id, block_id);
+}
+
+std::vector<BlockInfo> KVCacheAllocator::convertIndexToBufferByTag(
+    int layer_id, const std::string& tag, int block_id, int partition_count, int partition_id) const {
+    const int group_id = config_.groupIdForLayerTag(layer_id, tag);
+    return convertIndexToBuffer(layer_id, group_id, block_id, partition_count, partition_id);
+}
+
 void KVCacheAllocator::blockCopy(int src_block_index, int dest_block_index) {
     BlockIdPair copy_mapping{src_block_index, dest_block_index};
     blockBatchCopy(&copy_mapping, &copy_mapping + 1);
@@ -158,7 +207,7 @@ void KVCacheAllocator::blockBatchCopy(const BlockIdPair* begin_ptr, const BlockI
         copy_params.reserve(static_cast<CopyType>(i), copy_nums[i]);
     }
 
-    auto&  spec                = config_.cache_specs[0];
+    auto&  spec                = config_.specForGroup(0);
     size_t kv_block_size_bytes = spec->block_size_bytes();
 
     for (auto it = begin_ptr; it != end_ptr; ++it) {
@@ -181,7 +230,7 @@ void KVCacheAllocator::blockBatchCopy(const BlockIdPair* begin_ptr, const BlockI
             if (src_addr_info.kv_scale_addr && dst_addr_info.kv_scale_addr) {
                 copy_params.add(dst_addr_info.kv_scale_addr,
                                 src_addr_info.kv_scale_addr,
-                                static_cast<size_t>(config_.kv_scale_stride_bytes),
+                                config_.kvScaleStrideBytesForGroup(0),
                                 copy_type);
             }
         }
@@ -203,43 +252,72 @@ size_t KVCacheAllocator::availableBlocksNum() const {
 }
 
 BatchKVCacheResourcePtr KVCacheAllocator::popBlocksFromCache(size_t min_blocks_to_free) {
-    if (!block_pool_ || min_blocks_to_free == 0) {
+    if (!shared_block_cache_ || min_blocks_to_free == 0) {
         return nullptr;
     }
 
-    auto block_cache = block_pool_->blockCache();
-    if (!block_cache) {
-        return nullptr;
-    }
-
-    auto evict_result = block_cache->selectAndEvict(min_blocks_to_free);
+    auto evict_result = shared_block_cache_->selectAndEvict(min_blocks_to_free);
     if (evict_result.evicted_keys.empty()) {
         return nullptr;
     }
+    if (metrics_reporter_) {
+        for (const auto& [cache_key, lifetime_ms] : evict_result.evicted_lifetime_ms) {
+            RtpLLMCacheEvictionMetricsCollector collector;
+            collector.lifetime_ms = lifetime_ms;
+            kmonitor::MetricsTags tags("scope", "gpu");
+            tags.AddTag("evict_policy",
+                        evict_result.evicted_independent_group.count(cache_key) ? "independent" : "chain");
+            tags.AddTag("backing", "device");
+            metrics_reporter_->report<RtpLLMCacheEvictionMetrics, RtpLLMCacheEvictionMetricsCollector>(&tags,
+                                                                                                       &collector);
+        }
+    }
 
     auto batch_resource = std::make_shared<BatchKVCacheResource>();
     batch_resource->resetBatchSize(1);
-    batch_resource->initGroups(config_.groupNums(), static_cast<int>(config_.layer_all_num), config_.layer_to_group_id);
+    batch_resource->initGroups(config_.groupNums(),
+                               static_cast<int>(config_.layer_all_num),
+                               config_.layerGroupIdsSnapshot(),
+                               config_.kernelBlocksPerKvBlock(),
+                               config_.groupTypesSnapshot());
     batch_resource->setLastBlockAligned(true);
 
     for (int gid = 0; gid < config_.groupNums(); ++gid) {
         batch_resource->mutableBlockIds(0, gid).resize(evict_result.evicted_keys.size(), NULL_BLOCK_IDX);
     }
 
-    size_t evicted_idx = 0;
-    for (const auto cache_key : evict_result.evicted_keys) {
-        batch_resource->pushBackCacheKey(0, cache_key);
-        auto& items = evict_result.evicted_items.at(cache_key);
-        for (const auto& item : items) {
-            auto& block_ids = batch_resource->mutableBlockIds(0, item.group_id);
-            RTP_LLM_CHECK_WITH_INFO(evicted_idx < block_ids.blocksNum(),
-                                    "evicted index out of range: idx=%zu, blocks_num=%zu",
-                                    evicted_idx,
-                                    block_ids.blocksNum());
-            block_ids.setAt(evicted_idx, item.block_index);
+    CacheKeysType          evicted_keys;
+    BlockDependenciesType  evicted_dependencies;
+    evicted_keys.reserve(evict_result.evicted_keys.size());
+    evicted_dependencies.reserve(evict_result.evicted_keys.size());
+    for (size_t evicted_idx = 0; evicted_idx < evict_result.evicted_keys.size(); ++evicted_idx) {
+        const auto  cache_key = evict_result.evicted_keys[evicted_idx];
+        const auto& slots     = evict_result.evicted_slots.at(cache_key);
+        evicted_keys.push_back(cache_key);
+        auto dep_it = evict_result.evicted_dependencies.find(cache_key);
+        if (dep_it != evict_result.evicted_dependencies.end()) {
+            evicted_dependencies.push_back(dep_it->second);
+        } else {
+            BlockDependency dependency;
+            dependency.ordinal = static_cast<uint32_t>(evicted_idx);
+            if (evicted_idx > 0) {
+                dependency.has_parent = true;
+                dependency.parent_key = evict_result.evicted_keys[evicted_idx - 1];
+            }
+            evicted_dependencies.push_back(dependency);
+        }
+        for (int gid = 0; gid < static_cast<int>(slots.size()) && gid < config_.groupNums(); ++gid) {
+            if (!isNullBlockIdx(slots[gid])) {
+                batch_resource->mutableBlockIds(0, gid).setAt(evicted_idx, slots[gid]);
+            }
         }
-        ++evicted_idx;
     }
+    batch_resource->cacheResource(0).setCacheKeys(std::move(evicted_keys));
+    batch_resource->cacheResource(0).setBlockDependencies(std::move(evicted_dependencies));
+    // Evicted keys already come from the GPU cache's actual key namespace.
+    // Under CP this can be a mixed batch of canonical paged keys and logical
+    // state/SWA keys, so coordinator must not remap the whole batch again.
+    batch_resource->cacheResource(0).setCacheKeysAreCpCanonical(true);
     return batch_resource;
 }
 
@@ -282,7 +360,11 @@ size_t KVCacheAllocator::notInUseBlocksNum() const {
 }
 
 size_t KVCacheAllocator::availableTokensNum() const {
-    return block_pool_ ? (block_pool_->availableBlocksNum() * seqSizePerBlock()) : 0;
+    return block_pool_ ? (block_pool_->availableBlocksNum() * logicalSeqSizePerBlockForCapacity(/*gid=*/0)) : 0;
+}
+
+size_t KVCacheAllocator::totalTokensNum() const {
+    return block_pool_ ? (block_pool_->totalBlocksNum() * logicalSeqSizePerBlockForCapacity(/*gid=*/0)) : 0;
 }
 
 size_t KVCacheAllocator::totalBlocksNum() const {
@@ -290,7 +372,51 @@ size_t KVCacheAllocator::totalBlocksNum() const {
 }
 
 size_t KVCacheAllocator::maxAvailableTokensNum() const {
-    return block_pool_ ? (block_pool_->totalBlocksNum() * seqSizePerBlock()) : 0;
+    return totalTokensNum();
+}
+
+bool KVCacheAllocator::cpShardThisGroupForCapacity(size_t gid) const {
+    if (!cp_slot_mapper_ || !cp_slot_mapper_->isSharded()) {
+        return false;
+    }
+    return gid >= static_cast<size_t>(config_.groupNums()) || config_.typeForGroup(gid) == CacheGroupType::FULL;
+}
+
+size_t KVCacheAllocator::logicalSeqSizePerBlockForCapacity(size_t gid) const {
+    if (cpShardThisGroupForCapacity(gid)) {
+        return static_cast<size_t>(cp_slot_mapper_->virtualBlockSize());
+    }
+    return (gid < config_.group_seq_size_per_block.size() && config_.group_seq_size_per_block[gid] > 0) ?
+               config_.group_seq_size_per_block[gid] :
+               config_.seq_size_per_block;
+}
+
+int KVCacheAllocator::cpEffectiveSeqLenForAlloc(size_t gid, int seq_len) const {
+    if (cpShardThisGroupForCapacity(gid)) {
+        return cp_slot_mapper_->effectiveSeqLenForAlloc(seq_len);
+    }
+    return seq_len;
+}
+
+int KVCacheAllocator::deviceCacheMetricTokensPerBlock() const {
+    if (cp_slot_mapper_ && cp_slot_mapper_->isSharded()) {
+        return cp_slot_mapper_->virtualBlockSize();
+    }
+    return seqSizePerBlock();
+}
+
+KVCacheTokenCapacity KVCacheAllocator::tokenCapacity(size_t default_seq_size_per_block) const {
+    const size_t total_blocks     = totalBlocksNum();
+    const size_t available_blocks = availableBlocksNum();
+    return {total_blocks * default_seq_size_per_block, available_blocks * default_seq_size_per_block};
+}
+
+std::vector<KVCachePoolMetricsSnapshot> KVCacheAllocator::poolMetricsSnapshots() const {
+    return {};
+}
+
+std::vector<int> KVCacheAllocator::independentEvictionGroupIds() const {
+    return {};
 }
 
 void KVCacheAllocator::regUserMr(size_t model_id, std::shared_ptr<CacheStore> cache_store) {
diff --git a/rtp_llm/cpp/cache/KVCacheAllocator.h b/rtp_llm/cpp/cache/allocator/KVCacheAllocator.h
similarity index 50%
rename from rtp_llm/cpp/cache/KVCacheAllocator.h
rename to rtp_llm/cpp/cache/allocator/KVCacheAllocator.h
index 73c7584faa..5023a0d946 100644
--- a/rtp_llm/cpp/cache/KVCacheAllocator.h
+++ b/rtp_llm/cpp/cache/allocator/KVCacheAllocator.h
@@ -1,18 +1,38 @@
 #pragma once
 
+#include <cstddef>
 #include <cstdint>
 #include <limits>
 #include <memory>
+#include <string>
 #include <vector>
 
 #include "kmonitor/client/MetricsReporter.h"
 #include "rtp_llm/cpp/cache/Types.h"
 #include "rtp_llm/cpp/cache/CacheConfig.h"
 #include "rtp_llm/cpp/cache/BlockPool.h"
+#include "rtp_llm/cpp/cache/SharedBlockCache.h"
 #include "rtp_llm/cpp/cache/BufferTypes.h"
 
 namespace rtp_llm {
 
+struct KVCacheTokenCapacity {
+    size_t total_tokens     = 0;
+    size_t available_tokens = 0;
+};
+
+struct KVCachePoolMetricsSnapshot {
+    size_t pool_index           = 0;
+    std::string pool_name       = "unnamed";
+    size_t free_blocks          = 0;
+    size_t available_blocks     = 0;
+    size_t request_ref_blocks   = 0;
+    size_t connector_ref_blocks = 0;
+    size_t total_blocks         = 0;
+    size_t reserve_blocks       = 0;
+    float  used_ratio           = 0.0f;
+};
+
 class KVCacheAllocator {
 public:
     KVCacheAllocator(const CacheConfig&                 config,
@@ -33,9 +53,17 @@ class KVCacheAllocator {
     virtual std::vector<BlockInfo> convertIndexToBuffer(int layer_id, int block_id) const = 0;
     virtual std::vector<BlockInfo>
     convertIndexToBuffer(int layer_id, int block_id, int partition_count, int partition_id) const = 0;
+    virtual BlockAddrInfo convertIndexToAddr(int layer_id, int group_id, int block_id) const;
+    virtual std::vector<BlockInfo> convertIndexToBuffer(int layer_id, int group_id, int block_id) const;
+    virtual std::vector<BlockInfo>
+    convertIndexToBuffer(int layer_id, int group_id, int block_id, int partition_count, int partition_id) const;
+    virtual BlockAddrInfo convertIndexToAddrByTag(int layer_id, const std::string& tag, int block_id) const;
+    virtual std::vector<BlockInfo> convertIndexToBufferByTag(int layer_id, const std::string& tag, int block_id) const;
+    virtual std::vector<BlockInfo> convertIndexToBufferByTag(
+        int layer_id, const std::string& tag, int block_id, int partition_count, int partition_id) const;
     virtual std::shared_ptr<KVCacheResource> incrKVCacheRef(const KVCacheResource& kvcache_resource,
                                                             const CacheKeysType&   cache_keys,
-                                                            bool                   is_connector = false)            = 0;
+                                                            bool                   is_connector = false) = 0;
 
     virtual CacheLayerLayout allLayerCacheBase() const                                     = 0;
     virtual bool             updateKVBlock(const BatchKVCacheResourcePtr& batch_kv_cache_resource,
@@ -45,18 +73,38 @@ class KVCacheAllocator {
     virtual int              seqSizePerBlock() const                                       = 0;
     virtual int              singleBatchNeedBlocks(const BatchKVCacheResourcePtr& batch_kv_cache_resource,
                                                    int                            seq_len,
-                                                   int                            reserve_step) const                 = 0;
+                                                   int                            reserve_step) const              = 0;
 
     MallocResult malloc(const MallocInfo& malloc_info);
-    void         blockCopy(int src_block_index, int dest_block_index);
-    void         blockBatchCopy(const std::vector<BlockIdPair>& copy_mapping);
-    void         blockBatchCopy(const BlockIdPair* copy_mapping_begin, const BlockIdPair* copy_mapping_end);
-    void         blockBatchCopy(const torch::Tensor& copy_mapping);
+    virtual void blockCopy(int src_block_index, int dest_block_index);
+    virtual void blockBatchCopy(const std::vector<BlockIdPair>& copy_mapping);
+    virtual void blockBatchCopy(const BlockIdPair* copy_mapping_begin, const BlockIdPair* copy_mapping_end);
+    virtual void blockBatchCopy(const torch::Tensor& copy_mapping);
 
     BlockPoolPtr getBlockPool() const {
         return block_pool_;
     }
 
+    SharedBlockCachePtr sharedBlockCache() const {
+        return shared_block_cache_;
+    }
+
+    void setSharedBlockCache(SharedBlockCachePtr shared_block_cache) {
+        shared_block_cache_ = std::move(shared_block_cache);
+    }
+
+    void setUseCudaMallocBlockPool(bool use_cuda_malloc_block_pool) {
+        use_cuda_malloc_block_pool_ = use_cuda_malloc_block_pool;
+    }
+
+    void setCPSlotMapper(std::shared_ptr<CPSlotMapper> cp_slot_mapper) {
+        cp_slot_mapper_ = std::move(cp_slot_mapper);
+    }
+
+    std::shared_ptr<CPSlotMapper> cpSlotMapper() const {
+        return cp_slot_mapper_;
+    }
+
     // Reserve some blocks for already-running streams' future allocations.
     // Only applied to "init malloc" requests where batch_kv_cache_resource has no blocks yet.
     void setReserveBlockNum(size_t reserve_block_num) {
@@ -66,19 +114,23 @@ class KVCacheAllocator {
         return reserve_block_num_;
     }
 
-    void                    regUserMr(size_t model_id, std::shared_ptr<CacheStore> cache_store = nullptr);
-    int64_t                 getMrCostTimeMs() const;
-    size_t                  freeBlocksNum() const;
-    size_t                  availableBlocksNum() const;
-    BatchKVCacheResourcePtr popBlocksFromCache(size_t min_blocks_to_free);
-    void                    blockCacheFree(const BatchKVCacheResourcePtr& batch_kv_cache_resource);
-    size_t                  requestRefBlocksNum() const;
-    size_t                  connectorRefBlocksNum() const;
-    size_t                  blockCacheRefBlocksNum() const;
-    size_t                  notInUseBlocksNum() const;
-    size_t                  availableTokensNum() const;
-    size_t                  totalBlocksNum() const;
-    size_t                  maxAvailableTokensNum() const;
+    virtual void                    regUserMr(size_t model_id, std::shared_ptr<CacheStore> cache_store = nullptr);
+    virtual int64_t                 getMrCostTimeMs() const;
+    virtual size_t                  freeBlocksNum() const;
+    virtual size_t                  availableBlocksNum() const;
+    virtual BatchKVCacheResourcePtr popBlocksFromCache(size_t min_blocks_to_free);
+    virtual void                    blockCacheFree(const BatchKVCacheResourcePtr& batch_kv_cache_resource);
+    virtual size_t                  requestRefBlocksNum() const;
+    virtual size_t                  connectorRefBlocksNum() const;
+    virtual size_t                  blockCacheRefBlocksNum() const;
+    virtual size_t                  notInUseBlocksNum() const;
+    virtual size_t                  availableTokensNum() const;
+    virtual size_t                  totalTokensNum() const;
+    virtual size_t                  totalBlocksNum() const;
+    virtual size_t                  maxAvailableTokensNum() const;
+    virtual KVCacheTokenCapacity    tokenCapacity(size_t default_seq_size_per_block) const;
+    virtual std::vector<KVCachePoolMetricsSnapshot> poolMetricsSnapshots() const;
+    virtual std::vector<int> independentEvictionGroupIds() const;
     /// Returns global layer id; std::numeric_limits<uint32_t>::max() indicates invalid (caller must check).
     uint32_t convertToGlobalLayerId(size_t model_id, int local_layer_id) const;
 
@@ -88,12 +140,20 @@ class KVCacheAllocator {
     virtual MallocResult incrMalloc(const MallocInfo& malloc_info)                                          = 0;
     virtual MallocResult initMallocForCommonLen(const MallocInfo& malloc_info)                              = 0;
     virtual int          getNeedBlocks(const MallocInfo& malloc_info) const                                 = 0;
+    virtual void         checkCPShardedMallocResult(const MallocInfo&) const {}
     virtual void         decrKVCacheRef(const KVCacheResource& kvcache_resource, bool is_connector = false) = 0;
+    bool                 cpShardThisGroupForCapacity(size_t gid) const;
+    size_t               logicalSeqSizePerBlockForCapacity(size_t gid) const;
+    int                  cpEffectiveSeqLenForAlloc(size_t gid, int seq_len) const;
+    int                  deviceCacheMetricTokensPerBlock() const;
 
     CacheConfig                        config_;
     AllocationType                     allocation_type_;
     BlockPoolPtr                       block_pool_;
-    const kmonitor::MetricsReporterPtr metrics_reporter_ = nullptr;
+    SharedBlockCachePtr                shared_block_cache_;
+    std::shared_ptr<CPSlotMapper>      cp_slot_mapper_;
+    const kmonitor::MetricsReporterPtr metrics_reporter_           = nullptr;
+    bool                               use_cuda_malloc_block_pool_ = false;
 
     size_t  reserve_block_num_{0};
     int64_t reserve_block_ratio_{0};
diff --git a/rtp_llm/cpp/cache/SingleTypeKVCacheAllocator.cc b/rtp_llm/cpp/cache/allocator/SingleTypeKVCacheAllocator.cc
similarity index 65%
rename from rtp_llm/cpp/cache/SingleTypeKVCacheAllocator.cc
rename to rtp_llm/cpp/cache/allocator/SingleTypeKVCacheAllocator.cc
index 3475fe1023..c7bca4db4a 100644
--- a/rtp_llm/cpp/cache/SingleTypeKVCacheAllocator.cc
+++ b/rtp_llm/cpp/cache/allocator/SingleTypeKVCacheAllocator.cc
@@ -1,4 +1,4 @@
-#include "rtp_llm/cpp/cache/SingleTypeKVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/allocator/SingleTypeKVCacheAllocator.h"
 
 #include <algorithm>
 #include <unordered_map>
@@ -18,15 +18,49 @@ int SingleTypeKVCacheAllocator::getNeedBlocks(const MallocInfo& malloc_info) con
     const bool reuse_enabled    = malloc_info.reuse_cache;
     const int  reuse_blocks_len = reuse_enabled ? malloc_info.batch_kv_cache_resource->curBlocksNum() : 0;
     const int  batch_size       = malloc_info.batch_kv_cache_resource->batchSize();
-    const int  seq_len          = malloc_info.complete_token_ids->seqLength();
+    int        seq_len          = malloc_info.complete_token_ids->seqLength();
     const int  reserve_step     = malloc_info.complete_token_ids->getReserveStep();
-    const int  common_seq_len   = std::min(malloc_info.complete_token_ids->commonSeqLength(), seq_len);
+    int        common_seq_len   = std::min(malloc_info.complete_token_ids->commonSeqLength(), seq_len);
+
+    if (cp_slot_mapper_ && cp_slot_mapper_->isSharded()) {
+        seq_len        = cp_slot_mapper_->effectiveSeqLenForAlloc(seq_len);
+        common_seq_len = cp_slot_mapper_->effectiveSeqLenForAlloc(common_seq_len);
+    }
 
     const auto need =
         full_kv_cache_group_->getNeedBlocks(common_seq_len, seq_len, reserve_step, reuse_blocks_len, reuse_enabled);
     return (batch_size <= 0) ? 0 : (need.common_blocks + batch_size * need.extra_blocks);
 }
 
+void SingleTypeKVCacheAllocator::checkCPShardedMallocResult(const MallocInfo& malloc_info) const {
+    if (!cp_slot_mapper_ || !cp_slot_mapper_->isSharded()) {
+        return;
+    }
+
+    const auto& kv_resource       = malloc_info.batch_kv_cache_resource;
+    const int   seq_len           = malloc_info.incrSeqLen();
+    const int   reserve_step      = malloc_info.complete_token_ids->getReserveStep();
+    const int   effective_seq_len = cp_slot_mapper_->effectiveSeqLenForAlloc(seq_len);
+    const int   expected_blocks   = full_kv_cache_group_->needBlocksNum(effective_seq_len, 0, reserve_step);
+
+    for (int batch_id = 0; batch_id < kv_resource->batchSize(); ++batch_id) {
+        const int actual_blocks = kv_resource->blocksNum(batch_id);
+        RTP_LLM_CHECK_WITH_INFO(actual_blocks == expected_blocks,
+                                "CP invariant violated: batch=%d blocks=%d != expected_local_blocks=%d "
+                                "(seq_len=%d, effective_seq_len=%d, reserve_step=%d, cp_size=%d, "
+                                "block_size=%d, cacheKeys=%zu)",
+                                batch_id,
+                                actual_blocks,
+                                expected_blocks,
+                                seq_len,
+                                effective_seq_len,
+                                reserve_step,
+                                cp_slot_mapper_->cpSize(),
+                                cp_slot_mapper_->blockSize(),
+                                kv_resource->cacheKeys(batch_id).size());
+    }
+}
+
 SingleTypeKVCacheAllocator::SingleTypeKVCacheAllocator(const CacheConfig&                 config,
                                                        AllocationType                     allocation_type,
                                                        const kmonitor::MetricsReporterPtr metrics_reporter,
@@ -34,8 +68,8 @@ SingleTypeKVCacheAllocator::SingleTypeKVCacheAllocator(const CacheConfig&
     KVCacheAllocator(config, allocation_type, metrics_reporter, reserve_block_ratio) {}
 
 bool SingleTypeKVCacheAllocator::doInit() {
-    RTP_LLM_CHECK_WITH_INFO(!config_.cache_specs.empty(), "cache specs must not be empty");
-    auto& spec = config_.cache_specs[0];
+    RTP_LLM_CHECK_WITH_INFO(config_.groupNums() > 0, "cache groups must not be empty");
+    auto& spec = config_.specForGroup(0);
     RTP_LLM_CHECK_WITH_INFO(spec != nullptr, "cache spec[0] is null");
     RTP_LLM_CHECK_WITH_INFO(spec->type == rtp_llm::KVCacheSpecType::MultiHeadAttention
                                 || spec->type == rtp_llm::KVCacheSpecType::MultiHeadLatentAttention,
@@ -44,14 +78,22 @@ bool SingleTypeKVCacheAllocator::doInit() {
     BlockPoolConfig pool_config;
 
     pool_config = BlockPoolConfigHelper::createConfig(config_);
-    block_pool_ = std::make_shared<BlockPool>(pool_config, allocation_type_);
+    block_pool_ = std::make_shared<BlockPool>(
+        pool_config, allocation_type_, /*use_pinned_cpu_backing=*/false, use_cuda_malloc_block_pool_);
     if (!block_pool_->init()) {
         RTP_LLM_LOG_ERROR("Failed to initialize block pool for SingleTypeKVCacheAllocator");
         return false;
     }
 
-    std::vector<int> layer_ids(config_.global_layer_ids[0]);
-    full_kv_cache_group_ = std::make_shared<FullKVCacheGroup>(layer_ids, spec, block_pool_, 0);
+    SharedBlockCache* shared_cache_raw = shared_block_cache_ ? shared_block_cache_.get() : nullptr;
+
+    if (shared_block_cache_) {
+        std::vector<BlockPoolPtr> group_pools = {block_pool_};
+        shared_block_cache_->init(1, group_pools);
+    }
+
+    std::vector<int> layer_ids(config_.layerIdsForGroup(0));
+    full_kv_cache_group_ = std::make_shared<FullKVCacheGroup>(layer_ids, spec, block_pool_, 0, shared_cache_raw);
 
     if (!full_kv_cache_group_->init()) {
         RTP_LLM_LOG_ERROR("Failed to initialize FullKVCacheGroup");
@@ -68,6 +110,10 @@ MallocResult SingleTypeKVCacheAllocator::initMallocForCommonLen(const MallocInfo
     int   common_seq_len =
         std::min(malloc_info.complete_token_ids->commonSeqLength(), malloc_info.complete_token_ids->totalSeqLength());
 
+    if (cp_slot_mapper_ && cp_slot_mapper_->isSharded()) {
+        common_seq_len = cp_slot_mapper_->effectiveSeqLenForAlloc(common_seq_len);
+    }
+
     const auto& cache_keys         = kv_resource->cacheKeys(0);
     auto&       block_ids_0        = kv_resource->mutableBlockIds(0);
     int64_t     match_cost_time_us = 0;
@@ -82,12 +128,27 @@ MallocResult SingleTypeKVCacheAllocator::initMallocForCommonLen(const MallocInfo
     // 2. if the last block is full and matched, the reuse length will be equal to the seq_len, which causes core dump
     // in computing ops.
     if (malloc_info.enable_device_cache) {
-        CacheKeysType match_keys(cache_keys.begin(), cache_keys.empty() ? cache_keys.end() : cache_keys.end() - 1);
-        auto          match_begin_time_us = currentTimeUs();
-        auto          match_result        = full_kv_cache_group_->match(match_keys);
-        match_cost_time_us                = currentTimeUs() - match_begin_time_us;
-        reuse_len                         = static_cast<int>(match_result.reuse_length);
-        reuse_blocks                      = static_cast<int>(match_result.reuse_blocks);
+        CacheKeysType match_keys;
+        if (cp_slot_mapper_ && cp_slot_mapper_->isSharded()) {
+            // Drop the last virtual-block key (same reasoning as non-CP) to avoid
+            // a full-len reuse / empty-block crash. Use last-rank stride so all
+            // ranks share one canonical key namespace.
+            int  cp_size     = cp_slot_mapper_->cpSize();
+            auto vblock_keys = kv_resource->cacheResource(0).localCacheKeys(cp_size - 1, cp_size);
+            match_keys.assign(vblock_keys.begin(), vblock_keys.empty() ? vblock_keys.end() : vblock_keys.end() - 1);
+        } else {
+            match_keys.assign(cache_keys.begin(), cache_keys.empty() ? cache_keys.end() : cache_keys.end() - 1);
+        }
+        auto        match_begin_time_us = currentTimeUs();
+        MatchResult match_result        = full_kv_cache_group_->match(match_keys);
+        if (cp_slot_mapper_ && cp_slot_mapper_->isSharded()) {
+            // virtual block ⇒ reuse_length covers cp_size physical blocks of
+            // tokens; reuse_blocks counts virtual blocks.
+            match_result.reuse_length = match_result.reuse_blocks * cp_slot_mapper_->virtualBlockSize();
+        }
+        match_cost_time_us = currentTimeUs() - match_begin_time_us;
+        reuse_len          = static_cast<int>(match_result.reuse_length);
+        reuse_blocks       = static_cast<int>(match_result.reuse_blocks);
         kv_resource->cacheResource(0).setDeviceReuseBlockNum(reuse_blocks);
         full_kv_cache_group_->reference(block_ids_0, match_result.block_indices);
     }
@@ -128,9 +189,13 @@ MallocResult SingleTypeKVCacheAllocator::incrMalloc(const MallocInfo& malloc_inf
     auto& kv_resource    = malloc_info.batch_kv_cache_resource;
     int   batch_size     = kv_resource->batchSize();
     int   current_blocks = kv_resource->curBlocksNum();
-    int   seq_len        = malloc_info.complete_token_ids->seqLength();
+    int   seq_len        = malloc_info.incrSeqLen();
     int   reserve_step   = malloc_info.complete_token_ids->getReserveStep();
 
+    if (cp_slot_mapper_ && cp_slot_mapper_->isSharded()) {
+        seq_len = cp_slot_mapper_->effectiveSeqLenForAlloc(seq_len);
+    }
+
     auto need_blocks = full_kv_cache_group_->needBlocksNum(seq_len, current_blocks, reserve_step);
     if (need_blocks == 0) {
         return {true, 0};
@@ -189,24 +254,55 @@ void SingleTypeKVCacheAllocator::free(const FreeInfo& free_info) {
 
 void SingleTypeKVCacheAllocator::insertIntoCache(const InsertInfo& insert_info) {
     auto& kv_resource = insert_info.batch_kv_cache_resource;
-    int   batch_size  = kv_resource->batchSize();
+    if (!shared_block_cache_) {
+        return;
+    }
 
-    // TODO(chanyin): set batch_size to 1 for now
-    batch_size = 1;
+    int batch_size = kv_resource->batchSize();
+    batch_size     = 1;
 
     for (int batch_id = 0; batch_id < batch_size; ++batch_id) {
-        const auto& cache_keys = kv_resource->cacheKeys(batch_id);
-        const auto& blocks     = kv_resource->blocks(batch_id);
+        kv_resource->cacheResource(batch_id).ensureLinearBlockDependencies();
+        const auto& blocks = kv_resource->blocks(batch_id);
+
+        // Under CP sharding, use the same last-rank-key namespace as match()
+        // (see initMallocForCommonLen) so the device cache stays consistent
+        // across ranks without any cross-rank coordination.
+        CacheKeysType insert_keys;
+        SharedBlockCache::NamespaceId namespace_id = SharedBlockCache::kGpuLogicalNamespace;
+        if (cp_slot_mapper_ && cp_slot_mapper_->isSharded()) {
+            int cp_size = cp_slot_mapper_->cpSize();
+            insert_keys = kv_resource->cacheResource(batch_id).localCacheKeys(cp_size - 1, cp_size);
+            namespace_id = SharedBlockCache::kGpuCpCanonicalNamespace;
+        } else {
+            insert_keys = kv_resource->cacheKeys(batch_id);
+        }
+        BlockDependenciesType dependencies;
+        dependencies.reserve(insert_keys.size());
+        for (size_t i = 0; i < insert_keys.size(); ++i) {
+            BlockDependency dependency;
+            dependency.ordinal = static_cast<uint32_t>(i);
+            if (i > 0) {
+                dependency.has_parent = true;
+                dependency.parent_key = insert_keys[i - 1];
+            }
+            dependencies.push_back(dependency);
+        }
 
-        size_t block_num = std::min(size_t(cache_keys.size()), size_t(blocks.size()));
+        size_t block_num = std::min(size_t(insert_keys.size()), size_t(blocks.size()));
         if (block_num == 0) {
             continue;
         }
 
-        CacheKeysType    put_cache_keys(cache_keys.begin(), cache_keys.begin() + block_num);
-        BlockIndicesType put_block_ids(blocks.begin(), blocks.begin() + block_num);
-
-        full_kv_cache_group_->insertIntoCache(put_cache_keys, put_block_ids, insert_info.is_resident);
+        for (size_t i = block_num; i > 0; --i) {
+            const size_t idx = i - 1;
+            if (isNullBlockIdx(blocks[idx])) {
+                continue;
+            }
+            std::vector<BlockIdxType> group_slots = {blocks[idx]};
+            shared_block_cache_->put(
+                insert_keys[idx], group_slots, insert_info.is_resident, namespace_id, dependencies[idx]);
+        }
     }
 }
 
@@ -226,10 +322,10 @@ CacheLayerLayout SingleTypeKVCacheAllocator::allLayerCacheBase() const {
             layout.layers_to_scale_buffer_ptrs[layer_id] = scale_tensors[layer_id];
         }
     }
-    layout.layer_to_groups.reserve(config_.layer_all_num);
-    int group_id = full_kv_cache_group_->group_id();
-    for (int layed_id = 0; layed_id < config_.layer_all_num; layed_id++) {
-        layout.layer_to_groups.push_back(group_id);
+    layout.layer_to_group_ids.resize(config_.layer_all_num);
+    const int group_id = full_kv_cache_group_->group_id();
+    for (int layer_id = 0; layer_id < config_.layer_all_num; ++layer_id) {
+        layout.layer_to_group_ids[static_cast<size_t>(layer_id)] = {group_id};
     }
     return layout;
 }
@@ -272,13 +368,15 @@ std::shared_ptr<KVCacheResource> SingleTypeKVCacheAllocator::incrKVCacheRef(cons
         delete resource;
     };
     std::shared_ptr<KVCacheResource> selected_resource(selected_resource_ptr, deleter);
-    selected_resource->initGroups(
-        1, config_.layer_all_num, config_.layer_to_group_id, config_.kernelBlocksPerKvBlock());
+    selected_resource->initGroups(1, config_.layer_all_num, config_.layerGroupIdsSnapshot(), config_.kernelBlocksPerKvBlock());
 
-    CacheKeysType    selected_cache_keys;
-    BlockIndicesType selected_blocks;
+    CacheKeysType          selected_cache_keys;
+    BlockDependenciesType  selected_dependencies;
+    BlockIndicesType       selected_blocks;
+    BlockIndicesType       referenced_blocks;
 
-    const auto& src_blocks = kvcache_resource.blocks(0);
+    const auto& src_blocks           = kvcache_resource.blocks(0);
+    const auto& source_dependencies  = kvcache_resource.blockDependencies();
 
     for (auto key : cache_keys) {
         auto it = key_to_pos.find(key);
@@ -286,27 +384,37 @@ std::shared_ptr<KVCacheResource> SingleTypeKVCacheAllocator::incrKVCacheRef(cons
             continue;
         }
         const size_t pos = it->second;
-        if (pos >= src_blocks.size()) {
+        const bool preserve_connector_tail = is_connector && !kvcache_resource.lastBlockAligned()
+                                             && pos + 1 == resource_keys.size() && !selected_cache_keys.empty();
+        if (pos >= src_blocks.size() && !preserve_connector_tail) {
             continue;
         }
-        const auto block = src_blocks[pos];
-        if (block > 0 && !isNullBlockIdx(block)) {
+        const auto block = pos < src_blocks.size() ? src_blocks[pos] : NULL_BLOCK_IDX;
+        if ((block > 0 && !isNullBlockIdx(block)) || preserve_connector_tail) {
             selected_cache_keys.push_back(key);
+            selected_dependencies.push_back(
+                pos < source_dependencies.size() ?
+                    source_dependencies[pos] :
+                    BlockDependency{false, 0, static_cast<uint32_t>(selected_dependencies.size())});
             selected_blocks.push_back(block);
+            if (block > 0 && !isNullBlockIdx(block)) {
+                referenced_blocks.push_back(block);
+            }
         }
     }
 
-    if (selected_blocks.empty()) {
+    if (referenced_blocks.empty()) {
         return nullptr;
     }
 
     if (is_connector) {
-        block_pool_->connectorReference(selected_blocks);
+        block_pool_->connectorReference(referenced_blocks);
     } else {
-        block_pool_->requestReference(selected_blocks);
+        block_pool_->requestReference(referenced_blocks);
     }
     selected_resource->mutableBlockIds(0).assign(std::move(selected_blocks));
-    selected_resource->cacheKeys() = std::move(selected_cache_keys);
+    selected_resource->setCacheKeys(std::move(selected_cache_keys));
+    selected_resource->setBlockDependencies(std::move(selected_dependencies));
 
     return selected_resource;
 }
@@ -315,7 +423,12 @@ void SingleTypeKVCacheAllocator::decrKVCacheRef(const KVCacheResource& kvcache_r
     RTP_LLM_CHECK_WITH_INFO(
         kvcache_resource.groupNums() == 1, "decrKVCacheRef expects groupNums==1, got %d", kvcache_resource.groupNums());
 
-    const auto& blocks_to_free = kvcache_resource.blocks(0);
+    BlockIndicesType blocks_to_free;
+    for (auto block : kvcache_resource.blocks(0)) {
+        if (block > 0 && !isNullBlockIdx(block)) {
+            blocks_to_free.push_back(block);
+        }
+    }
     if (!blocks_to_free.empty()) {
         if (is_connector) {
             block_pool_->connectorFree(blocks_to_free);
@@ -380,8 +493,7 @@ bool SingleTypeKVCacheAllocator::updateKVBlock(const BatchKVCacheResourcePtr& kv
     kv_cache_resource->resetAndReturnOldResources(new_batch_size, old_resources);
 
     // init for all batch
-    kv_cache_resource->initGroups(
-        1, config_.layer_all_num, config_.layer_to_group_id, config_.kernelBlocksPerKvBlock());
+    kv_cache_resource->initGroups(1, config_.layer_all_num, config_.layerGroupIdsSnapshot(), config_.kernelBlocksPerKvBlock());
 
     for (int new_batch_idx = 0; new_batch_idx < new_batch_size; ++new_batch_idx) {
         const int old_batch_idx = block_src_batch[new_batch_idx];
@@ -420,7 +532,10 @@ int SingleTypeKVCacheAllocator::seqSizePerBlock() const {
 int SingleTypeKVCacheAllocator::singleBatchNeedBlocks(const BatchKVCacheResourcePtr& batch_kv_cache_resource,
                                                       int                            seq_len,
                                                       int                            reserve_step) const {
-    return full_kv_cache_group_->needBlocksNum(seq_len, 0, reserve_step);
+    (void)batch_kv_cache_resource;
+    const int effective_seq_len =
+        (cp_slot_mapper_ && cp_slot_mapper_->isSharded()) ? cp_slot_mapper_->effectiveSeqLenForAlloc(seq_len) : seq_len;
+    return full_kv_cache_group_->needBlocksNum(effective_seq_len, 0, reserve_step);
 }
 
 }  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/SingleTypeKVCacheAllocator.h b/rtp_llm/cpp/cache/allocator/SingleTypeKVCacheAllocator.h
similarity index 90%
rename from rtp_llm/cpp/cache/SingleTypeKVCacheAllocator.h
rename to rtp_llm/cpp/cache/allocator/SingleTypeKVCacheAllocator.h
index 28adcfe212..b0b3d39b18 100644
--- a/rtp_llm/cpp/cache/SingleTypeKVCacheAllocator.h
+++ b/rtp_llm/cpp/cache/allocator/SingleTypeKVCacheAllocator.h
@@ -1,12 +1,12 @@
 #pragma once
 
 #include <memory>
-#include "rtp_llm/cpp/cache/KVCacheAllocator.h"
-#include "rtp_llm/cpp/cache/FullKVCacheGroup.h"
+#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/group/FullKVCacheGroup.h"
 
 namespace rtp_llm {
 
-// SingleTypedKVCacheAllocator is used for model with full attentions only
+// SingleTypeKVCacheAllocator is used for model with full attentions only
 class SingleTypeKVCacheAllocator:
     public KVCacheAllocator,
     public std::enable_shared_from_this<SingleTypeKVCacheAllocator> {
@@ -42,6 +42,7 @@ class SingleTypeKVCacheAllocator:
     MallocResult incrMalloc(const MallocInfo& malloc_info) override;
     MallocResult initMallocForCommonLen(const MallocInfo& malloc_info) override;
     int          getNeedBlocks(const MallocInfo& malloc_info) const override;
+    void         checkCPShardedMallocResult(const MallocInfo& malloc_info) const override;
     void         decrKVCacheRef(const KVCacheResource& kvcache_resource, bool is_connector = false) override;
 
 private:
diff --git a/rtp_llm/cpp/cache/config_creator/CacheConfigCreator.cc b/rtp_llm/cpp/cache/config_creator/CacheConfigCreator.cc
new file mode 100644
index 0000000000..4bf86b3ed0
--- /dev/null
+++ b/rtp_llm/cpp/cache/config_creator/CacheConfigCreator.cc
@@ -0,0 +1,317 @@
+#include "rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h"
+
+#include <numeric>
+#include <algorithm>
+
+#include "rtp_llm/cpp/cache/config_creator/HybridPoolConfigCreator.h"
+#include "rtp_llm/cpp/cache/config_creator/HybridConfigCreator.h"
+#include "rtp_llm/cpp/cache/config_creator/MemoryEvaluationHelper.h"
+#include "rtp_llm/cpp/cache/config_creator/SingleConfigCreator.h"
+#include "rtp_llm/cpp/cache/spec/KVCacheSpecDesc.h"
+#include "rtp_llm/cpp/utils/Logger.h"
+#include "rtp_llm/cpp/utils/AssertUtils.h"
+
+namespace rtp_llm {
+
+namespace {
+
+size_t steppedBytes(size_t bytes, int step) {
+    return (bytes > 0 && step > 1) ? bytes / static_cast<size_t>(step) : bytes;
+}
+
+size_t nonExplicitFixedPoolHbmBytes(const CacheConfig& config) {
+    // Only independent-pool configs use per-group HBM accounting; SingleConfig
+    // and HybridConfig leave use_independent_block_pools false.
+    if (!config.use_independent_block_pools) {
+        return 0;
+    }
+
+    size_t bytes = 0;
+    for (size_t gid = 0; gid < static_cast<size_t>(config.groupNums()); ++gid) {
+        const auto& spec = config.specForGroup(gid);
+        if (spec == nullptr || !spec->isFixedCache()) {
+            continue;
+        }
+        if (!config.usesExplicitIndependentBlocks(gid)) {
+            bytes += config.blockSizeBytesForGroup(gid);
+        }
+    }
+    return bytes;
+}
+
+size_t effectivePagedBlockBytes(const CacheConfig& config, int step) {
+    return config.block_size_bytes + steppedBytes(nonExplicitFixedPoolHbmBytes(config), step);
+}
+
+void setupKernelSeqSize(CacheConfig& config, const KVCacheConfig& kv_cache_config, const char* config_name) {
+    if (kv_cache_config.kernel_seq_size_per_block > 0) {
+        const auto kernel_seq_size_per_block = static_cast<size_t>(kv_cache_config.kernel_seq_size_per_block);
+        // Generic divisibility check. Desc-based hybrid pool layouts validate
+        // their own stricter alignment during createBasicConfig().
+        RTP_LLM_CHECK_WITH_INFO(config.seq_size_per_block % kernel_seq_size_per_block == 0,
+                                "%s seq_size_per_block(%zu) must be divisible by kernel_seq_size_per_block(%zu)",
+                                config_name,
+                                config.seq_size_per_block,
+                                kernel_seq_size_per_block);
+        config.kernel_seq_size_per_block = kernel_seq_size_per_block;
+    } else if (config.kernel_seq_size_per_block == 0 || config.kernel_seq_size_per_block == config.seq_size_per_block) {
+        config.kernel_seq_size_per_block = config.seq_size_per_block;
+    }
+}
+
+uint32_t computeBlockNum(CacheConfig&                                     config,
+                         const ModelConfig&                               model_config,
+                         const RuntimeConfig&                             runtime_config,
+                         const KVCacheConfig&                             kv_cache_config,
+                         const ParallelismConfig&                         parallelism_config,
+                         const std::optional<WarmUpResult>&               warm_up_result,
+                         const std::optional<SpeculativeExecutionConfig>& sp_config) {
+    if (kv_cache_config.test_block_num > 0) {
+        RTP_LLM_LOG_INFO("KVCacheConfig explicitly specified kv cache block num %d", kv_cache_config.test_block_num);
+        config.finalizeBlockNums(kv_cache_config.test_block_num, runtime_config);
+        return static_cast<uint32_t>(kv_cache_config.test_block_num);
+    }
+
+    const auto kv_cache_mem_size = MemoryEvaluationHelper::getKVCacheMemorySize(
+        runtime_config, kv_cache_config, model_config, parallelism_config, warm_up_result, sp_config);
+    // Explicitly-sized pool reservation depends on runtime scheduler limits,
+    // so finalize it here after RuntimeConfig is available.
+    config.finalizeBlockNums(0, runtime_config);
+
+    size_t paged_budget = kv_cache_mem_size;
+    if (config.explicitly_sized_pool_reserve_bytes > 0) {
+        RTP_LLM_CHECK_WITH_INFO(kv_cache_mem_size > config.explicitly_sized_pool_reserve_bytes,
+                                "kv cache budget %zu MiB is smaller than explicitly-sized pool reservation %zu MiB "
+                                "(reduce explicitly sized pool blocks if needed)",
+                                kv_cache_mem_size / 1024 / 1024,
+                                config.explicitly_sized_pool_reserve_bytes / 1024 / 1024);
+        paged_budget = kv_cache_mem_size - config.explicitly_sized_pool_reserve_bytes;
+        RTP_LLM_LOG_INFO("kv cache: total budget %zu MiB, explicitly-sized pool reserve %zu MiB, paged budget %zu MiB",
+                         kv_cache_mem_size / 1024 / 1024,
+                         config.explicitly_sized_pool_reserve_bytes / 1024 / 1024,
+                         paged_budget / 1024 / 1024);
+    }
+    const int joint_step = std::max(1, config.linear_step);
+    return static_cast<uint32_t>(paged_budget / effectivePagedBlockBytes(config, joint_step));
+}
+
+}  // namespace
+
+LayerKVCacheSpecs CacheConfigCreator::buildLayerSpecsFromDescs(const LayerKVCacheSpecDescs& layer_descs,
+                                                               const SpecBuildContext&      ctx,
+                                                               int64_t                      expected_layer_num) {
+    RTP_LLM_CHECK_WITH_INFO(layer_descs.size() == static_cast<size_t>(expected_layer_num),
+                            "kv_cache_spec_descs size %zu != num_layers %ld",
+                            layer_descs.size(),
+                            expected_layer_num);
+    LayerKVCacheSpecs layer_specs(layer_descs.size());
+    for (size_t layer_id = 0; layer_id < layer_descs.size(); ++layer_id) {
+        const auto& descs = layer_descs[layer_id];
+        RTP_LLM_CHECK_WITH_INFO(!descs.empty(), "kv_cache_spec_descs layer %zu has no descs", layer_id);
+        auto& specs = layer_specs[layer_id];
+        specs.reserve(descs.size());
+        for (const auto& desc : descs) {
+            specs.push_back(SpecBuilder::build(desc, ctx));
+        }
+    }
+    return layer_specs;
+}
+
+CacheConfig CacheConfigCreator::createBasicConfig(const ModelConfig&       model_config,
+                                                  const ParallelismConfig& parallelism_config,
+                                                  const KVCacheConfig&     kv_cache_config,
+                                                  bool                     is_mtp,
+                                                  int                      gen_num_per_cycle) {
+    // Routing priority:
+    //   1. enable_independent_kv_cache_pools=true  → HybridPool (independent BlockPool per group)
+    //   2. enable_hybrid_attention=true             → HybridType  (shared BlockPool across groups)
+    //   3. else                                     → Single       (standard MHA/MLA path)
+    if (model_config.hybrid_attention_config.enable_independent_kv_cache_pools) {
+        return HybridPoolConfigCreator::createConfig(
+            model_config, parallelism_config, kv_cache_config, is_mtp, gen_num_per_cycle);
+    } else if (model_config.hybrid_attention_config.enable_hybrid_attention) {
+        return HybridConfigCreator::createHybridConfig(
+            model_config, parallelism_config, kv_cache_config, is_mtp, gen_num_per_cycle);
+    } else {
+        return SingleConfigCreator::createSingleConfig(
+            model_config, parallelism_config, kv_cache_config, is_mtp, gen_num_per_cycle);
+    }
+}
+
+CacheConfig CacheConfigCreator::createConfig(const ModelConfig&                               model_config,
+                                             const ParallelismConfig&                         parallelism_config,
+                                             const RuntimeConfig&                             runtime_config,
+                                             const KVCacheConfig&                             kv_cache_config,
+                                             const std::optional<WarmUpResult>&               warm_up_result,
+                                             const std::optional<SpeculativeExecutionConfig>& sp_config) {
+    CacheConfig config =
+        CacheConfigCreator::createBasicConfig(model_config, parallelism_config, kv_cache_config, false, 0);
+
+    config.linear_step = kv_cache_config.linear_step;
+    setupKernelSeqSize(config, kv_cache_config, "cache");
+
+    uint32_t block_num = computeBlockNum(config, model_config, runtime_config, kv_cache_config,
+                                         parallelism_config, warm_up_result, sp_config);
+    RTP_LLM_CHECK_WITH_INFO(block_num > 0,
+                            "kv cache needs at least 1 block but %ld, each block needs %ld MiB memory",
+                            block_num,
+                            static_cast<long>(config.block_size_bytes / 1024 / 1024));
+
+    const auto kv_cache_seq_len = static_cast<size_t>(block_num) * config.seq_size_per_block;
+    config.block_num            = static_cast<int>(block_num);
+    config.finalizeBlockNums(block_num, runtime_config);
+    RTP_LLM_LOG_INFO("kv cache block nums is %u, allows storing %ld tokens", block_num, kv_cache_seq_len);
+    if (kv_cache_seq_len < model_config.max_seq_len) {
+        RTP_LLM_LOG_WARNING("kv cache block nums %u can only store %ld tokens, less than max_seq_len %ld, "
+                            "this is dangerous, consider decrease max_seq_len",
+                            block_num,
+                            kv_cache_seq_len,
+                            model_config.max_seq_len);
+    }
+    return config;
+}
+
+CacheConfig CacheConfigCreator::createSpConfig(const ModelConfig&                 score_model_config,
+                                               const ModelConfig&                 propose_model_config,
+                                               const ParallelismConfig&           parallelism_config,
+                                               const RuntimeConfig&               runtime_config,
+                                               const KVCacheConfig&               kv_cache_config,
+                                               const SpeculativeExecutionConfig&  sp_config,
+                                               const std::optional<WarmUpResult>& warm_up_result,
+                                               bool                               is_mtp,
+                                               bool                               is_eagle) {
+    CacheConfig score_config = CacheConfigCreator::createBasicConfig(
+        score_model_config, parallelism_config, kv_cache_config, false, sp_config.gen_num_per_cycle);
+    CacheConfig propose_config = CacheConfigCreator::createBasicConfig(
+        propose_model_config, parallelism_config, kv_cache_config, is_mtp, sp_config.gen_num_per_cycle);
+
+    setupKernelSeqSize(score_config, kv_cache_config, "score");
+    setupKernelSeqSize(propose_config, kv_cache_config, "propose");
+
+    int num_mtp_modules = 1;
+    if (is_mtp) {
+        num_mtp_modules = sp_config.gen_num_per_cycle;
+        if (is_eagle) {
+            num_mtp_modules = 1;
+        }
+    }
+
+    // Fixed-pool block counts depend on runtime scheduler limits. Finalize the
+    // score and propose configs before sizing the shared paged budget so fixed
+    // state pools are accounted outside the paged KV-cache block budget.
+    score_config.finalizeBlockNums(0, runtime_config);
+    propose_config.finalizeBlockNums(0, runtime_config);
+
+    uint32_t total_layer_num = score_config.layer_num;
+    for (int i = 0; i < num_mtp_modules; ++i) {
+        total_layer_num += propose_config.layer_num;
+    }
+
+    size_t total_block_size_bytes = score_config.block_size_bytes;
+    for (int i = 0; i < num_mtp_modules; ++i) {
+        total_block_size_bytes += propose_config.block_size_bytes;
+    }
+
+    const size_t explicit_pool_reserve =
+        score_config.explicitly_sized_pool_reserve_bytes
+        + propose_config.explicitly_sized_pool_reserve_bytes * static_cast<size_t>(num_mtp_modules);
+
+    size_t block_num = 0;
+    if (kv_cache_config.test_block_num > 0) {
+        block_num = kv_cache_config.test_block_num;
+    } else {
+        const auto kv_cache_mem_size = MemoryEvaluationHelper::getKVCacheMemorySize(
+            runtime_config, kv_cache_config, score_model_config, parallelism_config, warm_up_result, sp_config);
+
+        size_t paged_budget = kv_cache_mem_size;
+        if (explicit_pool_reserve > 0) {
+            RTP_LLM_CHECK_WITH_INFO(kv_cache_mem_size > explicit_pool_reserve,
+                                    "sp kv cache budget %zu MiB is smaller than explicitly-sized pool reservation %zu MiB "
+                                    "(reduce explicitly sized pool blocks if needed)",
+                                    kv_cache_mem_size / 1024 / 1024,
+                                    explicit_pool_reserve / 1024 / 1024);
+            paged_budget = kv_cache_mem_size - explicit_pool_reserve;
+            RTP_LLM_LOG_INFO(
+                "sp kv cache: total budget %zu MiB, explicitly-sized pool reserve %zu MiB (score=%zu MiB + propose=%zu MiB x %d), paged budget %zu MiB",
+                kv_cache_mem_size / 1024 / 1024,
+                explicit_pool_reserve / 1024 / 1024,
+                score_config.explicitly_sized_pool_reserve_bytes / 1024 / 1024,
+                propose_config.explicitly_sized_pool_reserve_bytes / 1024 / 1024,
+                num_mtp_modules,
+                paged_budget / 1024 / 1024);
+        }
+
+        const int joint_step     = std::max(1, kv_cache_config.linear_step);
+        auto      effective_size = [&](const CacheConfig& cfg) -> size_t {
+            return effectivePagedBlockBytes(cfg, joint_step);
+        };
+        block_num =
+            paged_budget
+            / (effective_size(score_config) + effective_size(propose_config) * static_cast<size_t>(num_mtp_modules));
+    }
+
+    RTP_LLM_CHECK_WITH_INFO(block_num > 0, "kv cache needs at least 1 block but %zu", block_num);
+
+    CacheConfig config      = score_config;
+    config.linear_step      = std::max(1, kv_cache_config.linear_step);
+    config.layer_all_num    = total_layer_num;
+    config.block_size_bytes = total_block_size_bytes;
+    config.block_num                = block_num;
+    config.explicitly_sized_pool_reserve_bytes = explicit_pool_reserve;
+
+    const uint32_t main_layer_num = score_config.layer_num;
+    const uint32_t mtp_layer_num  = propose_config.layer_num;
+
+    // Each sub-model needs an independent CacheConfig because global_layer_ids differs per module.
+    config.mtp_sub_configs.clear();
+    config.mtp_sub_configs.reserve(num_mtp_modules);
+    config.resizeLayerRoutes(static_cast<size_t>(total_layer_num));
+    config.layer_to_block_stride_bytes.assign(static_cast<size_t>(total_layer_num), 0);
+
+    // Main(score) model per-layer stride (kv + scale).
+    // This is expected to be fully populated by createBasicConfig() (Single/Hybrid creators).
+    const size_t score_layers = static_cast<size_t>(main_layer_num);
+    RTP_LLM_CHECK_WITH_INFO(score_config.layer_to_block_stride_bytes.size() == score_layers,
+                            "score_config.layer_to_block_stride_bytes size mismatch, got=%zu need=%zu",
+                            score_config.layer_to_block_stride_bytes.size(),
+                            score_layers);
+    for (size_t l = 0; l < score_layers; ++l) {
+        config.layer_to_block_stride_bytes[l] = score_config.layer_to_block_stride_bytes[l];
+    }
+
+    for (int m = 0; m < num_mtp_modules; ++m) {
+        RTP_LLM_CHECK_WITH_INFO(propose_config.layer_to_block_stride_bytes.size() == static_cast<size_t>(mtp_layer_num),
+                                "sub_cfg.layer_to_block_stride_bytes size mismatch, got=%zu need=%u",
+                                propose_config.layer_to_block_stride_bytes.size(),
+                                mtp_layer_num);
+        auto sub_cfg = config.mergeMTPModule(propose_config, m, main_layer_num);
+        sub_cfg->finalizeBlockNums(static_cast<uint32_t>(block_num), runtime_config);
+        config.mtp_sub_configs.push_back(sub_cfg);
+    }
+
+    config.finalizeBlockNums(static_cast<uint32_t>(block_num), runtime_config);
+    config.explicitly_sized_pool_reserve_bytes = explicit_pool_reserve;
+
+    const auto kv_cache_seq_len = static_cast<size_t>(block_num) * config.seq_size_per_block;
+    RTP_LLM_LOG_INFO("CacheConfig created: is_mtp=%d, total_layers=%u, num_mtp_modules=%d, block_num=%zu, "
+                     "allows storing %zu tokens, total_block_size=%zu bytes (main=%zu + %d*propose=%zu)",
+                     is_mtp,
+                     total_layer_num,
+                     num_mtp_modules,
+                     block_num,
+                     kv_cache_seq_len,
+                     total_block_size_bytes,
+                     score_config.block_size_bytes,
+                     num_mtp_modules,
+                     propose_config.block_size_bytes);
+
+    RTP_LLM_LOG_INFO("CacheConfig debugString(main_score_model):\n%s", score_config.debugString().c_str());
+    for (size_t i = 0; i < config.mtp_sub_configs.size(); ++i) {
+        const auto& sub = config.mtp_sub_configs[i];
+        RTP_LLM_LOG_INFO("CacheConfig debugString(sub_propose_model[%zu]):\n%s", i, sub->debugString().c_str());
+    }
+
+    return config;
+}
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/CacheConfigCreator.h b/rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h
similarity index 76%
rename from rtp_llm/cpp/cache/CacheConfigCreator.h
rename to rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h
index fc52e975ed..f3abe53fa6 100644
--- a/rtp_llm/cpp/cache/CacheConfigCreator.h
+++ b/rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h
@@ -6,6 +6,7 @@
 #include "absl/status/statusor.h"
 #include "rtp_llm/cpp/cache/CacheConfig.h"
 #include "rtp_llm/cpp/cache/WarmUpResult.h"
+#include "rtp_llm/cpp/cache/spec/KVCacheSpecDesc.h"
 #include "rtp_llm/cpp/config/ConfigModules.h"
 #include "rtp_llm/cpp/config/ModelConfig.h"
 
@@ -15,7 +16,9 @@ class CacheConfigCreator {
 public:
     static CacheConfig createBasicConfig(const ModelConfig&       model_config,
                                          const ParallelismConfig& parallelism_config,
-                                         bool                     is_mtp = false);
+                                         const KVCacheConfig&     kv_cache_config,
+                                         bool                     is_mtp,
+                                         int                      gen_num_per_cycle);
     static CacheConfig createConfig(const ModelConfig&                               model_config,
                                     const ParallelismConfig&                         parallelism_config,
                                     const RuntimeConfig&                             runtime_config,
@@ -32,15 +35,12 @@ class CacheConfigCreator {
                                       bool                               is_mtp,
                                       bool                               is_eagle);
 
-private:
-    // Removed functions moved to MemoryEvaluationHelper:
-    // getDefaultRuntimeMemorySize
-    // getKVCacheMemorySize
+    // Unified desc->spec conversion. Callers provide the runtime build context;
+    // descs remain read-only.
+    static LayerKVCacheSpecs buildLayerSpecsFromDescs(const LayerKVCacheSpecDescs& layer_descs,
+                                                      const SpecBuildContext&      ctx,
+                                                      int64_t                      expected_layer_num);
 
-    // Removed functions moved to dedicated creators:
-    // createSingleConfig
-    // createHybridConfig
-    // splitIntoGroups (moved to HybridConfigCreator)
 };
 
 }  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/config_creator/HybridConfigCreator.cc b/rtp_llm/cpp/cache/config_creator/HybridConfigCreator.cc
new file mode 100644
index 0000000000..93830012fb
--- /dev/null
+++ b/rtp_llm/cpp/cache/config_creator/HybridConfigCreator.cc
@@ -0,0 +1,305 @@
+#include "rtp_llm/cpp/cache/config_creator/HybridConfigCreator.h"
+
+#include <numeric>
+
+#include "rtp_llm/cpp/cache/spec/KVCacheSpec.h"
+#include "rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h"
+#include "rtp_llm/cpp/cache/config_creator/MemoryEvaluationHelper.h"
+
+namespace rtp_llm {
+
+std::vector<std::vector<int>> HybridConfigCreator::splitIntoGroups(const std::vector<int>& ids, int group_layer_num) {
+    std::vector<std::vector<int>> groups;
+    if (ids.empty()) {
+        return groups;
+    }
+    const int n = static_cast<int>(ids.size());
+    const int s = std::max(group_layer_num, 1);
+    groups.reserve((n + s - 1) / s);
+    for (int i = 0; i < n; i += s) {
+        const int end = std::min(i + s, n);
+        groups.emplace_back(ids.begin() + i, ids.begin() + end);
+    }
+    return groups;
+}
+
+int HybridConfigCreator::calculateGroupLayerNum(int linear_layer_count, int full_layer_count) {
+    int group_layer_num = 0;
+    if (linear_layer_count > 0 && full_layer_count > 0) {
+        group_layer_num = std::gcd(linear_layer_count, full_layer_count);
+    } else {
+        group_layer_num = std::max(linear_layer_count, full_layer_count);
+    }
+    group_layer_num = std::max(group_layer_num, 1);
+    return group_layer_num;
+}
+
+std::pair<std::vector<int>, std::vector<int>>
+HybridConfigCreator::splitLayersByAttentionType(const ModelConfig& model_config) {
+    int64_t layer_num = model_config.num_layers;
+    RTP_LLM_CHECK_WITH_INFO(layer_num > 0, "invalid model_config.num_layers=%ld", layer_num);
+
+    std::vector<int> linear_layers;
+    std::vector<int> full_layers;
+    linear_layers.reserve(layer_num);
+    full_layers.reserve(layer_num);
+
+    const auto& types = model_config.hybrid_attention_config.hybrid_attention_types;
+    RTP_LLM_CHECK_WITH_INFO(types.size() == static_cast<size_t>(layer_num),
+                            "hybrid_attention_types size %zu != num_layers %ld",
+                            types.size(),
+                            layer_num);
+    for (int i = 0; i < static_cast<int>(layer_num); ++i) {
+        if (types[static_cast<size_t>(i)] == HybridAttentionType::LINEAR) {
+            linear_layers.push_back(i);
+        } else {
+            full_layers.push_back(i);
+        }
+    }
+
+    return std::make_pair(std::move(linear_layers), std::move(full_layers));
+}
+
+CacheConfig HybridConfigCreator::initializeConfig(const ModelConfig&      model_config,
+                                                  const std::vector<int>& linear_layers,
+                                                  const std::vector<int>& full_layers,
+                                                  rtp_llm::DataType       dtype) {
+    int64_t layer_num = model_config.num_layers;
+
+    CacheConfig config;
+    config.layer_num          = static_cast<uint32_t>(layer_num);
+    config.layer_all_num      = static_cast<uint32_t>(layer_num);
+    config.block_num          = 0;
+    config.seq_size_per_block = static_cast<uint32_t>(model_config.attn_config.tokens_per_block);
+    config.use_mla            = model_config.attn_config.use_mla;
+    config.dtype              = dtype;
+    config.linear_step        = 1;
+
+    return config;
+}
+
+KVCacheSpecPtr HybridConfigCreator::getSpecFromLayers(const LayerKVCacheSpecs& runtime_specs,
+                                                      const std::vector<int>&  layer_ids,
+                                                      const char*              spec_role) {
+    KVCacheSpecPtr result;
+    std::string    fingerprint;
+    for (int layer_id : layer_ids) {
+        RTP_LLM_CHECK_WITH_INFO(static_cast<size_t>(layer_id) < runtime_specs.size()
+                                    && !runtime_specs[static_cast<size_t>(layer_id)].empty(),
+                                "missing runtime kv_cache specs for %s layer %d",
+                                spec_role,
+                                layer_id);
+        RTP_LLM_CHECK_WITH_INFO(runtime_specs[static_cast<size_t>(layer_id)].size() == 1,
+                                "%s layer %d must have exactly one runtime kv_cache spec, got %zu",
+                                spec_role,
+                                layer_id,
+                                runtime_specs[static_cast<size_t>(layer_id)].size());
+        const auto& spec = runtime_specs[static_cast<size_t>(layer_id)][0];
+        RTP_LLM_CHECK_WITH_INFO(spec != nullptr, "%s layer %d has null kv_cache spec", spec_role, layer_id);
+        if (result == nullptr) {
+            result      = spec;
+            fingerprint = spec->fingerprint();
+        } else {
+            RTP_LLM_CHECK_WITH_INFO(fingerprint == spec->fingerprint(),
+                                    "%s layers have different kv_cache spec fingerprints",
+                                    spec_role);
+        }
+    }
+    RTP_LLM_CHECK_WITH_INFO(result != nullptr, "no %s layers found", spec_role);
+    return result->clone();
+}
+
+void HybridConfigCreator::prepareFullAttentionSpec(KVCacheSpecPtr            spec,
+                                                   const ModelConfig&       model_config,
+                                                   const ParallelismConfig& parallelism_config,
+                                                   rtp_llm::DataType        dtype,
+                                                   uint32_t                 layer_num) {
+    if (model_config.attn_config.use_mla && model_config.mla_ops_type != rtp_llm::MlaOpsType::MHA) {
+        auto* mla_spec = dynamic_cast<MLAKVCacheSpec*>(spec.get());
+        RTP_LLM_CHECK_WITH_INFO(mla_spec != nullptr && spec->type == KVCacheSpecType::MultiHeadLatentAttention,
+                                "full kv_cache spec must be MLAKVCacheSpec for MLA model");
+        // local_head_num_kv is already set to 1 by Python-side MLAKVCacheSpec default.
+        // kv_lora_rank, rope_head_dim, seq_size_per_block are already populated by Python.
+    } else {
+        auto* mha_spec = dynamic_cast<MHAKVCacheSpec*>(spec.get());
+        RTP_LLM_CHECK_WITH_INFO(mha_spec != nullptr && spec->type == KVCacheSpecType::MultiHeadAttention,
+                                "full kv_cache spec must be MHAKVCacheSpec for MHA/GQA model");
+        // local_head_num_kv depends on TP and cannot be provided by Python-side spec.
+        spec->local_head_num_kv = static_cast<uint32_t>(
+            (model_config.attn_config.kv_head_num % parallelism_config.get_attn_tp_size() == 0) ?
+                model_config.attn_config.kv_head_num / parallelism_config.get_attn_tp_size() :
+                model_config.attn_config.kv_head_num
+                    / std::gcd(model_config.attn_config.kv_head_num, parallelism_config.get_attn_tp_size()));
+        // size_per_head, seq_size_per_block are already populated by Python.
+    }
+    // dtype depends on runtime quantization config and cannot be provided by Python-side spec.
+    spec->dtype = dtype;
+}
+
+void HybridConfigCreator::prepareLinearAttentionSpec(KVCacheSpecPtr            spec,
+                                                     const ModelConfig&       model_config,
+                                                     const ParallelismConfig& parallelism_config,
+                                                     rtp_llm::DataType        dtype,
+                                                     uint32_t                 layer_num) {
+    auto* linear_spec = dynamic_cast<LinearKVCacheSpec*>(spec.get());
+    RTP_LLM_CHECK_WITH_INFO(linear_spec != nullptr && spec->type == KVCacheSpecType::LinearAttention,
+                            "linear kv_cache spec must be LinearKVCacheSpec");
+
+    const auto& linear_config = model_config.linear_attention_config;
+    RTP_LLM_CHECK_WITH_INFO(linear_config.linear_key_head_dim > 0 && linear_config.linear_value_head_dim > 0,
+                            "invalid linear head dim");
+    RTP_LLM_CHECK_WITH_INFO(linear_config.linear_conv_kernel_dim > 1,
+                            "invalid linear_conv_kernel_dim=%d",
+                            linear_config.linear_conv_kernel_dim);
+    RTP_LLM_CHECK_WITH_INFO(linear_config.linear_num_key_heads > 0 && linear_config.linear_num_value_heads > 0,
+                            "invalid linear heads");
+    RTP_LLM_CHECK_WITH_INFO(linear_config.linear_key_head_dim == linear_config.linear_value_head_dim,
+                            "linear head dims must match (current impl): k=%d v=%d",
+                            linear_config.linear_key_head_dim,
+                            linear_config.linear_value_head_dim);
+
+    // local_num_k_heads, local_num_v_heads, and local_head_num_kv depend on TP
+    // and cannot be provided by Python-side spec.
+    const int tp = std::max(1, static_cast<int>(parallelism_config.get_attn_tp_size()));
+    linear_spec->local_num_k_heads = static_cast<uint32_t>(linear_config.linear_num_key_heads / tp);
+    linear_spec->local_num_v_heads = static_cast<uint32_t>(linear_config.linear_num_value_heads / tp);
+    RTP_LLM_CHECK_WITH_INFO(linear_spec->local_num_k_heads > 0 && linear_spec->local_num_v_heads > 0,
+                            "invalid local heads for linear attention: k=%d v=%d tp=%d",
+                            linear_spec->local_num_k_heads,
+                            linear_spec->local_num_v_heads,
+                            tp);
+    spec->local_head_num_kv = static_cast<uint32_t>(std::max(
+        1,
+        (linear_config.linear_num_value_heads > 1) ?
+            static_cast<int>(linear_config.linear_num_value_heads / parallelism_config.get_attn_tp_size()) :
+            static_cast<int>(linear_config.linear_num_value_heads)));
+    // dtype depends on runtime quantization config and cannot be provided by Python-side spec.
+    spec->dtype = dtype;
+    // seq_size_per_block, head_k_dim, head_v_dim, conv_kernel_dim,
+    // ssm_state_dtype, conv_state_dtype are already populated by Python.
+}
+
+std::pair<std::vector<std::vector<int>>, std::vector<std::vector<int>>> HybridConfigCreator::createLayerGroups(
+    const std::vector<int>& linear_layers, const std::vector<int>& full_layers, int& group_layer_num) {
+    const int linear_cnt = static_cast<int>(linear_layers.size());
+    const int full_cnt   = static_cast<int>(full_layers.size());
+    group_layer_num      = HybridConfigCreator::calculateGroupLayerNum(linear_cnt, full_cnt);
+
+    const auto linear_groups = HybridConfigCreator::splitIntoGroups(linear_layers, group_layer_num);
+    const auto full_groups   = HybridConfigCreator::splitIntoGroups(full_layers, group_layer_num);
+
+    return std::make_pair(std::move(linear_groups), std::move(full_groups));
+}
+
+void HybridConfigCreator::setupCacheConfigSpecs(CacheConfig&                         config,
+                                                const std::vector<std::vector<int>>& linear_groups,
+                                                const std::vector<std::vector<int>>& full_groups,
+                                                const KVCacheSpecPtr&                linear_spec,
+                                                const KVCacheSpecPtr&                full_spec) {
+    std::vector<GroupBase> groups;
+    std::vector<LayerBase> layers(static_cast<size_t>(config.layer_num));
+
+    auto append_group = [&](const KVCacheSpecPtr& spec, CacheGroupType type, const std::vector<int>& layer_ids) {
+        GroupBase group;
+        group.spec      = spec;
+        group.policy    = defaultCacheGroupPolicy(type);
+        group.layer_ids = layer_ids;
+        const int gid   = static_cast<int>(groups.size());
+        groups.push_back(group);
+        for (int layer_id : layer_ids) {
+            auto& layer = layers[static_cast<size_t>(layer_id)];
+            layer.group_ids.push_back(gid);
+            layer.tag_to_gid[spec->tag] = gid;
+        }
+    };
+
+    // Keep order: all full groups first, then linear groups.
+    for (const auto& g : full_groups) {
+        append_group(full_spec, CacheGroupType::FULL, g);
+    }
+    for (const auto& g : linear_groups) {
+        append_group(linear_spec, CacheGroupType::LINEAR, g);
+    }
+    config.setTopology(std::move(groups), std::move(layers));
+}
+
+void HybridConfigCreator::setupPhysicalSizes(CacheConfig&          config,
+                                             const KVCacheSpecPtr& full_spec,
+                                             const KVCacheSpecPtr& linear_spec) {
+    // Decide the physical KV block/scale sizes by taking max between full and linear specs.
+    const size_t full_kv_block_stride_bytes   = full_spec->block_size_bytes();
+    const size_t linear_kv_block_stride_bytes = linear_spec->block_size_bytes();
+
+    // now we only support that linear attention block have padding
+    RTP_LLM_CHECK_WITH_INFO(full_kv_block_stride_bytes >= linear_kv_block_stride_bytes,
+                            "not support full attention with padding now");
+
+    config.kv_block_stride_bytes = full_kv_block_stride_bytes;
+    config.kv_block_size_bytes   = static_cast<size_t>(config.group_layer_num) * config.kv_block_stride_bytes;
+    config.kv_scale_stride_bytes = full_spec->scale_block_size_bytes();
+    config.kv_scale_size_bytes   = static_cast<size_t>(config.group_layer_num) * config.kv_scale_stride_bytes;
+    config.block_size_bytes      = config.kv_block_size_bytes + config.kv_scale_size_bytes;
+}
+
+CacheConfig HybridConfigCreator::createHybridConfig(const ModelConfig&       model_config,
+                                                    const ParallelismConfig& parallelism_config,
+                                                    const KVCacheConfig&     kv_cache_config,
+                                                    bool                     is_mtp,
+                                                    int                      gen_num_per_cycle) {
+    (void)is_mtp;
+    auto dtype = MemoryEvaluationHelper::getDataTypeForCache(model_config);
+    const auto physical_tokens_per_block =
+        kv_cache_config.seq_size_per_block > 0 ? static_cast<uint32_t>(kv_cache_config.seq_size_per_block) :
+                                                 static_cast<uint32_t>(model_config.attn_config.tokens_per_block);
+    const auto kernel_tokens_per_block =
+        kv_cache_config.kernel_seq_size_per_block > 0 ? static_cast<uint32_t>(kv_cache_config.kernel_seq_size_per_block) :
+                                                        physical_tokens_per_block;
+    RTP_LLM_CHECK_WITH_INFO(physical_tokens_per_block > 0, "hybrid seq_size_per_block must be > 0");
+    RTP_LLM_CHECK_WITH_INFO(kernel_tokens_per_block > 0, "hybrid kernel_seq_size_per_block must be > 0");
+    SpecBuildContext ctx;
+    ctx.dtype                   = dtype;
+    ctx.seq_size_per_block      = physical_tokens_per_block;
+    ctx.attn_tp_size            = static_cast<uint32_t>(parallelism_config.get_attn_tp_size());
+    ctx.kernel_tokens_per_block = kernel_tokens_per_block;
+    ctx.gen_num_per_cycle       = static_cast<uint32_t>(gen_num_per_cycle);
+    const auto runtime_specs =
+        CacheConfigCreator::buildLayerSpecsFromDescs(model_config.kv_cache_spec_descs, ctx, model_config.num_layers);
+
+    // Split layers by attention type
+    auto [linear_layers, full_layers] = HybridConfigCreator::splitLayersByAttentionType(model_config);
+
+    // Initialize config
+    CacheConfig config = HybridConfigCreator::initializeConfig(model_config, linear_layers, full_layers, dtype);
+    config.seq_size_per_block        = physical_tokens_per_block;
+    config.kernel_seq_size_per_block = kernel_tokens_per_block;
+
+    auto full_spec   = HybridConfigCreator::getSpecFromLayers(runtime_specs, full_layers, "full attention");
+    auto linear_spec = HybridConfigCreator::getSpecFromLayers(runtime_specs, linear_layers, "linear attention");
+
+    // Create layer groups and calculate group layer number
+    int group_layer_num = 0;
+    auto [linear_groups, full_groups] =
+        HybridConfigCreator::createLayerGroups(linear_layers, full_layers, group_layer_num);
+    config.group_layer_num = group_layer_num;
+
+    HybridConfigCreator::prepareFullAttentionSpec(
+        full_spec, model_config, parallelism_config, dtype, static_cast<uint32_t>(full_layers.size()));
+    HybridConfigCreator::prepareLinearAttentionSpec(
+        linear_spec, model_config, parallelism_config, dtype, static_cast<uint32_t>(linear_layers.size()));
+
+    // Setup cache config specs
+    HybridConfigCreator::setupCacheConfigSpecs(config, linear_groups, full_groups, linear_spec, full_spec);
+
+    // Setup physical sizes
+    HybridConfigCreator::setupPhysicalSizes(config, full_spec, linear_spec);
+
+    // Per-layer block stride (kv + scale).
+    // For hybrid attention, the physical per-layer stride follows the selected physical layout stride.
+    const size_t per_layer_stride_bytes = config.kv_block_stride_bytes + config.kv_scale_stride_bytes;
+    config.layer_to_block_stride_bytes.assign(static_cast<size_t>(config.layer_all_num),
+                                              static_cast<int>(per_layer_stride_bytes));
+
+    return config;
+}
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/HybridConfigCreator.h b/rtp_llm/cpp/cache/config_creator/HybridConfigCreator.h
similarity index 59%
rename from rtp_llm/cpp/cache/HybridConfigCreator.h
rename to rtp_llm/cpp/cache/config_creator/HybridConfigCreator.h
index c8cf684d01..e4542a373e 100644
--- a/rtp_llm/cpp/cache/HybridConfigCreator.h
+++ b/rtp_llm/cpp/cache/config_creator/HybridConfigCreator.h
@@ -3,7 +3,9 @@
 #include <memory>
 #include <vector>
 #include <utility>
+#include <string>
 #include "rtp_llm/cpp/cache/CacheConfig.h"
+#include "rtp_llm/cpp/config/ConfigModules.h"
 #include "rtp_llm/cpp/config/ModelConfig.h"
 
 namespace rtp_llm {
@@ -12,7 +14,9 @@ class HybridConfigCreator {
 public:
     static CacheConfig                   createHybridConfig(const ModelConfig&       model_config,
                                                             const ParallelismConfig& parallelism_config,
-                                                            bool                     is_mtp = false);
+                                                            const KVCacheConfig&     kv_cache_config,
+                                                            bool                     is_mtp = false,
+                                                            int                      gen_num_per_cycle = 0);
     static std::vector<std::vector<int>> splitIntoGroups(const std::vector<int>& ids, int group_layer_num);
 
     // Calculate the number of layers per group based on linear and full layers count
@@ -25,12 +29,19 @@ class HybridConfigCreator {
                                                                           const std::vector<int>& linear_layers,
                                                                           const std::vector<int>& full_layers,
                                                                           rtp_llm::DataType       dtype);
-    static KVCacheSpecPtr                                createFullAttentionSpec(const ModelConfig&       model_config,
-                                                                                 const ParallelismConfig& parallelism_config,
-                                                                                 rtp_llm::DataType        dtype);
-    static KVCacheSpecPtr                                createLinearAttentionSpec(const ModelConfig&       model_config,
-                                                                                   const ParallelismConfig& parallelism_config,
-                                                                                   rtp_llm::DataType        dtype);
+    static KVCacheSpecPtr getSpecFromLayers(const LayerKVCacheSpecs& runtime_specs,
+                                            const std::vector<int>&  layer_ids,
+                                            const char*              spec_role);
+    static void           prepareFullAttentionSpec(KVCacheSpecPtr            spec,
+                                                   const ModelConfig&       model_config,
+                                                   const ParallelismConfig& parallelism_config,
+                                                   rtp_llm::DataType        dtype,
+                                                   uint32_t                 layer_num);
+    static void           prepareLinearAttentionSpec(KVCacheSpecPtr            spec,
+                                                     const ModelConfig&       model_config,
+                                                     const ParallelismConfig& parallelism_config,
+                                                     rtp_llm::DataType        dtype,
+                                                     uint32_t                 layer_num);
     static std::pair<std::vector<std::vector<int>>, std::vector<std::vector<int>>>
     createLayerGroups(const std::vector<int>& linear_layers, const std::vector<int>& full_layers, int& group_layer_num);
     static void setupCacheConfigSpecs(CacheConfig&                         config,
@@ -40,7 +51,6 @@ class HybridConfigCreator {
                                       const KVCacheSpecPtr&                full_spec);
     static void
     setupPhysicalSizes(CacheConfig& config, const KVCacheSpecPtr& full_spec, const KVCacheSpecPtr& linear_spec);
-    static void setupLayerToGroupMapping(CacheConfig& config);
 };
 
-}  // namespace rtp_llm
\ No newline at end of file
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/config_creator/HybridPoolConfigCreator.cc b/rtp_llm/cpp/cache/config_creator/HybridPoolConfigCreator.cc
new file mode 100644
index 0000000000..befe99de4f
--- /dev/null
+++ b/rtp_llm/cpp/cache/config_creator/HybridPoolConfigCreator.cc
@@ -0,0 +1,370 @@
+#include "rtp_llm/cpp/cache/config_creator/HybridPoolConfigCreator.h"
+
+#include <algorithm>
+#include <map>
+#include <set>
+#include <utility>
+
+#include "rtp_llm/cpp/cache/spec/KVCacheSpec.h"
+#include "rtp_llm/cpp/cache/spec/KVCacheSpecDesc.h"
+#include "rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h"
+#include "rtp_llm/cpp/cache/config_creator/MemoryEvaluationHelper.h"
+#include "rtp_llm/cpp/utils/AssertUtils.h"
+
+namespace rtp_llm {
+
+namespace {
+
+uint32_t fixedRegionCpSize(const ParallelismConfig& parallelism_config) {
+    if (!parallelism_config.prefill_cp_config.kv_cache_sharded) {
+        return 1;
+    }
+    if (parallelism_config.role_type == RoleType::PREFILL && parallelism_config.tp_size > 1) {
+        return static_cast<uint32_t>(parallelism_config.tp_size);
+    }
+    if (parallelism_config.role_type == RoleType::DECODE && parallelism_config.prefill_cp_config.is_prefill_enabled()) {
+        RTP_LLM_CHECK_WITH_INFO(
+            parallelism_config.prefill_cp_config.prefill_cp_size > 1,
+            "fixed/SWA CP sharding decode requires explicit prefill_cp_size when PREFILL_CP and kv_cache_sharded are enabled");
+        return static_cast<uint32_t>(parallelism_config.prefill_cp_config.prefill_cp_size);
+    }
+    return 1;
+}
+
+bool isPrefillCpSliced(const ParallelismConfig& parallelism_config) {
+    return parallelism_config.role_type == RoleType::PREFILL && fixedRegionCpSize(parallelism_config) > 1;
+}
+
+CacheGroupPolicy policyFromSpecDesc(const KVCacheSpecDesc& desc) {
+    CacheGroupPolicy policy = defaultCacheGroupPolicy(SpecBuilder::groupType(desc));
+    if (desc.is_state_cache) {
+        policy.evict_policy = CacheEvictPolicy::INDEPENDENT;
+    }
+    if (desc.skip_prefix_reuse) {
+        policy.reuse_policy         = CacheReusePolicy::NON_REUSABLE;
+        policy.active_tail_blocks   = 1;
+        policy.validate_tail_blocks = false;
+    }
+    if (desc.has_reuse_policy) {
+        policy.reuse_policy = desc.reuse_policy;
+    }
+    if (desc.has_evict_policy) {
+        policy.evict_policy = desc.evict_policy;
+    }
+    if (desc.has_active_tail_blocks) {
+        policy.active_tail_blocks = desc.active_tail_blocks;
+    }
+    if (desc.has_validate_tail_blocks) {
+        policy.validate_tail_blocks = desc.validate_tail_blocks;
+    }
+    policy.explicit_block_num        = desc.extra.explicit_block_num;
+    policy.reserve_from_paged_budget = desc.extra.reserve_from_paged_budget;
+    if (desc.has_prefix_reusable) {
+        policy.prefix_reusable = desc.prefix_reusable;
+    }
+    policy.uses_pinned_cpu_backing = desc.uses_pinned_cpu_backing;
+    if (desc.has_is_cp_shardable) {
+        policy.is_cp_shardable = desc.is_cp_shardable;
+    }
+    if (desc.cache_type == CacheType::COMPRESSED_KV) {
+        policy.has_sparse_slots = true;
+    }
+    if (desc.has_sparse_slots) {
+        policy.has_sparse_slots = desc.sparse_slots;
+    }
+    if (desc.has_kernel_block_subdiv) {
+        policy.has_kernel_block_subdiv = desc.kernel_block_subdiv;
+    }
+    if (desc.has_cp_compact_tail_blocks) {
+        policy.cp_compact_tail_blocks = desc.cp_compact_tail_blocks;
+    }
+    if (desc.has_is_reservable) {
+        policy.is_reservable = desc.is_reservable;
+    }
+    return policy;
+}
+
+void validateHybridPoolDescs(const ModelConfig& model_config,
+                             uint32_t           kernel_tokens_per_block,
+                             int                gen_num_per_cycle) {
+    RTP_LLM_CHECK_WITH_INFO(model_config.kv_cache_spec_descs.size() == static_cast<size_t>(model_config.num_layers),
+                            "hybrid-pool desc config requires layer-wise kv_cache_spec_descs for every layer, got %zu/%ld",
+                            model_config.kv_cache_spec_descs.size(),
+                            model_config.num_layers);
+    RTP_LLM_CHECK_WITH_INFO(gen_num_per_cycle >= 0,
+                            "hybrid-pool desc config requires non-negative gen_num_per_cycle, got %d",
+                            gen_num_per_cycle);
+
+    for (int64_t layer_id = 0; layer_id < model_config.num_layers; ++layer_id) {
+        const auto& layer_descs = model_config.kv_cache_spec_descs[static_cast<size_t>(layer_id)];
+        RTP_LLM_CHECK_WITH_INFO(!layer_descs.empty(),
+                                "hybrid-pool desc config layer %ld has no descs",
+                                layer_id);
+        for (const auto& desc : layer_descs) {
+            RTP_LLM_CHECK_WITH_INFO(
+                desc.cache_type != CacheType::MHA || desc.num_kv_heads > 0,
+                "hybrid-pool MHA desc tag=%s missing num_kv_heads (must be set by Python model)",
+                desc.tag.c_str());
+            RTP_LLM_CHECK_WITH_INFO(
+                desc.cache_type != CacheType::LINEAR || (desc.num_k_heads > 0 && desc.num_v_heads > 0),
+                "hybrid-pool LINEAR desc tag=%s missing num_k_heads/num_v_heads (must be set by Python model)",
+                desc.tag.c_str());
+            if (desc.extra.derive_entries_from_kernel_block) {
+                RTP_LLM_CHECK_WITH_INFO(desc.compression_ratio > 0,
+                                        "desc tag=%s derives entries from kernel block but has invalid compression_ratio=%u",
+                                        desc.tag.c_str(),
+                                        desc.compression_ratio);
+                RTP_LLM_CHECK_WITH_INFO(kernel_tokens_per_block > 0,
+                                        "desc tag=%s derives entries from kernel block but kernel_tokens_per_block is 0",
+                                        desc.tag.c_str());
+                RTP_LLM_CHECK_WITH_INFO(kernel_tokens_per_block % desc.compression_ratio == 0,
+                                        "desc tag=%s compression_ratio=%u must divide kernel block %u",
+                                        desc.tag.c_str(),
+                                        desc.compression_ratio,
+                                        kernel_tokens_per_block);
+            }
+            if (desc.extra.state_ring_compression_ratio > 0) {
+                RTP_LLM_CHECK_WITH_INFO(desc.extra.state_ring_compression_ratio > 0,
+                                        "state ring desc tag=%s requires positive state_ring_compression_ratio",
+                                        desc.tag.c_str());
+            }
+        }
+    }
+}
+
+void populateGroupsFromLayerSpecs(CacheConfig&                  config,
+                                  const LayerKVCacheSpecDescs& layer_descs,
+                                  const LayerKVCacheSpecs&     layer_specs) {
+    RTP_LLM_CHECK_WITH_INFO(layer_descs.size() == static_cast<size_t>(config.layer_num),
+                            "hybrid-pool layer desc count %zu != layer_num %u",
+                            layer_descs.size(),
+                            config.layer_num);
+    RTP_LLM_CHECK_WITH_INFO(layer_specs.size() == static_cast<size_t>(config.layer_num),
+                            "hybrid-pool layer spec count %zu != layer_num %u",
+                            layer_specs.size(),
+                            config.layer_num);
+
+    struct GroupBuildState {
+        KVCacheSpecPtr   spec;
+        std::string      fingerprint;
+        CacheGroupType   type;
+        CacheGroupPolicy policy;
+        std::vector<int> layers;
+    };
+
+    std::map<std::string, GroupBuildState> group_by_tag;
+    std::vector<std::string>               ordered_tags;
+
+    for (uint32_t layer = 0; layer < config.layer_num; ++layer) {
+        const auto& descs = layer_descs[layer];
+        const auto& specs = layer_specs[layer];
+        RTP_LLM_CHECK_WITH_INFO(!descs.empty(), "hybrid-pool layer %u has no descs", layer);
+        RTP_LLM_CHECK_WITH_INFO(descs.size() == specs.size(),
+                                "hybrid-pool layer %u desc count %zu != spec count %zu",
+                                layer,
+                                descs.size(),
+                                specs.size());
+        std::set<std::string> layer_tags;
+        for (size_t idx = 0; idx < descs.size(); ++idx) {
+            const auto& desc = descs[idx];
+            const auto& spec = specs[idx];
+            RTP_LLM_CHECK_WITH_INFO(spec != nullptr, "hybrid-pool layer %u has null spec", layer);
+            RTP_LLM_CHECK_WITH_INFO(layer_tags.insert(spec->tag).second,
+                                    "hybrid-pool layer %u has duplicate tag=%s",
+                                    layer,
+                                    spec->tag.c_str());
+            const auto policy   = policyFromSpecDesc(desc);
+            const auto type     = SpecBuilder::groupType(desc);
+            auto       group_it = group_by_tag.find(spec->tag);
+            if (group_it == group_by_tag.end()) {
+                GroupBuildState state;
+                state.spec        = spec;
+                state.fingerprint = spec->fingerprint();
+                state.type        = type;
+                state.policy      = policy;
+                group_it          = group_by_tag.emplace(spec->tag, std::move(state)).first;
+                ordered_tags.push_back(spec->tag);
+            } else {
+                RTP_LLM_CHECK_WITH_INFO(group_it->second.fingerprint == spec->fingerprint(),
+                                        "hybrid-pool tag=%s has multiple physical prototypes",
+                                        spec->tag.c_str());
+                RTP_LLM_CHECK_WITH_INFO(group_it->second.type == type,
+                                        "hybrid-pool tag=%s has inconsistent group type",
+                                        spec->tag.c_str());
+                RTP_LLM_CHECK_WITH_INFO(CacheConfig::samePolicy(group_it->second.policy, policy),
+                                        "hybrid-pool tag=%s has inconsistent policy",
+                                        spec->tag.c_str());
+            }
+            group_it->second.layers.push_back(static_cast<int>(layer));
+        }
+    }
+
+    std::vector<GroupBase> groups;
+    std::vector<LayerBase> layers(static_cast<size_t>(config.layer_num));
+    groups.reserve(ordered_tags.size());
+    for (const auto& tag : ordered_tags) {
+        const auto& state = group_by_tag.at(tag);
+        GroupBase   group;
+        group.spec      = state.spec;
+        group.policy    = state.policy;
+        group.layer_ids = state.layers;
+        const int gid   = static_cast<int>(groups.size());
+        groups.push_back(group);
+        for (int layer_id : state.layers) {
+            auto& layer = layers[static_cast<size_t>(layer_id)];
+            layer.group_ids.push_back(gid);
+            layer.tag_to_gid[tag] = gid;
+        }
+    }
+    config.setTopology(std::move(groups), std::move(layers));
+}
+
+size_t kernelBlocksPerKvBlockForGroup(const CacheConfig& config, size_t group_id) {
+    const bool is_full = config.typeForGroup(group_id) == CacheGroupType::FULL;
+    return is_full ? config.kernelBlocksPerKvBlock() : 1;
+}
+
+void setupIndependentPoolSizes(CacheConfig& config, bool is_mtp) {
+    config.use_independent_block_pools = true;
+    const auto group_num               = static_cast<size_t>(config.groupNums());
+    std::vector<uint32_t> group_block_nums(group_num, 0);
+    config.group_seq_size_per_block.resize(group_num, config.seq_size_per_block);
+    std::vector<size_t> group_kv_block_stride_bytes(group_num, 0);
+    std::vector<size_t> group_kv_scale_stride_bytes(group_num, 0);
+
+    size_t   max_kv_stride           = 0;
+    size_t   max_scale_stride        = 0;
+    size_t   total_kv_block_bytes    = 0;
+    size_t   total_scale_block_bytes = 0;
+    uint32_t max_group_layers        = 0;
+
+    config.layer_to_block_stride_bytes.assign(config.layer_all_num, 0);
+    for (size_t gid = 0; gid < group_num; ++gid) {
+        const auto& spec = config.specForGroup(gid);
+        RTP_LLM_CHECK_WITH_INFO(spec != nullptr, "cache_specs[%zu] is null", gid);
+        const auto   layer_count                = static_cast<uint32_t>(config.layerIdsForGroup(gid).size());
+        const size_t kernel_kv_stride           = spec->block_size_bytes();
+        const auto   kernel_scale               = spec->scale_block_size_bytes();
+        const size_t group_bpk                  = kernelBlocksPerKvBlockForGroup(config, gid);
+        const size_t kv_stride                  = kernel_kv_stride * group_bpk;
+        const size_t scale_stride               = kernel_scale * group_bpk;
+        group_kv_block_stride_bytes[gid]        = kv_stride;
+        group_kv_scale_stride_bytes[gid]        = scale_stride;
+        const auto type     = config.typeForGroup(gid);
+        const bool is_state = spec->is_state_cache;
+        if (!is_state && type == CacheGroupType::FULL) {
+            total_kv_block_bytes += static_cast<size_t>(layer_count) * kv_stride;
+            total_scale_block_bytes += static_cast<size_t>(layer_count) * scale_stride;
+        }
+        max_kv_stride    = std::max(max_kv_stride, kv_stride);
+        max_scale_stride = std::max(max_scale_stride, scale_stride);
+        max_group_layers = std::max(max_group_layers, layer_count);
+
+        for (int layer_id : config.layerIdsForGroup(gid)) {
+            config.layer_to_block_stride_bytes[static_cast<size_t>(layer_id)] =
+                static_cast<int>(kv_stride + scale_stride);
+        }
+    }
+
+    config.group_layer_num         = static_cast<int>(std::max<uint32_t>(1, max_group_layers));
+    config.kv_block_stride_bytes   = max_kv_stride;
+    config.kv_scale_stride_bytes   = max_scale_stride;
+    config.kv_block_size_bytes     = total_kv_block_bytes;
+    config.kv_scale_size_bytes     = total_scale_block_bytes;
+    const size_t paged_block_bytes = config.kv_block_size_bytes + config.kv_scale_size_bytes;
+    if (paged_block_bytes == 0) {
+        RTP_LLM_CHECK_WITH_INFO(is_mtp && config.use_typed_cache_regions,
+                                "hybrid-pool paged groups produced zero block bytes");
+        config.kv_block_size_bytes = 1;
+        config.kv_scale_size_bytes = 0;
+        config.block_size_bytes    = 1;
+    } else {
+        config.block_size_bytes = paged_block_bytes;
+    }
+    config.explicitly_sized_pool_reserve_bytes = 0;
+    config.setGroupBlockLayout(group_block_nums, group_kv_block_stride_bytes, group_kv_scale_stride_bytes);
+}
+
+CacheConfig createHybridAttentionPoolConfig(const ModelConfig&       model_config,
+                                            const ParallelismConfig& parallelism_config,
+                                            const KVCacheConfig&     kv_cache_config,
+                                            bool                     is_mtp,
+                                            int                      gen_num_per_cycle) {
+    const auto dtype = MemoryEvaluationHelper::getDataTypeForCache(model_config);
+    const auto physical_tokens_per_block =
+        kv_cache_config.seq_size_per_block > 0 ? static_cast<uint32_t>(kv_cache_config.seq_size_per_block) :
+                                                 static_cast<uint32_t>(model_config.attn_config.tokens_per_block);
+    const auto kernel_tokens_per_block =
+        kv_cache_config.kernel_seq_size_per_block > 0 ? static_cast<uint32_t>(kv_cache_config.kernel_seq_size_per_block) :
+                                                        physical_tokens_per_block;
+    RTP_LLM_CHECK_WITH_INFO(physical_tokens_per_block > 0, "hybrid-pool seq_size_per_block must be > 0");
+    RTP_LLM_CHECK_WITH_INFO(kernel_tokens_per_block > 0, "hybrid-pool kernel_seq_size_per_block must be > 0");
+    RTP_LLM_CHECK_WITH_INFO(physical_tokens_per_block >= kernel_tokens_per_block
+                                && physical_tokens_per_block % kernel_tokens_per_block == 0,
+                            "hybrid-pool seq_size_per_block=%u must be >= kernel_seq_size_per_block=%u and divisible by it",
+                            physical_tokens_per_block,
+                            kernel_tokens_per_block);
+
+    CacheConfig config;
+    config.layer_num          = static_cast<uint32_t>(model_config.num_layers);
+    config.layer_all_num      = config.layer_num;
+    config.block_num          = 0;
+    config.seq_size_per_block = physical_tokens_per_block;
+    config.kernel_seq_size_per_block = kernel_tokens_per_block;
+    config.use_mla            = model_config.attn_config.use_mla;
+    config.dtype              = dtype;
+    config.linear_step        = 1;
+    config.is_sparse          = model_config.attn_config.is_sparse;
+
+    if (!model_config.kv_cache_spec_descs.empty()) {
+        validateHybridPoolDescs(model_config, kernel_tokens_per_block, gen_num_per_cycle);
+        SpecBuildContext ctx;
+        ctx.dtype                   = dtype;
+        ctx.seq_size_per_block      = physical_tokens_per_block;
+        ctx.attn_tp_size            = static_cast<uint32_t>(parallelism_config.get_attn_tp_size());
+        ctx.kernel_tokens_per_block = kernel_tokens_per_block;
+        ctx.gen_num_per_cycle       = static_cast<uint32_t>(gen_num_per_cycle);
+        ctx.cp_size                 = fixedRegionCpSize(parallelism_config);
+        ctx.cp_prefill_sliced       = isPrefillCpSliced(parallelism_config);
+        auto refreshed_specs = CacheConfigCreator::buildLayerSpecsFromDescs(
+            model_config.kv_cache_spec_descs, ctx, model_config.num_layers);
+        populateGroupsFromLayerSpecs(config, model_config.kv_cache_spec_descs, refreshed_specs);
+        config.group_seq_size_per_block.resize(static_cast<size_t>(config.groupNums()), config.seq_size_per_block);
+        for (size_t gid = 0; gid < static_cast<size_t>(config.groupNums()); ++gid) {
+            const auto& spec = config.specForGroup(gid);
+            RTP_LLM_CHECK_WITH_INFO(spec != nullptr, "hybrid-pool desc config produced null spec gid=%zu", gid);
+            config.group_seq_size_per_block[gid] = spec->seq_size_per_block;
+            config.use_typed_cache_regions =
+                config.use_typed_cache_regions || spec->type == KVCacheSpecType::OpaqueKV
+                || spec->type == KVCacheSpecType::OpaqueState;
+            config.use_opaque_kv_cache_store =
+                config.use_opaque_kv_cache_store || spec->type == KVCacheSpecType::OpaqueKV
+                || spec->type == KVCacheSpecType::OpaqueState;
+        }
+        for (const auto& layer_descs : model_config.kv_cache_spec_descs) {
+            for (const auto& desc : layer_descs) {
+                config.is_sparse = config.is_sparse || desc.cache_type == CacheType::COMPRESSED_KV;
+            }
+        }
+        config.disable_decode_first_malloc_device_reuse =
+            config.disable_decode_first_malloc_device_reuse || config.use_opaque_kv_cache_store;
+    } else {
+        RTP_LLM_CHECK_WITH_INFO(false, "HybridPoolConfigCreator requires kv_cache_spec_descs");
+    }
+
+    RTP_LLM_CHECK_WITH_INFO(config.groupNums() > 0, "hybrid-pool config produced no cache specs");
+    setupIndependentPoolSizes(config, is_mtp);
+    return config;
+}
+
+}  // namespace
+
+CacheConfig HybridPoolConfigCreator::createConfig(const ModelConfig&       model_config,
+                                                  const ParallelismConfig& parallelism_config,
+                                                  const KVCacheConfig&     kv_cache_config,
+                                                  bool                     is_mtp,
+                                                  int                      gen_num_per_cycle) {
+    return createHybridAttentionPoolConfig(model_config, parallelism_config, kv_cache_config, is_mtp, gen_num_per_cycle);
+}
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/config_creator/HybridPoolConfigCreator.h b/rtp_llm/cpp/cache/config_creator/HybridPoolConfigCreator.h
new file mode 100644
index 0000000000..dac00b5099
--- /dev/null
+++ b/rtp_llm/cpp/cache/config_creator/HybridPoolConfigCreator.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "rtp_llm/cpp/cache/CacheConfig.h"
+#include "rtp_llm/cpp/config/ConfigModules.h"
+#include "rtp_llm/cpp/config/ModelConfig.h"
+
+namespace rtp_llm {
+
+class HybridPoolConfigCreator {
+public:
+    static CacheConfig createConfig(const ModelConfig&       model_config,
+                                    const ParallelismConfig& parallelism_config,
+                                    const KVCacheConfig&     kv_cache_config,
+                                    bool                     is_mtp,
+                                    int                      gen_num_per_cycle);
+};
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/MemoryEvaluationHelper.cc b/rtp_llm/cpp/cache/config_creator/MemoryEvaluationHelper.cc
similarity index 99%
rename from rtp_llm/cpp/cache/MemoryEvaluationHelper.cc
rename to rtp_llm/cpp/cache/config_creator/MemoryEvaluationHelper.cc
index f2813ffeda..71838bef0d 100644
--- a/rtp_llm/cpp/cache/MemoryEvaluationHelper.cc
+++ b/rtp_llm/cpp/cache/config_creator/MemoryEvaluationHelper.cc
@@ -1,4 +1,4 @@
-#include "rtp_llm/cpp/cache/MemoryEvaluationHelper.h"
+#include "rtp_llm/cpp/cache/config_creator/MemoryEvaluationHelper.h"
 
 #include <numeric>
 
@@ -20,12 +20,12 @@ namespace rtp_llm {
 // Helper function to update memory size if below minimum requirement
 void MemoryEvaluationHelper::updateMemoryIfNeeded(size_t& current_size, size_t min_required, const char* scenario) {
     if (current_size < min_required) {
-        current_size = min_required;
         RTP_LLM_LOG_INFO("%s needs at least %ld MiB memory for runtime by default, "
                          "but only %ld MiB memory reserved. adjust to minimal value.",
                          scenario,
                          min_required / 1024 / 1024,
                          current_size / 1024 / 1024);
+        current_size = min_required;
     }
 }
 
diff --git a/rtp_llm/cpp/cache/MemoryEvaluationHelper.h b/rtp_llm/cpp/cache/config_creator/MemoryEvaluationHelper.h
similarity index 100%
rename from rtp_llm/cpp/cache/MemoryEvaluationHelper.h
rename to rtp_llm/cpp/cache/config_creator/MemoryEvaluationHelper.h
diff --git a/rtp_llm/cpp/cache/config_creator/SingleConfigCreator.cc b/rtp_llm/cpp/cache/config_creator/SingleConfigCreator.cc
new file mode 100644
index 0000000000..855c688202
--- /dev/null
+++ b/rtp_llm/cpp/cache/config_creator/SingleConfigCreator.cc
@@ -0,0 +1,131 @@
+#include "rtp_llm/cpp/cache/config_creator/SingleConfigCreator.h"
+
+#include "rtp_llm/cpp/cache/spec/KVCacheSpec.h"
+#include "rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h"
+#include "rtp_llm/cpp/cache/config_creator/MemoryEvaluationHelper.h"
+#include "rtp_llm/cpp/utils/Logger.h"
+
+#include <numeric>
+
+namespace rtp_llm {
+
+namespace {
+
+KVCacheSpecPtr getDefaultSpecFromRuntimeSpecs(const ModelConfig&        model_config,
+                                              const LayerKVCacheSpecs& runtime_specs) {
+    RTP_LLM_CHECK_WITH_INFO(runtime_specs.size() == static_cast<size_t>(model_config.num_layers),
+                            "single cache config requires layer-wise runtime specs for every layer, got %zu/%ld",
+                            runtime_specs.size(),
+                            model_config.num_layers);
+    RTP_LLM_CHECK_WITH_INFO(!runtime_specs.empty(), "single cache config requires at least one runtime spec");
+    RTP_LLM_CHECK_WITH_INFO(runtime_specs[0].size() == 1,
+                            "single cache config requires exactly one spec for layer 0, got %zu",
+                            runtime_specs[0].size());
+    auto spec = runtime_specs[0][0];
+    RTP_LLM_CHECK_WITH_INFO(spec != nullptr, "single cache config got null runtime spec for layer 0");
+    RTP_LLM_CHECK_WITH_INFO(spec->tag == "default",
+                            "single cache config requires tag=default for layer 0, got=%s",
+                            spec->tag.c_str());
+    const auto fingerprint = spec->fingerprint();
+    for (int64_t layer_id = 1; layer_id < model_config.num_layers; ++layer_id) {
+        const auto layer = static_cast<size_t>(layer_id);
+        RTP_LLM_CHECK_WITH_INFO(runtime_specs[layer].size() == 1,
+                                "single cache config requires exactly one spec for layer %ld, got %zu",
+                                layer_id,
+                                runtime_specs[layer].size());
+        const auto& layer_spec = runtime_specs[layer][0];
+        RTP_LLM_CHECK_WITH_INFO(layer_spec != nullptr, "single cache config got null runtime spec for layer %ld", layer_id);
+        RTP_LLM_CHECK_WITH_INFO(layer_spec->tag == "default",
+                                "single cache config requires tag=default for layer %ld, got=%s",
+                                layer_id,
+                                layer_spec->tag.c_str());
+        RTP_LLM_CHECK_WITH_INFO(layer_spec->fingerprint() == fingerprint,
+                                "single cache config default spec differs at layer %ld",
+                                layer_id);
+    }
+    return spec->clone();
+}
+
+}  // namespace
+
+CacheConfig SingleConfigCreator::createSingleConfig(const ModelConfig&       model_config,
+                                                    const ParallelismConfig& parallelism_config,
+                                                    const KVCacheConfig&     kv_cache_config,
+                                                    bool                     is_mtp,
+                                                    int                      gen_num_per_cycle) {
+    (void)is_mtp;
+    auto dtype = MemoryEvaluationHelper::getDataTypeForCache(model_config);
+    const auto physical_tokens_per_block =
+        kv_cache_config.seq_size_per_block > 0 ? static_cast<uint32_t>(kv_cache_config.seq_size_per_block) :
+                                                 static_cast<uint32_t>(model_config.attn_config.tokens_per_block);
+    const auto kernel_tokens_per_block =
+        kv_cache_config.kernel_seq_size_per_block > 0 ? static_cast<uint32_t>(kv_cache_config.kernel_seq_size_per_block) :
+                                                        physical_tokens_per_block;
+    RTP_LLM_CHECK_WITH_INFO(physical_tokens_per_block > 0, "single seq_size_per_block must be > 0");
+    RTP_LLM_CHECK_WITH_INFO(kernel_tokens_per_block > 0, "single kernel_seq_size_per_block must be > 0");
+    SpecBuildContext ctx;
+    ctx.dtype                   = dtype;
+    ctx.seq_size_per_block      = physical_tokens_per_block;
+    ctx.attn_tp_size            = static_cast<uint32_t>(parallelism_config.get_attn_tp_size());
+    ctx.kernel_tokens_per_block = kernel_tokens_per_block;
+    ctx.gen_num_per_cycle       = static_cast<uint32_t>(gen_num_per_cycle);
+    const auto runtime_specs =
+        CacheConfigCreator::buildLayerSpecsFromDescs(model_config.kv_cache_spec_descs, ctx, model_config.num_layers);
+
+    auto layer_num = model_config.num_layers;
+
+    CacheConfig config;
+    config.layer_num          = static_cast<uint32_t>(layer_num);
+    config.layer_all_num      = static_cast<uint32_t>(layer_num);
+    config.block_num          = 0;
+    config.seq_size_per_block        = physical_tokens_per_block;
+    config.kernel_seq_size_per_block = kernel_tokens_per_block;
+
+    config.use_mla   = model_config.attn_config.use_mla;
+    config.dtype     = dtype;
+    config.is_sparse = model_config.attn_config.is_sparse;
+
+    auto spec = getDefaultSpecFromRuntimeSpecs(model_config, runtime_specs);
+
+    std::vector<int> layer_ids(static_cast<size_t>(layer_num));
+    std::iota(layer_ids.begin(), layer_ids.end(), 0);
+    GroupBase group;
+    group.spec      = spec;
+    group.policy    = defaultCacheGroupPolicy(CacheGroupType::FULL);
+    group.layer_ids = layer_ids;
+
+    std::vector<LayerBase> layers(static_cast<size_t>(layer_num));
+    for (int64_t layer_id = 0; layer_id < layer_num; ++layer_id) {
+        auto& layer                  = layers[static_cast<size_t>(layer_id)];
+        layer.group_ids              = {0};
+        layer.tag_to_gid[spec->tag]  = 0;
+    }
+    config.setTopology({group}, std::move(layers));
+    RTP_LLM_CHECK_WITH_INFO(config.groupNums() == 1, "single config expected one cache group");
+
+    // Using spec interface for block size and scale
+    config.kv_block_stride_bytes = spec->block_size_bytes();
+    config.kv_block_size_bytes   = static_cast<size_t>(config.layer_num) * config.kv_block_stride_bytes;
+
+    // Scale handling - no need to check dtype as scale_block_size_bytes() returns 0 if no scale support
+    config.kv_scale_stride_bytes = spec->scale_block_size_bytes();
+    config.kv_scale_size_bytes   = static_cast<size_t>(config.layer_num) * config.kv_scale_stride_bytes;
+
+    if (config.is_sparse) {
+        auto indexer_dim             = model_config.attn_config.indexer_head_dim;
+        config.kv_scale_stride_bytes = (indexer_dim + indexer_dim / 128 * 4) * spec->seq_size_per_block;
+        config.kv_scale_size_bytes   = static_cast<size_t>(config.layer_num) * config.kv_scale_stride_bytes;
+    }
+
+    config.block_size_bytes = config.kv_block_size_bytes + config.kv_scale_size_bytes;
+    config.group_layer_num  = layer_num;  // only 1 group for SingleConfig
+
+    // Per-layer block stride (kv + scale).
+    const size_t per_layer_stride_bytes = config.kv_block_stride_bytes + config.kv_scale_stride_bytes;
+    config.layer_to_block_stride_bytes.assign(static_cast<size_t>(config.layer_all_num),
+                                              static_cast<int>(per_layer_stride_bytes));
+
+    return config;
+}
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/SingleConfigCreator.h b/rtp_llm/cpp/cache/config_creator/SingleConfigCreator.h
similarity index 72%
rename from rtp_llm/cpp/cache/SingleConfigCreator.h
rename to rtp_llm/cpp/cache/config_creator/SingleConfigCreator.h
index 032a254636..0a64fd75f1 100644
--- a/rtp_llm/cpp/cache/SingleConfigCreator.h
+++ b/rtp_llm/cpp/cache/config_creator/SingleConfigCreator.h
@@ -12,7 +12,9 @@ class SingleConfigCreator {
 public:
     static CacheConfig createSingleConfig(const ModelConfig&       model_config,
                                           const ParallelismConfig& parallelism_config,
-                                          bool                     is_mtp = false);
+                                          const KVCacheConfig&     kv_cache_config,
+                                          bool                     is_mtp = false,
+                                          int                      gen_num_per_cycle = 0);
 };
 
 }  // namespace rtp_llm
\ No newline at end of file
diff --git a/rtp_llm/cpp/cache/connector/KVCacheConnectorCoordinator.cc b/rtp_llm/cpp/cache/connector/KVCacheConnectorCoordinator.cc
index 16a748eaf3..5091874bdb 100644
--- a/rtp_llm/cpp/cache/connector/KVCacheConnectorCoordinator.cc
+++ b/rtp_llm/cpp/cache/connector/KVCacheConnectorCoordinator.cc
@@ -1,8 +1,9 @@
 #include "rtp_llm/cpp/cache/connector/KVCacheConnectorCoordinator.h"
 
 #include <utility>
+#include <vector>
 
-#include "rtp_llm/cpp/cache/KVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h"
 #include "rtp_llm/cpp/utils/Logger.h"
 #include "rtp_llm/cpp/utils/ProfilingScope.h"
 #include "rtp_llm/cpp/cache/connector/KVCacheConnectorReadWriteContext.h"
@@ -14,6 +15,121 @@
 #endif
 
 namespace rtp_llm {
+namespace {
+
+CacheGroupType groupTypeForConnector(const CacheConfig& cache_config, int group_id) {
+    if (group_id >= 0 && group_id < cache_config.groupNums()) {
+        return cache_config.typeForGroup(static_cast<size_t>(group_id));
+    }
+    return CacheGroupType::FULL;
+}
+
+bool isCpCompactSliceGroup(const CacheConfig& cache_config, int group_id, int cp_size) {
+    if (cp_size <= 1 || group_id < 0 || group_id >= cache_config.groupNums()
+        || static_cast<size_t>(group_id) >= cache_config.group_seq_size_per_block.size()) {
+        return false;
+    }
+    const auto& spec = cache_config.specForGroup(static_cast<size_t>(group_id));
+    if (!spec || !spec->supportsCpSlice()) {
+        return false;
+    }
+    const auto row_tokens = cache_config.group_seq_size_per_block[static_cast<size_t>(group_id)];
+    return row_tokens > 0 && row_tokens == cache_config.seq_size_per_block * static_cast<size_t>(cp_size);
+}
+
+bool isCompactFullBlockList(const KVCacheResource& source,
+                            const BlockIndicesType& src_blocks,
+                            const CacheKeysType& selected_keys) {
+    return src_blocks.size() <= selected_keys.size() || src_blocks.size() < source.cacheKeys().size();
+}
+
+bool selectedLastRankKeysAreAligned(const KVCacheResource& source, int cp_size) {
+    if (source.lastBlockAligned()) {
+        return true;
+    }
+    const auto& keys = source.cacheKeys();
+    if (keys.empty() || cp_size <= 1) {
+        return source.lastBlockAligned();
+    }
+    const int partial_key_pos = static_cast<int>(keys.size() - 1);
+    const int last_rank       = cp_size - 1;
+    return partial_key_pos % cp_size != last_rank;
+}
+
+KVCacheResource makeCpShardedConnectorResource(const KVCacheResource& source,
+                                               const CacheConfig&     cache_config,
+                                               const CacheKeysType&   selected_keys,
+                                               int                    cp_size) {
+    std::vector<CacheGroupType> group_types;
+    group_types.reserve(static_cast<size_t>(source.groupNums()));
+    for (int gid = 0; gid < source.groupNums(); ++gid) {
+        group_types.push_back(groupTypeForConnector(cache_config, gid));
+    }
+
+    KVCacheResource selected = source;
+    selected.initGroups(source.groupNums(),
+                        static_cast<int>(cache_config.layer_all_num),
+                        cache_config.layerGroupIdsSnapshot(),
+                        cache_config.kernelBlocksPerKvBlock(),
+                        group_types);
+    selected.setCacheKeys(selected_keys);
+    const bool selected_aligned = selectedLastRankKeysAreAligned(source, cp_size);
+    selected.setLastBlockAligned(selected_aligned);
+
+    // Memory connector intentionally drops the last key to avoid matching a
+    // partial tail.  After CP Page-RR remap, a source partial can belong to a
+    // non-last rank, making the selected last-rank key complete.  Append the
+    // original partial key as a connector-only dummy tail so the drop-last
+    // contract discards the dummy, not the usable selected key.
+    if (!source.lastBlockAligned() && selected_aligned && !source.cacheKeys().empty()) {
+        selected.cacheKeys().push_back(source.cacheKeys().back());
+        selected.rebuildLinearBlockDependencies();
+        selected.setLastBlockAligned(false);
+    }
+
+    for (int gid = 0; gid < source.groupNums(); ++gid) {
+        const auto&      src_blocks = source.blocks(gid);
+        BlockIndicesType dst_blocks;
+        dst_blocks.reserve(selected_keys.size());
+
+        if (isCpCompactSliceGroup(cache_config, gid, cp_size)) {
+            // Intra-block CP-sliced groups can be compact by using a row size
+            // of seq_size_per_block * cp_size, so their block list is already
+            // in the canonical last-rank key namespace.
+            for (size_t i = 0; i < selected_keys.size(); ++i) {
+                dst_blocks.push_back(i < src_blocks.size() ? src_blocks[i] : NULL_BLOCK_IDX);
+            }
+        } else if (group_types[static_cast<size_t>(gid)] == CacheGroupType::FULL) {
+            // Prefill rank-local FULL blocks are compact already. Decode-side
+            // FULL blocks are full-logical, so select the canonical last-rank
+            // logical positions.
+            if (isCompactFullBlockList(source, src_blocks, selected_keys)) {
+                for (size_t i = 0; i < selected_keys.size(); ++i) {
+                    dst_blocks.push_back(i < src_blocks.size() ? src_blocks[i] : NULL_BLOCK_IDX);
+                }
+            } else {
+                for (size_t logical_pos = static_cast<size_t>(cp_size - 1); dst_blocks.size() < selected_keys.size();
+                     logical_pos += static_cast<size_t>(cp_size)) {
+                    dst_blocks.push_back(logical_pos < src_blocks.size() ? src_blocks[logical_pos] : NULL_BLOCK_IDX);
+                }
+            }
+        } else {
+            // SWA/state groups keep the non-sharded logical coordinate system.
+            // Select the block at the original logical key position instead of
+            // reinterpreting the group as rank-local compact storage.
+            for (size_t logical_pos = static_cast<size_t>(cp_size - 1); dst_blocks.size() < selected_keys.size();
+                 logical_pos += static_cast<size_t>(cp_size)) {
+                dst_blocks.push_back(logical_pos < src_blocks.size() ? src_blocks[logical_pos] : NULL_BLOCK_IDX);
+            }
+        }
+
+        selected.mutableBlockIds(gid).assign(std::move(dst_blocks));
+    }
+
+    return selected;
+}
+
+}  // namespace
 
 KVCacheConnectorCoordinator::KVCacheConnectorCoordinator(const CacheConfig&                       cache_config,
                                                          const KVCacheConfig&                     kv_cache_config,
@@ -116,7 +232,23 @@ KVCacheConnectorCoordinator::asyncRead(const std::shared_ptr<KVCacheConnectorRea
         return nullptr;
     }
 
-    auto resource = allocator_->incrKVCacheRef(kvcache_resource, kvcache_resource.cacheKeys(), true);
+    const int       cp_size      = cpSize();
+    CacheKeysType   ref_keys     = kvcache_resource.cacheKeys();
+    KVCacheResource ref_resource = kvcache_resource;
+    if (cp_size > 1) {
+        if (!kvcache_resource.cacheKeysAreCpCanonical()) {
+            ref_keys = kvcache_resource.localCacheKeys(cp_size - 1, cp_size);
+            // Short requests (< cp_size logical blocks) have no complete virtual
+            // block, so the canonical last-rank-key namespace is empty by design.
+            // Skip silently — connector activity for these is a no-op anyway.
+            if (ref_keys.empty()) {
+                return nullptr;
+            }
+            ref_resource = makeCpShardedConnectorResource(kvcache_resource, cache_config_, ref_keys, cp_size);
+            ref_keys     = ref_resource.cacheKeys();
+        }
+    }
+    auto resource = allocator_->incrKVCacheRef(ref_resource, ref_keys, true);
     if (!resource) {
         RTP_LLM_LOG_WARNING("async read failed, incr kvcache ref failed, resource: [%s]",
                             kvcache_resource.debugString().c_str());
@@ -154,7 +286,20 @@ KVCacheConnectorCoordinator::asyncWrite(const std::shared_ptr<KVCacheConnectorRe
         return nullptr;
     }
 
-    auto resource = allocator_->incrKVCacheRef(kvcache_resource, kvcache_resource.cacheKeys(), true);
+    const int       cp_size      = cpSize();
+    CacheKeysType   ref_keys     = kvcache_resource.cacheKeys();
+    KVCacheResource ref_resource = kvcache_resource;
+    if (cp_size > 1) {
+        if (!kvcache_resource.cacheKeysAreCpCanonical()) {
+            ref_keys = kvcache_resource.localCacheKeys(cp_size - 1, cp_size);
+            if (ref_keys.empty()) {
+                return nullptr;  // request shorter than one virtual block — nothing to write
+            }
+            ref_resource = makeCpShardedConnectorResource(kvcache_resource, cache_config_, ref_keys, cp_size);
+            ref_keys     = ref_resource.cacheKeys();
+        }
+    }
+    auto resource = allocator_->incrKVCacheRef(ref_resource, ref_keys, true);
     if (!resource) {
         RTP_LLM_LOG_WARNING("async write failed, incr kvcache ref failed, resource: [%s]",
                             kvcache_resource.debugString().c_str());
@@ -193,8 +338,12 @@ KVCacheConnectorCoordinator::asyncWriteByLayer(int
 }
 
 std::shared_ptr<KVCacheMemoryConnector> KVCacheConnectorCoordinator::initMemoryConnector() {
-    auto memory_connector = std::make_shared<KVCacheMemoryConnector>(
-        cache_config_, kv_cache_config_, allocator_, runtime_config_.worker_grpc_addrs, metrics_reporter_);
+    auto memory_connector = std::make_shared<KVCacheMemoryConnector>(cache_config_,
+                                                                     kv_cache_config_,
+                                                                     parallelism_config_,
+                                                                     allocator_,
+                                                                     runtime_config_.worker_grpc_addrs,
+                                                                     metrics_reporter_);
     RTP_LLM_CHECK_WITH_INFO(memory_connector->init(), "memory connector init failed");
     return memory_connector;
 }
@@ -220,6 +369,21 @@ std::shared_ptr<RemoteConnector> KVCacheConnectorCoordinator::initRemoteConnecto
 #endif
 }
 
+int KVCacheConnectorCoordinator::cpSize() const {
+    const auto& cp_cfg = parallelism_config_.prefill_cp_config;
+    if (!cp_cfg.kv_cache_sharded) {
+        return 1;
+    }
+    if (parallelism_config_.tp_size > 1) {
+        return static_cast<int>(parallelism_config_.tp_size);
+    }
+    if (parallelism_config_.role_type == RoleType::DECODE && cp_cfg.is_prefill_enabled()
+        && cp_cfg.prefill_cp_size > 1) {
+        return static_cast<int>(cp_cfg.prefill_cp_size);
+    }
+    return 1;
+}
+
 void KVCacheConnectorCoordinator::updateOnce() {
     RTP_LLM_PROFILE_FUNCTION();
     processReadContexts();
diff --git a/rtp_llm/cpp/cache/connector/KVCacheConnectorCoordinator.h b/rtp_llm/cpp/cache/connector/KVCacheConnectorCoordinator.h
index ba8bfaa2a8..253bf86d10 100644
--- a/rtp_llm/cpp/cache/connector/KVCacheConnectorCoordinator.h
+++ b/rtp_llm/cpp/cache/connector/KVCacheConnectorCoordinator.h
@@ -11,7 +11,7 @@
 #include "rtp_llm/cpp/cache/connector/AsyncContext.h"
 #include "rtp_llm/cpp/cache/connector/IKVCacheConnectorCoordinator.h"
 #include "rtp_llm/cpp/cache/connector/KVCacheConnector.h"
-#include "rtp_llm/cpp/cache/KVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h"
 #include "rtp_llm/cpp/config/ConfigModules.h"
 #include "rtp_llm/cpp/model_rpc/proto/model_rpc_service.grpc.pb.h"
 #include "rtp_llm/cpp/model_rpc/proto/model_rpc_service.pb.h"
@@ -67,10 +67,12 @@ class KVCacheConnectorCoordinator: public IKVCacheConnectorCoordinator {
     std::shared_ptr<KVCacheMemoryConnector> initMemoryConnector();
     std::shared_ptr<RemoteConnector>        initRemoteConnector();
     bool                                    initP2PConnectorInternal();
-    void                                    initUpdateThread();
-    void                                    updateOnce();
-    void                                    processReadContexts();
-    void                                    processWriteContexts();
+    // Returns CP size when page-level RR sharding is active; 1 otherwise.
+    int  cpSize() const;
+    void initUpdateThread();
+    void updateOnce();
+    void processReadContexts();
+    void processWriteContexts();
     void asyncReadAfterMatch(std::shared_ptr<FusedAsyncReadContext> fused_read_context);
 
     bool isPdInvertMode() const;
diff --git a/rtp_llm/cpp/cache/connector/memory/BUILD b/rtp_llm/cpp/cache/connector/memory/BUILD
index e2a7955837..7d6ec080a3 100644
--- a/rtp_llm/cpp/cache/connector/memory/BUILD
+++ b/rtp_llm/cpp/cache/connector/memory/BUILD
@@ -12,10 +12,9 @@ cc_library(
     deps = [
         "//rtp_llm/cpp/cache/connector:connector_base",
         "//rtp_llm/cpp/config:config_modules",
-        "//rtp_llm/models_py/bindings/core:exec_ops_hdr",
         "//rtp_llm/cpp/metrics:metrics",
         "//rtp_llm/cpp/model_rpc:broadcast_manager",
-        "//rtp_llm/cpp/cache:kv_cache_allocator",
+        "//rtp_llm/cpp/cache:kv_cache_allocator_hdr",
         "//rtp_llm/cpp/utils:lru_cache",
         "//rtp_llm/cpp/utils:profiling_scope",
         "@havenask//aios/autil:json",
diff --git a/rtp_llm/cpp/cache/connector/memory/CacheBlockKind.h b/rtp_llm/cpp/cache/connector/memory/CacheBlockKind.h
new file mode 100644
index 0000000000..1ccd56dccd
--- /dev/null
+++ b/rtp_llm/cpp/cache/connector/memory/CacheBlockKind.h
@@ -0,0 +1,30 @@
+#pragma once
+
+namespace rtp_llm {
+
+enum class CacheBlockKind {
+    COMPLETE   = 0,
+    INCOMPLETE = 1,
+    COMPRESSED_KV = 2,
+    STATE_SWA_KV  = 3,
+};
+
+inline CacheBlockKind blockKindFromComplete(bool is_complete) {
+    return is_complete ? CacheBlockKind::COMPLETE : CacheBlockKind::INCOMPLETE;
+}
+
+inline const char* cacheBlockKindName(CacheBlockKind kind) {
+    switch (kind) {
+        case CacheBlockKind::COMPLETE:
+            return "complete";
+        case CacheBlockKind::INCOMPLETE:
+            return "incomplete";
+        case CacheBlockKind::COMPRESSED_KV:
+            return "compressed_kv";
+        case CacheBlockKind::STATE_SWA_KV:
+            return "state_swa_kv";
+    }
+    return "unknown";
+}
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/connector/memory/DiskBlockIO.cc b/rtp_llm/cpp/cache/connector/memory/DiskBlockIO.cc
new file mode 100644
index 0000000000..fe9a480c19
--- /dev/null
+++ b/rtp_llm/cpp/cache/connector/memory/DiskBlockIO.cc
@@ -0,0 +1,151 @@
+#include "rtp_llm/cpp/cache/connector/memory/DiskBlockIO.h"
+
+#include <cerrno>
+#include <cstring>
+#include <fcntl.h>
+#include <sstream>
+#include <unistd.h>
+
+#include "rtp_llm/cpp/utils/Logger.h"
+
+namespace rtp_llm {
+namespace {
+
+constexpr size_t kDirectIOAlignment = 4096;
+
+}  // namespace
+
+PosixDiskBlockIO::~PosixDiskBlockIO() {
+    close();
+}
+
+bool PosixDiskBlockIO::openAndPreallocate(const std::string& file_path, size_t bytes, bool buffered_io) {
+    close();
+    file_path_   = file_path;
+    bytes_       = bytes;
+    buffered_io_ = buffered_io;
+
+    int flags = O_CREAT | O_EXCL | O_RDWR | O_CLOEXEC;
+    if (!buffered_io_) {
+#ifdef O_DIRECT
+        flags |= O_DIRECT;
+#else
+        RTP_LLM_LOG_ERROR("O_DIRECT is not supported on this platform");
+        return false;
+#endif
+    }
+
+    fd_ = ::open(file_path.c_str(), flags, 0600);
+    if (fd_ < 0) {
+        RTP_LLM_LOG_ERROR("open disk kv file failed, file=%s, error=%s", file_path.c_str(), std::strerror(errno));
+        return false;
+    }
+
+    const int rc = ::posix_fallocate(fd_, 0, static_cast<off_t>(bytes));
+    if (rc != 0) {
+        RTP_LLM_LOG_ERROR("posix_fallocate disk kv file failed, file=%s, bytes=%zu, error=%s",
+                          file_path.c_str(),
+                          bytes,
+                          std::strerror(rc));
+        close();
+        return false;
+    }
+    return true;
+}
+
+bool PosixDiskBlockIO::checkDirectIOAlignment(uint64_t offset, const void* buffer, size_t bytes) const {
+    if (buffered_io_) {
+        return true;
+    }
+    const auto addr = reinterpret_cast<uintptr_t>(buffer);
+    if (offset % kDirectIOAlignment != 0 || addr % kDirectIOAlignment != 0 || bytes % kDirectIOAlignment != 0) {
+        RTP_LLM_LOG_ERROR("direct disk io alignment failed, file=%s, offset=%lu, addr=%p, bytes=%zu",
+                          file_path_.c_str(),
+                          offset,
+                          buffer,
+                          bytes);
+        return false;
+    }
+    return true;
+}
+
+bool PosixDiskBlockIO::read(uint64_t offset, void* dst, size_t bytes) {
+    if (fd_ < 0 || dst == nullptr || offset + bytes > bytes_ || !checkDirectIOAlignment(offset, dst, bytes)) {
+        return false;
+    }
+    size_t done = 0;
+    while (done < bytes) {
+        const auto rc = ::pread(fd_, static_cast<char*>(dst) + done, bytes - done, static_cast<off_t>(offset + done));
+        if (rc < 0) {
+            if (errno == EINTR) {
+                continue;
+            }
+            RTP_LLM_LOG_ERROR("pread disk kv file failed, file=%s, offset=%lu, bytes=%zu, done=%zu, error=%s",
+                              file_path_.c_str(),
+                              offset,
+                              bytes,
+                              done,
+                              std::strerror(errno));
+            return false;
+        }
+        if (rc == 0) {
+            RTP_LLM_LOG_ERROR("pread disk kv file got EOF, file=%s, offset=%lu, bytes=%zu, done=%zu",
+                              file_path_.c_str(),
+                              offset,
+                              bytes,
+                              done);
+            return false;
+        }
+        done += static_cast<size_t>(rc);
+    }
+    return true;
+}
+
+bool PosixDiskBlockIO::write(uint64_t offset, const void* src, size_t bytes) {
+    if (fd_ < 0 || src == nullptr || offset + bytes > bytes_ || !checkDirectIOAlignment(offset, src, bytes)) {
+        return false;
+    }
+    size_t done = 0;
+    while (done < bytes) {
+        const auto rc =
+            ::pwrite(fd_, static_cast<const char*>(src) + done, bytes - done, static_cast<off_t>(offset + done));
+        if (rc < 0) {
+            if (errno == EINTR) {
+                continue;
+            }
+            RTP_LLM_LOG_ERROR("pwrite disk kv file failed, file=%s, offset=%lu, bytes=%zu, done=%zu, error=%s",
+                              file_path_.c_str(),
+                              offset,
+                              bytes,
+                              done,
+                              std::strerror(errno));
+            return false;
+        }
+        if (rc == 0) {
+            RTP_LLM_LOG_ERROR("pwrite disk kv file made no progress, file=%s, offset=%lu, bytes=%zu, done=%zu",
+                              file_path_.c_str(),
+                              offset,
+                              bytes,
+                              done);
+            return false;
+        }
+        done += static_cast<size_t>(rc);
+    }
+    return true;
+}
+
+void PosixDiskBlockIO::close() {
+    if (fd_ >= 0) {
+        ::close(fd_);
+        fd_ = -1;
+    }
+}
+
+std::string PosixDiskBlockIO::debugString() const {
+    std::ostringstream oss;
+    oss << "PosixDiskBlockIO{file=" << file_path_ << ", bytes=" << bytes_
+        << ", io=" << (buffered_io_ ? "buffered" : "direct") << "}";
+    return oss.str();
+}
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/connector/memory/DiskBlockIO.h b/rtp_llm/cpp/cache/connector/memory/DiskBlockIO.h
new file mode 100644
index 0000000000..6c818eab6b
--- /dev/null
+++ b/rtp_llm/cpp/cache/connector/memory/DiskBlockIO.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+namespace rtp_llm {
+
+class IDiskBlockIO {
+public:
+    virtual ~IDiskBlockIO() = default;
+
+    virtual bool        openAndPreallocate(const std::string& file_path, size_t bytes, bool buffered_io) = 0;
+    virtual bool        read(uint64_t offset, void* dst, size_t bytes)                                   = 0;
+    virtual bool        write(uint64_t offset, const void* src, size_t bytes)                            = 0;
+    virtual void        close()                                                                          = 0;
+    virtual std::string debugString() const                                                              = 0;
+};
+
+class PosixDiskBlockIO: public IDiskBlockIO {
+public:
+    PosixDiskBlockIO() = default;
+    ~PosixDiskBlockIO() override;
+
+    bool        openAndPreallocate(const std::string& file_path, size_t bytes, bool buffered_io) override;
+    bool        read(uint64_t offset, void* dst, size_t bytes) override;
+    bool        write(uint64_t offset, const void* src, size_t bytes) override;
+    void        close() override;
+    std::string debugString() const override;
+
+private:
+    bool checkDirectIOAlignment(uint64_t offset, const void* buffer, size_t bytes) const;
+
+private:
+    int         fd_{-1};
+    std::string file_path_;
+    size_t      bytes_{0};
+    bool        buffered_io_{true};
+};
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/connector/memory/DiskBlockPool.cc b/rtp_llm/cpp/cache/connector/memory/DiskBlockPool.cc
new file mode 100644
index 0000000000..917fac1193
--- /dev/null
+++ b/rtp_llm/cpp/cache/connector/memory/DiskBlockPool.cc
@@ -0,0 +1,344 @@
+#include "rtp_llm/cpp/cache/connector/memory/DiskBlockPool.h"
+
+#include <cerrno>
+#include <cstring>
+#include <dirent.h>
+#include <fcntl.h>
+#include <sstream>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <utility>
+#include <unistd.h>
+
+#include "rtp_llm/cpp/utils/Logger.h"
+#include "rtp_llm/cpp/utils/StringUtil.h"
+
+namespace rtp_llm {
+namespace {
+
+constexpr size_t kDiskIOAlignment = 4096;
+
+std::string joinPath(const std::string& parent, const std::string& child) {
+    if (parent.empty() || parent.back() == '/') {
+        return parent + child;
+    }
+    return parent + "/" + child;
+}
+
+bool mkdirIfMissing(const std::string& path) {
+    if (::mkdir(path.c_str(), 0755) == 0) {
+        return true;
+    }
+    if (errno == EEXIST) {
+        struct stat st;
+        return ::stat(path.c_str(), &st) == 0 && S_ISDIR(st.st_mode);
+    }
+    return false;
+}
+
+bool directoryExists(const std::string& path) {
+    struct stat st;
+    return ::stat(path.c_str(), &st) == 0 && S_ISDIR(st.st_mode);
+}
+
+}  // namespace
+
+DiskMountGuard::~DiskMountGuard() {
+    unlockAndClose();
+}
+
+bool DiskMountGuard::init(const std::string& mount_path) {
+    mount_path_ = mount_path;
+    work_dir_   = joinPath(mount_path_, "rtp_llm_disk_kv");
+    lock_path_  = joinPath(work_dir_, ".lock");
+    if (!initDirectoryAndLock() || !cleanupStaleFiles()) {
+        unlockAndClose();
+        return false;
+    }
+    RTP_LLM_LOG_INFO("disk kv mount guard init success: %s", debugString().c_str());
+    return true;
+}
+
+const std::string& DiskMountGuard::workDir() const {
+    return work_dir_;
+}
+
+const std::string& DiskMountGuard::mountPath() const {
+    return mount_path_;
+}
+
+std::string DiskMountGuard::debugString() const {
+    std::ostringstream oss;
+    oss << "DiskMountGuard{mount=" << mount_path_ << ", work_dir=" << work_dir_ << ", lock=" << lock_path_ << "}";
+    return oss.str();
+}
+
+bool DiskMountGuard::initDirectoryAndLock() {
+    if (!directoryExists(mount_path_)) {
+        RTP_LLM_LOG_ERROR("disk kv mount path does not exist or is not a directory, mount=%s, error=%s",
+                          mount_path_.c_str(),
+                          std::strerror(errno));
+        return false;
+    }
+
+    if (!mkdirIfMissing(work_dir_)) {
+        RTP_LLM_LOG_ERROR("create disk kv directory failed, mount=%s, work_dir=%s, error=%s",
+                          mount_path_.c_str(),
+                          work_dir_.c_str(),
+                          std::strerror(errno));
+        return false;
+    }
+
+    lock_fd_ = ::open(lock_path_.c_str(), O_CREAT | O_RDWR | O_CLOEXEC, 0600);
+    if (lock_fd_ < 0) {
+        RTP_LLM_LOG_ERROR("open disk kv lock failed, lock=%s, error=%s", lock_path_.c_str(), std::strerror(errno));
+        return false;
+    }
+    if (::flock(lock_fd_, LOCK_EX | LOCK_NB) != 0) {
+        RTP_LLM_LOG_ERROR("lock disk kv mount failed, lock=%s, error=%s", lock_path_.c_str(), std::strerror(errno));
+        unlockAndClose();
+        return false;
+    }
+    return true;
+}
+
+bool DiskMountGuard::cleanupStaleFiles() {
+    DIR* dir = ::opendir(work_dir_.c_str());
+    if (dir == nullptr) {
+        RTP_LLM_LOG_ERROR("open disk kv work dir failed, dir=%s, error=%s", work_dir_.c_str(), std::strerror(errno));
+        return false;
+    }
+    while (auto* entry = ::readdir(dir)) {
+        const std::string name(entry->d_name);
+        if (name == "." || name == ".." || name == ".lock") {
+            continue;
+        }
+        const bool framework_file =
+            (startsWith(name, "rank_") && (name.size() >= 3 && name.substr(name.size() - 3) == ".kv"))
+            || (name.size() >= 4 && name.substr(name.size() - 4) == ".tmp");
+        if (!framework_file) {
+            continue;
+        }
+        const auto path = joinPath(work_dir_, name);
+        if (::unlink(path.c_str()) != 0 && errno != ENOENT) {
+            RTP_LLM_LOG_ERROR(
+                "remove stale disk kv file failed, file=%s, error=%s", path.c_str(), std::strerror(errno));
+            ::closedir(dir);
+            return false;
+        }
+    }
+    ::closedir(dir);
+    return true;
+}
+
+void DiskMountGuard::unlockAndClose() {
+    if (lock_fd_ >= 0) {
+        ::flock(lock_fd_, LOCK_UN);
+        ::close(lock_fd_);
+        lock_fd_ = -1;
+    }
+}
+
+DiskBlockPool::DiskBlockPool(DiskBlockPoolConfig config, std::unique_ptr<IDiskBlockIO> io):
+    config_(std::move(config)), io_(std::move(io)) {
+    if (!io_) {
+        io_ = std::make_unique<PosixDiskBlockIO>();
+    }
+}
+
+DiskBlockPool::~DiskBlockPool() {
+    if (io_) {
+        io_->close();
+    }
+}
+
+size_t DiskBlockPool::alignUp(size_t value, size_t alignment) {
+    return ((value + alignment - 1) / alignment) * alignment;
+}
+
+bool DiskBlockPool::init() {
+    if (config_.work_dir.empty() || config_.disk_size_bytes == 0 || config_.block_size_bytes == 0) {
+        RTP_LLM_LOG_ERROR("init disk block pool failed, invalid config: %s", debugString().c_str());
+        return false;
+    }
+    slot_stride_bytes_ = alignUp(config_.block_size_bytes, kDiskIOAlignment);
+    slot_count_        = config_.disk_size_bytes / slot_stride_bytes_;
+    if (slot_count_ == 0) {
+        RTP_LLM_LOG_ERROR("init disk block pool failed, disk size too small, disk=%zu, block=%zu, stride=%zu",
+                          config_.disk_size_bytes,
+                          config_.block_size_bytes,
+                          slot_stride_bytes_);
+        return false;
+    }
+
+    if (!initFile()) {
+        if (!file_path_.empty()) {
+            ::unlink(file_path_.c_str());
+        }
+        return false;
+    }
+
+    {
+        std::lock_guard<std::mutex> lock(mutex_);
+        slots_.assign(slot_count_, SlotState{});
+        free_slots_.clear();
+        for (size_t i = 0; i < slot_count_; ++i) {
+            free_slots_.insert(static_cast<int32_t>(i));
+        }
+    }
+
+    RTP_LLM_LOG_INFO("disk kv block pool init success: %s", debugString().c_str());
+    return true;
+}
+
+bool DiskBlockPool::initFile() {
+    file_path_ = joinPath(config_.work_dir,
+                          fmtstr("rank_%ld_world_%ld_%s.kv",
+                                 config_.local_rank,
+                                 config_.world_rank,
+                                 cacheBlockKindName(config_.pool_kind)));
+    return io_->openAndPreallocate(file_path_, slot_count_ * slot_stride_bytes_, config_.buffered_io);
+}
+
+std::optional<int32_t> DiskBlockPool::malloc() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (free_slots_.empty()) {
+        return std::nullopt;
+    }
+    const auto slot = *free_slots_.begin();
+    free_slots_.erase(free_slots_.begin());
+    slots_[static_cast<size_t>(slot)].request_ref++;
+    return slot;
+}
+
+bool DiskBlockPool::validSlot(int32_t slot) const {
+    return slot >= 0 && static_cast<size_t>(slot) < slot_count_;
+}
+
+void DiskBlockPool::requestReference(int32_t slot) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (!validSlot(slot)) {
+        return;
+    }
+    auto& state = slots_[static_cast<size_t>(slot)];
+    state.request_ref++;
+    free_slots_.erase(slot);
+}
+
+void DiskBlockPool::requestFree(int32_t slot) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (!validSlot(slot)) {
+        return;
+    }
+    auto& state = slots_[static_cast<size_t>(slot)];
+    if (state.request_ref > 0) {
+        state.request_ref--;
+    }
+    tryFreeSlotLocked(slot);
+}
+
+void DiskBlockPool::blockCacheReference(int32_t slot) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (!validSlot(slot)) {
+        return;
+    }
+    auto& state = slots_[static_cast<size_t>(slot)];
+    state.cache_ref++;
+    free_slots_.erase(slot);
+}
+
+void DiskBlockPool::blockCacheFree(int32_t slot) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (!validSlot(slot)) {
+        return;
+    }
+    auto& state = slots_[static_cast<size_t>(slot)];
+    if (state.cache_ref > 0) {
+        state.cache_ref--;
+    }
+    tryFreeSlotLocked(slot);
+}
+
+void DiskBlockPool::tryFreeSlotLocked(int32_t slot) {
+    auto& state = slots_[static_cast<size_t>(slot)];
+    if (state.request_ref == 0 && state.cache_ref == 0) {
+        free_slots_.insert(slot);
+    }
+}
+
+bool DiskBlockPool::read(int32_t slot, void* dst, size_t bytes) {
+    if (!validSlot(slot) || bytes > slot_stride_bytes_) {
+        return false;
+    }
+    const uint64_t offset = static_cast<uint64_t>(slot) * slot_stride_bytes_;
+    if (!io_->read(offset, dst, bytes)) {
+        return false;
+    }
+    read_bytes_.fetch_add(bytes, std::memory_order_relaxed);
+    return true;
+}
+
+bool DiskBlockPool::write(int32_t slot, const void* src, size_t bytes) {
+    if (!validSlot(slot) || bytes > slot_stride_bytes_) {
+        return false;
+    }
+    const uint64_t offset = static_cast<uint64_t>(slot) * slot_stride_bytes_;
+    if (!io_->write(offset, src, bytes)) {
+        return false;
+    }
+    write_bytes_.fetch_add(bytes, std::memory_order_relaxed);
+    return true;
+}
+
+size_t DiskBlockPool::totalSlots() const {
+    return slot_count_;
+}
+
+size_t DiskBlockPool::freeSlots() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return free_slots_.size();
+}
+
+size_t DiskBlockPool::availableSlots() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    size_t                      available = 0;
+    for (const auto& state : slots_) {
+        if (state.request_ref == 0) {
+            ++available;
+        }
+    }
+    return available;
+}
+
+size_t DiskBlockPool::blockSizeBytes() const {
+    return config_.block_size_bytes;
+}
+
+size_t DiskBlockPool::slotStrideBytes() const {
+    return slot_stride_bytes_;
+}
+
+size_t DiskBlockPool::readBytes() const {
+    return read_bytes_.load(std::memory_order_relaxed);
+}
+
+size_t DiskBlockPool::writeBytes() const {
+    return write_bytes_.load(std::memory_order_relaxed);
+}
+
+const std::string& DiskBlockPool::filePath() const {
+    return file_path_;
+}
+
+std::string DiskBlockPool::debugString() const {
+    std::ostringstream oss;
+    oss << "DiskBlockPool{work_dir=" << config_.work_dir << ", file=" << file_path_
+        << ", local_rank=" << config_.local_rank << ", world_rank=" << config_.world_rank
+        << ", kind=" << cacheBlockKindName(config_.pool_kind) << ", disk_size=" << config_.disk_size_bytes
+        << ", block_size=" << config_.block_size_bytes
+        << ", stride=" << slot_stride_bytes_ << ", slots=" << slot_count_
+        << ", io=" << (config_.buffered_io ? "buffered" : "direct") << "}";
+    return oss.str();
+}
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/connector/memory/DiskBlockPool.h b/rtp_llm/cpp/cache/connector/memory/DiskBlockPool.h
new file mode 100644
index 0000000000..b4c6e21654
--- /dev/null
+++ b/rtp_llm/cpp/cache/connector/memory/DiskBlockPool.h
@@ -0,0 +1,107 @@
+#pragma once
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "rtp_llm/cpp/cache/connector/memory/CacheBlockKind.h"
+#include "rtp_llm/cpp/cache/connector/memory/DiskBlockIO.h"
+
+namespace rtp_llm {
+
+class DiskMountGuard {
+public:
+    DiskMountGuard() = default;
+    ~DiskMountGuard();
+
+    bool               init(const std::string& mount_path);
+    const std::string& workDir() const;
+    const std::string& mountPath() const;
+    std::string        debugString() const;
+
+private:
+    bool initDirectoryAndLock();
+    bool cleanupStaleFiles();
+    void unlockAndClose();
+
+private:
+    std::string mount_path_;
+    std::string work_dir_;
+    std::string lock_path_;
+    int         lock_fd_{-1};
+};
+
+struct DiskBlockPoolConfig {
+    std::string work_dir;
+    int64_t     local_rank{0};
+    int64_t     world_rank{0};
+    size_t      disk_size_bytes{0};
+    size_t      block_size_bytes{0};
+    bool        buffered_io{true};
+    CacheBlockKind pool_kind{CacheBlockKind::COMPLETE};
+};
+
+class DiskBlockPool {
+public:
+    explicit DiskBlockPool(DiskBlockPoolConfig config, std::unique_ptr<IDiskBlockIO> io = nullptr);
+    ~DiskBlockPool();
+
+    bool init();
+
+    // Slot allocation is driven by the copy-plan owner, matching the existing
+    // memory connector metadata model. Follower ranks receive the slot id in
+    // the broadcast copy plan and use it as an externally assigned file offset;
+    // they do not independently allocate or evict disk slots.
+    std::optional<int32_t> malloc();
+    void                   requestReference(int32_t slot);
+    void                   requestFree(int32_t slot);
+    void                   blockCacheReference(int32_t slot);
+    void                   blockCacheFree(int32_t slot);
+
+    bool read(int32_t slot, void* dst, size_t bytes);
+    bool write(int32_t slot, const void* src, size_t bytes);
+
+    bool               validSlot(int32_t slot) const;
+    size_t             totalSlots() const;
+    size_t             freeSlots() const;
+    size_t             availableSlots() const;
+    size_t             blockSizeBytes() const;
+    size_t             slotStrideBytes() const;
+    size_t             readBytes() const;
+    size_t             writeBytes() const;
+    const std::string& filePath() const;
+    std::string        debugString() const;
+
+    static size_t alignUp(size_t value, size_t alignment);
+
+private:
+    struct SlotState {
+        uint32_t request_ref{0};
+        uint32_t cache_ref{0};
+    };
+
+    bool initFile();
+    void tryFreeSlotLocked(int32_t slot);
+
+private:
+    DiskBlockPoolConfig           config_;
+    std::unique_ptr<IDiskBlockIO> io_;
+    std::string                   file_path_;
+    size_t                        slot_stride_bytes_{0};
+    size_t                        slot_count_{0};
+    mutable std::mutex            mutex_;
+    std::set<int32_t>             free_slots_;
+    std::vector<SlotState>        slots_;
+    std::atomic<size_t>           read_bytes_{0};
+    std::atomic<size_t>           write_bytes_{0};
+};
+
+using DiskBlockPoolPtr = std::shared_ptr<DiskBlockPool>;
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/connector/memory/KVCacheMemoryConnector.cc b/rtp_llm/cpp/cache/connector/memory/KVCacheMemoryConnector.cc
index 3f1b7109d2..99fe7aceae 100644
--- a/rtp_llm/cpp/cache/connector/memory/KVCacheMemoryConnector.cc
+++ b/rtp_llm/cpp/cache/connector/memory/KVCacheMemoryConnector.cc
@@ -1,21 +1,24 @@
 #include "rtp_llm/cpp/cache/connector/memory/KVCacheMemoryConnector.h"
 
+#include <algorithm>
+#include <chrono>
+#include <cstring>
+
 #include "rtp_llm/cpp/cache/BlockPool.h"
 #include "rtp_llm/cpp/cache/BlockPoolConfigHelper.h"
 #include "rtp_llm/cpp/cache/connector/memory/MemoryAsyncContext.h"
 #include "rtp_llm/cpp/cache/connector/Meta.h"
-#include "rtp_llm/cpp/cache/KVCacheAllocator.h"
-#include "rtp_llm/models_py/bindings/core/ExecOps.h"
+#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h"
 #include "rtp_llm/models_py/bindings/NoBlockCopy.h"
 #include "rtp_llm/cpp/utils/Logger.h"
 #include "rtp_llm/cpp/metrics/RtpLLMMetrics.h"
 #include "rtp_llm/cpp/utils/ProfilingScope.h"
+#include "rtp_llm/cpp/utils/StringUtil.h"
+#include "rtp_llm/cpp/utils/TimeUtil.h"
 
 namespace rtp_llm {
-
 // When set on MultiCopyParams, execNoBlockCopy may try CUDA split scatter/gather (SplitKvCacheCopy; not on PPU).
-// Eligibility for the fast path is decided only by enable_memory_cache_sm_copy; splitKvMultiCopy falls back if layout
-// mismatches.
+// This legacy SM-copy path is only used for non typed layer-region layouts.
 static void applySplitKvMultiCopyFieldsIfEligible(bool enable_sm_copy, const CacheConfig& cfg, MultiCopyParams& out) {
     if (!enable_sm_copy) {
         return;
@@ -25,17 +28,62 @@ static void applySplitKvMultiCopyFieldsIfEligible(bool enable_sm_copy, const Cac
     out.split_kv_scale_stride_bytes = cfg.kv_scale_stride_bytes;
 }
 
+static void
+appendBatchedMemoryCopyTile(void* dst, const void* src, size_t bytes, std::vector<BatchedMemoryCopyTile>& tiles) {
+    if (bytes > 0) {
+        tiles.push_back(BatchedMemoryCopyTile{dst, src, bytes});
+    }
+}
+
+static void
+appendStagedMemoryCopyTile(void* gpu, size_t host_offset, size_t bytes, std::vector<StagedMemoryCopyTile>& tiles) {
+    if (gpu != nullptr && bytes > 0) {
+        tiles.push_back(StagedMemoryCopyTile{gpu, host_offset, bytes});
+    }
+}
+
+static void appendStagedMemoryCopyHostSegment(void*                                     host,
+                                              size_t                                    host_offset,
+                                              size_t                                    bytes,
+                                              std::vector<StagedMemoryCopyHostSegment>& segments) {
+    if (host == nullptr || bytes == 0) {
+        return;
+    }
+    if (!segments.empty()) {
+        auto& prev = segments.back();
+        if (static_cast<char*>(prev.host) + prev.bytes == host && prev.host_offset + prev.bytes == host_offset) {
+            prev.bytes += bytes;
+            return;
+        }
+    }
+    segments.push_back(StagedMemoryCopyHostSegment{host, host_offset, bytes});
+}
+
+static size_t alignUp(size_t value, size_t alignment) {
+    RTP_LLM_CHECK_WITH_INFO(alignment != 0, "alignment must not be zero");
+    return ((value + alignment - 1) / alignment) * alignment;
+}
+
 KVCacheMemoryConnector::KVCacheMemoryConnector(const CacheConfig&                       cache_config,
                                                const KVCacheConfig&                     kv_cache_config,
+                                               const ParallelismConfig&                 parallelism_config,
                                                const std::shared_ptr<KVCacheAllocator>& allocator,
                                                const std::vector<std::string>&          tp_addrs,
                                                const kmonitor::MetricsReporterPtr&      metrics_reporter):
     cache_config_(cache_config),
     kv_cache_config_(kv_cache_config),
+    parallelism_config_(parallelism_config),
     allocator_(allocator),
     tp_addrs_(tp_addrs),
     metrics_reporter_(metrics_reporter) {}
 
+KVCacheMemoryConnector::KVCacheMemoryConnector(const CacheConfig&                       cache_config,
+                                               const KVCacheConfig&                     kv_cache_config,
+                                               const std::shared_ptr<KVCacheAllocator>& allocator,
+                                               const std::vector<std::string>&          tp_addrs,
+                                               const kmonitor::MetricsReporterPtr&      metrics_reporter):
+    KVCacheMemoryConnector(cache_config, kv_cache_config, ParallelismConfig{}, allocator, tp_addrs, metrics_reporter) {}
+
 KVCacheMemoryConnector::~KVCacheMemoryConnector() {
     RTP_LLM_LOG_INFO("KVCacheMemoryConnector destructor");
     stop_.store(true);
@@ -50,6 +98,17 @@ KVCacheMemoryConnector::~KVCacheMemoryConnector() {
     broadcast_manager_.reset();
     block_pool_.reset();
     block_cache_.reset();
+    complete_pool_.reset();
+    incomplete_pool_.reset();
+    {
+        std::lock_guard<std::mutex> lock(staged_copy_scratch_mutex_);
+        for (auto& [_, scratch] : staged_copy_scratch_by_device_) {
+            if (scratch) {
+                releaseStagedMemoryCopyScratch(*scratch);
+            }
+        }
+        staged_copy_scratch_by_device_.clear();
+    }
 }
 
 bool KVCacheMemoryConnector::init() {
@@ -61,7 +120,7 @@ bool KVCacheMemoryConnector::init() {
     checkLayerBlockStrideBytes();
 
     initBlockPool();
-    block_cache_ = std::make_shared<MemoryBlockCache>();
+    block_cache_ = std::make_shared<MemoryDiskBlockCache>();
 
     broadcast_manager_ = std::make_shared<BroadcastManager>(tp_addrs_);
     RTP_LLM_CHECK_WITH_INFO(broadcast_manager_->init(), "init failed, broadcast manager init failed");
@@ -76,16 +135,27 @@ bool KVCacheMemoryConnector::init() {
 }
 
 void KVCacheMemoryConnector::checkLayerBlockStrideBytes() const {
-    const size_t layer_num          = cache_config_.layer_all_num;
-    const auto&  layer_block_stride = cache_config_.layer_to_block_stride_bytes;
-    RTP_LLM_CHECK_WITH_INFO(layer_block_stride.size() == layer_num,
-                            "layer block stride size must equal to layer num, got=%zu need=%zu",
-                            layer_block_stride.size(),
-                            layer_num);
-    for (size_t i = 0; i < layer_num; ++i) {
-        RTP_LLM_CHECK_WITH_INFO(
-            layer_block_stride[i] > 0, "invalid block stride bytes at layer=%zu: %d", i, layer_block_stride[i]);
+    const auto slots = layerTagSlots();
+    RTP_LLM_CHECK_WITH_INFO(!slots.empty(), "layer-attn slots must not be empty");
+    for (const auto& slot : slots) {
+        RTP_LLM_CHECK_WITH_INFO(slot.stride_bytes > 0,
+                                "invalid block stride bytes at layer=%d tag=%s group=%d: %zu",
+                                slot.layer_id,
+                                slot.tag.c_str(),
+                                slot.group_id,
+                                slot.stride_bytes);
+    }
+}
+
+bool KVCacheMemoryConnector::isDualPool() const {
+    return complete_pool_ != nullptr;
+}
+
+bool KVCacheMemoryConnector::isFullOnlySlot(const LayerTagSlot& slot) const {
+    if (slot.group_id < 0 || slot.group_id >= cache_config_.groupNums()) {
+        return true;
     }
+    return cache_config_.typeForGroup(static_cast<size_t>(slot.group_id)) == CacheGroupType::FULL;
 }
 
 void KVCacheMemoryConnector::initBlockPool() {
@@ -94,15 +164,139 @@ void KVCacheMemoryConnector::initBlockPool() {
                             "init block pool failed, memory size is invalid, memory size: %ld MB",
                             memory_cache_size_mb);
 
-    const auto& layer_block_stride = cache_config_.layer_to_block_stride_bytes;
+    const auto slots = layerTagSlots();
+
+    size_t total_block_size     = 0;
+    size_t full_only_block_size = 0;
+    for (const auto& slot : slots) {
+        total_block_size += slot.stride_bytes;
+        if (isFullOnlySlot(slot)) {
+            full_only_block_size += slot.stride_bytes;
+        }
+    }
+    RTP_LLM_CHECK_WITH_INFO(total_block_size > 0, "block size is invalid: %zu", total_block_size);
 
-    // block_size here means "one cache-key across all layers" total bytes (kv + scale).
-    // Use per-layer block strides so NULL_BLOCK_IDX layers still occupy space in merged layout.
-    size_t block_size = std::accumulate(layer_block_stride.begin(), layer_block_stride.end(), 0);
-    RTP_LLM_CHECK_WITH_INFO(block_size > 0, "block size is invalid: %zu", block_size);
+    const bool use_dual =
+        hasTypedLayerTagSlots(slots) && full_only_block_size > 0 && full_only_block_size < total_block_size;
 
-    block_pool_ = createBlockPool(block_size, memory_cache_size_mb);
-    RTP_LLM_CHECK_WITH_INFO(block_pool_ != nullptr, "init block pool failed, create block pool failed");
+    if (!use_dual) {
+        block_pool_ = createBlockPool(total_block_size, memory_cache_size_mb);
+        RTP_LLM_CHECK_WITH_INFO(block_pool_ != nullptr, "init block pool failed, create block pool failed");
+        return;
+    }
+
+    complete_block_size_   = total_block_size;
+    incomplete_block_size_ = full_only_block_size;
+
+    const int    step        = std::max(1, cache_config_.linear_step);
+    const size_t total_bytes = static_cast<size_t>(memory_cache_size_mb) * 1024ULL * 1024ULL;
+
+    size_t complete_block_num;
+    size_t incomplete_block_num;
+    if (step > 1) {
+        const size_t effective_block_bytes =
+            complete_block_size_ + incomplete_block_size_ * static_cast<size_t>(step - 1);
+        RTP_LLM_CHECK_WITH_INFO(effective_block_bytes > 0, "effective block bytes is zero");
+        complete_block_num   = total_bytes / effective_block_bytes;
+        incomplete_block_num = complete_block_num * static_cast<size_t>(step - 1);
+    } else {
+        complete_block_num   = total_bytes / complete_block_size_;
+        incomplete_block_num = 0;
+    }
+    RTP_LLM_CHECK_WITH_INFO(complete_block_num > 0,
+                            "pool_size_mb=%ld too small for complete_block_size=%zu",
+                            memory_cache_size_mb,
+                            complete_block_size_);
+
+    RTP_LLM_LOG_INFO(
+        "dual pool init: complete_size=%zu complete_num=%zu incomplete_size=%zu incomplete_num=%zu step=%d",
+        complete_block_size_,
+        complete_block_num,
+        incomplete_block_size_,
+        incomplete_block_num,
+        step);
+
+    auto make_pool = [](size_t block_size, size_t block_num) -> std::shared_ptr<BlockPool> {
+        if (block_num == 0) {
+            return nullptr;
+        }
+        RTP_LLM_LOG_INFO("create memory block pool, block num: %zu, block size: %zu", block_num, block_size);
+        const auto pool_config = BlockPoolConfigHelper::createConfig(
+            /*layer_num=*/1, static_cast<uint32_t>(block_num), static_cast<uint32_t>(block_size), rtp_llm::TYPE_INT8);
+        auto pool = std::make_shared<BlockPool>(pool_config, AllocationType::HOST);
+        RTP_LLM_CHECK_WITH_INFO(pool->init(), "memory block pool init failed, block size: %zu", block_size);
+        return pool;
+    };
+
+    complete_pool_ = make_pool(complete_block_size_, complete_block_num);
+    RTP_LLM_CHECK_WITH_INFO(complete_pool_ != nullptr, "init complete pool failed");
+    if (incomplete_block_num > 0) {
+        incomplete_pool_ = make_pool(incomplete_block_size_, incomplete_block_num);
+        RTP_LLM_CHECK_WITH_INFO(incomplete_pool_ != nullptr, "init incomplete pool failed");
+    }
+}
+
+size_t KVCacheMemoryConnector::memoryCacheBlockSizeBytes() const {
+    const auto slots      = layerTagSlots();
+    size_t     block_size = 0;
+    for (const auto& slot : slots) {
+        block_size += slot.stride_bytes;
+    }
+    return block_size;
+}
+
+std::vector<KVCacheMemoryConnector::LayerTagSlot> KVCacheMemoryConnector::layerTagSlots() const {
+    std::vector<LayerTagSlot> slots;
+    const size_t                 layer_num = cache_config_.layer_all_num;
+
+    auto group_stride = [this](int gid, int layer_id) -> size_t {
+        if (gid >= 0 && gid < cache_config_.groupNums()) {
+            const size_t kv_stride    = cache_config_.kvBlockStrideBytesForGroup(static_cast<size_t>(gid));
+            const size_t scale_stride = cache_config_.kvScaleStrideBytesForGroup(static_cast<size_t>(gid));
+            if (kv_stride + scale_stride > 0) {
+                return kv_stride + scale_stride;
+            }
+        }
+        if (layer_id >= 0 && static_cast<size_t>(layer_id) < cache_config_.layer_to_block_stride_bytes.size()) {
+            return static_cast<size_t>(cache_config_.layer_to_block_stride_bytes[static_cast<size_t>(layer_id)]);
+        }
+        return cache_config_.kv_block_stride_bytes + cache_config_.kv_scale_stride_bytes;
+    };
+
+    const auto layer_group_ids = cache_config_.layerGroupIdsSnapshot();
+    for (size_t layer = 0; layer < layer_num; ++layer) {
+        if (layer < layer_group_ids.size()) {
+            for (int gid : layer_group_ids[layer]) {
+                if (gid < 0) {
+                    continue;
+                }
+                const auto policy = cache_config_.policyForGroup(static_cast<size_t>(gid));
+                if (policy.reuse_policy == CacheReusePolicy::NON_REUSABLE) {
+                    continue;
+                }
+                const std::string tag = gid < cache_config_.groupNums() ?
+                                            cache_config_.tagForGroup(static_cast<size_t>(gid)) :
+                                            std::string("group_") + std::to_string(gid);
+                slots.push_back(LayerTagSlot{static_cast<int>(layer),
+                                                tag,
+                                                gid,
+                                                group_stride(gid, static_cast<int>(layer))});
+            }
+        }
+    }
+    return slots;
+}
+
+bool KVCacheMemoryConnector::hasTypedLayerTagSlots(const std::vector<LayerTagSlot>& slots) const {
+    if (slots.size() != cache_config_.layer_all_num) {
+        return true;
+    }
+    for (size_t i = 0; i < slots.size(); ++i) {
+        if (slots[i].layer_id != static_cast<int>(i) || slots[i].tag != "default") {
+            return true;
+        }
+    }
+    return false;
 }
 
 std::shared_ptr<AsyncMatchContext> KVCacheMemoryConnector::asyncMatch(const std::shared_ptr<KVCacheResource>& resource,
@@ -115,16 +309,19 @@ std::shared_ptr<AsyncMatchContext> KVCacheMemoryConnector::asyncMatch(const std:
     }
 
     const auto& cache_keys = resource->cacheKeys();
-    // do not match last block, whether it is aligned or not, otherwise may cause core dump in computing ops.
+    // Do not match the last key.  It is either a real partial tail or a
+    // connector-level dummy tail used to preserve the same contract after CP
+    // Page-RR remap.
     const auto cache_keys_size = cache_keys.empty() ? 0 : cache_keys.size() - 1;
     if (cache_keys_size == 0) {
         RTP_LLM_LOG_DEBUG("async match skip, cache keys is empty");
         return nullptr;
     }
 
-    const auto& layer_block_ids = resource->layerBlocks();
-    if (!checkLayerBlocks(layer_block_ids, cache_keys_size)) {
-        RTP_LLM_LOG_WARNING("async match failed, invalid layer_block_ids, cache_keys_size=%zu", cache_keys_size);
+    const auto slots                = layerTagSlots();
+    const auto layer_attn_block_ids = resourceLayerRegionBlocks(*resource, slots);
+    if (!checkLayerRegionBlocks(layer_attn_block_ids, slots, cache_keys_size)) {
+        RTP_LLM_LOG_WARNING("async match failed, invalid layer_attn_block_ids, cache_keys_size=%zu", cache_keys_size);
         return nullptr;
     }
 
@@ -154,7 +351,8 @@ std::shared_ptr<AsyncMatchContext> KVCacheMemoryConnector::asyncMatch(const std:
         if (isNullBlockIdx(match_result.matched_index)) {
             break;  // only continuous prefix
         }
-        if (match_result.is_complete && gpuBlocksAllValid(layer_block_ids, i)) {
+        const bool gpu_blocks_all_valid = gpuBlocksAllValid(layer_attn_block_ids, slots, i);
+        if (match_result.is_complete && gpu_blocks_all_valid) {
             matched_num = i + 1;
         }
     }
@@ -163,15 +361,29 @@ std::shared_ptr<AsyncMatchContext> KVCacheMemoryConnector::asyncMatch(const std:
         RTP_LLM_LOG_DEBUG("not matched cache in memory, cache keys size: %zu, already_reuse_num: %zu",
                           cache_keys_size,
                           already_reuse_num);
-        reportMatchMetrics(/*success=*/false, timer.done_us(), cache_keys_size, matched_num);
+        reportMatchMetrics(/*success=*/true, timer.done_us(), cache_keys_size, matched_num);
+        return nullptr;
+    }
+    const int start_read_block_index = static_cast<int>(already_reuse_num);
+    const int read_block_num         = static_cast<int>(matched_num - already_reuse_num);
+    auto      copy_plan =
+        buildCopyPlanForRead(cache_keys, layer_attn_block_ids, slots, start_read_block_index, read_block_num);
+    if (!copy_plan || copy_plan->copy_infos.empty()) {
+        RTP_LLM_LOG_DEBUG(
+            "memory cache match dropped because read copy plan is empty, already_reuse=%zu matched=%zu cache_keys=%zu",
+            already_reuse_num,
+            matched_num,
+            cache_keys_size);
+        reportMatchMetrics(/*success=*/false, timer.done_us(), cache_keys_size, already_reuse_num);
         return nullptr;
     }
-    RTP_LLM_LOG_INFO("memory cache matched blocks: already_reuse=%zu matched=%zu cache_keys=%zu",
-                     already_reuse_num,
-                     matched_num,
-                     cache_keys_size);
+
+    RTP_LLM_LOG_DEBUG("memory cache matched blocks: already_reuse=%zu matched=%zu cache_keys=%zu",
+                      already_reuse_num,
+                      matched_num,
+                      cache_keys_size);
     reportMatchMetrics(/*success=*/true, timer.done_us(), cache_keys_size, matched_num);
-    return std::make_shared<MemoryAsyncMatchContext>(matched_num);
+    return std::make_shared<MemoryAsyncMatchContext>(matched_num, start_read_block_index, read_block_num, copy_plan);
 }
 
 bool KVCacheMemoryConnector::gpuBlocksAllValid(const LayerBlockIds& layer_block_ids, size_t key_index) const {
@@ -184,6 +396,31 @@ bool KVCacheMemoryConnector::gpuBlocksAllValid(const LayerBlockIds& layer_block_
     return true;
 }
 
+bool KVCacheMemoryConnector::gpuBlocksAllValid(const LayerAttnBlockIds&            layer_attn_block_ids,
+                                               const std::vector<LayerTagSlot>& slots,
+                                               size_t                              key_index) const {
+    for (const auto& slot : slots) {
+        const auto layer = static_cast<size_t>(slot.layer_id);
+        const auto attn  = static_cast<size_t>(slot.group_id);
+        if (layer >= layer_attn_block_ids.size() || attn >= layer_attn_block_ids[layer].size()
+            || layer_attn_block_ids[layer][attn] == nullptr) {
+            return false;
+        }
+        const auto& blocks = layer_attn_block_ids[layer][attn]->blocks();
+        if (key_index >= blocks.size() || isNullBlockIdx(blocks[key_index])) {
+            return false;
+        }
+    }
+    return true;
+}
+
+CacheGroupPolicy KVCacheMemoryConnector::groupPolicyForSlot(const LayerTagSlot& slot) const {
+    if (slot.group_id < 0 || slot.group_id >= cache_config_.groupNums()) {
+        return CacheGroupPolicy{};
+    }
+    return cache_config_.policyForGroup(static_cast<size_t>(slot.group_id));
+}
+
 std::shared_ptr<AsyncContext> KVCacheMemoryConnector::asyncRead(const std::shared_ptr<KVCacheResource>&   resource,
                                                                 const std::shared_ptr<Meta>&              meta,
                                                                 const std::shared_ptr<AsyncMatchContext>& match_context,
@@ -200,8 +437,9 @@ std::shared_ptr<AsyncContext> KVCacheMemoryConnector::asyncRead(const std::share
 
     autil::ScopedTime2 timer;
 
-    const auto& layer_block_ids = resource->layerBlocks();
-    if (!checkLayerBlocks(layer_block_ids, cache_keys_size)) {
+    const auto slots                = layerTagSlots();
+    const auto layer_attn_block_ids = resourceLayerRegionBlocks(*resource, slots);
+    if (!checkLayerRegionBlocks(layer_attn_block_ids, slots, cache_keys_size)) {
         reportReadMetrics(false, timer.done_us(), cache_keys_size, 0);
         return nullptr;
     }
@@ -217,7 +455,26 @@ std::shared_ptr<AsyncContext> KVCacheMemoryConnector::asyncRead(const std::share
         return nullptr;
     }
 
-    auto copy_plan = buildCopyPlanForRead(cache_keys, layer_block_ids, start_read_block_index, read_block_num);
+    std::shared_ptr<CopyPlan> copy_plan;
+    auto                      memory_match_context = std::dynamic_pointer_cast<MemoryAsyncMatchContext>(match_context);
+    if (memory_match_context && memory_match_context->readCopyPlan()) {
+        if (memory_match_context->startReadBlockIndex() == start_read_block_index
+            && memory_match_context->readBlockNum() == read_block_num) {
+            copy_plan = std::static_pointer_cast<CopyPlan>(memory_match_context->readCopyPlan());
+            memory_match_context->clearReadCopyPlan();
+        } else {
+            RTP_LLM_LOG_WARNING(
+                "async read ignored read copy plan because range mismatched, plan_start=%d plan_num=%d read_start=%d read_num=%d",
+                memory_match_context->startReadBlockIndex(),
+                memory_match_context->readBlockNum(),
+                start_read_block_index,
+                read_block_num);
+        }
+    }
+    if (!copy_plan) {
+        copy_plan =
+            buildCopyPlanForRead(cache_keys, layer_attn_block_ids, slots, start_read_block_index, read_block_num);
+    }
     if (!copy_plan || copy_plan->copy_infos.empty()) {
         reportReadMetrics(false, timer.done_us(), cache_keys_size, 0);
         return nullptr;
@@ -229,11 +486,12 @@ std::shared_ptr<AsyncContext> KVCacheMemoryConnector::asyncRead(const std::share
         if (success) {
             resource->setMemoryReuseBlockNum(read_block_num);
             for (const auto& copy_info : copy_plan->copy_infos) {
-                const auto removed_item = block_cache_->removeIfMatch(copy_info.cache_key, copy_info.mem_block);
+                const auto removed_item = block_cache_->removeIfMatch(
+                    copy_info.cache_key, CacheBackingType::MEMORY, copy_info.mem_block, /*disk_slot=*/-1);
                 if (!removed_item.has_value()) {
                     continue;
                 }
-                freeBlocks({removed_item->block_index}, /*cache_free=*/true);
+                releaseCacheBacking(*removed_item);
             }
             RTP_LLM_LOG_INFO("memory cache read success: read_blocks=%d released_blocks=%zu total_blocks=%zu",
                              read_block_num,
@@ -254,31 +512,40 @@ std::shared_ptr<AsyncContext> KVCacheMemoryConnector::asyncRead(const std::share
     return context;
 }
 
-std::shared_ptr<KVCacheMemoryConnector::CopyPlan> KVCacheMemoryConnector::buildCopyPlanForRead(
-    const CacheKeysType& cache_keys, const LayerBlockIds& layer_block_ids, int start_index, int read_num) {
+std::shared_ptr<KVCacheMemoryConnector::CopyPlan>
+KVCacheMemoryConnector::buildCopyPlanForRead(const CacheKeysType&                cache_keys,
+                                             const LayerAttnBlockIds&            layer_attn_block_ids,
+                                             const std::vector<LayerTagSlot>& slots,
+                                             int                                 start_index,
+                                             int                                 read_num) {
     std::vector<CopyInfoPerKey> copy_infos;
-    const auto                  layer_num = cache_config_.layer_all_num;
-    bool                        success   = true;
+    bool                        success = true;
 
     for (int i = start_index; i < start_index + read_num; ++i) {
         const auto cache_key    = cache_keys.at(i);
-        const auto match_result = block_cache_->match(static_cast<CacheKeyType>(cache_key));
+        const auto match_result = block_cache_->matchAndMarkInFlight(static_cast<CacheKeyType>(cache_key));
         if (isNullBlockIdx(match_result.matched_index)) {
             RTP_LLM_LOG_WARNING("build copy plan for read failed, cache key not found, cache key: %ld", cache_key);
             success = false;
             break;
         }
         // 每次都加引用的原因是为了确保match到的block不会被释放(避免在写时malloc如果cache满弹出该block)
-        referenceBlocks({match_result.matched_index}, /*cache_ref=*/false);
+        auto source_pool = memoryPoolFor(blockKindFromComplete(match_result.is_complete));
+        if (!source_pool) {
+            RTP_LLM_LOG_WARNING("build copy plan for read failed, missing memory pool, cache key: %ld", cache_key);
+            success = false;
+            break;
+        }
+        referenceBlocksInPool(source_pool, {match_result.matched_index}, /*cache_ref=*/false);
 
         CopyInfoPerKey copy_info;
         copy_info.cache_key = cache_key;
         copy_info.mem_block = match_result.matched_index;
-        copy_info.gpu_blocks.reserve(layer_num);
-        for (size_t layer = 0; layer < layer_num; ++layer) {
-            // Do NOT skip NULL_BLOCK_IDX here. The merged memory block layout requires reserving
-            // per-layer stride even when this layer has no gpu block (-1).
-            copy_info.gpu_blocks.push_back(layer_block_ids.at(layer)->blocks().at(i));
+        copy_info.gpu_blocks.reserve(slots.size());
+        for (const auto& slot : slots) {
+            const auto layer = static_cast<size_t>(slot.layer_id);
+            const auto attn  = static_cast<size_t>(slot.group_id);
+            copy_info.gpu_blocks.push_back(layer_attn_block_ids.at(layer).at(attn)->blocks().at(i));
         }
         copy_info.is_complete = match_result.is_complete;
         copy_infos.emplace_back(std::move(copy_info));
@@ -315,13 +582,16 @@ std::shared_ptr<AsyncContext> KVCacheMemoryConnector::asyncWrite(const std::shar
 
     autil::ScopedTime2 timer;
 
-    const auto& layer_block_ids = resource->layerBlocks();
-    if (!checkLayerBlocks(layer_block_ids, cache_keys_size)) {
+    const auto slots                = layerTagSlots();
+    const auto layer_attn_block_ids = resourceLayerRegionBlocks(*resource, slots);
+    if (!checkLayerRegionBlocks(layer_attn_block_ids, slots, cache_keys_size)) {
+        RTP_LLM_LOG_WARNING("async write failed, invalid layer_attn_block_ids, cache_keys_size=%zu resource_keys=%zu",
+                            cache_keys_size,
+                            cache_keys.size());
         reportWriteMetrics(false, timer.done_us(), cache_keys_size, 0);
         return nullptr;
     }
 
-    // 计算内存中已存在的前缀长度
     size_t mem_matched_num = 0;
     for (; mem_matched_num < cache_keys_size; ++mem_matched_num) {
         if (!block_cache_->contains(static_cast<CacheKeyType>(cache_keys[mem_matched_num]))) {
@@ -339,28 +609,32 @@ std::shared_ptr<AsyncContext> KVCacheMemoryConnector::asyncWrite(const std::shar
 
     bool no_need_write = false;
     auto copy_plan     = buildCopyPlanForWrite(
-        cache_keys, layer_block_ids, mem_matched_num, cache_keys_size - mem_matched_num, no_need_write);
+        cache_keys, layer_attn_block_ids, slots, mem_matched_num, cache_keys_size - mem_matched_num, no_need_write);
     if (!copy_plan || copy_plan->copy_infos.empty()) {
+        RTP_LLM_LOG_DEBUG(
+            "async write skip, no copy plan, cache_keys=%zu write_start=%zu write_num=%zu no_need_write=%d",
+            cache_keys_size,
+            mem_matched_num,
+            cache_keys_size - mem_matched_num,
+            no_need_write);
         reportWriteMetrics(no_need_write, timer.done_us(), static_cast<int64_t>(cache_keys_size), 0);
         return nullptr;
     }
 
     auto write_done =
         [copy_plan, resource_copy = resource, timer, total_block_num = cache_keys_size, this](bool success) mutable {
-            RTP_LLM_LOG_DEBUG("async write done, success: %d", success);
+            RTP_LLM_LOG_DEBUG("memory cache write done: success=%d write_blocks=%zu total_blocks=%zu",
+                              success,
+                              copy_plan ? copy_plan->copy_infos.size() : 0,
+                              total_block_num);
 
             if (success) {
-                for (const auto& copy_info : copy_plan->copy_infos) {
-                    MemoryBlockCache::CacheItem item;
-                    item.cache_key   = copy_info.cache_key;
-                    item.block_index = copy_info.mem_block;
-                    item.is_resident = false;
-                    item.is_complete = copy_info.is_complete;
-                    putToCache(item);
+                for (auto& copy_info : copy_plan->copy_infos) {
+                    putToCache(copy_info);
                 }
-                // reset resource to decrease block ref count in destructor
-                resource_copy.reset();
             }
+            // reset resource to decrease block ref count in destructor
+            resource_copy.reset();
             const int64_t write_block_num = success ? static_cast<int64_t>(copy_plan->copy_infos.size()) : 0;
             // reset copy plan to release memory block refs
             copy_plan.reset();
@@ -377,12 +651,12 @@ std::shared_ptr<AsyncContext> KVCacheMemoryConnector::asyncWrite(const std::shar
 }
 
 std::shared_ptr<KVCacheMemoryConnector::CopyPlan>
-KVCacheMemoryConnector::buildCopyPlanForWrite(const CacheKeysType& cache_keys,
-                                              const LayerBlockIds& layer_block_ids,
-                                              int                  start_index,
-                                              int                  write_num,
-                                              bool&                no_need_write) {
-    const auto                  layer_num = cache_config_.layer_all_num;
+KVCacheMemoryConnector::buildCopyPlanForWrite(const CacheKeysType&                cache_keys,
+                                              const LayerAttnBlockIds&            layer_attn_block_ids,
+                                              const std::vector<LayerTagSlot>& slots,
+                                              int                                 start_index,
+                                              int                                 write_num,
+                                              bool&                               no_need_write) {
     std::vector<CopyInfoPerKey> copy_infos;
     copy_infos.reserve(write_num);
 
@@ -395,11 +669,13 @@ KVCacheMemoryConnector::buildCopyPlanForWrite(const CacheKeysType& cache_keys,
     for (int i = start_index; i < start_index + write_num; ++i) {
         const auto                cache_key = cache_keys.at(i);
         std::vector<BlockIdxType> gpu_blocks;
-        gpu_blocks.reserve(layer_num);
+        gpu_blocks.reserve(slots.size());
         size_t null_block_num = 0;
-        for (size_t layer = 0; layer < layer_num; ++layer) {
-            const int gpu_block_idx = layer_block_ids.at(layer)->blocks().at(i);
-            // Do NOT skip NULL_BLOCK_IDX here. We must keep per-layer stride slots in the merged big block.
+        for (const auto& slot : slots) {
+            const auto layer         = static_cast<size_t>(slot.layer_id);
+            const auto attn          = static_cast<size_t>(slot.group_id);
+            const int  gpu_block_idx = layer_attn_block_ids.at(layer).at(attn)->blocks().at(i);
+            // Do NOT skip NULL_BLOCK_IDX here. We must keep per-layer+attn stride slots in the merged big block.
             if (isNullBlockIdx(gpu_block_idx)) {
                 ++null_block_num;
             }
@@ -422,6 +698,8 @@ KVCacheMemoryConnector::buildCopyPlanForWrite(const CacheKeysType& cache_keys,
     // ensure the final written key is complete
     no_need_write = last_complete_index < start_index;
     if (no_need_write) {
+        RTP_LLM_LOG_DEBUG(
+            "build copy plan for write found no complete key, start=%d write_num=%d", start_index, write_num);
         return nullptr;
     }
 
@@ -429,15 +707,24 @@ KVCacheMemoryConnector::buildCopyPlanForWrite(const CacheKeysType& cache_keys,
     const size_t keep_cnt = static_cast<size_t>(last_complete_index - start_index + 1);
     copy_infos.resize(keep_cnt);
 
-    std::vector<BlockIdxType> mem_blocks;
-    if (!mallocBlocks(copy_infos.size(), mem_blocks)) {
-        RTP_LLM_LOG_WARNING("build copy plan for write failed, malloc blocks failed, need blocks: %zu",
+    if (isDualPool() && !incomplete_pool_) {
+        const auto before = copy_infos.size();
+        copy_infos.erase(
+            std::remove_if(copy_infos.begin(), copy_infos.end(), [](const auto& ci) { return !ci.is_complete; }),
+            copy_infos.end());
+        if (copy_infos.size() != before) {
+            RTP_LLM_LOG_DEBUG("build copy plan for write skip incomplete blocks because incomplete pool is disabled, "
+                              "before=%zu after=%zu",
+                              before,
+                              copy_infos.size());
+        }
+    }
+
+    if (!allocateBackingsForWrite(copy_infos)) {
+        RTP_LLM_LOG_WARNING("build copy plan for write failed, allocate backing failed, need blocks: %zu",
                             copy_infos.size());
         return nullptr;
     }
-    for (size_t i = 0; i < copy_infos.size(); ++i) {
-        copy_infos[i].mem_block = mem_blocks[i];
-    }
 
     // free blocks in destructor
     auto plan = createCopyPlan(copy_infos, CopyDirection::D2H);
@@ -450,12 +737,13 @@ KVCacheMemoryConnector::createCopyPlan(const std::vector<CopyInfoPerKey>& copy_i
     plan->copy_infos = copy_infos;
     plan->direction  = direction;
     auto deleter     = [this](CopyPlan* plan) {
-        std::vector<BlockIdxType> blocks;
-        blocks.reserve(plan->copy_infos.size());
         for (const auto& copy_info : plan->copy_infos) {
-            blocks.push_back(copy_info.mem_block);
+            releaseRequestBacking(copy_info);
+            if (plan->direction == CopyDirection::H2D) {
+                block_cache_->releaseInFlight(
+                    copy_info.cache_key, CacheBackingType::MEMORY, copy_info.mem_block, /*disk_slot=*/-1);
+            }
         }
-        freeBlocks(blocks, /*cache_free=*/false);
         delete plan;
     };
     return std::shared_ptr<CopyPlan>(plan, deleter);
@@ -466,9 +754,11 @@ bool KVCacheMemoryConnector::startCopyAsync(const std::shared_ptr<MemoryAsyncCon
     if (stop_.load()) {
         return false;
     }
-    auto code = wait_done_thread_pool_->pushTask([this, context, copy_plan]() mutable {
-        auto send_result = sendCopyPlan(copy_plan);
+    auto task_copy_plan = copy_plan;
+    auto code           = wait_done_thread_pool_->pushTask([this, context, task_copy_plan]() mutable {
+        auto send_result = sendCopyPlan(task_copy_plan);
         context->setBroadcastResult(send_result);
+        task_copy_plan.reset();
         context->waitDone();
     });
     if (code != autil::ThreadPoolBase::ERROR_NONE) {
@@ -486,11 +776,18 @@ KVCacheMemoryConnector::sendCopyPlan(const std::shared_ptr<CopyPlan>& copy_plan)
     for (const auto& copy_info : copy_plan->copy_infos) {
         auto* item = mem_req.add_copy_items();
         item->set_mem_block(copy_info.mem_block);
+        item->set_is_complete(copy_info.is_complete);
+        item->set_backing_type(MemoryOperationRequestPB::MEMORY);
         for (const auto& block : copy_info.gpu_blocks) {
             item->add_gpu_blocks(block);
         }
     }
 
+    return sendMemoryRequest(mem_req, copyPlanTimeoutMs(copy_plan));
+}
+
+std::shared_ptr<BroadcastResult<FunctionRequestPB, FunctionResponsePB>>
+KVCacheMemoryConnector::sendMemoryRequest(const MemoryOperationRequestPB& mem_req, int64_t timeout_ms) const {
     std::vector<FunctionRequestPB> requests;
     requests.reserve(broadcast_manager_->workerNum());
     for (size_t i = 0; i < broadcast_manager_->workerNum(); ++i) {
@@ -498,15 +795,13 @@ KVCacheMemoryConnector::sendCopyPlan(const std::shared_ptr<CopyPlan>& copy_plan)
         req.mutable_mem_request()->CopyFrom(mem_req);
         requests.emplace_back(std::move(req));
     }
-
     auto rpc_call = [](const std::shared_ptr<RpcService::Stub>&    stub,
                        const std::shared_ptr<grpc::ClientContext>& context,
                        const FunctionRequestPB&                    request,
                        grpc::CompletionQueue*                      completion_queue) {
         return stub->AsyncExecuteFunction(context.get(), request, completion_queue);
     };
-    return broadcast_manager_->broadcast<FunctionRequestPB, FunctionResponsePB>(
-        requests, kv_cache_config_.memory_cache_sync_timeout_ms, rpc_call);
+    return broadcast_manager_->broadcast<FunctionRequestPB, FunctionResponsePB>(requests, timeout_ms, rpc_call);
 }
 
 void KVCacheMemoryConnector::printCopyPlan(const std::shared_ptr<CopyPlan>& copy_plan) const {
@@ -515,7 +810,8 @@ void KVCacheMemoryConnector::printCopyPlan(const std::shared_ptr<CopyPlan>& copy
         << ", copy infos size: " << copy_plan->copy_infos.size() << "\n";
     for (int i = 0; i < copy_plan->copy_infos.size(); ++i) {
         const auto& copy_info = copy_plan->copy_infos.at(i);
-        oss << "copy info " << i << ": cache key: " << copy_info.cache_key << ", mem block: " << copy_info.mem_block
+        oss << "copy info " << i << ": cache key: " << copy_info.cache_key
+            << ", mem block: " << copy_info.mem_block
             << ", gpu layer blocks: [";
         for (const auto& gpu_block : copy_info.gpu_blocks) {
             oss << gpu_block << ", ";
@@ -528,45 +824,329 @@ void KVCacheMemoryConnector::printCopyPlan(const std::shared_ptr<CopyPlan>& copy
 bool KVCacheMemoryConnector::copyCache(const MemoryOperationRequestPB& request, MemoryOperationResponsePB& response) {
     RTP_LLM_PROFILE_FUNCTION();
     autil::ScopedTime2 timer;
-    const auto         copy_direction =
-        (request.copy_direction() == MemoryOperationRequestPB::H2D) ? CopyDirection::H2D : CopyDirection::D2H;
+    CopyDirection copy_direction = CopyDirection::D2H;
+    if (request.copy_direction() == MemoryOperationRequestPB::H2D) {
+        copy_direction = CopyDirection::H2D;
+    }
+    const auto slots           = layerTagSlots();
+    const bool has_typed_slots = hasTypedLayerTagSlots(slots);
+
+    if (request.copy_items_size() == 0) {
+        RTP_LLM_LOG_WARNING("copy cache failed, copy_items is empty");
+        response.set_success(false);
+        reportCopyMetrics(false, timer.done_us(), copy_direction);
+        return false;
+    }
+
+    for (int i = 0; i < request.copy_items_size(); ++i) {
+        const auto& item = request.copy_items(i);
+        if (!validateCopyItemBacking(item)) {
+            response.set_success(false);
+            reportCopyMetrics(false, timer.done_us(), copy_direction);
+            return false;
+        }
+    }
+
+    if (tryCopyCacheWithStagedMemoryCopy(request, copy_direction, slots)) {
+        response.set_success(true);
+        reportCopyMetrics(true, timer.done_us(), copy_direction);
+        return true;
+    }
+    if (cache_config_.use_typed_cache_regions && cache_config_.use_opaque_kv_cache_store) {
+        RTP_LLM_LOG_WARNING("typed opaque memory copy failed for typed layout");
+        response.set_success(false);
+        reportCopyMetrics(false, timer.done_us(), copy_direction);
+        return false;
+    }
+    if (has_typed_slots && tryCopyCacheWithBatchedMemoryCopy(request, copy_direction, slots)) {
+        response.set_success(true);
+        reportCopyMetrics(true, timer.done_us(), copy_direction);
+        return true;
+    }
+
+    if (!copyMemoryItemsGeneric(request, copy_direction, slots)) {
+        response.set_success(false);
+        reportCopyMetrics(false, timer.done_us(), copy_direction);
+        return false;
+    }
+
+    response.set_success(true);
+    reportCopyMetrics(true, timer.done_us(), copy_direction);
+    return true;
+}
+
+bool KVCacheMemoryConnector::validateCopyItemBacking(const MemoryOperationRequestPB::CopyItem& item) const {
+    if (item.backing_type() != MemoryOperationRequestPB::MEMORY) {
+        RTP_LLM_LOG_WARNING("copy item has unsupported backing_type=%d", static_cast<int>(item.backing_type()));
+        return false;
+    }
+    return true;
+}
 
+bool KVCacheMemoryConnector::copyMemoryItemsGeneric(const MemoryOperationRequestPB&     request,
+                                                    CopyDirection                       direction,
+                                                    const std::vector<LayerTagSlot>& slots) {
     std::vector<torch::Tensor> dst_buffers;
     std::vector<torch::Tensor> src_buffers;
     for (int i = 0; i < request.copy_items_size(); ++i) {
-        const auto&                     item      = request.copy_items(i);
+        const auto& item = request.copy_items(i);
+        if (item.backing_type() != MemoryOperationRequestPB::MEMORY) {
+            continue;
+        }
         const auto                      mem_block = static_cast<BlockIdxType>(item.mem_block());
         const std::vector<BlockIdxType> gpu_blocks(item.gpu_blocks().begin(), item.gpu_blocks().end());
 
-        if (!prepareCopyBuffers(mem_block, gpu_blocks, copy_direction, dst_buffers, src_buffers)) {
-            RTP_LLM_LOG_WARNING("copy cache failed, prepare copy buffers failed, mem_block=%d, direction=%s",
+        if (!prepareCopyBuffers(mem_block, gpu_blocks, direction, item.is_complete(), dst_buffers, src_buffers)) {
+            RTP_LLM_LOG_WARNING("copy cache failed, prepare memory copy buffers failed, mem_block=%d, direction=%s",
                                 mem_block,
-                                copy_direction == CopyDirection::H2D ? "H2D" : "D2H");
-            response.set_success(false);
-            reportCopyMetrics(false, timer.done_us(), copy_direction);
+                                direction == CopyDirection::H2D ? "H2D" : "D2H");
             return false;
         }
     }
 
     if (!dst_buffers.empty()) {
         MultiCopyParams mc{dst_buffers, src_buffers};
-        applySplitKvMultiCopyFieldsIfEligible(kv_cache_config_.enable_memory_cache_sm_copy, cache_config_, mc);
+        const bool      can_use_split_kv_copy = !hasTypedLayerTagSlots(slots);
+        applySplitKvMultiCopyFieldsIfEligible(
+            kv_cache_config_.enable_memory_cache_sm_copy && can_use_split_kv_copy, cache_config_, mc);
         execNoBlockCopy(mc);
     }
+    return true;
+}
 
-    response.set_success(true);
-    reportCopyMetrics(true, timer.done_us(), copy_direction);
+bool KVCacheMemoryConnector::tryCopyCacheWithStagedMemoryCopy(const MemoryOperationRequestPB&     request,
+                                                              CopyDirection                       direction,
+                                                              const std::vector<LayerTagSlot>& slots) {
+    RTP_LLM_PROFILE_SCOPE("reuse_cache.memory.copy.plan_staged");
+    if (!isDualPool() && block_pool_ == nullptr) {
+        return false;
+    }
+    if (isDualPool() && !complete_pool_) {
+        return false;
+    }
+    if (allocator_ == nullptr) {
+        return false;
+    }
+
+    StagedMemoryCopyParams params;
+    params.direction =
+        direction == CopyDirection::H2D ? StagedMemoryCopyDirection::H2D : StagedMemoryCopyDirection::D2H;
+    size_t logical_rows  = 0;
+    size_t payload_bytes = 0;
+
+    for (int i = 0; i < request.copy_items_size(); ++i) {
+        const auto&                     item      = request.copy_items(i);
+        const auto                      mem_block = static_cast<BlockIdxType>(item.mem_block());
+        const std::vector<BlockIdxType> gpu_blocks(item.gpu_blocks().begin(), item.gpu_blocks().end());
+        const bool                      item_is_complete = item.is_complete();
+
+        if (isNullBlockIdx(mem_block) || gpu_blocks.size() != slots.size()) {
+            return false;
+        }
+
+        auto& pool_ref = isDualPool() ? (item_is_complete ? complete_pool_ : incomplete_pool_) : block_pool_;
+        if (!pool_ref) {
+            return false;
+        }
+        auto mem_buffers = pool_ref->convertIndexToBuffer(/*layer_id=*/0, mem_block);
+        if (mem_buffers.size() != 1u || mem_buffers[0].addr == nullptr || mem_buffers[0].size_bytes == 0
+            || mem_buffers[0].is_cuda) {
+            return false;
+        }
+        const auto& mem_buffer = mem_buffers[0];
+        auto*       mem_addr   = static_cast<char*>(mem_buffer.addr);
+
+        size_t byte_off = 0;
+        for (size_t slot_idx = 0; slot_idx < slots.size(); ++slot_idx) {
+            const auto& slot         = slots[slot_idx];
+            const auto  gpu_block    = gpu_blocks.at(slot_idx);
+            const auto  layer_stride = slot.stride_bytes;
+
+            if (!item_is_complete && !isFullOnlySlot(slot)) {
+                continue;
+            }
+
+            if (isNullBlockIdx(gpu_block)) {
+                byte_off += layer_stride;
+                continue;
+            }
+
+            const auto gpu_buffers      = allocator_->convertIndexToBuffer(slot.layer_id, slot.group_id, gpu_block);
+            size_t     within_layer_off = 0;
+            for (const auto& gpu_buffer : gpu_buffers) {
+                if (gpu_buffer.addr == nullptr || gpu_buffer.size_bytes == 0) {
+                    within_layer_off += gpu_buffer.size_bytes;
+                    continue;
+                }
+                if (within_layer_off + gpu_buffer.size_bytes > layer_stride
+                    || byte_off + within_layer_off + gpu_buffer.size_bytes > mem_buffer.size_bytes) {
+                    return false;
+                }
+                auto* host_addr = mem_addr + byte_off + within_layer_off;
+                if (!gpu_buffer.is_cuda) {
+                    return false;
+                }
+                if (params.device_index < 0) {
+                    params.device_index = gpu_buffer.device_index;
+                } else if (params.device_index != gpu_buffer.device_index) {
+                    return false;
+                }
+
+                // The SM copy kernels vectorize with int4/int2. Keep every staged tile aligned so compact
+                // staging does not trade fewer memcpy calls for misaligned vector accesses.
+                constexpr size_t kStagedTileAlignment = 16;
+                const size_t     staging_offset       = alignUp(params.host_bytes, kStagedTileAlignment);
+                params.host_bytes                     = staging_offset;
+                appendStagedMemoryCopyHostSegment(
+                    host_addr, staging_offset, gpu_buffer.size_bytes, params.host_segments);
+                appendStagedMemoryCopyTile(gpu_buffer.addr, staging_offset, gpu_buffer.size_bytes, params.tiles);
+                params.host_bytes += gpu_buffer.size_bytes;
+                ++logical_rows;
+                payload_bytes += gpu_buffer.size_bytes;
+                within_layer_off += gpu_buffer.size_bytes;
+            }
+            byte_off += layer_stride;
+        }
+    }
+
+    if (params.tiles.empty()) {
+        return true;
+    }
+
+    RTP_LLM_LOG_DEBUG("cuda staged memory copy, direction=%s, rows=%zu, tiles=%zu, bytes=%zu, span=%zu, device=%d",
+                      direction == CopyDirection::H2D ? "H2D" : "D2H",
+                      logical_rows,
+                      params.tiles.size(),
+                      payload_bytes,
+                      params.host_bytes,
+                      params.device_index);
+    RTP_LLM_PROFILE_SCOPE("reuse_cache.memory.copy.exec_staged");
+    std::lock_guard<std::mutex> scratch_lock(staged_copy_scratch_mutex_);
+    if (!execStagedMemoryCopy(params, &stagedCopyScratchForDevice(params.device_index))) {
+        return false;
+    }
     return true;
 }
 
+StagedMemoryCopyScratch& KVCacheMemoryConnector::stagedCopyScratchForDevice(int device_index) {
+    auto& scratch = staged_copy_scratch_by_device_[device_index];
+    if (!scratch) {
+        scratch = std::make_unique<StagedMemoryCopyScratch>();
+    }
+    return *scratch;
+}
+
+bool KVCacheMemoryConnector::tryCopyCacheWithBatchedMemoryCopy(const MemoryOperationRequestPB&     request,
+                                                               CopyDirection                       direction,
+                                                               const std::vector<LayerTagSlot>& slots) {
+    RTP_LLM_PROFILE_SCOPE("reuse_cache.memory.copy.plan_batch");
+    if (!isDualPool() && block_pool_ == nullptr) {
+        return false;
+    }
+    if (isDualPool() && !complete_pool_) {
+        return false;
+    }
+    if (allocator_ == nullptr) {
+        return false;
+    }
+
+    BatchedMemoryCopyParams params;
+    size_t                  logical_rows  = 0;
+    size_t                  payload_bytes = 0;
+
+    for (int i = 0; i < request.copy_items_size(); ++i) {
+        const auto&                     item      = request.copy_items(i);
+        const auto                      mem_block = static_cast<BlockIdxType>(item.mem_block());
+        const std::vector<BlockIdxType> gpu_blocks(item.gpu_blocks().begin(), item.gpu_blocks().end());
+        const bool                      item_is_complete = item.is_complete();
+
+        if (isNullBlockIdx(mem_block) || gpu_blocks.size() != slots.size()) {
+            return false;
+        }
+
+        auto& pool_ref = isDualPool() ? (item_is_complete ? complete_pool_ : incomplete_pool_) : block_pool_;
+        if (!pool_ref) {
+            return false;
+        }
+        auto mem_buffers = pool_ref->convertIndexToBuffer(/*layer_id=*/0, mem_block);
+        if (mem_buffers.size() != 1u || mem_buffers[0].addr == nullptr || mem_buffers[0].size_bytes == 0) {
+            return false;
+        }
+        const auto& mem_buffer = mem_buffers[0];
+
+        size_t byte_off = 0;
+        for (size_t slot_idx = 0; slot_idx < slots.size(); ++slot_idx) {
+            const auto& slot         = slots[slot_idx];
+            const auto  gpu_block    = gpu_blocks.at(slot_idx);
+            const auto  layer_stride = slot.stride_bytes;
+
+            if (!item_is_complete && !isFullOnlySlot(slot)) {
+                continue;
+            }
+
+            if (isNullBlockIdx(gpu_block)) {
+                byte_off += layer_stride;
+                continue;
+            }
+
+            const auto gpu_buffers      = allocator_->convertIndexToBuffer(slot.layer_id, slot.group_id, gpu_block);
+            size_t     within_layer_off = 0;
+            for (const auto& gpu_buffer : gpu_buffers) {
+                if (gpu_buffer.addr == nullptr || gpu_buffer.size_bytes == 0) {
+                    within_layer_off += gpu_buffer.size_bytes;
+                    continue;
+                }
+                if (!gpu_buffer.is_cuda) {
+                    return false;
+                }
+                if (within_layer_off + gpu_buffer.size_bytes > layer_stride
+                    || byte_off + within_layer_off + gpu_buffer.size_bytes > mem_buffer.size_bytes) {
+                    return false;
+                }
+                if (params.device_index < 0) {
+                    params.device_index = gpu_buffer.device_index;
+                } else if (params.device_index != gpu_buffer.device_index) {
+                    return false;
+                }
+
+                auto* mem_addr = static_cast<void*>(static_cast<char*>(mem_buffer.addr) + byte_off + within_layer_off);
+                if (direction == CopyDirection::H2D) {
+                    appendBatchedMemoryCopyTile(gpu_buffer.addr, mem_addr, gpu_buffer.size_bytes, params.tiles);
+                } else {
+                    appendBatchedMemoryCopyTile(mem_addr, gpu_buffer.addr, gpu_buffer.size_bytes, params.tiles);
+                }
+                ++logical_rows;
+                payload_bytes += gpu_buffer.size_bytes;
+                within_layer_off += gpu_buffer.size_bytes;
+            }
+            byte_off += layer_stride;
+        }
+    }
+
+    if (params.tiles.empty()) {
+        return true;
+    }
+
+    RTP_LLM_LOG_DEBUG("cuda memcpy batch, direction=%s, rows=%zu, tiles=%zu, bytes=%zu, device=%d",
+                      direction == CopyDirection::H2D ? "H2D" : "D2H",
+                      logical_rows,
+                      params.tiles.size(),
+                      payload_bytes,
+                      params.device_index);
+    RTP_LLM_PROFILE_SCOPE("reuse_cache.memory.copy.exec_batch");
+    return execBatchedMemoryCopy(params);
+}
+
 bool KVCacheMemoryConnector::prepareCopyBuffers(BlockIdxType                     mem_block,
                                                 const std::vector<BlockIdxType>& gpu_blocks,
                                                 CopyDirection                    direction,
+                                                bool                             is_complete,
                                                 std::vector<torch::Tensor>&      dst,
                                                 std::vector<torch::Tensor>&      src) {
     RTP_LLM_CHECK_WITH_INFO(mem_block != NULL_BLOCK_IDX, "mem block is null");
-    RTP_LLM_CHECK_WITH_INFO(block_pool_ != nullptr, "block pool is null");
-    auto mem_buffers = block_pool_->convertIndexToBuffer(/*layer_id=*/0, mem_block);
+    auto& pool_ref = isDualPool() ? (is_complete ? complete_pool_ : incomplete_pool_) : block_pool_;
+    RTP_LLM_CHECK_WITH_INFO(pool_ref != nullptr, "block pool is null");
+    auto mem_buffers = pool_ref->convertIndexToBuffer(/*layer_id=*/0, mem_block);
     if (mem_buffers.empty()) {
         RTP_LLM_LOG_WARNING("prepare copy buffers failed, mem buffers are empty, block=%d, direction=%s",
                             mem_block,
@@ -583,29 +1163,36 @@ bool KVCacheMemoryConnector::prepareCopyBuffers(BlockIdxType
                             mem_block,
                             direction == CopyDirection::H2D ? "H2D" : "D2H");
 
-    const size_t layer_num = cache_config_.layer_all_num;
-    RTP_LLM_CHECK_WITH_INFO(gpu_blocks.size() == layer_num,
-                            "gpu_blocks must contain all layers, got=%zu need=%zu",
+    const auto slots = layerTagSlots();
+    RTP_LLM_CHECK_WITH_INFO(gpu_blocks.size() == slots.size(),
+                            "gpu_blocks must contain all layer-attn slots, got=%zu need=%zu",
                             gpu_blocks.size(),
-                            layer_num);
+                            slots.size());
 
     size_t byte_off = 0;
-    for (int layer = 0; layer < layer_num; ++layer) {
-        const auto gpu_block    = gpu_blocks.at(layer);
-        const auto layer_stride = cache_config_.layer_to_block_stride_bytes[layer];
+    for (size_t slot_idx = 0; slot_idx < slots.size(); ++slot_idx) {
+        const auto& slot         = slots[slot_idx];
+        const auto  gpu_block    = gpu_blocks.at(slot_idx);
+        const auto  layer_stride = slot.stride_bytes;
+
+        if (!is_complete && !isFullOnlySlot(slot)) {
+            continue;
+        }
 
         if (isNullBlockIdx(gpu_block)) {
             byte_off += layer_stride;
             continue;
         }
 
-        const auto gpu_buffers      = allocator_->convertIndexToBuffer(layer, gpu_block);
+        const auto gpu_buffers      = allocator_->convertIndexToBuffer(slot.layer_id, slot.group_id, gpu_block);
         size_t     within_layer_off = 0;
         for (const auto& gpu_buffer : gpu_buffers) {
             if (within_layer_off + gpu_buffer.size_bytes > layer_stride) {
                 RTP_LLM_LOG_WARNING("prepare copy buffers failed, gpu buffer overflow: "
-                                    "layer=%zu byte_off=%zu within_layer_off=%zu gpu_buffer_size=%zu",
-                                    layer,
+                                    "layer=%d tag=%s group=%d byte_off=%zu within_layer_off=%zu gpu_buffer_size=%zu",
+                                    slot.layer_id,
+                                    slot.tag.c_str(),
+                                    slot.group_id,
                                     byte_off,
                                     within_layer_off,
                                     gpu_buffer.size_bytes);
@@ -687,33 +1274,45 @@ bool KVCacheMemoryConnector::checkLayerBlocks(const LayerBlockIds& layer_block_i
     return true;
 }
 
-bool KVCacheMemoryConnector::mallocBlocks(size_t need_blocks, std::vector<BlockIdxType>& malloced_blocks) {
-    RTP_LLM_PROFILE_FUNCTION();
-    if (need_blocks == 0) {
-        RTP_LLM_LOG_WARNING("malloc memory blocks failed, need blocks cannot be 0");
-        return false;
+LayerAttnBlockIds KVCacheMemoryConnector::resourceLayerRegionBlocks(const KVCacheResource&              resource,
+                                                                    const std::vector<LayerTagSlot>& slots) const {
+    if (!resource.layerGroupBlocks().empty()) {
+        return resource.layerGroupBlocks();
     }
 
-    // make sure `eusure + malloc` is atomic
-    std::unique_lock<std::mutex> lock(malloc_mutex_);
+    return {};
+}
 
-    if (!ensureEnoughFreeBlocks(need_blocks)) {
-        RTP_LLM_LOG_WARNING(
-            "malloc memory blocks failed, ensure enough free blocks failed, need blocks: %zu, free blocks: %zu",
-            need_blocks,
-            block_pool_->freeBlocksNum());
+bool KVCacheMemoryConnector::checkLayerRegionBlocks(const LayerAttnBlockIds&            layer_attn_block_ids,
+                                                    const std::vector<LayerTagSlot>& slots,
+                                                    size_t                              required_len) const {
+    if (layer_attn_block_ids.empty()) {
+        RTP_LLM_LOG_WARNING("check layer-attn blocks failed, layer_attn_block_ids is empty (required_len=%zu)",
+                            required_len);
         return false;
     }
-
-    auto blocks = block_pool_->malloc(need_blocks);
-    if (blocks.size() != need_blocks) {
-        RTP_LLM_LOG_WARNING("malloc memory blocks failed, malloc failed, need blocks: %zu, allocated blocks: %zu",
-                            need_blocks,
-                            blocks.size());
-        freeBlocks(blocks, /*cache_free=*/false);
-        return false;
+    for (const auto& slot : slots) {
+        const auto layer = static_cast<size_t>(slot.layer_id);
+        const auto attn  = static_cast<size_t>(slot.group_id);
+        if (layer >= layer_attn_block_ids.size() || attn >= layer_attn_block_ids[layer].size()
+            || layer_attn_block_ids[layer][attn] == nullptr) {
+            RTP_LLM_LOG_WARNING("check layer-group blocks failed, missing slot layer=%d tag=%s group=%d",
+                                slot.layer_id,
+                                slot.tag.c_str(),
+                                slot.group_id);
+            return false;
+        }
+        if (layer_attn_block_ids[layer][attn]->blocksNum() < required_len) {
+            RTP_LLM_LOG_WARNING(
+                "check layer-group blocks failed, blocksNum is less than required_len, layer=%d tag=%s group=%d blocksNum=%zu required_len=%zu",
+                slot.layer_id,
+                slot.tag.c_str(),
+                slot.group_id,
+                layer_attn_block_ids[layer][attn]->blocksNum(),
+                required_len);
+            return false;
+        }
     }
-    malloced_blocks = std::move(blocks);
     return true;
 }
 
@@ -751,6 +1350,86 @@ void KVCacheMemoryConnector::referenceBlocks(const std::vector<BlockIdxType>& bl
     }
 }
 
+bool KVCacheMemoryConnector::allocateBackingsForWrite(std::vector<CopyInfoPerKey>& copy_infos) {
+    std::unique_lock<std::mutex> lock(malloc_mutex_);
+    std::vector<size_t>          allocated_indices;
+    allocated_indices.reserve(copy_infos.size());
+    for (size_t i = 0; i < copy_infos.size(); ++i) {
+        if (!allocateOneBacking(copy_infos[i])) {
+            for (const auto idx : allocated_indices) {
+                releaseRequestBacking(copy_infos[idx]);
+            }
+            return false;
+        }
+        allocated_indices.push_back(i);
+    }
+    return true;
+}
+
+bool KVCacheMemoryConnector::allocateOneBacking(CopyInfoPerKey& copy_info) {
+    const auto   kind      = blockKindFromComplete(copy_info.is_complete);
+    BlockIdxType mem_block = NULL_BLOCK_IDX;
+    if (tryMallocMemoryBlock(kind, mem_block)) {
+        copy_info.mem_block = mem_block;
+        return true;
+    }
+
+    while (true) {
+        auto evicted = block_cache_->popOldestEvictable(kind);
+        if (!evicted.has_value()) {
+            return false;
+        }
+        reportEvictionLifetime(kind, evicted->backing_type, evicted->created_time_us);
+        releaseCacheBacking(*evicted);
+        if (tryMallocMemoryBlock(kind, mem_block)) {
+            copy_info.mem_block = mem_block;
+            return true;
+        }
+    }
+}
+
+bool KVCacheMemoryConnector::tryMallocMemoryBlock(CacheBlockKind kind, BlockIdxType& block) {
+    block     = NULL_BLOCK_IDX;
+    auto pool = memoryPoolFor(kind);
+    if (pool == nullptr || pool->freeBlocksNum() == 0) {
+        return false;
+    }
+    auto blocks = pool->malloc(1);
+    if (blocks.size() != 1) {
+        return false;
+    }
+    block = blocks[0];
+    return true;
+}
+
+void KVCacheMemoryConnector::releaseRequestBacking(const CopyInfoPerKey& copy_info) {
+    auto pool = memoryPoolFor(blockKindFromComplete(copy_info.is_complete));
+    if (pool) {
+        freeBlocksFromPool(pool, {copy_info.mem_block}, /*cache_free=*/false);
+    }
+}
+
+void KVCacheMemoryConnector::releaseCacheBacking(const MemoryDiskBlockCache::CacheItem& item) {
+    auto pool = memoryPoolFor(blockKindFromComplete(item.is_complete));
+    if (pool) {
+        freeBlocksFromPool(pool, {item.block_index}, /*cache_free=*/true);
+    }
+}
+
+void KVCacheMemoryConnector::referenceCacheBacking(const MemoryDiskBlockCache::CacheItem& item) {
+    auto pool = memoryPoolFor(blockKindFromComplete(item.is_complete));
+    if (pool) {
+        referenceBlocksInPool(pool, {item.block_index}, /*cache_ref=*/true);
+    }
+}
+
+std::shared_ptr<BlockPool> KVCacheMemoryConnector::memoryPoolFor(CacheBlockKind kind) const {
+    if (!isDualPool()) {
+        return block_pool_;
+    }
+    return kind == CacheBlockKind::COMPLETE ? complete_pool_ : incomplete_pool_;
+}
+
 std::shared_ptr<BlockPool> KVCacheMemoryConnector::createBlockPool(size_t block_size, size_t pool_size_mb) const {
     RTP_LLM_CHECK_WITH_INFO(pool_size_mb > 0, "pool size must be > 0");
     const int64_t block_num = pool_size_mb * 1024 * 1024 / static_cast<int64_t>(block_size);
@@ -769,40 +1448,80 @@ std::shared_ptr<BlockPool> KVCacheMemoryConnector::createBlockPool(size_t block_
 
 std::string KVCacheMemoryConnector::blockPoolDebugString() const {
     std::stringstream oss;
-    oss << "total blocks num: " << block_pool_->totalBlocksNum()
-        << ", free blocks num: " << block_pool_->freeBlocksNum()
-        << ", available blocks num: " << block_pool_->availableBlocksNum();
+    if (isDualPool()) {
+        oss << "complete pool: total=" << complete_pool_->totalBlocksNum()
+            << " free=" << complete_pool_->freeBlocksNum() << " available=" << complete_pool_->availableBlocksNum();
+        if (incomplete_pool_) {
+            oss << " | incomplete pool: total=" << incomplete_pool_->totalBlocksNum()
+                << " free=" << incomplete_pool_->freeBlocksNum()
+                << " available=" << incomplete_pool_->availableBlocksNum();
+        }
+    } else {
+        oss << "total blocks num: " << block_pool_->totalBlocksNum()
+            << ", free blocks num: " << block_pool_->freeBlocksNum()
+            << ", available blocks num: " << block_pool_->availableBlocksNum();
+    }
     return oss.str();
 }
 
 void KVCacheMemoryConnector::putToCache(const MemoryBlockCache::CacheItem& item) {
     RTP_LLM_PROFILE_FUNCTION();
-    if (auto [success, popped_item_opt] = block_cache_->put(item); success) {
-        RTP_LLM_LOG_DEBUG("write cache, cache key: %ld, block index: %d, block size: %zu",
-                          item.cache_key,
-                          item.block_index,
-                          item.block_size);
-        referenceBlocks({item.block_index}, /*cache_ref=*/true);
-        if (popped_item_opt.has_value()) {
-            const auto popped_item = popped_item_opt.value();
-            freeBlocks({popped_item.block_index}, /*cache_free=*/true);
-        }
+    MemoryDiskBlockCache::CacheItem new_item;
+    new_item.cache_key    = item.cache_key;
+    new_item.backing_type = CacheBackingType::MEMORY;
+    new_item.block_index  = item.block_index;
+    new_item.disk_slot    = -1;
+    new_item.block_size   = item.block_size;
+    new_item.is_resident  = item.is_resident;
+    new_item.is_complete  = item.is_complete;
+    putToCache(new_item, /*already_has_cache_ref=*/false);
+}
+
+void KVCacheMemoryConnector::putToCache(CopyInfoPerKey& copy_info) {
+    const auto                      kind = blockKindFromComplete(copy_info.is_complete);
+    MemoryDiskBlockCache::CacheItem item;
+    item.cache_key    = copy_info.cache_key;
+    item.backing_type = CacheBackingType::MEMORY;
+    item.block_index  = copy_info.mem_block;
+    item.disk_slot    = -1;
+    item.block_size = isDualPool() ?
+                          (kind == CacheBlockKind::COMPLETE ? complete_block_size_ : incomplete_block_size_) :
+                          memoryCacheBlockSizeBytes();
+    item.is_resident = false;
+    item.is_complete = copy_info.is_complete;
+
+    // Add cache ref. The request ref will be released by the CopyPlan deleter.
+    if (!putToCache(item, /*already_has_cache_ref=*/false)) {
+        return;
     }
 }
 
-// this function is called under lock
-bool KVCacheMemoryConnector::ensureEnoughFreeBlocks(size_t need_blocks) {
+bool KVCacheMemoryConnector::putToCache(const MemoryDiskBlockCache::CacheItem& item, bool already_has_cache_ref) {
     RTP_LLM_PROFILE_FUNCTION();
-    auto free_blocks = block_pool_->freeBlocksNum();
-    if (free_blocks >= need_blocks) {
-        return true;
+    if (!already_has_cache_ref) {
+        referenceCacheBacking(item);
     }
-    const auto need_evict_blocks = need_blocks - free_blocks;
-    const auto evict_blocks      = block_cache_->pop(need_evict_blocks);
-    if (!evict_blocks.empty()) {
-        freeBlocks(evict_blocks, /*cache_free=*/true);
+    auto [success, popped_item_opt] = block_cache_->putCommitted(item);
+    if (!success) {
+        releaseCacheBacking(item);
+        return false;
+    }
+
+    RTP_LLM_LOG_DEBUG("write cache, cache key: %ld, backing: %d, block index: %d, disk slot: %d, block size: %zu",
+                      item.cache_key,
+                      static_cast<int>(item.backing_type),
+                      item.block_index,
+                      item.disk_slot,
+                      item.block_size);
+    if (popped_item_opt.has_value()) {
+        const auto popped_item = popped_item_opt.value();
+        releaseCacheBacking(popped_item);
     }
-    return block_pool_->freeBlocksNum() >= need_blocks;
+    return true;
+}
+
+int64_t KVCacheMemoryConnector::copyPlanTimeoutMs(const std::shared_ptr<CopyPlan>& copy_plan) const {
+    return kv_cache_config_.memory_cache_sync_timeout_ms;
 }
 
 std::vector<CacheKeyType> KVCacheMemoryConnector::cacheKeys() const {
@@ -819,10 +1538,11 @@ void KVCacheMemoryConnector::reportMatchMetrics(bool    success,
     }
 
     RtpLLMMemoryCacheMatchMetricsCollector collector;
+    const int64_t tokens_per_block = cacheKeyTokensPerBlockForMetrics();
     collector.failed        = !success;
     collector.latency_us    = latency_us;
-    collector.input_token   = input_block_num * cache_config_.seq_size_per_block;
-    collector.matched_token = matched_block_num * cache_config_.seq_size_per_block;
+    collector.input_token   = input_block_num * tokens_per_block;
+    collector.matched_token = matched_block_num * tokens_per_block;
 
     metrics_reporter_->report<RtpLLMMemoryCacheMetrics, RtpLLMMemoryCacheMatchMetricsCollector>(nullptr, &collector);
 }
@@ -836,10 +1556,11 @@ void KVCacheMemoryConnector::reportReadMetrics(bool    success,
     }
 
     RtpLLMMemoryCacheReadMetricsCollector collector;
+    const int64_t tokens_per_block = cacheKeyTokensPerBlockForMetrics();
     collector.failed      = !success;
     collector.latency_us  = latency_us;
-    collector.input_token = input_block_num * cache_config_.seq_size_per_block;
-    collector.read_token  = read_block_num * cache_config_.seq_size_per_block;
+    collector.input_token = input_block_num * tokens_per_block;
+    collector.read_token  = read_block_num * tokens_per_block;
 
     metrics_reporter_->report<RtpLLMMemoryCacheMetrics, RtpLLMMemoryCacheReadMetricsCollector>(nullptr, &collector);
 }
@@ -853,10 +1574,11 @@ void KVCacheMemoryConnector::reportWriteMetrics(bool    success,
     }
 
     RtpLLMMemoryCacheWriteMetricsCollector collector;
+    const int64_t tokens_per_block = cacheKeyTokensPerBlockForMetrics();
     collector.failed      = !success;
     collector.latency_us  = latency_us;
-    collector.input_token = input_block_num * cache_config_.seq_size_per_block;
-    collector.write_token = write_block_num * cache_config_.seq_size_per_block;
+    collector.input_token = input_block_num * tokens_per_block;
+    collector.write_token = write_block_num * tokens_per_block;
 
     metrics_reporter_->report<RtpLLMMemoryCacheMetrics, RtpLLMMemoryCacheWriteMetricsCollector>(nullptr, &collector);
 }
@@ -874,27 +1596,172 @@ void KVCacheMemoryConnector::reportCopyMetrics(bool success, int64_t latency_us,
     metrics_reporter_->report<RtpLLMMemoryCacheMetrics, RtpLLMMemoryCacheCopyMetricsCollector>(nullptr, &collector);
 }
 
+int KVCacheMemoryConnector::cpSizeForMetrics() const {
+    const auto& cp_cfg = parallelism_config_.prefill_cp_config;
+    if (!cp_cfg.kv_cache_sharded) {
+        return 1;
+    }
+    if (parallelism_config_.tp_size > 1) {
+        return static_cast<int>(parallelism_config_.tp_size);
+    }
+    if (parallelism_config_.role_type == RoleType::DECODE && cp_cfg.is_prefill_enabled()
+        && cp_cfg.prefill_cp_size > 1) {
+        return static_cast<int>(cp_cfg.prefill_cp_size);
+    }
+    return 1;
+}
+
+int KVCacheMemoryConnector::cacheKeyTokensPerBlockForMetrics() const {
+    return static_cast<int>(cache_config_.seq_size_per_block) * cpSizeForMetrics();
+}
+
+void KVCacheMemoryConnector::reportEvictionLifetime(CacheBlockKind kind,
+                                                    CacheBackingType backing_type,
+                                                    int64_t          created_time_us) {
+    if (!metrics_reporter_ || created_time_us <= 0) {
+        return;
+    }
+    RtpLLMCacheEvictionMetricsCollector collector;
+    collector.lifetime_ms = std::max<int64_t>(0, (currentTimeUs() - created_time_us) / 1000);
+    kmonitor::MetricsTags tags("scope", "memory");
+    tags.AddTag("kind", cacheBlockKindName(kind));
+    tags.AddTag("backing", "memory");
+    metrics_reporter_->report<RtpLLMCacheEvictionMetrics, RtpLLMCacheEvictionMetricsCollector>(&tags, &collector);
+}
+
 void KVCacheMemoryConnector::reportMetricsLoop() {
     while (!stop_.load()) {
         if (metrics_reporter_) {
-            if (!block_pool_) {
-                std::this_thread::sleep_for(std::chrono::seconds(1));
-                continue;
+            const auto item_num = block_cache_ ? block_cache_->size() : 0;
+            if (isDualPool()) {
+                if (!complete_pool_) {
+                    std::this_thread::sleep_for(std::chrono::seconds(1));
+                    continue;
+                }
+                const auto total =
+                    complete_pool_->totalBlocksNum() + (incomplete_pool_ ? incomplete_pool_->totalBlocksNum() : 0);
+                const auto free =
+                    complete_pool_->freeBlocksNum() + (incomplete_pool_ ? incomplete_pool_->freeBlocksNum() : 0);
+                const auto avail = complete_pool_->availableBlocksNum()
+                                   + (incomplete_pool_ ? incomplete_pool_->availableBlocksNum() : 0);
+
+                RtpLLMMemoryCacheStatusMetricsCollector collector;
+                collector.item_num            = static_cast<int64_t>(item_num);
+                collector.total_block_num     = total;
+                collector.allocated_block_num = total - free;
+                collector.available_block_num = avail;
+                collector.used_ratio =
+                    total == 0 ? 0.0f : static_cast<float>(100.0 * (total - avail) / static_cast<double>(total));
+                metrics_reporter_->report<RtpLLMMemoryCacheMetrics, RtpLLMMemoryCacheStatusMetricsCollector>(
+                    nullptr, &collector);
+            } else {
+                if (!block_pool_) {
+                    std::this_thread::sleep_for(std::chrono::seconds(1));
+                    continue;
+                }
+                const auto total_blocks     = block_pool_->totalBlocksNum();
+                const auto free_blocks      = block_pool_->freeBlocksNum();
+                const auto available_blocks = block_pool_->availableBlocksNum();
+
+                RtpLLMMemoryCacheStatusMetricsCollector collector;
+                collector.item_num            = static_cast<int64_t>(item_num);
+                collector.total_block_num     = total_blocks;
+                collector.allocated_block_num = total_blocks - free_blocks;
+                collector.available_block_num = available_blocks;
+                collector.used_ratio          = total_blocks == 0 ?
+                                                    0.0f :
+                                                    static_cast<float>(100.0 * (total_blocks - available_blocks)
+                                                                       / static_cast<double>(total_blocks));
+                metrics_reporter_->report<RtpLLMMemoryCacheMetrics, RtpLLMMemoryCacheStatusMetricsCollector>(
+                    nullptr, &collector);
             }
 
-            const auto total_blocks     = block_pool_->totalBlocksNum();
-            const auto free_blocks      = block_pool_->freeBlocksNum();
-            const auto available_blocks = block_pool_->availableBlocksNum();
+        }
+        std::this_thread::sleep_for(std::chrono::seconds(1));
+    }
+}
 
-            RtpLLMMemoryCacheStatusMetricsCollector collector;
-            collector.total_block_num     = total_blocks;
-            collector.allocated_block_num = total_blocks - free_blocks;
-            collector.available_block_num = available_blocks;
+bool KVCacheMemoryConnector::mallocBlocksFromPool(const std::shared_ptr<BlockPool>&        pool,
+                                                  const std::shared_ptr<MemoryBlockCache>& cache,
+                                                  size_t                                   need_blocks,
+                                                  std::vector<BlockIdxType>&               malloced_blocks) {
+    RTP_LLM_PROFILE_FUNCTION();
+    if (need_blocks == 0) {
+        return true;
+    }
+    std::unique_lock<std::mutex> lock(malloc_mutex_);
+    if (!ensureEnoughFreeBlocksInPool(pool, cache, need_blocks)) {
+        RTP_LLM_LOG_WARNING("malloc blocks from pool failed, need=%zu free=%zu", need_blocks, pool->freeBlocksNum());
+        return false;
+    }
+    auto blocks = pool->malloc(need_blocks);
+    if (blocks.size() != need_blocks) {
+        RTP_LLM_LOG_WARNING("malloc blocks from pool failed, need=%zu got=%zu", need_blocks, blocks.size());
+        freeBlocksFromPool(pool, std::vector<BlockIdxType>(blocks.begin(), blocks.end()), false);
+        return false;
+    }
+    malloced_blocks.insert(malloced_blocks.end(), blocks.begin(), blocks.end());
+    return true;
+}
 
-            metrics_reporter_->report<RtpLLMMemoryCacheMetrics, RtpLLMMemoryCacheStatusMetricsCollector>(nullptr,
-                                                                                                         &collector);
+bool KVCacheMemoryConnector::freeBlocksFromPool(const std::shared_ptr<BlockPool>& pool,
+                                                const std::vector<BlockIdxType>&  blocks,
+                                                bool                              cache_free) {
+    std::vector<int> need_free;
+    need_free.reserve(blocks.size());
+    for (const auto& b : blocks) {
+        if (!isNullBlockIdx(b)) {
+            need_free.push_back(static_cast<int>(b));
+        }
+    }
+    if (need_free.empty()) {
+        return true;
+    }
+    RTP_LLM_CHECK_WITH_INFO(pool != nullptr, "pool is null");
+    if (cache_free) {
+        pool->blockCacheFree(need_free);
+    } else {
+        pool->requestFree(need_free);
+    }
+    return true;
+}
+
+void KVCacheMemoryConnector::referenceBlocksInPool(const std::shared_ptr<BlockPool>& pool,
+                                                   const std::vector<BlockIdxType>&  blocks,
+                                                   bool                              cache_ref) {
+    RTP_LLM_CHECK_WITH_INFO(pool != nullptr, "pool is null");
+    if (cache_ref) {
+        pool->blockCacheReference(blocks);
+    } else {
+        pool->requestReference(blocks);
+    }
+}
+
+bool KVCacheMemoryConnector::ensureEnoughFreeBlocksInPool(const std::shared_ptr<BlockPool>&        pool,
+                                                          const std::shared_ptr<MemoryBlockCache>& cache,
+                                                          size_t                                   need_blocks) {
+    RTP_LLM_PROFILE_FUNCTION();
+    auto free_blocks = pool->freeBlocksNum();
+    if (free_blocks >= need_blocks) {
+        return true;
+    }
+    const auto need_evict = need_blocks - free_blocks;
+    const auto evicted    = cache->pop(need_evict);
+    if (!evicted.empty()) {
+        freeBlocksFromPool(pool, evicted, true);
+    }
+    return pool->freeBlocksNum() >= need_blocks;
+}
+
+void KVCacheMemoryConnector::putToCacheInPool(const std::shared_ptr<BlockPool>&        pool,
+                                              const std::shared_ptr<MemoryBlockCache>& cache,
+                                              const MemoryBlockCache::CacheItem&       item) {
+    RTP_LLM_PROFILE_FUNCTION();
+    if (auto [success, popped_item_opt] = cache->put(item); success) {
+        referenceBlocksInPool(pool, {item.block_index}, true);
+        if (popped_item_opt.has_value()) {
+            freeBlocksFromPool(pool, {popped_item_opt->block_index}, true);
         }
-        std::this_thread::sleep_for(std::chrono::seconds(1));
     }
 }
 
diff --git a/rtp_llm/cpp/cache/connector/memory/KVCacheMemoryConnector.h b/rtp_llm/cpp/cache/connector/memory/KVCacheMemoryConnector.h
index 73f20ee99d..ffc0ae10f6 100644
--- a/rtp_llm/cpp/cache/connector/memory/KVCacheMemoryConnector.h
+++ b/rtp_llm/cpp/cache/connector/memory/KVCacheMemoryConnector.h
@@ -2,6 +2,7 @@
 
 #include <atomic>
 #include <condition_variable>
+#include <cstdint>
 #include <map>
 #include <memory>
 #include <mutex>
@@ -13,6 +14,7 @@
 #include "rtp_llm/cpp/cache/CacheConfig.h"
 #include "rtp_llm/cpp/cache/connector/KVCacheConnector.h"
 #include "rtp_llm/cpp/cache/connector/memory/MemoryBlockCache.h"
+#include "rtp_llm/cpp/cache/connector/memory/MemoryDiskBlockCache.h"
 #include "rtp_llm/cpp/cache/Types.h"
 #include "rtp_llm/cpp/config/ConfigModules.h"
 #include "rtp_llm/cpp/model_rpc/BroadcastManager.h"
@@ -24,9 +26,16 @@ class BlockPool;
 class BroadcastManager;
 class KVCacheAllocator;
 class MemoryAsyncContext;
+struct StagedMemoryCopyScratch;
 
 class KVCacheMemoryConnector: public KVCacheConnector {
 public:
+    KVCacheMemoryConnector(const CacheConfig&                       cache_config,
+                           const KVCacheConfig&                     kv_cache_config,
+                           const ParallelismConfig&                 parallelism_config,
+                           const std::shared_ptr<KVCacheAllocator>& allocator,
+                           const std::vector<std::string>&          tp_addrs,
+                           const kmonitor::MetricsReporterPtr&      metrics_reporter = nullptr);
     KVCacheMemoryConnector(const CacheConfig&                       cache_config,
                            const KVCacheConfig&                     kv_cache_config,
                            const std::shared_ptr<KVCacheAllocator>& allocator,
@@ -56,9 +65,17 @@ class KVCacheMemoryConnector: public KVCacheConnector {
     std::vector<CacheKeyType> cacheKeys() const;
 
 private:
+    struct LayerTagSlot {
+        int         layer_id{-1};
+        std::string tag;
+        int         group_id{-1};
+        size_t      stride_bytes{0};
+    };
     struct CopyInfoPerKey {
         CacheKeyType              cache_key{0};
+        CacheBlockKind            kind{CacheBlockKind::COMPLETE};
         BlockIdxType              mem_block{NULL_BLOCK_IDX};
+        size_t                    block_size{0};
         std::vector<BlockIdxType> gpu_blocks;
         bool                      is_complete{true};
     };
@@ -71,65 +88,132 @@ class KVCacheMemoryConnector: public KVCacheConnector {
         CopyDirection               direction;
     };
 
-    std::shared_ptr<CopyPlan> buildCopyPlanForRead(const CacheKeysType& cache_keys,
-                                                   const LayerBlockIds& layer_block_ids,
-                                                   int                  start_index,
-                                                   int                  read_num);
-    std::shared_ptr<CopyPlan> buildCopyPlanForWrite(const CacheKeysType& cache_keys,
-                                                    const LayerBlockIds& layer_block_ids,
-                                                    int                  start_index,
-                                                    int                  write_num,
-                                                    bool&                no_need_write);
+    std::shared_ptr<CopyPlan> buildCopyPlanForRead(const CacheKeysType&                cache_keys,
+                                                   const LayerAttnBlockIds&            layer_attn_block_ids,
+                                                   const std::vector<LayerTagSlot>& slots,
+                                                   int                                 start_index,
+                                                   int                                 read_num);
+    std::shared_ptr<CopyPlan> buildCopyPlanForWrite(const CacheKeysType&                cache_keys,
+                                                    const LayerAttnBlockIds&            layer_attn_block_ids,
+                                                    const std::vector<LayerTagSlot>& slots,
+                                                    int                                 start_index,
+                                                    int                                 write_num,
+                                                    bool&                               no_need_write);
     std::shared_ptr<CopyPlan> createCopyPlan(const std::vector<CopyInfoPerKey>& copy_infos,
                                              const CopyDirection&               direction);
     bool startCopyAsync(const std::shared_ptr<MemoryAsyncContext>& context, const std::shared_ptr<CopyPlan>& copy_plan);
     std::shared_ptr<BroadcastResult<FunctionRequestPB, FunctionResponsePB>>
          sendCopyPlan(const std::shared_ptr<CopyPlan>& copy_plan) const;
+    std::shared_ptr<BroadcastResult<FunctionRequestPB, FunctionResponsePB>>
+         sendMemoryRequest(const MemoryOperationRequestPB& mem_req, int64_t timeout_ms) const;
     void printCopyPlan(const std::shared_ptr<CopyPlan>& copy_plan) const;
 
-    bool prepareCopyBuffers(BlockIdxType                     mem_block,
-                            const std::vector<BlockIdxType>& gpu_blocks,
-                            CopyDirection                    direction,
-                            std::vector<torch::Tensor>&      dst,
-                            std::vector<torch::Tensor>&      src);
-    bool appendCopyBytesToBuffers(const BlockInfo&            mem_block,
-                                  const BlockInfo&            gpu_block,
-                                  size_t                      byte_off,
-                                  CopyDirection               direction,
-                                  std::vector<torch::Tensor>& dst,
-                                  std::vector<torch::Tensor>& src);
-
-    void checkLayerBlockStrideBytes() const;
-    bool checkLayerBlocks(const LayerBlockIds& layer_block_ids, size_t required_len) const;
-    bool gpuBlocksAllValid(const LayerBlockIds& layer_block_ids, size_t key_index) const;
-
-    bool mallocBlocks(size_t need_blocks, std::vector<BlockIdxType>& malloced_blocks);
+    bool                     prepareCopyBuffers(BlockIdxType                     mem_block,
+                                                const std::vector<BlockIdxType>& gpu_blocks,
+                                                CopyDirection                    direction,
+                                                bool                             is_complete,
+                                                std::vector<torch::Tensor>&      dst,
+                                                std::vector<torch::Tensor>&      src);
+    bool                     tryCopyCacheWithBatchedMemoryCopy(const MemoryOperationRequestPB&     request,
+                                                               CopyDirection                       direction,
+                                                               const std::vector<LayerTagSlot>& slots);
+    bool                     tryCopyCacheWithStagedMemoryCopy(const MemoryOperationRequestPB&     request,
+                                                              CopyDirection                       direction,
+                                                              const std::vector<LayerTagSlot>& slots);
+    StagedMemoryCopyScratch& stagedCopyScratchForDevice(int device_index);
+    bool                     appendCopyBytesToBuffers(const BlockInfo&            mem_block,
+                                                      const BlockInfo&            gpu_block,
+                                                      size_t                      byte_off,
+                                                      CopyDirection               direction,
+                                                      std::vector<torch::Tensor>& dst,
+                                                      std::vector<torch::Tensor>& src);
+    bool                     copyMemoryItemsGeneric(const MemoryOperationRequestPB&     request,
+                                                    CopyDirection                       direction,
+                                                    const std::vector<LayerTagSlot>& slots);
+    bool                     validateCopyItemBacking(const MemoryOperationRequestPB::CopyItem& item) const;
+
+    void                         checkLayerBlockStrideBytes() const;
+    std::vector<LayerTagSlot> layerTagSlots() const;
+    bool                         hasTypedLayerTagSlots(const std::vector<LayerTagSlot>& slots) const;
+    bool                         checkLayerBlocks(const LayerBlockIds& layer_block_ids, size_t required_len) const;
+    LayerAttnBlockIds            resourceLayerRegionBlocks(const KVCacheResource&                resource,
+                                                           const std::vector<LayerTagSlot>& slots) const;
+    bool                         checkLayerRegionBlocks(const LayerAttnBlockIds&            layer_attn_block_ids,
+                                                        const std::vector<LayerTagSlot>& slots,
+                                                        size_t                              required_len) const;
+    bool                         gpuBlocksAllValid(const LayerBlockIds& layer_block_ids, size_t key_index) const;
+    bool                         gpuBlocksAllValid(const LayerAttnBlockIds&            layer_attn_block_ids,
+                                                   const std::vector<LayerTagSlot>& slots,
+                                                   size_t                              key_index) const;
+    CacheGroupPolicy             groupPolicyForSlot(const LayerTagSlot& slot) const;
+
     bool freeBlocks(const std::vector<BlockIdxType>& blocks, bool cache_free = true);
     void referenceBlocks(const std::vector<BlockIdxType>& blocks, bool cache_ref = true);
-    bool ensureEnoughFreeBlocks(size_t need_blocks);
+    bool allocateBackingsForWrite(std::vector<CopyInfoPerKey>& copy_infos);
+    bool allocateOneBacking(CopyInfoPerKey& copy_info);
+    bool tryMallocMemoryBlock(CacheBlockKind kind, BlockIdxType& block);
+    void releaseRequestBacking(const CopyInfoPerKey& copy_info);
+    void releaseCacheBacking(const MemoryDiskBlockCache::CacheItem& item);
+    void referenceCacheBacking(const MemoryDiskBlockCache::CacheItem& item);
+    std::shared_ptr<BlockPool> memoryPoolFor(CacheBlockKind kind) const;
+
+    bool isDualPool() const;
+    bool isFullOnlySlot(const LayerTagSlot& slot) const;
+    bool mallocBlocksFromPool(const std::shared_ptr<BlockPool>&        pool,
+                              const std::shared_ptr<MemoryBlockCache>& cache,
+                              size_t                                   need_blocks,
+                              std::vector<BlockIdxType>&               malloced_blocks);
+    bool freeBlocksFromPool(const std::shared_ptr<BlockPool>& pool,
+                            const std::vector<BlockIdxType>&  blocks,
+                            bool                              cache_free);
+    void referenceBlocksInPool(const std::shared_ptr<BlockPool>& pool,
+                               const std::vector<BlockIdxType>&  blocks,
+                               bool                              cache_ref);
+    bool ensureEnoughFreeBlocksInPool(const std::shared_ptr<BlockPool>&        pool,
+                                      const std::shared_ptr<MemoryBlockCache>& cache,
+                                      size_t                                   need_blocks);
+    void putToCacheInPool(const std::shared_ptr<BlockPool>&        pool,
+                          const std::shared_ptr<MemoryBlockCache>& cache,
+                          const MemoryBlockCache::CacheItem&       item);
 
     void                       initBlockPool();
+    int64_t                    copyPlanTimeoutMs(const std::shared_ptr<CopyPlan>& copy_plan) const;
     std::shared_ptr<BlockPool> createBlockPool(size_t block_size, size_t pool_size_mb) const;
     std::string                blockPoolDebugString() const;
+    size_t                     memoryCacheBlockSizeBytes() const;
     void                       putToCache(const MemoryBlockCache::CacheItem& item);
+    void                       putToCache(CopyInfoPerKey& copy_info);
+    bool                       putToCache(const MemoryDiskBlockCache::CacheItem& item,
+                                          bool                                   already_has_cache_ref = false);
 
     void reportMatchMetrics(bool success, int64_t latency_us, int64_t input_block_num, int64_t matched_block_num);
     void reportReadMetrics(bool success, int64_t latency_us, int64_t input_block_num, int64_t read_block_num);
     void reportWriteMetrics(bool success, int64_t latency_us, int64_t input_block_num, int64_t write_block_num);
     void reportCopyMetrics(bool success, int64_t latency_us, CopyDirection direction);
+    int  cpSizeForMetrics() const;
+    int  cacheKeyTokensPerBlockForMetrics() const;
+    void reportEvictionLifetime(CacheBlockKind kind, CacheBackingType backing_type, int64_t created_time_us);
     void reportMetricsLoop();
 
 private:
     const CacheConfig&                cache_config_;
     const KVCacheConfig&              kv_cache_config_;
+    const ParallelismConfig           parallelism_config_;
     std::shared_ptr<KVCacheAllocator> allocator_;
     const std::vector<std::string>    tp_addrs_;
 
-    std::shared_ptr<BlockPool> block_pool_;
-    mutable std::mutex                         malloc_mutex_;
-    std::shared_ptr<MemoryBlockCache>          block_cache_;
-    std::shared_ptr<BroadcastManager>          broadcast_manager_;
-    std::shared_ptr<autil::LockFreeThreadPool> wait_done_thread_pool_;
+    std::shared_ptr<BlockPool>                              block_pool_;
+    mutable std::mutex                                      malloc_mutex_;
+    mutable std::mutex                                      staged_copy_scratch_mutex_;
+    std::map<int, std::unique_ptr<StagedMemoryCopyScratch>> staged_copy_scratch_by_device_;
+    std::shared_ptr<MemoryDiskBlockCache>                   block_cache_;
+    std::shared_ptr<BroadcastManager>                       broadcast_manager_;
+    std::shared_ptr<autil::LockFreeThreadPool>              wait_done_thread_pool_;
+
+    std::shared_ptr<BlockPool> complete_pool_;
+    std::shared_ptr<BlockPool> incomplete_pool_;
+    size_t                     complete_block_size_{0};
+    size_t                     incomplete_block_size_{0};
 
     // metrics reporter
     kmonitor::MetricsReporterPtr metrics_reporter_;
diff --git a/rtp_llm/cpp/cache/connector/memory/MemoryAsyncContext.cc b/rtp_llm/cpp/cache/connector/memory/MemoryAsyncContext.cc
index 0412e1f285..f142899807 100644
--- a/rtp_llm/cpp/cache/connector/memory/MemoryAsyncContext.cc
+++ b/rtp_llm/cpp/cache/connector/memory/MemoryAsyncContext.cc
@@ -20,13 +20,29 @@ size_t MemoryAsyncMatchContext::matchedBlockCount() const {
     return matched_block_count_;
 }
 
+int MemoryAsyncMatchContext::startReadBlockIndex() const {
+    return start_read_block_index_;
+}
+
+int MemoryAsyncMatchContext::readBlockNum() const {
+    return read_block_num_;
+}
+
+std::shared_ptr<void> MemoryAsyncMatchContext::readCopyPlan() const {
+    return read_copy_plan_;
+}
+
+void MemoryAsyncMatchContext::clearReadCopyPlan() {
+    read_copy_plan_.reset();
+}
+
 // ----------------------------- MemoryAsyncContext ---------------------------------
 
 bool MemoryAsyncContext::done() const {
     return already_done_.load();
 }
 
-bool MemoryAsyncContext::success() const {
+bool MemoryAsyncContext::successLocked() const {
     if (!broadcast_result_ || !broadcast_result_->success()) {
         return false;
     }
@@ -39,22 +55,58 @@ bool MemoryAsyncContext::success() const {
     return true;
 }
 
+bool MemoryAsyncContext::success() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return successLocked();
+}
+
 void MemoryAsyncContext::waitDone() {
-    if (done()) {
-        return;
+    std::shared_ptr<BroadcastResult<FunctionRequestPB, FunctionResponsePB>> result;
+    {
+        std::unique_lock<std::mutex> lock(mutex_);
+        cv_.wait(lock, [this]() { return result_ready_ || already_done_.load(); });
+        if (already_done_.load()) {
+            return;
+        }
+        if (finalizing_) {
+            cv_.wait(lock, [this]() { return already_done_.load(); });
+            return;
+        }
+        finalizing_ = true;
+        result      = broadcast_result_;
+    }
+
+    if (result) {
+        result->waitDone();
     }
-    if (broadcast_result_) {
-        broadcast_result_->waitDone();
+
+    bool ok = false;
+    std::function<void(bool)> done_callback;
+    {
+        std::lock_guard<std::mutex> lock(mutex_);
+        ok            = successLocked();
+        done_callback = std::move(done_callback_);
     }
-    if (done_callback_) {
-        done_callback_(success());
+    if (done_callback) {
+        done_callback(ok);
     }
-    already_done_.store(true);
+
+    {
+        std::lock_guard<std::mutex> lock(mutex_);
+        already_done_.store(true);
+        finalizing_ = false;
+    }
+    cv_.notify_all();
 }
 
 void MemoryAsyncContext::setBroadcastResult(
     const std::shared_ptr<BroadcastResult<FunctionRequestPB, FunctionResponsePB>>& result) {
-    broadcast_result_ = result;
+    {
+        std::lock_guard<std::mutex> lock(mutex_);
+        broadcast_result_ = result;
+        result_ready_     = true;
+    }
+    cv_.notify_all();
 }
 
-}  // namespace rtp_llm
\ No newline at end of file
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/connector/memory/MemoryAsyncContext.h b/rtp_llm/cpp/cache/connector/memory/MemoryAsyncContext.h
index 4d45c9cf41..6fc57f8219 100644
--- a/rtp_llm/cpp/cache/connector/memory/MemoryAsyncContext.h
+++ b/rtp_llm/cpp/cache/connector/memory/MemoryAsyncContext.h
@@ -1,8 +1,11 @@
 #pragma once
 
 #include <atomic>
+#include <condition_variable>
 #include <functional>
 #include <memory>
+#include <mutex>
+#include <utility>
 
 #include "rtp_llm/cpp/cache/connector/AsyncContext.h"
 #include "rtp_llm/cpp/model_rpc/BroadcastManager.h"
@@ -12,7 +15,14 @@ namespace rtp_llm {
 // 用于 memory connector match
 class MemoryAsyncMatchContext: public AsyncMatchContext {
 public:
-    explicit MemoryAsyncMatchContext(size_t matched_block_count): matched_block_count_(matched_block_count) {}
+    explicit MemoryAsyncMatchContext(size_t                matched_block_count,
+                                     int                   start_read_block_index = -1,
+                                     int                   read_block_num         = 0,
+                                     std::shared_ptr<void> read_copy_plan         = nullptr):
+        matched_block_count_(matched_block_count),
+        start_read_block_index_(start_read_block_index),
+        read_block_num_(read_block_num),
+        read_copy_plan_(std::move(read_copy_plan)) {}
     ~MemoryAsyncMatchContext() override = default;
 
 public:
@@ -20,9 +30,16 @@ class MemoryAsyncMatchContext: public AsyncMatchContext {
     bool   done() const override;
     bool   success() const override;
     size_t matchedBlockCount() const override;
+    int    startReadBlockIndex() const;
+    int    readBlockNum() const;
+    std::shared_ptr<void> readCopyPlan() const;
+    void                  clearReadCopyPlan();
 
 private:
-    size_t matched_block_count_{0};
+    size_t                matched_block_count_{0};
+    int                   start_read_block_index_{-1};
+    int                   read_block_num_{0};
+    std::shared_ptr<void> read_copy_plan_;
 };
 
 // 用于 memory connector read/write
@@ -38,8 +55,15 @@ class MemoryAsyncContext: public AsyncContext {
     void setBroadcastResult(const std::shared_ptr<BroadcastResult<FunctionRequestPB, FunctionResponsePB>>& result);
 
 private:
+    bool successLocked() const;
+
+private:
+    mutable std::mutex                                                      mutex_;
+    std::condition_variable                                                 cv_;
     std::shared_ptr<BroadcastResult<FunctionRequestPB, FunctionResponsePB>> broadcast_result_;
     std::function<void(bool)>                                               done_callback_;
+    bool                                                                    result_ready_{false};
+    bool                                                                    finalizing_{false};
     std::atomic<bool>                                                       already_done_{false};
 };
 
diff --git a/rtp_llm/cpp/cache/connector/memory/MemoryDiskBlockCache.cc b/rtp_llm/cpp/cache/connector/memory/MemoryDiskBlockCache.cc
new file mode 100644
index 0000000000..374b0cee62
--- /dev/null
+++ b/rtp_llm/cpp/cache/connector/memory/MemoryDiskBlockCache.cc
@@ -0,0 +1,327 @@
+#include "rtp_llm/cpp/cache/connector/memory/MemoryDiskBlockCache.h"
+
+#include <algorithm>
+#include <mutex>
+
+#include "rtp_llm/cpp/utils/AssertUtils.h"
+#include "rtp_llm/cpp/utils/Logger.h"
+#include "rtp_llm/cpp/utils/ProfilingScope.h"
+#include "rtp_llm/cpp/utils/TimeUtil.h"
+
+namespace rtp_llm {
+
+MemoryDiskBlockCache::MatchResult MemoryDiskBlockCache::match(CacheKeyType cache_key) {
+    RTP_LLM_PROFILE_FUNCTION();
+    std::unique_lock<std::shared_mutex> lock(mutex_);
+    auto                                it = items_.find(cache_key);
+    if (it == items_.end()) {
+        return {};
+    }
+    touchLocked(it->second);
+    const auto& item = it->second;
+    return {item.backing_type, item.block_index, item.disk_slot, item.block_size, item.is_complete};
+}
+
+MemoryDiskBlockCache::MatchResult MemoryDiskBlockCache::matchAndMarkInFlight(CacheKeyType cache_key) {
+    RTP_LLM_PROFILE_FUNCTION();
+    std::unique_lock<std::shared_mutex> lock(mutex_);
+    auto                                it = items_.find(cache_key);
+    if (it == items_.end()) {
+        return {};
+    }
+    touchLocked(it->second);
+    it->second.in_flight_ref++;
+    const auto& item = it->second;
+    return {item.backing_type, item.block_index, item.disk_slot, item.block_size, item.is_complete};
+}
+
+bool MemoryDiskBlockCache::contains(CacheKeyType cache_key) const {
+    std::shared_lock<std::shared_mutex> lock(mutex_);
+    return items_.find(cache_key) != items_.end();
+}
+
+std::pair<bool, std::optional<MemoryDiskBlockCache::CacheItem>>
+MemoryDiskBlockCache::putCommitted(const CacheItem& input_item) {
+    RTP_LLM_PROFILE_FUNCTION();
+    RTP_LLM_CHECK_WITH_INFO(validItem(input_item), "invalid cache item backing fields");
+
+    std::unique_lock<std::shared_mutex> lock(mutex_);
+    auto                                item = input_item;
+    item.in_flight_ref                       = 0;
+
+    auto existing = items_.find(item.cache_key);
+    if (existing != items_.end()) {
+        touchLocked(existing->second);
+        if (!existing->second.is_complete && item.is_complete) {
+            if (existing->second.in_flight_ref > 0) {
+                return {false, std::nullopt};
+            }
+            auto old_item = existing->second;
+            eraseEvictKeyLocked(existing->second);
+            item.last_access_seq = ++access_seq_;
+            item.created_time_us = item.created_time_us > 0 ? item.created_time_us : currentTimeUs();
+            existing->second     = item;
+            insertEvictKeyLocked(existing->second);
+            return {true, old_item};
+        }
+        return {false, std::nullopt};
+    }
+
+    item.last_access_seq = ++access_seq_;
+    item.created_time_us = item.created_time_us > 0 ? item.created_time_us : currentTimeUs();
+    auto [it, inserted]  = items_.emplace(item.cache_key, item);
+    (void)inserted;
+    insertEvictKeyLocked(it->second);
+    return {true, std::nullopt};
+}
+
+std::optional<MemoryDiskBlockCache::CacheItem> MemoryDiskBlockCache::removeIfMatch(CacheKeyType     cache_key,
+                                                                                   CacheBackingType backing_type,
+                                                                                   BlockIdxType expected_block_index,
+                                                                                   int32_t      expected_disk_slot) {
+    std::unique_lock<std::shared_mutex> lock(mutex_);
+    auto                                it = items_.find(cache_key);
+    if (it == items_.end() || it->second.backing_type != backing_type) {
+        return std::nullopt;
+    }
+    if (backing_type == CacheBackingType::MEMORY && it->second.block_index != expected_block_index) {
+        return std::nullopt;
+    }
+    if (backing_type == CacheBackingType::DISK && it->second.disk_slot != expected_disk_slot) {
+        return std::nullopt;
+    }
+    auto removed_item = it->second;
+    eraseEvictKeyLocked(it->second);
+    items_.erase(it);
+    return removed_item;
+}
+
+std::pair<bool, std::optional<MemoryBlockCache::CacheItem>>
+MemoryDiskBlockCache::put(const MemoryBlockCache::CacheItem& input_item) {
+    CacheItem item;
+    item.cache_key    = input_item.cache_key;
+    item.backing_type = CacheBackingType::MEMORY;
+    item.block_index  = input_item.block_index;
+    item.disk_slot    = -1;
+    item.block_size   = input_item.block_size;
+    item.is_resident  = input_item.is_resident;
+    item.is_complete  = input_item.is_complete;
+    auto [ok, popped] = putCommitted(item);
+    if (!popped.has_value()) {
+        return {ok, std::nullopt};
+    }
+    return {ok, toMemoryCacheItem(*popped)};
+}
+
+std::optional<MemoryBlockCache::CacheItem> MemoryDiskBlockCache::remove(CacheKeyType cache_key) {
+    std::unique_lock<std::shared_mutex> lock(mutex_);
+    auto                                it = items_.find(cache_key);
+    if (it == items_.end()) {
+        return std::nullopt;
+    }
+    auto removed_item = it->second;
+    eraseEvictKeyLocked(it->second);
+    items_.erase(it);
+    return toMemoryCacheItem(removed_item);
+}
+
+std::optional<MemoryBlockCache::CacheItem> MemoryDiskBlockCache::removeIfMatch(CacheKeyType cache_key,
+                                                                               BlockIdxType expected_block_index) {
+    auto removed = removeIfMatch(cache_key, CacheBackingType::MEMORY, expected_block_index, -1);
+    if (!removed.has_value()) {
+        return std::nullopt;
+    }
+    return toMemoryCacheItem(*removed);
+}
+
+std::optional<MemoryDiskBlockCache::CacheItem> MemoryDiskBlockCache::popOldestEvictable() {
+    std::unique_lock<std::shared_mutex> lock(mutex_);
+    std::optional<CacheItem>            selected;
+    auto                                consider = [&selected](const std::optional<CacheItem>& candidate) {
+        if (!candidate.has_value()) {
+            return;
+        }
+        if (!selected.has_value() || candidate->last_access_seq < selected->last_access_seq) {
+            selected = candidate;
+        }
+    };
+    consider(oldestFromSetLocked(memory_complete_lru_));
+    consider(oldestFromSetLocked(memory_incomplete_lru_));
+    consider(oldestFromSetLocked(disk_complete_lru_));
+    consider(oldestFromSetLocked(disk_incomplete_lru_));
+    if (!selected.has_value()) {
+        return std::nullopt;
+    }
+    auto it = items_.find(selected->cache_key);
+    if (it != items_.end()) {
+        eraseEvictKeyLocked(it->second);
+        items_.erase(it);
+    }
+    return selected;
+}
+
+std::optional<MemoryDiskBlockCache::CacheItem> MemoryDiskBlockCache::popOldestEvictable(CacheBlockKind kind) {
+    std::unique_lock<std::shared_mutex> lock(mutex_);
+    return popOldestEvictableLocked(kind);
+}
+
+std::optional<MemoryDiskBlockCache::CacheItem> MemoryDiskBlockCache::popOldestEvictableLocked(CacheBlockKind kind) {
+    auto memory_item = oldestFromSetLocked(lruSetLocked(CacheBackingType::MEMORY, kind));
+    auto disk_item   = oldestFromSetLocked(lruSetLocked(CacheBackingType::DISK, kind));
+    if (!memory_item.has_value()) {
+        if (!disk_item.has_value()) {
+            return std::nullopt;
+        }
+        auto it = items_.find(disk_item->cache_key);
+        if (it != items_.end()) {
+            eraseEvictKeyLocked(it->second);
+            items_.erase(it);
+        }
+        return disk_item;
+    }
+    if (!disk_item.has_value() || memory_item->last_access_seq <= disk_item->last_access_seq) {
+        auto it = items_.find(memory_item->cache_key);
+        if (it != items_.end()) {
+            eraseEvictKeyLocked(it->second);
+            items_.erase(it);
+        }
+        return memory_item;
+    }
+    auto it = items_.find(disk_item->cache_key);
+    if (it != items_.end()) {
+        eraseEvictKeyLocked(it->second);
+        items_.erase(it);
+    }
+    return disk_item;
+}
+
+bool MemoryDiskBlockCache::markInFlight(CacheKeyType     cache_key,
+                                        CacheBackingType backing_type,
+                                        BlockIdxType     block_index,
+                                        int32_t          disk_slot) {
+    std::unique_lock<std::shared_mutex> lock(mutex_);
+    auto                                it = items_.find(cache_key);
+    if (it == items_.end() || it->second.backing_type != backing_type) {
+        return false;
+    }
+    if (backing_type == CacheBackingType::MEMORY && it->second.block_index != block_index) {
+        return false;
+    }
+    if (backing_type == CacheBackingType::DISK && it->second.disk_slot != disk_slot) {
+        return false;
+    }
+    it->second.in_flight_ref++;
+    return true;
+}
+
+void MemoryDiskBlockCache::releaseInFlight(CacheKeyType     cache_key,
+                                           CacheBackingType backing_type,
+                                           BlockIdxType     block_index,
+                                           int32_t          disk_slot) {
+    std::unique_lock<std::shared_mutex> lock(mutex_);
+    auto                                it = items_.find(cache_key);
+    if (it == items_.end() || it->second.backing_type != backing_type) {
+        return;
+    }
+    if (backing_type == CacheBackingType::MEMORY && it->second.block_index != block_index) {
+        return;
+    }
+    if (backing_type == CacheBackingType::DISK && it->second.disk_slot != disk_slot) {
+        return;
+    }
+    if (it->second.in_flight_ref > 0) {
+        it->second.in_flight_ref--;
+    }
+}
+
+bool MemoryDiskBlockCache::empty() const {
+    std::shared_lock<std::shared_mutex> lock(mutex_);
+    return items_.empty();
+}
+
+size_t MemoryDiskBlockCache::size() const {
+    std::shared_lock<std::shared_mutex> lock(mutex_);
+    return items_.size();
+}
+
+std::vector<CacheKeyType> MemoryDiskBlockCache::cacheKeys() const {
+    std::shared_lock<std::shared_mutex> lock(mutex_);
+    std::vector<CacheItem>              values;
+    values.reserve(items_.size());
+    for (const auto& [_, item] : items_) {
+        values.push_back(item);
+    }
+    std::sort(values.begin(), values.end(), [](const CacheItem& lhs, const CacheItem& rhs) {
+        return lhs.last_access_seq > rhs.last_access_seq;
+    });
+    std::vector<CacheKeyType> keys;
+    keys.reserve(values.size());
+    for (const auto& item : values) {
+        keys.push_back(item.cache_key);
+    }
+    return keys;
+}
+
+bool MemoryDiskBlockCache::validItem(const CacheItem& item) const {
+    if (item.backing_type == CacheBackingType::MEMORY) {
+        return !isNullBlockIdx(item.block_index) && item.disk_slot < 0;
+    }
+    if (item.backing_type == CacheBackingType::DISK) {
+        return isNullBlockIdx(item.block_index) && item.disk_slot >= 0;
+    }
+    return false;
+}
+
+MemoryBlockCache::CacheItem MemoryDiskBlockCache::toMemoryCacheItem(const CacheItem& item) {
+    MemoryBlockCache::CacheItem memory_item;
+    memory_item.cache_key   = item.cache_key;
+    memory_item.block_index = item.block_index;
+    memory_item.block_size  = item.block_size;
+    memory_item.is_resident = item.is_resident;
+    memory_item.is_complete = item.is_complete;
+    return memory_item;
+}
+
+void MemoryDiskBlockCache::insertEvictKeyLocked(const CacheItem& item) {
+    auto& eviction_set = lruSetLocked(item.backing_type, blockKindFromComplete(item.is_complete));
+    eviction_set.insert(EvictKey{item.last_access_seq, item.cache_key});
+}
+
+void MemoryDiskBlockCache::eraseEvictKeyLocked(const CacheItem& item) {
+    auto& eviction_set = lruSetLocked(item.backing_type, blockKindFromComplete(item.is_complete));
+    eviction_set.erase(EvictKey{item.last_access_seq, item.cache_key});
+}
+
+void MemoryDiskBlockCache::touchLocked(CacheItem& item) {
+    eraseEvictKeyLocked(item);
+    item.last_access_seq = ++access_seq_;
+    insertEvictKeyLocked(item);
+}
+
+std::optional<MemoryDiskBlockCache::CacheItem>
+MemoryDiskBlockCache::oldestFromSetLocked(std::set<EvictKey>& eviction_set) {
+    for (auto evict_it = eviction_set.begin(); evict_it != eviction_set.end();) {
+        const auto key = *evict_it;
+        auto       it  = items_.find(key.cache_key);
+        if (it == items_.end() || it->second.last_access_seq != key.last_access_seq) {
+            evict_it = eviction_set.erase(evict_it);
+            continue;
+        }
+        if (it->second.is_resident || it->second.in_flight_ref > 0) {
+            ++evict_it;
+            continue;
+        }
+        return it->second;
+    }
+    return std::nullopt;
+}
+
+std::set<MemoryDiskBlockCache::EvictKey>& MemoryDiskBlockCache::lruSetLocked(CacheBackingType backing_type,
+                                                                             CacheBlockKind   kind) {
+    if (backing_type == CacheBackingType::MEMORY) {
+        return kind == CacheBlockKind::COMPLETE ? memory_complete_lru_ : memory_incomplete_lru_;
+    }
+    return kind == CacheBlockKind::COMPLETE ? disk_complete_lru_ : disk_incomplete_lru_;
+}
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/connector/memory/MemoryDiskBlockCache.h b/rtp_llm/cpp/cache/connector/memory/MemoryDiskBlockCache.h
new file mode 100644
index 0000000000..52998c4b63
--- /dev/null
+++ b/rtp_llm/cpp/cache/connector/memory/MemoryDiskBlockCache.h
@@ -0,0 +1,105 @@
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <set>
+#include <shared_mutex>
+#include <unordered_map>
+#include <vector>
+
+#include "rtp_llm/cpp/cache/Types.h"
+#include "rtp_llm/cpp/cache/KVCacheResource.h"
+#include "rtp_llm/cpp/cache/connector/memory/CacheBlockKind.h"
+#include "rtp_llm/cpp/cache/connector/memory/MemoryBlockCache.h"
+
+namespace rtp_llm {
+
+enum class CacheBackingType {
+    MEMORY = 0,
+    DISK   = 1,
+};
+
+class MemoryDiskBlockCache {
+public:
+    struct CacheItem {
+        CacheKeyType     cache_key{0};
+        CacheBackingType backing_type{CacheBackingType::MEMORY};
+        BlockIdxType     block_index{NULL_BLOCK_IDX};
+        int32_t          disk_slot{-1};
+        size_t           block_size{0};
+        bool             is_resident{false};
+        bool             is_complete{true};
+        uint64_t         last_access_seq{0};
+        int64_t          created_time_us{0};
+        uint32_t         in_flight_ref{0};
+    };
+
+    struct MatchResult {
+        CacheBackingType backing_type{CacheBackingType::MEMORY};
+        BlockIdxType     matched_index{NULL_BLOCK_IDX};
+        int32_t          disk_slot{-1};
+        size_t           block_size{0};
+        bool             is_complete{false};
+    };
+
+public:
+    MatchResult match(CacheKeyType cache_key);
+    MatchResult matchAndMarkInFlight(CacheKeyType cache_key);
+    bool        contains(CacheKeyType cache_key) const;
+
+    std::pair<bool, std::optional<CacheItem>>                   putCommitted(const CacheItem& item);
+    std::optional<CacheItem>                                    removeIfMatch(CacheKeyType     cache_key,
+                                                                              CacheBackingType backing_type,
+                                                                              BlockIdxType     expected_block_index,
+                                                                              int32_t          expected_disk_slot);
+    std::pair<bool, std::optional<MemoryBlockCache::CacheItem>> put(const MemoryBlockCache::CacheItem& item);
+    std::optional<MemoryBlockCache::CacheItem>                  remove(CacheKeyType cache_key);
+    std::optional<MemoryBlockCache::CacheItem> removeIfMatch(CacheKeyType cache_key, BlockIdxType expected_block_index);
+    std::optional<CacheItem>                   popOldestEvictable();
+    std::optional<CacheItem>                   popOldestEvictable(CacheBlockKind kind);
+
+    bool
+    markInFlight(CacheKeyType cache_key, CacheBackingType backing_type, BlockIdxType block_index, int32_t disk_slot);
+    void
+    releaseInFlight(CacheKeyType cache_key, CacheBackingType backing_type, BlockIdxType block_index, int32_t disk_slot);
+
+    bool                      empty() const;
+    size_t                    size() const;
+    std::vector<CacheKeyType> cacheKeys() const;
+
+private:
+    struct EvictKey {
+        uint64_t     last_access_seq{0};
+        CacheKeyType cache_key{0};
+
+        bool operator<(const EvictKey& other) const {
+            if (last_access_seq != other.last_access_seq) {
+                return last_access_seq < other.last_access_seq;
+            }
+            return cache_key < other.cache_key;
+        }
+    };
+
+    bool                               validItem(const CacheItem& item) const;
+    static MemoryBlockCache::CacheItem toMemoryCacheItem(const CacheItem& item);
+    void                               insertEvictKeyLocked(const CacheItem& item);
+    void                               eraseEvictKeyLocked(const CacheItem& item);
+    void                               touchLocked(CacheItem& item);
+    std::optional<CacheItem>           oldestFromSetLocked(std::set<EvictKey>& eviction_set);
+    std::optional<CacheItem>           popOldestEvictableLocked(CacheBlockKind kind);
+    std::set<EvictKey>&                lruSetLocked(CacheBackingType backing_type, CacheBlockKind kind);
+
+private:
+    mutable std::shared_mutex                   mutex_;
+    std::unordered_map<CacheKeyType, CacheItem> items_;
+    std::set<EvictKey>                          memory_complete_lru_;
+    std::set<EvictKey>                          memory_incomplete_lru_;
+    std::set<EvictKey>                          disk_complete_lru_;
+    std::set<EvictKey>                          disk_incomplete_lru_;
+    uint64_t                                    access_seq_{0};
+};
+
+using MemoryDiskBlockCachePtr = std::shared_ptr<MemoryDiskBlockCache>;
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/connector/memory/PrefixTreeMemoryBlockCache.cc b/rtp_llm/cpp/cache/connector/memory/PrefixTreeMemoryBlockCache.cc
new file mode 100644
index 0000000000..4b745d37f0
--- /dev/null
+++ b/rtp_llm/cpp/cache/connector/memory/PrefixTreeMemoryBlockCache.cc
@@ -0,0 +1,768 @@
+#include "rtp_llm/cpp/cache/connector/memory/PrefixTreeMemoryBlockCache.h"
+
+#include <algorithm>
+#include <mutex>
+
+#include "rtp_llm/cpp/utils/AssertUtils.h"
+#include "rtp_llm/cpp/utils/TimeUtil.h"
+
+namespace rtp_llm {
+
+size_t PrefixTreeMemoryBlockCache::kindIndex(CacheBlockKind kind) {
+    RTP_LLM_CHECK_WITH_INFO(validKind(kind), "invalid prefix-tree memory kind %d", static_cast<int>(kind));
+    return kind == CacheBlockKind::COMPRESSED_KV ? 0 : 1;
+}
+
+bool PrefixTreeMemoryBlockCache::validKind(CacheBlockKind kind) {
+    return kind == CacheBlockKind::COMPRESSED_KV || kind == CacheBlockKind::STATE_SWA_KV;
+}
+
+bool PrefixTreeMemoryBlockCache::slotMaskCovers(const std::vector<uint8_t>& stored,
+                                                const std::vector<uint8_t>& required) {
+    for (size_t i = 0; i < required.size(); ++i) {
+        if (required[i] == 0) {
+            continue;
+        }
+        if (i >= stored.size() || stored[i] == 0) {
+            return false;
+        }
+    }
+    return true;
+}
+
+bool PrefixTreeMemoryBlockCache::contains(CacheKeyType cache_key, CacheBlockKind kind) const {
+    static const std::vector<uint8_t> empty_required_mask;
+    return contains(cache_key, kind, empty_required_mask);
+}
+
+bool PrefixTreeMemoryBlockCache::contains(CacheKeyType                 cache_key,
+                                          CacheBlockKind               kind,
+                                          const std::vector<uint8_t>& required_slot_mask) const {
+    std::shared_lock<std::shared_mutex> lock(mutex_);
+    auto                                it = nodes_.find(cache_key);
+    if (it == nodes_.end() || !validKind(kind)) {
+        return false;
+    }
+    const auto& state = it->second.kinds[kindIndex(kind)];
+    return state.has_value && !state.detached && slotMaskCovers(state.slot_valid_mask, required_slot_mask);
+}
+
+PrefixTreeMemoryBlockCache::MatchResult
+PrefixTreeMemoryBlockCache::match(CacheKeyType cache_key, CacheBlockKind kind) {
+    static const std::vector<uint8_t> empty_required_mask;
+    return match(cache_key, kind, empty_required_mask);
+}
+
+PrefixTreeMemoryBlockCache::MatchResult
+PrefixTreeMemoryBlockCache::match(CacheKeyType                 cache_key,
+                                  CacheBlockKind               kind,
+                                  const std::vector<uint8_t>& required_slot_mask) {
+    std::unique_lock<std::shared_mutex> lock(mutex_);
+    auto                                it = nodes_.find(cache_key);
+    if (it == nodes_.end() || !validKind(kind)) {
+        return {};
+    }
+    auto& state = it->second.kinds[kindIndex(kind)];
+    if (!state.has_value || state.detached || !slotMaskCovers(state.slot_valid_mask, required_slot_mask)) {
+        return {};
+    }
+    touchLocked(it->second, kind);
+    return {true,
+            state.backing_type,
+            state.block_index,
+            state.disk_slot,
+            state.block_size,
+            state.generation,
+            state.created_time_us,
+            state.slot_valid_mask};
+}
+
+PrefixTreeMemoryBlockCache::MatchResult
+PrefixTreeMemoryBlockCache::matchAndMarkInFlight(CacheKeyType cache_key, CacheBlockKind kind) {
+    static const std::vector<uint8_t> empty_required_mask;
+    return matchAndMarkInFlight(cache_key, kind, empty_required_mask);
+}
+
+PrefixTreeMemoryBlockCache::MatchResult
+PrefixTreeMemoryBlockCache::matchAndMarkInFlight(CacheKeyType                 cache_key,
+                                                 CacheBlockKind               kind,
+                                                 const std::vector<uint8_t>& required_slot_mask) {
+    std::unique_lock<std::shared_mutex> lock(mutex_);
+    auto                                it = nodes_.find(cache_key);
+    if (it == nodes_.end() || !validKind(kind)) {
+        return {};
+    }
+    auto& state = it->second.kinds[kindIndex(kind)];
+    if (!state.has_value || state.detached || !slotMaskCovers(state.slot_valid_mask, required_slot_mask)) {
+        return {};
+    }
+    touchLocked(it->second, kind);
+    state.in_flight_ref++;
+    eraseEvictKeyLocked(it->second, kind);
+    return {true,
+            state.backing_type,
+            state.block_index,
+            state.disk_slot,
+            state.block_size,
+            state.generation,
+            state.created_time_us,
+            state.slot_valid_mask};
+}
+
+std::pair<bool, std::optional<PrefixTreeMemoryBlockCache::CacheItem>>
+PrefixTreeMemoryBlockCache::putCommitted(CacheKeyType            cache_key,
+                                          const BlockDependency&  dependency,
+                                          const CacheItem&        input_item) {
+    RTP_LLM_CHECK_WITH_INFO(validKind(input_item.kind), "invalid prefix-tree memory kind");
+    RTP_LLM_CHECK_WITH_INFO(input_item.cache_key == cache_key, "cache key mismatch");
+    std::unique_lock<std::shared_mutex> lock(mutex_);
+    auto&                               node  = upsertNodeLocked(cache_key, dependency);
+    auto&                               state = node.kinds[kindIndex(input_item.kind)];
+    std::optional<CacheItem> old_item;
+    if (state.has_value && !state.detached) {
+        if (slotMaskCovers(state.slot_valid_mask, input_item.slot_valid_mask)) {
+            return {false, std::nullopt};
+        }
+        if (!slotMaskCovers(input_item.slot_valid_mask, state.slot_valid_mask)) {
+            return {false, std::nullopt};
+        }
+        old_item = toItemLocked(node, input_item.kind);
+        eraseEvictKeyLocked(node, input_item.kind);
+        if (state.in_flight_ref > 0 && old_item.has_value()) {
+            node.retired_items[kindIndex(input_item.kind)].push_back(RetiredItem{*old_item, state.in_flight_ref});
+            old_item.reset();
+        }
+    } else {
+        incrementAncestorsLocked(cache_key, input_item.kind);
+    }
+
+    state.has_value       = true;
+    state.detached        = false;
+    state.backing_type    = input_item.backing_type;
+    state.block_index     = input_item.block_index;
+    state.disk_slot       = input_item.disk_slot;
+    state.block_size      = input_item.block_size;
+    state.is_resident     = input_item.is_resident;
+    state.generation      = ++generation_seq_;
+    state.last_access_seq = ++access_seq_;
+    state.created_time_us = input_item.created_time_us > 0 ? input_item.created_time_us : currentTimeUs();
+    state.in_flight_ref   = 0;
+    state.slot_valid_mask = input_item.slot_valid_mask;
+    insertEvictKeyLocked(node, input_item.kind);
+    return {true, old_item};
+}
+
+std::optional<PrefixTreeMemoryBlockCache::CacheItem>
+PrefixTreeMemoryBlockCache::detachIfMatch(CacheKeyType     cache_key,
+                                           CacheBlockKind   kind,
+                                           CacheBackingType backing_type,
+                                           BlockIdxType     expected_block_index,
+                                           int32_t          expected_disk_slot,
+                                           uint64_t         expected_generation) {
+    std::unique_lock<std::shared_mutex> lock(mutex_);
+    auto                                it = nodes_.find(cache_key);
+    if (it == nodes_.end() || !validKind(kind)) {
+        return std::nullopt;
+    }
+    auto& state = it->second.kinds[kindIndex(kind)];
+    if (!state.has_value || state.detached || state.backing_type != backing_type
+        || state.generation != expected_generation) {
+        return std::nullopt;
+    }
+    if (backing_type == CacheBackingType::MEMORY && state.block_index != expected_block_index) {
+        return std::nullopt;
+    }
+    if (backing_type == CacheBackingType::DISK && state.disk_slot != expected_disk_slot) {
+        return std::nullopt;
+    }
+    auto item = toItemLocked(it->second, kind);
+    if (!item.has_value()) {
+        return std::nullopt;
+    }
+    eraseEvictKeyLocked(it->second, kind);
+    state.detached = true;
+    decrementAncestorsLocked(cache_key, kind);
+    const auto descendant_ref_count = state.subtree_ref_count;
+    if (state.in_flight_ref == 0) {
+        state = KindState{};
+        state.subtree_ref_count = descendant_ref_count;
+        pruneLocked(cache_key);
+        return item;
+    }
+    it->second.retired_items[kindIndex(kind)].push_back(RetiredItem{*item, state.in_flight_ref});
+    state = KindState{};
+    state.subtree_ref_count = descendant_ref_count;
+    pruneLocked(cache_key);
+    return std::nullopt;
+}
+
+std::optional<PrefixTreeMemoryBlockCache::CacheItem>
+PrefixTreeMemoryBlockCache::releaseInFlight(CacheKeyType     cache_key,
+                                            CacheBlockKind   kind,
+                                            CacheBackingType backing_type,
+                                            BlockIdxType     block_index,
+                                            int32_t          disk_slot,
+                                            uint64_t         generation) {
+    std::unique_lock<std::shared_mutex> lock(mutex_);
+    auto                                it = nodes_.find(cache_key);
+    if (it == nodes_.end() || !validKind(kind)) {
+        return std::nullopt;
+    }
+    auto& state = it->second.kinds[kindIndex(kind)];
+    if (!state.has_value || state.backing_type != backing_type || state.generation != generation) {
+        auto& retired_items = it->second.retired_items[kindIndex(kind)];
+        for (auto retired_it = retired_items.begin(); retired_it != retired_items.end(); ++retired_it) {
+            auto& item = retired_it->item;
+            if (item.backing_type != backing_type || item.generation != generation) {
+                continue;
+            }
+            if (backing_type == CacheBackingType::MEMORY && item.block_index != block_index) {
+                continue;
+            }
+            if (backing_type == CacheBackingType::DISK && item.disk_slot != disk_slot) {
+                continue;
+            }
+            if (retired_it->in_flight_ref > 0) {
+                retired_it->in_flight_ref--;
+            }
+            if (retired_it->in_flight_ref == 0) {
+                auto released = item;
+                retired_items.erase(retired_it);
+                pruneLocked(cache_key);
+                return released;
+            }
+            return std::nullopt;
+        }
+        return std::nullopt;
+    }
+    if (backing_type == CacheBackingType::MEMORY && state.block_index != block_index) {
+        return std::nullopt;
+    }
+    if (backing_type == CacheBackingType::DISK && state.disk_slot != disk_slot) {
+        return std::nullopt;
+    }
+    if (state.in_flight_ref > 0) {
+        state.in_flight_ref--;
+    }
+    if (state.detached && state.in_flight_ref == 0) {
+        auto released = toItemLocked(it->second, kind);
+        state = KindState{};
+        pruneLocked(cache_key);
+        return released;
+    } else if (!state.detached && state.in_flight_ref == 0) {
+        refreshEvictKeyLocked(it->second, kind);
+    }
+    return std::nullopt;
+}
+
+std::optional<PrefixTreeMemoryBlockCache::CacheItem>
+PrefixTreeMemoryBlockCache::popOldestEvictable(CacheBlockKind kind) {
+    std::unique_lock<std::shared_mutex> lock(mutex_);
+    if (!validKind(kind)) {
+        return std::nullopt;
+    }
+    auto& lru = leaf_lru_[kindIndex(kind)];
+    for (auto it = lru.begin(); it != lru.end();) {
+        auto node_it = nodes_.find(it->cache_key);
+        if (node_it == nodes_.end()) {
+            it = lru.erase(it);
+            continue;
+        }
+        auto& state = node_it->second.kinds[kindIndex(kind)];
+        if (!state.has_value || state.detached || state.last_access_seq != it->last_access_seq
+            || state.generation != it->generation) {
+            it = lru.erase(it);
+            continue;
+        }
+        if (state.is_resident || state.in_flight_ref > 0 || !isKindLeafLocked(node_it->second, kind)) {
+            ++it;
+            continue;
+        }
+        auto item = toItemLocked(node_it->second, kind);
+        it        = lru.erase(it);
+        state     = KindState{};
+        decrementAncestorsLocked(item->cache_key, kind);
+        pruneLocked(item->cache_key);
+        return item;
+    }
+    return std::nullopt;
+}
+
+std::optional<PrefixTreeMemoryBlockCache::CacheItem>
+PrefixTreeMemoryBlockCache::popOldestEvictable(CacheBlockKind kind, CacheBackingType backing_type) {
+    std::unique_lock<std::shared_mutex> lock(mutex_);
+    if (!validKind(kind)) {
+        return std::nullopt;
+    }
+    auto& lru = leaf_lru_[kindIndex(kind)];
+    for (auto it = lru.begin(); it != lru.end();) {
+        auto node_it = nodes_.find(it->cache_key);
+        if (node_it == nodes_.end()) {
+            it = lru.erase(it);
+            continue;
+        }
+        auto& state = node_it->second.kinds[kindIndex(kind)];
+        if (!state.has_value || state.detached || state.last_access_seq != it->last_access_seq
+            || state.generation != it->generation) {
+            it = lru.erase(it);
+            continue;
+        }
+        if (state.backing_type != backing_type || state.is_resident || state.in_flight_ref > 0
+            || !isKindLeafLocked(node_it->second, kind)) {
+            ++it;
+            continue;
+        }
+        auto item = toItemLocked(node_it->second, kind);
+        it        = lru.erase(it);
+        state     = KindState{};
+        decrementAncestorsLocked(item->cache_key, kind);
+        pruneLocked(item->cache_key);
+        return item;
+    }
+    return std::nullopt;
+}
+
+std::vector<PrefixTreeMemoryBlockCache::CacheItem>
+PrefixTreeMemoryBlockCache::popOldestStateOrChainEvictable(CacheBackingType backing_type) {
+    std::unique_lock<std::shared_mutex> lock(mutex_);
+    std::vector<CacheKeyType>           leaf_keys;
+    const auto&                         state_lru = leaf_lru_[kindIndex(CacheBlockKind::STATE_SWA_KV)];
+    leaf_keys.reserve(state_lru.size());
+    for (const auto& evict_key : state_lru) {
+        leaf_keys.push_back(evict_key.cache_key);
+    }
+
+    for (const auto leaf_key : leaf_keys) {
+        auto item = popStateOnlyFromChainLocked(leaf_key, backing_type);
+        if (item.has_value()) {
+            return {*item};
+        }
+    }
+    for (const auto leaf_key : leaf_keys) {
+        auto items = popChainLocked(leaf_key, backing_type);
+        if (!items.empty()) {
+            return items;
+        }
+    }
+    return {};
+}
+
+std::vector<CacheKeyType> PrefixTreeMemoryBlockCache::cacheKeys() const {
+    std::shared_lock<std::shared_mutex> lock(mutex_);
+    std::vector<std::pair<uint64_t, CacheKeyType>> entries;
+    for (const auto& [key, node] : nodes_) {
+        uint64_t latest = 0;
+        for (const auto& state : node.kinds) {
+            if (state.has_value && !state.detached) {
+                latest = std::max(latest, state.last_access_seq);
+            }
+        }
+        if (latest > 0) {
+            entries.emplace_back(latest, key);
+        }
+    }
+    std::sort(entries.begin(), entries.end(), [](const auto& lhs, const auto& rhs) {
+        if (lhs.first != rhs.first) {
+            return lhs.first > rhs.first;
+        }
+        return lhs.second < rhs.second;
+    });
+    std::vector<CacheKeyType> keys;
+    keys.reserve(entries.size());
+    for (const auto& [_, key] : entries) {
+        keys.push_back(key);
+    }
+    return keys;
+}
+
+std::vector<CacheKeyType> PrefixTreeMemoryBlockCache::cacheKeysUnorderedForStatus() const {
+    std::shared_lock<std::shared_mutex> lock(mutex_);
+    std::vector<CacheKeyType>           keys;
+    keys.reserve(nodes_.size());
+    for (const auto& [key, node] : nodes_) {
+        for (const auto& state : node.kinds) {
+            if (state.has_value && !state.detached) {
+                keys.push_back(key);
+                break;
+            }
+        }
+    }
+    return keys;
+}
+
+size_t PrefixTreeMemoryBlockCache::size() const {
+    std::shared_lock<std::shared_mutex> lock(mutex_);
+    size_t count = 0;
+    for (const auto& [_, node] : nodes_) {
+        for (const auto& state : node.kinds) {
+            if (state.has_value && !state.detached) {
+                ++count;
+            }
+        }
+    }
+    return count;
+}
+
+PrefixTreeMemoryBlockCache::Node&
+PrefixTreeMemoryBlockCache::upsertNodeLocked(CacheKeyType cache_key, const BlockDependency& dependency) {
+    auto it = nodes_.find(cache_key);
+    if (it == nodes_.end()) {
+        Node node;
+        node.cache_key  = cache_key;
+        node.parent_key = dependency.parent_key;
+        node.has_parent = dependency.has_parent && dependency.parent_key != cache_key;
+        node.ordinal    = dependency.ordinal;
+        auto [inserted_it, _] = nodes_.emplace(cache_key, std::move(node));
+        it = inserted_it;
+    } else {
+        if (it->second.has_parent
+            && (it->second.parent_key != dependency.parent_key || !dependency.has_parent
+                || dependency.parent_key == cache_key)) {
+            auto old_parent_it = nodes_.find(it->second.parent_key);
+            if (old_parent_it != nodes_.end()) {
+                subtractSubtreeRefsFromAncestorsLocked(old_parent_it->first, it->second);
+                old_parent_it->second.children.erase(cache_key);
+            } else {
+                detachPendingChildLocked(it->second.parent_key, cache_key);
+            }
+        }
+        it->second.parent_key = dependency.parent_key;
+        it->second.has_parent = dependency.has_parent && dependency.parent_key != cache_key;
+        it->second.ordinal    = dependency.ordinal;
+    }
+    if (it->second.has_parent) {
+        auto parent_it = nodes_.find(it->second.parent_key);
+        if (parent_it != nodes_.end()) {
+            auto [_, inserted] = parent_it->second.children.insert(cache_key);
+            if (inserted) {
+                detachPendingChildLocked(it->second.parent_key, cache_key);
+                addSubtreeRefsToAncestorsLocked(parent_it->first, it->second);
+            }
+        } else {
+            pending_children_by_parent_[it->second.parent_key].insert(cache_key);
+        }
+    }
+    attachPendingChildrenLocked(it->second);
+    return it->second;
+}
+
+void PrefixTreeMemoryBlockCache::incrementAncestorsLocked(CacheKeyType cache_key, CacheBlockKind kind) {
+    CacheKeyType cur = cache_key;
+    while (true) {
+        auto it = nodes_.find(cur);
+        if (it == nodes_.end()) {
+            break;
+        }
+        it->second.kinds[kindIndex(kind)].subtree_ref_count++;
+        refreshEvictKeyLocked(it->second, kind);
+        if (!it->second.has_parent) {
+            break;
+        }
+        cur = it->second.parent_key;
+    }
+}
+
+void PrefixTreeMemoryBlockCache::decrementAncestorsLocked(CacheKeyType cache_key, CacheBlockKind kind) {
+    CacheKeyType cur = cache_key;
+    while (true) {
+        auto it = nodes_.find(cur);
+        if (it == nodes_.end()) {
+            break;
+        }
+        auto& count = it->second.kinds[kindIndex(kind)].subtree_ref_count;
+        if (count > 0) {
+            count--;
+        }
+        refreshEvictKeyLocked(it->second, kind);
+        if (!it->second.has_parent) {
+            break;
+        }
+        cur = it->second.parent_key;
+    }
+}
+
+void PrefixTreeMemoryBlockCache::addSubtreeRefsToAncestorsLocked(CacheKeyType ancestor_key, const Node& child) {
+    CacheKeyType cur = ancestor_key;
+    while (true) {
+        auto it = nodes_.find(cur);
+        if (it == nodes_.end()) {
+            break;
+        }
+        for (size_t kind_idx = 0; kind_idx < kKindCount; ++kind_idx) {
+            const auto delta = child.kinds[kind_idx].subtree_ref_count;
+            if (delta == 0) {
+                continue;
+            }
+            auto kind = kind_idx == 0 ? CacheBlockKind::COMPRESSED_KV : CacheBlockKind::STATE_SWA_KV;
+            eraseEvictKeyLocked(it->second, kind);
+            it->second.kinds[kind_idx].subtree_ref_count += delta;
+            insertEvictKeyLocked(it->second, kind);
+        }
+        if (!it->second.has_parent) {
+            break;
+        }
+        cur = it->second.parent_key;
+    }
+}
+
+void PrefixTreeMemoryBlockCache::subtractSubtreeRefsFromAncestorsLocked(CacheKeyType ancestor_key, const Node& child) {
+    CacheKeyType cur = ancestor_key;
+    while (true) {
+        auto it = nodes_.find(cur);
+        if (it == nodes_.end()) {
+            break;
+        }
+        for (size_t kind_idx = 0; kind_idx < kKindCount; ++kind_idx) {
+            const auto delta = child.kinds[kind_idx].subtree_ref_count;
+            if (delta == 0) {
+                continue;
+            }
+            auto kind = kind_idx == 0 ? CacheBlockKind::COMPRESSED_KV : CacheBlockKind::STATE_SWA_KV;
+            eraseEvictKeyLocked(it->second, kind);
+            auto& count = it->second.kinds[kind_idx].subtree_ref_count;
+            count       = count > delta ? count - delta : 0;
+            insertEvictKeyLocked(it->second, kind);
+        }
+        if (!it->second.has_parent) {
+            break;
+        }
+        cur = it->second.parent_key;
+    }
+}
+
+void PrefixTreeMemoryBlockCache::detachPendingChildLocked(CacheKeyType parent_key, CacheKeyType child_key) {
+    auto pending_it = pending_children_by_parent_.find(parent_key);
+    if (pending_it == pending_children_by_parent_.end()) {
+        return;
+    }
+    pending_it->second.erase(child_key);
+    if (pending_it->second.empty()) {
+        pending_children_by_parent_.erase(pending_it);
+    }
+}
+
+void PrefixTreeMemoryBlockCache::attachPendingChildrenLocked(Node& node) {
+    auto pending_it = pending_children_by_parent_.find(node.cache_key);
+    if (pending_it == pending_children_by_parent_.end()) {
+        return;
+    }
+    auto pending_children = std::move(pending_it->second);
+    pending_children_by_parent_.erase(pending_it);
+    for (const auto child_key : pending_children) {
+        auto child_it = nodes_.find(child_key);
+        if (child_it == nodes_.end() || !child_it->second.has_parent || child_it->second.parent_key != node.cache_key) {
+            continue;
+        }
+        auto [_, inserted] = node.children.insert(child_key);
+        if (inserted) {
+            addSubtreeRefsToAncestorsLocked(node.cache_key, child_it->second);
+        }
+    }
+}
+
+void PrefixTreeMemoryBlockCache::touchLocked(Node& node, CacheBlockKind kind) {
+    eraseEvictKeyLocked(node, kind);
+    auto& state = node.kinds[kindIndex(kind)];
+    state.last_access_seq = ++access_seq_;
+    insertEvictKeyLocked(node, kind);
+}
+
+void PrefixTreeMemoryBlockCache::insertEvictKeyLocked(const Node& node, CacheBlockKind kind) {
+    const auto& state = node.kinds[kindIndex(kind)];
+    if (!state.has_value || state.detached || state.is_resident || state.in_flight_ref > 0
+        || !isKindLeafLocked(node, kind)) {
+        return;
+    }
+    leaf_lru_[kindIndex(kind)].insert(EvictKey{state.last_access_seq, node.cache_key, state.generation});
+}
+
+void PrefixTreeMemoryBlockCache::eraseEvictKeyLocked(const Node& node, CacheBlockKind kind) {
+    const auto& state = node.kinds[kindIndex(kind)];
+    leaf_lru_[kindIndex(kind)].erase(EvictKey{state.last_access_seq, node.cache_key, state.generation});
+}
+
+void PrefixTreeMemoryBlockCache::refreshEvictKeyLocked(const Node& node, CacheBlockKind kind) {
+    eraseEvictKeyLocked(node, kind);
+    insertEvictKeyLocked(node, kind);
+}
+
+void PrefixTreeMemoryBlockCache::pruneLocked(CacheKeyType cache_key) {
+    auto it = nodes_.find(cache_key);
+    while (it != nodes_.end()) {
+        bool has_state = false;
+        for (const auto& state : it->second.kinds) {
+            if (state.has_value) {
+                has_state = true;
+                break;
+            }
+        }
+        if (!has_state) {
+            for (const auto& retired_items : it->second.retired_items) {
+                if (!retired_items.empty()) {
+                    has_state = true;
+                    break;
+                }
+            }
+        }
+        if (has_state || !it->second.children.empty()) {
+            break;
+        }
+        const bool has_parent = it->second.has_parent;
+        const auto parent_key = it->second.parent_key;
+        nodes_.erase(it);
+        if (!has_parent) {
+            break;
+        }
+        auto parent_it = nodes_.find(parent_key);
+        if (parent_it == nodes_.end()) {
+            detachPendingChildLocked(parent_key, cache_key);
+            break;
+        }
+        parent_it->second.children.erase(cache_key);
+        cache_key = parent_key;
+        it        = parent_it;
+    }
+}
+
+std::optional<PrefixTreeMemoryBlockCache::CacheItem>
+PrefixTreeMemoryBlockCache::toItemLocked(const Node& node, CacheBlockKind kind) const {
+    if (!validKind(kind)) {
+        return std::nullopt;
+    }
+    const auto& state = node.kinds[kindIndex(kind)];
+    if (!state.has_value) {
+        return std::nullopt;
+    }
+    return CacheItem{
+        node.cache_key, kind, state.backing_type, state.block_index, state.disk_slot, state.block_size,
+        state.is_resident, state.generation, state.created_time_us, state.slot_valid_mask};
+}
+
+bool PrefixTreeMemoryBlockCache::isKindLeafLocked(const Node& node, CacheBlockKind kind) const {
+    const auto& state = node.kinds[kindIndex(kind)];
+    if (!state.has_value || state.detached) {
+        return false;
+    }
+    return state.subtree_ref_count <= 1;
+}
+
+std::optional<PrefixTreeMemoryBlockCache::CacheItem>
+PrefixTreeMemoryBlockCache::popStateOnlyFromChainLocked(const CacheKeyType& leaf_key, CacheBackingType backing_type) {
+    auto leaf_it = nodes_.find(leaf_key);
+    if (leaf_it == nodes_.end()) {
+        return std::nullopt;
+    }
+    std::vector<CacheKeyType> chain;
+    CacheKeyType              cur = leaf_key;
+    while (true) {
+        auto node_it = nodes_.find(cur);
+        if (node_it == nodes_.end()) {
+            break;
+        }
+        chain.push_back(cur);
+        if (!node_it->second.has_parent) {
+            break;
+        }
+        auto parent_it = nodes_.find(node_it->second.parent_key);
+        if (parent_it == nodes_.end() || parent_it->second.children.size() != 1) {
+            break;
+        }
+        cur = parent_it->first;
+    }
+    if (chain.size() <= 1) {
+        return std::nullopt;
+    }
+    for (size_t idx = 1; idx < chain.size(); ++idx) {
+        auto node_it = nodes_.find(chain[idx]);
+        if (node_it == nodes_.end()) {
+            continue;
+        }
+        auto& state = node_it->second.kinds[kindIndex(CacheBlockKind::STATE_SWA_KV)];
+        if (!state.has_value || state.detached || state.backing_type != backing_type || state.is_resident
+            || state.in_flight_ref > 0) {
+            continue;
+	        }
+	        auto item = toItemLocked(node_it->second, CacheBlockKind::STATE_SWA_KV);
+	        eraseEvictKeyLocked(node_it->second, CacheBlockKind::STATE_SWA_KV);
+	        state.detached = true;
+	        decrementAncestorsLocked(item->cache_key, CacheBlockKind::STATE_SWA_KV);
+	        const auto descendant_ref_count = state.subtree_ref_count;
+	        state                           = KindState{};
+	        state.subtree_ref_count         = descendant_ref_count;
+        pruneLocked(item->cache_key);
+        return item;
+    }
+    return std::nullopt;
+}
+
+std::vector<PrefixTreeMemoryBlockCache::CacheItem>
+PrefixTreeMemoryBlockCache::popChainLocked(const CacheKeyType& leaf_key, CacheBackingType backing_type) {
+    std::vector<CacheItem> items;
+    auto                  leaf_it = nodes_.find(leaf_key);
+    if (leaf_it == nodes_.end()) {
+        return items;
+    }
+    std::vector<CacheKeyType> chain;
+    CacheKeyType              cur = leaf_key;
+    while (true) {
+        auto node_it = nodes_.find(cur);
+        if (node_it == nodes_.end()) {
+            break;
+        }
+        chain.push_back(cur);
+        if (!node_it->second.has_parent) {
+            break;
+        }
+        auto parent_it = nodes_.find(node_it->second.parent_key);
+        if (parent_it == nodes_.end() || parent_it->second.children.size() != 1) {
+            break;
+        }
+        cur = parent_it->first;
+    }
+
+    bool has_target_state = false;
+    for (const auto key : chain) {
+        auto node_it = nodes_.find(key);
+        if (node_it == nodes_.end()) {
+            continue;
+        }
+        const auto& state = node_it->second.kinds[kindIndex(CacheBlockKind::STATE_SWA_KV)];
+        if (state.has_value && !state.detached && state.backing_type == backing_type && !state.is_resident
+            && state.in_flight_ref == 0) {
+            has_target_state = true;
+            break;
+        }
+    }
+    if (!has_target_state) {
+        return items;
+    }
+
+    for (auto chain_it = chain.begin(); chain_it != chain.end(); ++chain_it) {
+        auto node_it = nodes_.find(*chain_it);
+        if (node_it == nodes_.end()) {
+            continue;
+        }
+        for (auto kind : {CacheBlockKind::COMPRESSED_KV, CacheBlockKind::STATE_SWA_KV}) {
+            auto& state = node_it->second.kinds[kindIndex(kind)];
+            if (!state.has_value || state.detached || state.backing_type != backing_type || state.is_resident
+                || state.in_flight_ref > 0) {
+                continue;
+	            }
+	            auto item = toItemLocked(node_it->second, kind);
+	            if (!item.has_value()) {
+	                continue;
+	            }
+	            eraseEvictKeyLocked(node_it->second, kind);
+	            state.detached = true;
+	            decrementAncestorsLocked(item->cache_key, kind);
+	            const auto descendant_ref_count = state.subtree_ref_count;
+	            state                           = KindState{};
+	            state.subtree_ref_count         = descendant_ref_count;
+            items.push_back(*item);
+        }
+        pruneLocked(*chain_it);
+    }
+    return items;
+}
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/connector/memory/PrefixTreeMemoryBlockCache.h b/rtp_llm/cpp/cache/connector/memory/PrefixTreeMemoryBlockCache.h
new file mode 100644
index 0000000000..4cb1db4fad
--- /dev/null
+++ b/rtp_llm/cpp/cache/connector/memory/PrefixTreeMemoryBlockCache.h
@@ -0,0 +1,158 @@
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <set>
+#include <shared_mutex>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "rtp_llm/cpp/cache/KVCacheResource.h"
+#include "rtp_llm/cpp/cache/connector/memory/CacheBlockKind.h"
+#include "rtp_llm/cpp/cache/connector/memory/MemoryDiskBlockCache.h"
+
+namespace rtp_llm {
+
+class PrefixTreeMemoryBlockCache {
+public:
+    static constexpr size_t kKindCount = 2;
+
+    struct KindState {
+        bool             has_value{false};
+        CacheBackingType backing_type{CacheBackingType::MEMORY};
+        BlockIdxType     block_index{NULL_BLOCK_IDX};
+        int32_t          disk_slot{-1};
+        size_t           block_size{0};
+        bool             is_resident{false};
+        bool             detached{false};
+        uint64_t         generation{0};
+        uint64_t         last_access_seq{0};
+        int64_t          created_time_us{0};
+        uint32_t         in_flight_ref{0};
+        uint32_t         subtree_ref_count{0};
+        std::vector<uint8_t> slot_valid_mask;
+    };
+
+    struct CacheItem {
+        CacheKeyType     cache_key{0};
+        CacheBlockKind   kind{CacheBlockKind::COMPRESSED_KV};
+        CacheBackingType backing_type{CacheBackingType::MEMORY};
+        BlockIdxType     block_index{NULL_BLOCK_IDX};
+        int32_t          disk_slot{-1};
+        size_t           block_size{0};
+        bool             is_resident{false};
+        uint64_t         generation{0};
+        int64_t          created_time_us{0};
+        std::vector<uint8_t> slot_valid_mask;
+    };
+
+    struct MatchResult {
+        bool             found{false};
+        CacheBackingType backing_type{CacheBackingType::MEMORY};
+        BlockIdxType     block_index{NULL_BLOCK_IDX};
+        int32_t          disk_slot{-1};
+        size_t           block_size{0};
+        uint64_t         generation{0};
+        int64_t          created_time_us{0};
+        std::vector<uint8_t> slot_valid_mask;
+    };
+
+    bool contains(CacheKeyType cache_key, CacheBlockKind kind) const;
+    bool contains(CacheKeyType cache_key, CacheBlockKind kind, const std::vector<uint8_t>& required_slot_mask) const;
+    MatchResult match(CacheKeyType cache_key, CacheBlockKind kind);
+    MatchResult match(CacheKeyType cache_key, CacheBlockKind kind, const std::vector<uint8_t>& required_slot_mask);
+    MatchResult matchAndMarkInFlight(CacheKeyType cache_key, CacheBlockKind kind);
+    MatchResult matchAndMarkInFlight(CacheKeyType                 cache_key,
+                                     CacheBlockKind               kind,
+                                     const std::vector<uint8_t>& required_slot_mask);
+
+    std::pair<bool, std::optional<CacheItem>>
+    putCommitted(CacheKeyType cache_key, const BlockDependency& dependency, const CacheItem& item);
+    std::optional<CacheItem> detachIfMatch(CacheKeyType     cache_key,
+                                           CacheBlockKind   kind,
+                                           CacheBackingType backing_type,
+                                           BlockIdxType     expected_block_index,
+                                           int32_t          expected_disk_slot,
+                                           uint64_t         expected_generation);
+    std::optional<CacheItem> releaseInFlight(CacheKeyType     cache_key,
+                                             CacheBlockKind   kind,
+                                             CacheBackingType backing_type,
+                                             BlockIdxType     block_index,
+                                             int32_t          disk_slot,
+                                             uint64_t         generation);
+
+    std::optional<CacheItem> popOldestEvictable(CacheBlockKind kind);
+    std::optional<CacheItem> popOldestEvictable(CacheBlockKind kind, CacheBackingType backing_type);
+    std::vector<CacheItem>   popOldestStateOrChainEvictable(CacheBackingType backing_type);
+    std::vector<CacheKeyType> cacheKeys() const;
+    std::vector<CacheKeyType> cacheKeysUnorderedForStatus() const;
+    size_t size() const;
+
+private:
+    struct RetiredItem {
+        CacheItem item;
+        uint32_t  in_flight_ref{0};
+    };
+
+    struct Node {
+        CacheKeyType cache_key{0};
+        CacheKeyType parent_key{0};
+        bool         has_parent{false};
+        uint32_t     ordinal{0};
+        std::unordered_set<CacheKeyType> children;
+        std::array<KindState, kKindCount> kinds;
+        std::array<std::vector<RetiredItem>, kKindCount> retired_items;
+    };
+
+    struct EvictKey {
+        uint64_t     last_access_seq{0};
+        CacheKeyType cache_key{0};
+        uint64_t     generation{0};
+
+        bool operator<(const EvictKey& other) const {
+            if (last_access_seq != other.last_access_seq) {
+                return last_access_seq < other.last_access_seq;
+            }
+            if (cache_key != other.cache_key) {
+                return cache_key < other.cache_key;
+            }
+            return generation < other.generation;
+        }
+    };
+
+    static size_t kindIndex(CacheBlockKind kind);
+    static bool   validKind(CacheBlockKind kind);
+    static bool   slotMaskCovers(const std::vector<uint8_t>& stored, const std::vector<uint8_t>& required);
+
+    Node& upsertNodeLocked(CacheKeyType cache_key, const BlockDependency& dependency);
+    void  incrementAncestorsLocked(CacheKeyType cache_key, CacheBlockKind kind);
+    void  decrementAncestorsLocked(CacheKeyType cache_key, CacheBlockKind kind);
+    void  addSubtreeRefsToAncestorsLocked(CacheKeyType ancestor_key, const Node& child);
+    void  subtractSubtreeRefsFromAncestorsLocked(CacheKeyType ancestor_key, const Node& child);
+    void  detachPendingChildLocked(CacheKeyType parent_key, CacheKeyType child_key);
+    void  attachPendingChildrenLocked(Node& node);
+    void  touchLocked(Node& node, CacheBlockKind kind);
+    void  insertEvictKeyLocked(const Node& node, CacheBlockKind kind);
+    void  eraseEvictKeyLocked(const Node& node, CacheBlockKind kind);
+    void  refreshEvictKeyLocked(const Node& node, CacheBlockKind kind);
+    void  pruneLocked(CacheKeyType cache_key);
+    std::optional<CacheItem> toItemLocked(const Node& node, CacheBlockKind kind) const;
+    bool isKindLeafLocked(const Node& node, CacheBlockKind kind) const;
+    std::optional<CacheItem> popStateOnlyFromChainLocked(const CacheKeyType& leaf_key, CacheBackingType backing_type);
+    std::vector<CacheItem>   popChainLocked(const CacheKeyType& leaf_key, CacheBackingType backing_type);
+
+private:
+    mutable std::shared_mutex mutex_;
+    std::unordered_map<CacheKeyType, Node> nodes_;
+    std::unordered_map<CacheKeyType, std::unordered_set<CacheKeyType>> pending_children_by_parent_;
+    std::array<std::set<EvictKey>, kKindCount> leaf_lru_;
+    uint64_t access_seq_{0};
+    uint64_t generation_seq_{0};
+};
+
+using PrefixTreeMemoryBlockCachePtr = std::shared_ptr<PrefixTreeMemoryBlockCache>;
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/connector/memory/test/BUILD b/rtp_llm/cpp/cache/connector/memory/test/BUILD
index 473bb33475..bc8b5c2050 100644
--- a/rtp_llm/cpp/cache/connector/memory/test/BUILD
+++ b/rtp_llm/cpp/cache/connector/memory/test/BUILD
@@ -7,6 +7,7 @@ test_deps = [
     "//rtp_llm/models_py/bindings/cuda/ops:cuda_impl",
     "//rtp_llm/models_py/bindings/core:exec_ops_test_lib",
     "//rtp_llm/models_py/bindings/cuda:no_block_copy",
+    "//rtp_llm/cpp/cache:kv_cache_allocator",
     "//rtp_llm/cpp/config:config_modules",
     "//rtp_llm/cpp/config:model_config",
     "//rtp_llm/cpp/utils:core_utils",
@@ -36,6 +37,31 @@ cc_test(
     exec_properties = {'gpu':'H20'},
 )
 
+cc_test(
+    name = "memory_connector_batch_copy_test",
+    srcs = [
+        "KVCacheBatchedMemoryCopyTest.cc",
+    ],
+    data = [],
+    copts = test_copts,
+    deps = [
+        "//rtp_llm/cpp/cache:block_pool",
+        "//rtp_llm/cpp/cache:cache_core",
+        "//rtp_llm/cpp/cache:kv_cache_allocator_hdr",
+        "//rtp_llm/cpp/cache/connector/memory:memory_connector",
+        "//rtp_llm/cpp/cache/test:cache_config_test_utils",
+        "//rtp_llm/cpp/config:config_modules",
+        "//rtp_llm/cpp/config:model_config",
+        "//rtp_llm/cpp/utils:core_utils",
+        "//rtp_llm/models_py/bindings/cuda:no_block_copy",
+        "@com_google_googletest//:gtest",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_cuda//cuda:cudart",
+    ] + torch_deps(),
+    env = {},
+    exec_properties = {'gpu':'H20'},
+)
+
 cc_test(
     name = "memory_block_cache_test",
     srcs = [
@@ -50,6 +76,48 @@ cc_test(
     exec_properties = {'gpu':'H20'},
 )
 
+cc_test(
+    name = "memory_disk_block_cache_test",
+    srcs = [
+        "MemoryDiskBlockCacheTest.cc",
+    ],
+    data = [],
+    copts = test_copts,
+    deps = test_deps + [
+        "//rtp_llm/cpp/cache/connector/memory:memory_connector",
+    ],
+    env = {},
+    exec_properties = {'gpu':'H20'},
+)
+
+cc_test(
+    name = "prefix_tree_memory_block_cache_test",
+    srcs = [
+        "PrefixTreeMemoryBlockCacheTest.cc",
+    ],
+    data = [],
+    copts = test_copts,
+    deps = test_deps + [
+        "//rtp_llm/cpp/cache/connector/memory:memory_connector",
+    ],
+    env = {},
+    exec_properties = {'gpu':'H20'},
+)
+
+cc_test(
+    name = "disk_block_pool_test",
+    srcs = [
+        "DiskBlockPoolTest.cc",
+    ],
+    data = [],
+    copts = test_copts,
+    deps = test_deps + [
+        "//rtp_llm/cpp/cache/connector/memory:memory_connector",
+    ],
+    env = {},
+    exec_properties = {'gpu':'H20'},
+)
+
 cc_test(
     name = "memory_async_context_test",
     srcs = [
@@ -62,4 +130,4 @@ cc_test(
     ],
     env = {},
     exec_properties = {'gpu':'H20'},
-)
\ No newline at end of file
+)
diff --git a/rtp_llm/cpp/cache/connector/memory/test/DiskBlockPoolTest.cc b/rtp_llm/cpp/cache/connector/memory/test/DiskBlockPoolTest.cc
new file mode 100644
index 0000000000..384e2af160
--- /dev/null
+++ b/rtp_llm/cpp/cache/connector/memory/test/DiskBlockPoolTest.cc
@@ -0,0 +1,193 @@
+#include "gtest/gtest.h"
+
+#include <dirent.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <vector>
+
+#include "rtp_llm/cpp/cache/connector/memory/DiskBlockPool.h"
+
+namespace rtp_llm::test {
+namespace {
+
+class TempDir {
+public:
+    TempDir() {
+        char tmpl[] = "/tmp/rtp_disk_pool_test_XXXXXX";
+        auto path   = ::mkdtemp(tmpl);
+        EXPECT_NE(path, nullptr);
+        if (path != nullptr) {
+            path_ = path;
+        }
+    }
+    ~TempDir() {
+        if (path_.empty()) {
+            return;
+        }
+        const auto work_dir = path_ + "/rtp_llm_disk_kv";
+        if (auto* dir = ::opendir(work_dir.c_str())) {
+            while (auto* entry = ::readdir(dir)) {
+                const std::string name(entry->d_name);
+                if (name == "." || name == "..") {
+                    continue;
+                }
+                ::unlink((work_dir + "/" + name).c_str());
+            }
+            ::closedir(dir);
+        }
+        ::rmdir(work_dir.c_str());
+        ::rmdir(path_.c_str());
+    }
+    const std::string& path() const {
+        return path_;
+    }
+
+private:
+    std::string path_;
+};
+
+DiskBlockPoolConfig makeConfig(const std::string& path, size_t disk_size_bytes = 3 * 4096) {
+    DiskBlockPoolConfig config;
+    config.work_dir         = path;
+    config.local_rank       = 0;
+    config.world_rank       = 0;
+    config.disk_size_bytes  = disk_size_bytes;
+    config.block_size_bytes = 1024;
+    config.buffered_io      = true;
+    config.pool_kind        = CacheBlockKind::COMPLETE;
+    return config;
+}
+
+}  // namespace
+
+TEST(DiskBlockPoolTest, InitPreallocatesFileAndCleansStaleFiles) {
+    TempDir temp_dir;
+    ASSERT_FALSE(temp_dir.path().empty());
+
+    const auto work_dir = temp_dir.path() + "/rtp_llm_disk_kv";
+    ASSERT_EQ(::mkdir(work_dir.c_str(), 0755), 0);
+    const auto stale = work_dir + "/rank_stale.kv";
+    int        fd    = ::open(stale.c_str(), O_CREAT | O_WRONLY, 0600);
+    ASSERT_GE(fd, 0);
+    ::close(fd);
+    ASSERT_EQ(::access(stale.c_str(), F_OK), 0);
+
+    DiskMountGuard guard;
+    ASSERT_TRUE(guard.init(temp_dir.path()));
+
+    DiskBlockPool pool(makeConfig(guard.workDir()));
+    ASSERT_TRUE(pool.init());
+    EXPECT_EQ(::access(stale.c_str(), F_OK), -1);
+    EXPECT_EQ(::access(pool.filePath().c_str(), F_OK), 0);
+    EXPECT_NE(pool.filePath().find("rank_0_world_0_complete.kv"), std::string::npos);
+    EXPECT_EQ(pool.totalSlots(), 3u);
+    EXPECT_EQ(pool.freeSlots(), 3u);
+}
+
+TEST(DiskBlockPoolTest, InitFailsWhenMountPathDoesNotExist) {
+    TempDir temp_dir;
+    ASSERT_FALSE(temp_dir.path().empty());
+
+    DiskMountGuard guard;
+    EXPECT_FALSE(guard.init(temp_dir.path() + "/missing_mount"));
+}
+
+TEST(DiskBlockPoolTest, MountGuardAllowsTwoPoolsOnSameMountWithoutDeletingFirst) {
+    TempDir temp_dir;
+    ASSERT_FALSE(temp_dir.path().empty());
+
+    DiskMountGuard guard;
+    ASSERT_TRUE(guard.init(temp_dir.path()));
+
+    DiskBlockPool complete_pool(makeConfig(guard.workDir()));
+    ASSERT_TRUE(complete_pool.init());
+    ASSERT_EQ(::access(complete_pool.filePath().c_str(), F_OK), 0);
+
+    auto incomplete_cfg       = makeConfig(guard.workDir(), 6 * 4096);
+    incomplete_cfg.pool_kind  = CacheBlockKind::INCOMPLETE;
+    incomplete_cfg.local_rank = 0;
+    incomplete_cfg.world_rank = 0;
+    DiskBlockPool incomplete_pool(incomplete_cfg);
+    ASSERT_TRUE(incomplete_pool.init());
+
+    EXPECT_EQ(::access(complete_pool.filePath().c_str(), F_OK), 0);
+    EXPECT_EQ(::access(incomplete_pool.filePath().c_str(), F_OK), 0);
+    EXPECT_NE(complete_pool.filePath(), incomplete_pool.filePath());
+}
+
+TEST(DiskBlockPoolTest, ReserveCommitAbortAndFreeSlots) {
+    TempDir       temp_dir;
+    DiskMountGuard guard;
+    ASSERT_TRUE(guard.init(temp_dir.path()));
+    DiskBlockPool pool(makeConfig(guard.workDir()));
+    ASSERT_TRUE(pool.init());
+
+    auto slot = pool.malloc();
+    ASSERT_TRUE(slot.has_value());
+    EXPECT_EQ(pool.freeSlots(), 2u);
+
+    pool.blockCacheReference(*slot);
+    pool.requestFree(*slot);
+    EXPECT_EQ(pool.freeSlots(), 2u);
+    EXPECT_EQ(pool.availableSlots(), 3u);
+
+    pool.blockCacheFree(*slot);
+    EXPECT_EQ(pool.freeSlots(), 3u);
+}
+
+TEST(DiskBlockPoolTest, RequestRefPreventsReuseUntilReleased) {
+    TempDir       temp_dir;
+    DiskMountGuard guard;
+    ASSERT_TRUE(guard.init(temp_dir.path()));
+    DiskBlockPool pool(makeConfig(guard.workDir()));
+    ASSERT_TRUE(pool.init());
+
+    auto slot = pool.malloc();
+    ASSERT_TRUE(slot.has_value());
+    pool.blockCacheReference(*slot);
+    pool.requestReference(*slot);
+
+    pool.blockCacheFree(*slot);
+    pool.requestFree(*slot);
+    EXPECT_EQ(pool.freeSlots(), 2u);
+
+    pool.requestFree(*slot);
+    EXPECT_EQ(pool.freeSlots(), 3u);
+}
+
+TEST(DiskBlockPoolTest, ReadWriteFullSlot) {
+    TempDir       temp_dir;
+    DiskMountGuard guard;
+    ASSERT_TRUE(guard.init(temp_dir.path()));
+    DiskBlockPool pool(makeConfig(guard.workDir()));
+    ASSERT_TRUE(pool.init());
+
+    auto slot = pool.malloc();
+    ASSERT_TRUE(slot.has_value());
+    std::vector<unsigned char> write_buf(pool.slotStrideBytes(), 0x5a);
+    std::vector<unsigned char> read_buf(pool.slotStrideBytes(), 0);
+
+    ASSERT_TRUE(pool.write(*slot, write_buf.data(), write_buf.size()));
+    ASSERT_TRUE(pool.read(*slot, read_buf.data(), read_buf.size()));
+    EXPECT_EQ(read_buf, write_buf);
+    EXPECT_EQ(pool.writeBytes(), write_buf.size());
+    EXPECT_EQ(pool.readBytes(), read_buf.size());
+}
+
+TEST(DiskBlockPoolTest, FullPoolReturnsNullopt) {
+    TempDir       temp_dir;
+    DiskMountGuard guard;
+    ASSERT_TRUE(guard.init(temp_dir.path()));
+    DiskBlockPool pool(makeConfig(guard.workDir(), 2 * 4096));
+    ASSERT_TRUE(pool.init());
+    ASSERT_TRUE(pool.malloc().has_value());
+    ASSERT_TRUE(pool.malloc().has_value());
+    EXPECT_FALSE(pool.malloc().has_value());
+}
+
+}  // namespace rtp_llm::test
diff --git a/rtp_llm/cpp/cache/connector/memory/test/KVCacheBatchedMemoryCopyTest.cc b/rtp_llm/cpp/cache/connector/memory/test/KVCacheBatchedMemoryCopyTest.cc
new file mode 100644
index 0000000000..9b4eebc35b
--- /dev/null
+++ b/rtp_llm/cpp/cache/connector/memory/test/KVCacheBatchedMemoryCopyTest.cc
@@ -0,0 +1,469 @@
+// Copyright (c) RTP-LLM
+
+#include <cstring>
+#include <map>
+#include <memory>
+#include <set>
+#include <utility>
+#include <vector>
+
+#include <cuda_runtime.h>
+#include <torch/torch.h>
+#include "gtest/gtest.h"
+
+#include "rtp_llm/cpp/cache/BlockPool.h"
+#include "rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h"
+#include "rtp_llm/cpp/cache/config_creator/HybridPoolConfigCreator.h"
+#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/spec/MHAKVCacheSpec.h"
+#include "rtp_llm/cpp/cache/connector/memory/KVCacheMemoryConnector.h"
+#include "rtp_llm/cpp/config/ConfigModules.h"
+#include "rtp_llm/cpp/config/ModelConfig.h"
+#include "rtp_llm/cpp/utils/Logger.h"
+#include "rtp_llm/models_py/bindings/core/OpData.h"
+
+namespace rtp_llm {
+
+void execBatchCopy(const BatchCopyParams&) {}
+
+}  // namespace rtp_llm
+
+namespace rtp_llm::test {
+namespace {
+
+CacheConfig makeCompactDsv4TypedMemoryCopyConfig(bool use_flash) {
+    CacheConfig config;
+    config.dtype                       = rtp_llm::DataType::TYPE_UINT8;
+    config.layer_num                   = use_flash ? 43 : 61;
+    config.layer_all_num               = config.layer_num;
+    config.block_num                   = 512;
+    config.seq_size_per_block          = 256;
+    config.kernel_seq_size_per_block   = 256;
+    config.use_independent_block_pools = true;
+    config.use_typed_cache_regions     = true;
+    config.use_opaque_kv_cache_store   = true;
+    config.is_sparse                   = true;
+
+    constexpr size_t kDsv4PoolNum      = 7;
+    const std::vector<std::string> group_tags = {
+        "csa_kv", "hca_kv", "indexer_kv", "indexer_state", "csa_state", "hca_state", "swa_kv"};
+    const std::vector<CacheGroupType> group_types = {CacheGroupType::FULL,
+                                                     CacheGroupType::FULL,
+                                                     CacheGroupType::FULL,
+                                                     CacheGroupType::SWA,
+                                                     CacheGroupType::SWA,
+                                                     CacheGroupType::SWA,
+                                                     CacheGroupType::SWA};
+    std::vector<CacheGroupPolicy> group_policies;
+    group_policies.reserve(kDsv4PoolNum);
+    for (const auto type : group_types) {
+        group_policies.push_back(defaultCacheGroupPolicy(type));
+    }
+    group_policies[5].reuse_policy         = CacheReusePolicy::NON_REUSABLE;
+    group_policies[5].active_tail_blocks   = 1;
+    group_policies[5].validate_tail_blocks = false;
+    for (size_t gid : {3u, 4u, 5u, 6u}) {
+        group_policies[gid].evict_policy = CacheEvictPolicy::INDEPENDENT;
+    }
+    const std::vector<size_t> group_kv_block_stride_bytes = {64, 16, 32, 48, 80, 40, 96};
+    const std::vector<size_t> group_kv_scale_stride_bytes(kDsv4PoolNum, 0);
+    config.group_seq_size_per_block    = std::vector<size_t>(kDsv4PoolNum, config.seq_size_per_block);
+    const std::vector<uint32_t> group_block_nums(kDsv4PoolNum, config.block_num);
+    std::vector<std::vector<int>> layers_by_group(kDsv4PoolNum);
+    config.layer_to_block_stride_bytes = std::vector<int>(config.layer_all_num, 0);
+
+    auto make_spec = [&](size_t gid) -> KVCacheSpecPtr {
+        if (group_types[gid] == CacheGroupType::FULL) {
+            auto spec                = std::make_shared<CompressedKVCacheSpec>();
+            spec->type               = KVCacheSpecType::OpaqueKV;
+            spec->dtype              = config.dtype;
+            spec->store_dtype        = config.dtype;
+            spec->entry_elems        = static_cast<uint32_t>(group_kv_block_stride_bytes[gid]);
+            spec->entries_per_block  = 1;
+            spec->seq_size_per_block = static_cast<uint32_t>(config.seq_size_per_block);
+            spec->tag                = group_tags[gid];
+            return spec;
+        }
+        auto spec                = std::make_shared<FixedStateCacheSpec>();
+        spec->type               = KVCacheSpecType::OpaqueState;
+        spec->dtype              = config.dtype;
+        spec->store_dtype        = config.dtype;
+        spec->state_dim          = static_cast<uint32_t>(group_kv_block_stride_bytes[gid]);
+        spec->entries_per_block  = 1;
+        spec->seq_size_per_block = static_cast<uint32_t>(config.seq_size_per_block);
+        spec->tag                = group_tags[gid];
+        return spec;
+    };
+
+    auto add_tag = [&](size_t layer, const std::string& tag, int gid) {
+        (void)tag;
+        layers_by_group[static_cast<size_t>(gid)].push_back(static_cast<int>(layer));
+    };
+
+    for (size_t layer = 0; layer < config.layer_all_num; ++layer) {
+        const bool is_csa = layer >= 2 && layer % 2 == 0;
+        const bool is_hca = use_flash ? (layer >= 2 && layer % 2 == 1) : (!is_csa);
+        if (is_csa) {
+            add_tag(layer, "csa_kv", 0);
+            add_tag(layer, "indexer_kv", 2);
+            add_tag(layer, "indexer_state", 3);
+            add_tag(layer, "csa_state", 4);
+        } else if (is_hca) {
+            add_tag(layer, "hca_kv", 1);
+            add_tag(layer, "hca_state", 5);
+        }
+        add_tag(layer, "swa_kv", 6);
+    }
+
+    std::vector<KVCacheSpecPtr> specs;
+    specs.reserve(kDsv4PoolNum);
+    for (size_t gid = 0; gid < kDsv4PoolNum; ++gid) {
+        specs.push_back(make_spec(gid));
+    }
+    config.fromGroupedSpecs(specs, layers_by_group, group_types, group_tags);
+    config.setGroupPolicies(group_policies);
+    config.setGroupBlockLayout(group_block_nums, group_kv_block_stride_bytes, group_kv_scale_stride_bytes);
+    return config;
+}
+
+char copyTag(size_t index) {
+    return static_cast<char>(33 + (index % 90));
+}
+
+size_t sumBlockInfosBytes(const std::vector<BlockInfo>& infos) {
+    size_t total = 0;
+    for (const auto& b : infos) {
+        if (b.addr && b.size_bytes > 0) {
+            total += b.size_bytes;
+        }
+    }
+    return total;
+}
+
+void setBlockBytes(const BlockInfo& b, size_t byte_offset, size_t byte_len, char c) {
+    ASSERT_NE(b.addr, nullptr);
+    ASSERT_LE(byte_offset + byte_len, b.size_bytes);
+    auto* addr = static_cast<char*>(b.addr) + byte_offset;
+    if (b.is_cuda) {
+        const auto rc = cudaMemset(addr, c, byte_len);
+        ASSERT_EQ(rc, cudaSuccess) << cudaGetErrorString(rc);
+        const auto sync_rc = cudaDeviceSynchronize();
+        ASSERT_EQ(sync_rc, cudaSuccess) << cudaGetErrorString(sync_rc);
+    } else {
+        memset(addr, c, byte_len);
+    }
+}
+
+void verifyBlockBytesEq(const BlockInfo& b, size_t byte_offset, size_t byte_len, char expected) {
+    ASSERT_NE(b.addr, nullptr);
+    ASSERT_LE(byte_offset + byte_len, b.size_bytes);
+    auto* addr = static_cast<const char*>(b.addr) + byte_offset;
+
+    std::vector<unsigned char> data(byte_len, 0);
+    if (b.is_cuda) {
+        const auto rc = cudaMemcpy(data.data(), addr, byte_len, cudaMemcpyDeviceToHost);
+        ASSERT_EQ(rc, cudaSuccess) << cudaGetErrorString(rc);
+    } else {
+        memcpy(data.data(), addr, byte_len);
+    }
+    size_t mismatch = 0;
+    for (; mismatch < byte_len; ++mismatch) {
+        if (data[mismatch] != static_cast<unsigned char>(expected)) {
+            break;
+        }
+    }
+    ASSERT_EQ(mismatch, byte_len) << "mismatch at byte offset " << mismatch << " expect '" << expected << "' got 0x"
+                                  << std::hex << static_cast<int>(data[mismatch]) << std::dec;
+}
+
+void setBlockInfosContent(const std::vector<BlockInfo>& infos, char c) {
+    for (const auto& b : infos) {
+        if (b.addr && b.size_bytes > 0) {
+            setBlockBytes(b, /*byte_offset=*/0, b.size_bytes, c);
+        }
+    }
+}
+
+void verifyBlockInfosContent(const std::vector<BlockInfo>& infos, char c) {
+    for (const auto& b : infos) {
+        if (b.addr && b.size_bytes > 0) {
+            verifyBlockBytesEq(b, /*byte_offset=*/0, b.size_bytes, c);
+        }
+    }
+}
+
+class FakeTypedKVCacheAllocator: public KVCacheAllocator {
+public:
+    explicit FakeTypedKVCacheAllocator(const CacheConfig&          config,
+                                       size_t                      payload_gap_bytes = 0,
+                                       std::set<int>               host_groups        = {}):
+        KVCacheAllocator(config, AllocationType::DEVICE),
+        host_groups_(std::move(host_groups)),
+        payload_gap_bytes_(payload_gap_bytes) {
+        const auto cuda_options = torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCUDA);
+        const auto host_options = torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
+        const auto layer_group_ids = config.layerGroupIdsSnapshot();
+        const auto kv_strides      = config.groupKvBlockStrideBytesSnapshot();
+        const auto scale_strides   = config.groupKvScaleStrideBytesSnapshot();
+        for (int layer = 0; layer < static_cast<int>(config.layer_all_num); ++layer) {
+            if (static_cast<size_t>(layer) >= layer_group_ids.size()) {
+                continue;
+            }
+            const auto& layer_groups = layer_group_ids[static_cast<size_t>(layer)];
+            for (const int gid : layer_groups) {
+                if (gid < 0 || static_cast<size_t>(gid) >= kv_strides.size()) {
+                    continue;
+                }
+                const size_t stride = kv_strides[static_cast<size_t>(gid)]
+                                      + (static_cast<size_t>(gid) < scale_strides.size() ?
+                                             scale_strides[static_cast<size_t>(gid)] :
+                                             0);
+                if (stride == 0) {
+                    continue;
+                }
+                const bool host_group = host_groups_.count(gid) > 0;
+                auto       tensor = torch::empty({static_cast<int64_t>(config.block_num), static_cast<int64_t>(stride)},
+                                           host_group ? host_options : cuda_options);
+                if (host_group) {
+                    tensor = tensor.pin_memory();
+                }
+                tensors_[key(layer, gid)] = std::move(tensor);
+                strides_[key(layer, gid)] = stride;
+            }
+        }
+    }
+
+    void free(const FreeInfo&) override {}
+    void insertIntoCache(const InsertInfo&) override {}
+
+    BlockAddrInfo convertIndexToAddr(int layer_id, int block_id) const override {
+        return convertIndexToAddr(layer_id, 0, block_id);
+    }
+
+    BlockAddrInfo convertIndexToAddr(int layer_id, int group_id, int block_id) const override {
+        const auto buffers = convertIndexToBuffer(layer_id, group_id, block_id);
+        return buffers.empty() ? BlockAddrInfo{} : BlockAddrInfo{buffers[0].addr, nullptr};
+    }
+
+    std::vector<BlockInfo> convertIndexToBuffer(int layer_id, int block_id) const override {
+        return convertIndexToBuffer(layer_id, 0, block_id);
+    }
+
+    std::vector<BlockInfo> convertIndexToBuffer(int layer_id, int block_id, int, int) const override {
+        return convertIndexToBuffer(layer_id, block_id);
+    }
+
+    std::vector<BlockInfo>
+    convertIndexToBuffer(int layer_id, int group_id, int block_id) const override {
+        const auto k         = key(layer_id, group_id);
+        const auto tensor_it = tensors_.find(k);
+        const auto stride_it = strides_.find(k);
+        if (tensor_it == tensors_.end() || stride_it == strides_.end() || block_id < 0
+            || static_cast<uint32_t>(block_id) >= config_.block_num) {
+            return {};
+        }
+        const auto& tensor       = tensor_it->second;
+        const auto  stride       = stride_it->second;
+        auto*       addr         = static_cast<char*>(tensor.data_ptr()) + static_cast<size_t>(block_id) * stride;
+        const auto  payload_size = payload_gap_bytes_ < stride ? stride - payload_gap_bytes_ : stride;
+        return {BlockInfo{
+            /*is_cuda=*/tensor.is_cuda(),
+            /*device_index=*/tensor.is_cuda() ? static_cast<int32_t>(tensor.get_device()) : -1,
+            /*scalar_type=*/static_cast<int32_t>(tensor.scalar_type()),
+            /*addr=*/addr,
+            /*size_bytes=*/payload_size,
+        }};
+    }
+
+    std::vector<BlockInfo>
+    convertIndexToBuffer(int layer_id, int group_id, int block_id, int, int) const override {
+        return convertIndexToBuffer(layer_id, group_id, block_id);
+    }
+
+    std::shared_ptr<KVCacheResource> incrKVCacheRef(const KVCacheResource&, const CacheKeysType&, bool) override {
+        return nullptr;
+    }
+
+    CacheLayerLayout allLayerCacheBase() const override {
+        return {};
+    }
+
+    bool
+    updateKVBlock(const BatchKVCacheResourcePtr&, const std::vector<int>&, bool, std::vector<BlockIdPair>&) override {
+        return false;
+    }
+
+    int seqSizePerBlock() const override {
+        return static_cast<int>(config_.seq_size_per_block);
+    }
+
+    int singleBatchNeedBlocks(const BatchKVCacheResourcePtr&, int, int) const override {
+        return 0;
+    }
+
+private:
+    static std::pair<int, int> key(int layer_id, int group_id) {
+        return {layer_id, group_id};
+    }
+
+    bool doInit() override {
+        return true;
+    }
+
+    MallocResult incrMalloc(const MallocInfo&) override {
+        return {false, 0};
+    }
+
+    MallocResult initMallocForCommonLen(const MallocInfo&) override {
+        return {false, 0};
+    }
+
+    int getNeedBlocks(const MallocInfo&) const override {
+        return 0;
+    }
+
+    void decrKVCacheRef(const KVCacheResource&, bool) override {}
+
+    std::map<std::pair<int, int>, torch::Tensor> tensors_;
+    std::map<std::pair<int, int>, size_t>        strides_;
+    std::set<int>                                host_groups_;
+    size_t                                       payload_gap_bytes_ = 0;
+};
+
+}  // namespace
+
+void runDsv4TypedStagedCopyRoundTrip(const std::set<int>& host_groups) {
+    const auto set_device_rc = cudaSetDevice(0);
+    ASSERT_EQ(set_device_rc, cudaSuccess) << cudaGetErrorString(set_device_rc);
+
+    auto config = makeCompactDsv4TypedMemoryCopyConfig(/*use_flash=*/true);
+
+    KVCacheConfig kv_config;
+    kv_config.memory_cache_size_mb         = 64;
+    kv_config.memory_cache_sync_timeout_ms = 1000;
+    kv_config.enable_prefix_tree_memory_cache = false;
+
+    auto allocator = std::make_shared<FakeTypedKVCacheAllocator>(config, /*payload_gap_bytes=*/8, host_groups);
+
+    std::vector<std::string> server_addrs = {"127.0.0.1:1"};
+    auto connector = std::make_shared<KVCacheMemoryConnector>(config, kv_config, allocator, server_addrs);
+    ASSERT_TRUE(connector->init());
+    auto memory_pool = connector->isDualPool() ? connector->complete_pool_ : connector->block_pool_;
+    ASSERT_NE(memory_pool, nullptr);
+
+    const auto slots = connector->layerTagSlots();
+    ASSERT_TRUE(connector->hasTypedLayerTagSlots(slots));
+    ASSERT_GT(slots.size(), config.layer_all_num);
+
+    auto mem_blocks = memory_pool->malloc(2);
+    ASSERT_EQ(mem_blocks.size(), 2u);
+    const std::vector<BlockIdxType> request_mem_blocks{static_cast<BlockIdxType>(mem_blocks[1]),
+                                                       static_cast<BlockIdxType>(mem_blocks[0])};
+
+    MemoryOperationRequestPB               req;
+    std::vector<std::vector<BlockIdxType>> gpu_block_sets(request_mem_blocks.size(),
+                                                          std::vector<BlockIdxType>(slots.size(), NULL_BLOCK_IDX));
+    BlockIdxType                           next_gpu_block = 1;
+    for (auto& gpu_blocks : gpu_block_sets) {
+        for (auto& gpu_block : gpu_blocks) {
+            gpu_block = next_gpu_block++;
+        }
+    }
+    ASSERT_LT(next_gpu_block, static_cast<BlockIdxType>(config.block_num));
+    ASSERT_EQ(gpu_block_sets.size(), request_mem_blocks.size());
+    for (size_t block_idx = 0; block_idx < request_mem_blocks.size(); ++block_idx) {
+        auto* item = req.add_copy_items();
+        item->set_mem_block(request_mem_blocks[block_idx]);
+        item->set_is_complete(true);
+        ASSERT_EQ(gpu_block_sets[block_idx].size(), slots.size());
+        for (const auto block : gpu_block_sets[block_idx]) {
+            item->add_gpu_blocks(block);
+        }
+    }
+
+    for (size_t block_idx = 0; block_idx < request_mem_blocks.size(); ++block_idx) {
+        const auto mem_bufs = memory_pool->convertIndexToBuffer(0, request_mem_blocks[block_idx]);
+        ASSERT_EQ(mem_bufs.size(), 1u);
+        const auto& mem_buffer = mem_bufs[0];
+        ASSERT_NE(mem_buffer.addr, nullptr);
+        setBlockBytes(mem_buffer, /*byte_offset=*/0, mem_buffer.size_bytes, '#');
+
+        size_t byte_off = 0;
+        for (size_t i = 0; i < slots.size(); ++i) {
+            const auto& slot = slots[i];
+            const char  tag  = copyTag(block_idx * slots.size() + i);
+            const auto  gpu_bufs =
+                allocator->convertIndexToBuffer(slot.layer_id, slot.group_id, gpu_block_sets[block_idx][i]);
+            ASSERT_GT(sumBlockInfosBytes(gpu_bufs), 0u);
+            ASSERT_LE(sumBlockInfosBytes(gpu_bufs), slot.stride_bytes);
+            setBlockInfosContent(gpu_bufs, tag);
+            setBlockBytes(mem_buffer, byte_off, sumBlockInfosBytes(gpu_bufs), 0);
+            byte_off += slot.stride_bytes;
+        }
+    }
+
+    ASSERT_TRUE(connector->tryCopyCacheWithStagedMemoryCopy(req, KVCacheMemoryConnector::CopyDirection::D2H, slots));
+
+    for (size_t block_idx = 0; block_idx < request_mem_blocks.size(); ++block_idx) {
+        const auto mem_bufs = memory_pool->convertIndexToBuffer(0, request_mem_blocks[block_idx]);
+        ASSERT_EQ(mem_bufs.size(), 1u);
+        const auto& mem_buffer = mem_bufs[0];
+
+        size_t byte_off = 0;
+        for (size_t i = 0; i < slots.size(); ++i) {
+            const auto& slot = slots[i];
+            const auto  gpu_bufs =
+                allocator->convertIndexToBuffer(slot.layer_id, slot.group_id, gpu_block_sets[block_idx][i]);
+            verifyBlockBytesEq(
+                mem_buffer, byte_off, sumBlockInfosBytes(gpu_bufs), copyTag(block_idx * slots.size() + i));
+            if (slot.stride_bytes > sumBlockInfosBytes(gpu_bufs)) {
+                verifyBlockBytesEq(mem_buffer,
+                                   byte_off + sumBlockInfosBytes(gpu_bufs),
+                                   slot.stride_bytes - sumBlockInfosBytes(gpu_bufs),
+                                   '#');
+            }
+            byte_off += slot.stride_bytes;
+        }
+    }
+
+    for (size_t block_idx = 0; block_idx < request_mem_blocks.size(); ++block_idx) {
+        const auto mem_bufs = memory_pool->convertIndexToBuffer(0, request_mem_blocks[block_idx]);
+        ASSERT_EQ(mem_bufs.size(), 1u);
+        const auto& mem_buffer = mem_bufs[0];
+
+        size_t byte_off = 0;
+        for (size_t i = 0; i < slots.size(); ++i) {
+            const auto& slot = slots[i];
+            const char  tag  = copyTag(1000 + block_idx * slots.size() + i);
+            const auto  gpu_bufs =
+                allocator->convertIndexToBuffer(slot.layer_id, slot.group_id, gpu_block_sets[block_idx][i]);
+            setBlockInfosContent(gpu_bufs, 0);
+            setBlockBytes(mem_buffer, byte_off, sumBlockInfosBytes(gpu_bufs), tag);
+            byte_off += slot.stride_bytes;
+        }
+    }
+
+    ASSERT_TRUE(connector->tryCopyCacheWithStagedMemoryCopy(req, KVCacheMemoryConnector::CopyDirection::H2D, slots));
+
+    for (size_t block_idx = 0; block_idx < request_mem_blocks.size(); ++block_idx) {
+        for (size_t i = 0; i < slots.size(); ++i) {
+            const auto& slot = slots[i];
+            const auto  gpu_bufs =
+                allocator->convertIndexToBuffer(slot.layer_id, slot.group_id, gpu_block_sets[block_idx][i]);
+            verifyBlockInfosContent(gpu_bufs, copyTag(1000 + block_idx * slots.size() + i));
+        }
+    }
+}
+
+TEST(KVCacheBatchedMemoryCopyTest, Dsv4TypedLayoutUsesStagedCopyForD2HAndH2D) {
+    runDsv4TypedStagedCopyRoundTrip({});
+}
+
+}  // namespace rtp_llm::test
+
+int main(int argc, char** argv) {
+    rtp_llm::initLogger();
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
+
diff --git a/rtp_llm/cpp/cache/connector/memory/test/KVCacheMemoryConnectorTest.cc b/rtp_llm/cpp/cache/connector/memory/test/KVCacheMemoryConnectorTest.cc
index ea2a74b2ed..d771373244 100644
--- a/rtp_llm/cpp/cache/connector/memory/test/KVCacheMemoryConnectorTest.cc
+++ b/rtp_llm/cpp/cache/connector/memory/test/KVCacheMemoryConnectorTest.cc
@@ -2,7 +2,13 @@
 
 #include <csignal>
 #include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
 #include <execinfo.h>
+#include <map>
+#include <numeric>
+#include <string>
 #include <thread>
 #include <unistd.h>
 
@@ -15,9 +21,11 @@
 #include "rtp_llm/cpp/cache/connector/memory/MemoryAsyncContext.h"
 #include "rtp_llm/cpp/cache/connector/memory/MemoryBlockCache.h"
 #include "rtp_llm/cpp/cache/connector/memory/test/mock/TestRpcService.h"
-#include "rtp_llm/cpp/cache/KVCacheAllocator.h"
-#include "rtp_llm/cpp/cache/MLAKVCacheSpec.h"
-#include "rtp_llm/cpp/cache/SingleTypeKVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/spec/MLAKVCacheSpec.h"
+#include "rtp_llm/cpp/cache/spec/OpaqueKVCacheSpec.h"
+#include "rtp_llm/cpp/cache/allocator/SingleTypeKVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/allocator/HybridTypeKVCacheAllocator.h"
 #include "rtp_llm/models_py/bindings/cuda/cuda_host_utils.h"
 #include "rtp_llm/models_py/bindings/core/ExecOps.h"
 #include "rtp_llm/cpp/model_rpc/proto/model_rpc_service.pb.h"
@@ -25,6 +33,7 @@
 #include "rtp_llm/cpp/config/ConfigModules.h"
 #include "rtp_llm/cpp/config/EplbConfig.h"
 #include "rtp_llm/cpp/config/ModelConfig.h"
+#include "rtp_llm/cpp/config/StaticConfig.h"
 
 namespace rtp_llm::test {
 
@@ -58,6 +67,38 @@ struct CrashHandlerInstaller {
 
 static CrashHandlerInstaller g_crash_handler_installer;
 
+void initResourceGroupsForConfig(KVCacheResource& resource, const CacheConfig& config) {
+    resource.initGroups(config.groupNums(),
+                        static_cast<int>(config.layer_all_num),
+                        config.layerGroupIdsSnapshot(),
+                        /*kernel_blocks_per_kv_block=*/1,
+                        config.groupTypesSnapshot());
+}
+
+void setGroupStridesForConfig(CacheConfig& config,
+                              const std::vector<size_t>& kv_block_stride_bytes,
+                              const std::vector<size_t>& kv_scale_stride_bytes) {
+    std::vector<uint32_t> block_nums = config.groupBlockNumsSnapshot();
+    if (block_nums.empty()) {
+        block_nums.assign(static_cast<size_t>(config.groupNums()), config.block_num);
+    }
+    config.setGroupBlockLayout(block_nums, kv_block_stride_bytes, kv_scale_stride_bytes);
+}
+
+void makeConfigUseZeroStrideSpec(CacheConfig& config) {
+    auto spec                = std::make_shared<FixedStateCacheSpec>();
+    spec->type               = KVCacheSpecType::OpaqueState;
+    spec->dtype              = config.dtype;
+    spec->store_dtype        = config.dtype;
+    spec->state_dim          = 0;
+    spec->entries_per_block  = 1;
+    spec->seq_size_per_block = static_cast<uint32_t>(config.seq_size_per_block);
+    std::vector<int> layer_ids(static_cast<size_t>(config.layer_all_num));
+    std::iota(layer_ids.begin(), layer_ids.end(), 0);
+    config.fromGroupedSpecs({spec}, {layer_ids}, {CacheGroupType::FULL}, {"default"});
+    setGroupStridesForConfig(config, {0}, {0});
+}
+
 }  // namespace
 
 // Test-local helper struct. Business code no longer exposes a LayerBlock type.
@@ -99,6 +140,8 @@ class TestReadMeta: public rtp_llm::Meta {
 class KVCacheMemoryConnectorTest: public ::testing::Test {
 protected:
     void SetUp() override {
+        old_core_dump_on_exception_                  = StaticConfig::user_ft_core_dump_on_exception;
+        StaticConfig::user_ft_core_dump_on_exception = false;
         createDevice();
 
         cache_config_ = createMockCacheConfig();
@@ -113,7 +156,9 @@ class KVCacheMemoryConnectorTest: public ::testing::Test {
         ASSERT_TRUE(connector_->init());
     }
 
-    void TearDown() override {}
+    void TearDown() override {
+        StaticConfig::user_ft_core_dump_on_exception = old_core_dump_on_exception_;
+    }
 
     CacheConfig                                 cache_config_;
     KVCacheConfig                               kv_cache_config_;
@@ -121,6 +166,7 @@ class KVCacheMemoryConnectorTest: public ::testing::Test {
     std::shared_ptr<KVCacheMemoryConnector>     connector_;
     std::vector<std::unique_ptr<TestRpcServer>> servers_;
     std::vector<std::string>                    server_addrs_;
+    bool                                        old_core_dump_on_exception_{false};
 
 private:
     void createDevice() const {
@@ -145,14 +191,12 @@ class KVCacheMemoryConnectorTest: public ::testing::Test {
         kv_cache_config_.memory_cache_sync_timeout_ms = kTestMemoryCacheSyncTimeout;
 
         auto mha_spec       = std::make_shared<MHAKVCacheSpec>();
-        mha_spec->layer_num = layer_num;
         // mha_spec->block_nums         = block_num;
         mha_spec->local_head_num_kv  = 8;
         mha_spec->size_per_head      = 128;
         mha_spec->seq_size_per_block = seq_size_per_block;
         mha_spec->dtype              = mha_dtype;
         mha_spec->type               = KVCacheSpecType::MultiHeadAttention;
-        config.cache_specs.push_back(mha_spec);
         // Keep CacheConfig sizes consistent with current business definition (see CacheConfig.h):
         // - kv_block_stride_bytes / kv_scale_stride_bytes are "per-layer" strides for one logical block
         // - kv_block_size_bytes / kv_scale_size_bytes are "all layers" totals for one logical block
@@ -172,10 +216,7 @@ class KVCacheMemoryConnectorTest: public ::testing::Test {
         for (int i = 0; i < layer_num; ++i) {
             layer_ids[i] = i;
         }
-        config.layer_ids.push_back(layer_ids);
-        // SingleTypeKVCacheAllocator::init() expects global_layer_ids[0] to exist.
-        // In these unit tests we only have one "model group", so keep it consistent with layer_ids.
-        config.global_layer_ids.push_back(layer_ids);
+        config.fromGroupedSpecs({mha_spec}, {layer_ids}, {CacheGroupType::FULL}, {"default"});
 
         return config;
     }
@@ -217,6 +258,7 @@ class KVCacheMemoryConnectorTest: public ::testing::Test {
         auto* addr = static_cast<char*>(b.addr) + byte_offset;
         if (b.is_cuda) {
             check_cuda_value(cudaMemset(addr, c, byte_len));
+            check_cuda_value(cudaDeviceSynchronize());
         } else {
             memset(addr, c, byte_len);
         }
@@ -395,6 +437,8 @@ class KVCacheMemoryConnectorTest: public ::testing::Test {
             item->add_gpu_blocks(blocks[layer]);
         }
         item->set_mem_block(static_cast<int>(mem_block_index));
+        item->set_is_complete(true);
+        item->set_backing_type(MemoryOperationRequestPB::MEMORY);
     }
     LayerBlockIds makeLayerBlockIds(const std::vector<std::vector<BlockIdxType>>& per_layer_block_indices,
                                     size_t                                        cache_keys_num) const {
@@ -418,9 +462,18 @@ class KVCacheMemoryConnectorTest: public ::testing::Test {
     makeCacheResource(const CacheKeysType&                          cache_keys,
                       const std::vector<std::vector<BlockIdxType>>& per_layer_block_indices,
                       size_t                                        reuse_len = 0) const {
-        auto res             = std::make_shared<KVCacheResource>();
-        res->cache_keys      = cache_keys;
+        auto res                  = std::make_shared<KVCacheResource>();
+        res->initGroups(1,
+                        static_cast<int>(cache_config_.layer_num),
+                        std::vector<std::vector<int>>(cache_config_.layer_num, std::vector<int>{0}),
+                        /*kernel_blocks_per_kv_block=*/1,
+                        {});
+        res->cache_keys = cache_keys;
         res->layer_block_ids = makeLayerBlockIds(per_layer_block_indices, cache_keys.size());
+        res->layer_group_block_ids.assign(cache_config_.layer_num, std::vector<std::shared_ptr<BlockIds>>(1, nullptr));
+        for (size_t layer = 0; layer < cache_config_.layer_num; ++layer) {
+            res->layer_group_block_ids[layer][0] = res->layer_block_ids[layer];
+        }
         // reuse_len in these tests means "GPU already-reused prefix length".
         // KVCacheResource::reuseBlockNum() is derived from (device + memory + remote),
         // so set device reuse here to make asyncMatch/asyncRead semantics consistent.
@@ -556,10 +609,14 @@ TEST_F(KVCacheMemoryConnectorTest, init_ReturnFalse_WhenMemoryCacheSyncTimeoutMs
 }
 
 TEST_F(KVCacheMemoryConnectorTest, init_ReturnFalse_WhenBlockSizeBytesZero) {
-    // NOTE: business code no longer validates `block_size_bytes` for memory cache block size.
-    // `init()` validates `layer_to_block_stride_bytes` instead.
     auto cfg = cache_config_;
     cfg.layer_to_block_stride_bytes.clear();
+    makeConfigUseZeroStrideSpec(cfg);
+    cfg.kv_block_stride_bytes = 0;
+    cfg.kv_scale_stride_bytes = 0;
+    cfg.kv_block_size_bytes   = 0;
+    cfg.kv_scale_size_bytes   = 0;
+    cfg.block_size_bytes      = 0;
 
     auto kv_cfg                         = kv_cache_config_;
     kv_cfg.memory_cache_size_mb         = 64;
@@ -573,6 +630,9 @@ TEST_F(KVCacheMemoryConnectorTest, init_ReturnFalse_WhenPoolTooSmallForBlockSize
     auto cfg = cache_config_;
     // Make sure pool_size_mb * 1MB / total_stride_bytes == 0 -> createBlockPool() should fail with CHECK.
     cfg.layer_to_block_stride_bytes.assign(static_cast<size_t>(cfg.layer_num), 1024 * 1024);  // 1MB per layer
+    setGroupStridesForConfig(cfg,
+                             std::vector<size_t>(static_cast<size_t>(cfg.groupNums()), 1024 * 1024),
+                             std::vector<size_t>(static_cast<size_t>(cfg.groupNums()), 0));
 
     auto kv_cfg                         = kv_cache_config_;
     kv_cfg.memory_cache_size_mb         = 1;     // 1MB
@@ -602,10 +662,14 @@ TEST_F(KVCacheMemoryConnectorTest, initBlockPool_Throw_WhenMemoryCacheSizeMbZero
 }
 
 TEST_F(KVCacheMemoryConnectorTest, initBlockPool_Throw_WhenBlockSizeBytesZero) {
-    // NOTE: business code no longer validates `block_size_bytes` for memory cache block size.
-    // `initBlockPool()` validates `layer_to_block_stride_bytes` instead.
     auto cfg = cache_config_;
     cfg.layer_to_block_stride_bytes.clear();
+    makeConfigUseZeroStrideSpec(cfg);
+    cfg.kv_block_stride_bytes = 0;
+    cfg.kv_scale_stride_bytes = 0;
+    cfg.kv_block_size_bytes   = 0;
+    cfg.kv_scale_size_bytes   = 0;
+    cfg.block_size_bytes      = 0;
 
     auto kv_cfg                         = kv_cache_config_;
     kv_cfg.memory_cache_size_mb         = 64;
@@ -620,6 +684,9 @@ TEST_F(KVCacheMemoryConnectorTest, initBlockPool_Throw_WhenCreateBlockPoolFails)
     // Force createBlockPool() to compute block_num=0:
     // block_num = pool_size_mb * 1MB / total_stride_bytes.
     cfg.layer_to_block_stride_bytes.assign(static_cast<size_t>(cfg.layer_num), 1024 * 1024);  // 1MB per layer
+    setGroupStridesForConfig(cfg,
+                             std::vector<size_t>(static_cast<size_t>(cfg.groupNums()), 1024 * 1024),
+                             std::vector<size_t>(static_cast<size_t>(cfg.groupNums()), 0));
 
     auto kv_cfg                         = kv_cache_config_;
     kv_cfg.memory_cache_size_mb         = 1;     // 1MB
@@ -640,6 +707,54 @@ TEST_F(KVCacheMemoryConnectorTest, initBlockPool_ReturnTrue_AndRegistersPool) {
     ASSERT_NE(pool, nullptr);
 }
 
+TEST_F(KVCacheMemoryConnectorTest, buildCopyPlanForWrite_UsesLayerAndGroupSlots) {
+    auto cfg          = cache_config_;
+    cfg.layer_num     = 1;
+    cfg.layer_all_num = 1;
+    auto spec = cfg.specForGroup(0);
+    cfg.fromGroupedSpecs({spec, spec}, {{0}, {0}}, {CacheGroupType::FULL, CacheGroupType::FULL}, {"csa_kv", "swa_kv"});
+    setGroupStridesForConfig(cfg, {16, 32}, {0, 0});
+    cfg.layer_to_block_stride_bytes = {999};
+
+    auto kv_cfg                         = kv_cache_config_;
+    kv_cfg.memory_cache_size_mb         = 64;
+    kv_cfg.memory_cache_sync_timeout_ms = 1000;
+    auto conn          = std::make_shared<KVCacheMemoryConnector>(cfg, kv_cfg, allocator_, server_addrs_);
+    conn->block_cache_ = std::make_shared<MemoryDiskBlockCache>();
+    ASSERT_NO_THROW(conn->initBlockPool());
+
+    auto slots = conn->layerTagSlots();
+    ASSERT_EQ(slots.size(), 2u);
+    EXPECT_EQ(slots[0].layer_id, 0);
+    EXPECT_EQ(slots[0].tag, "csa_kv");
+    EXPECT_EQ(slots[0].group_id, 0);
+    EXPECT_EQ(slots[0].stride_bytes, 16u);
+    EXPECT_EQ(slots[1].layer_id, 0);
+    EXPECT_EQ(slots[1].tag, "swa_kv");
+    EXPECT_EQ(slots[1].group_id, 1);
+    EXPECT_EQ(slots[1].stride_bytes, 32u);
+
+    auto resource         = std::make_shared<KVCacheResource>();
+    resource->cacheKeys() = {101, 102, 103};
+    initResourceGroupsForConfig(*resource, cfg);
+    resource->mutableBlockIds(/*group_id=*/0).assign({11, 12, 13});
+    resource->mutableBlockIds(/*group_id=*/1).assign({21, NULL_BLOCK_IDX, 23});
+
+    bool no_need_write = true;
+    auto plan          = conn->buildCopyPlanForWrite(
+        resource->cacheKeys(), resource->layerGroupBlocks(), slots, /*start_index=*/0, /*write_num=*/3, no_need_write);
+
+    ASSERT_NE(plan, nullptr);
+    EXPECT_FALSE(no_need_write);
+    ASSERT_EQ(plan->copy_infos.size(), 3u);
+    EXPECT_TRUE(plan->copy_infos[0].is_complete);
+    EXPECT_FALSE(plan->copy_infos[1].is_complete);
+    EXPECT_TRUE(plan->copy_infos[2].is_complete);
+    EXPECT_EQ(plan->copy_infos[0].gpu_blocks, (std::vector<BlockIdxType>{11, 21}));
+    EXPECT_EQ(plan->copy_infos[1].gpu_blocks, (std::vector<BlockIdxType>{12, NULL_BLOCK_IDX}));
+    EXPECT_EQ(plan->copy_infos[2].gpu_blocks, (std::vector<BlockIdxType>{13, 23}));
+}
+
 TEST_F(KVCacheMemoryConnectorTest, asyncMatch_ReturnNull_WhenGpuReuseLenGEKeysSize) {
     const size_t                           N = 3;
     CacheKeysType                          cache_keys{70001, 70002, 70003};
@@ -855,6 +970,47 @@ TEST_F(KVCacheMemoryConnectorTest, asyncRead_ReturnNull_WhenPlanEmpty) {
     EXPECT_EQ(ctx, nullptr);
 }
 
+TEST_F(KVCacheMemoryConnectorTest, asyncRead_Success_WhenCacheEntryRemovedAfterMatch) {
+    // asyncMatch should pin the matched memory blocks so asyncRead can still use them
+    // even if another request consumes and removes the cache entries before read starts.
+    CacheKeysType cache_keys{21001, 21002, 21003};
+
+    const size_t mem_size = memoryCacheBlockBytes();
+    ASSERT_GT(mem_size, 0u);
+    auto pool = ensureBlockPool(mem_size);
+    ASSERT_NE(pool, nullptr);
+
+    auto block_indices = putItemsToCache(cache_keys, mem_size);
+    ASSERT_EQ(block_indices.size(), cache_keys.size());
+
+    std::vector<std::vector<BlockIdxType>> lbs_vec{
+        {1, 2, 3},
+        {1, 2, 3},
+        {1, 2, 3},
+        {1, 2, 3},
+    };
+    auto res  = makeCacheResource(cache_keys, lbs_vec);
+    auto meta = std::make_shared<TestReadMeta>(/*enable_memory_cache=*/true, /*enable_remote_cache=*/false, "");
+
+    auto match_ctx = connector_->asyncMatch(res, meta);
+    ASSERT_NE(match_ctx, nullptr);
+    const int start_read_block_index = static_cast<int>(res->reuseBlockNum());
+    const int read_block_num         = static_cast<int>(match_ctx->matchedBlockCount()) - start_read_block_index;
+    ASSERT_GT(read_block_num, 0);
+
+    for (int i = start_read_block_index; i < start_read_block_index + read_block_num; ++i) {
+        auto removed = connector_->block_cache_->remove(cache_keys[i]);
+        ASSERT_TRUE(removed.has_value());
+        pool->blockCacheFree({removed->block_index});
+    }
+
+    auto ctx = connector_->asyncRead(res, meta, match_ctx, start_read_block_index, read_block_num);
+    ASSERT_NE(ctx, nullptr);
+    ASSERT_TRUE(waitUntilDone(ctx));
+    EXPECT_TRUE(ctx->success());
+    EXPECT_EQ(res->memoryReuseBlockNum(), static_cast<size_t>(read_block_num));
+}
+
 TEST_F(KVCacheMemoryConnectorTest, asyncRead_Success_IncrementsReuseLen_ByMatchedPrefix) {
     // 初始 reuse_len=1, 内存全部命中 => mem_match_len=3，最终 reuse_len=3
     CacheKeysType cache_keys{40001, 40002, 40003};
@@ -866,10 +1022,10 @@ TEST_F(KVCacheMemoryConnectorTest, asyncRead_Success_IncrementsReuseLen_ByMatche
     ASSERT_EQ(block_indices.size(), cache_keys.size());
 
     std::vector<std::vector<BlockIdxType>> lbs_vec{
-        {101, 102, 103},  // layer0
-        {201, 202, 203},  // layer1
-        {301, 302, 303},  // layer2
-        {401, 402, 403},  // layer3
+        {1, 2, 3},  // layer0
+        {1, 2, 3},  // layer1
+        {1, 2, 3},  // layer2
+        {1, 2, 3},  // layer3
     };
     auto res = makeCacheResource(cache_keys, lbs_vec, 1);
 
@@ -901,10 +1057,10 @@ TEST_F(KVCacheMemoryConnectorTest, asyncRead_Success_RemovesLoadedBlocksFromMemo
     ASSERT_LT(pool->freeBlocksNum(), free_before);
 
     std::vector<std::vector<BlockIdxType>> lbs_vec{
-        {111, 112, 113},
-        {211, 212, 213},
-        {311, 312, 313},
-        {411, 412, 413},
+        {1, 2, 3},
+        {1, 2, 3},
+        {1, 2, 3},
+        {1, 2, 3},
     };
     auto res = makeCacheResource(cache_keys, lbs_vec, /*reuse_len=*/1);
 
@@ -938,10 +1094,10 @@ TEST_F(KVCacheMemoryConnectorTest, asyncRead_Success_DoesNotRemoveUpgradedBlock)
     ASSERT_EQ(block_indices.size(), cache_keys.size());
 
     std::vector<std::vector<BlockIdxType>> lbs_vec{
-        {111, 112, 113},
-        {211, 212, 213},
-        {311, 312, 313},
-        {411, 412, 413},
+        {1, 2, 3},
+        {1, 2, 3},
+        {1, 2, 3},
+        {1, 2, 3},
     };
     auto res = makeCacheResource(cache_keys, lbs_vec, /*reuse_len=*/1);
 
@@ -952,7 +1108,7 @@ TEST_F(KVCacheMemoryConnectorTest, asyncRead_Success_DoesNotRemoveUpgradedBlock)
     const int read_num  = static_cast<int>(match_ctx->matchedBlockCount()) - reuse_num;
     ASSERT_GT(read_num, 0);
 
-    // Start async read — buildCopyPlanForRead captures old block indices in the copy plan.
+    // asyncMatch captured old block indices in the read copy plan; asyncRead consumes that plan.
     auto ctx = connector_->asyncRead(res, meta, match_ctx, reuse_num, read_num);
     ASSERT_NE(ctx, nullptr);
 
@@ -1010,7 +1166,7 @@ TEST_F(KVCacheMemoryConnectorTest, asyncRead_FailureOnMemResponse_NoReuseLenIncr
     auto block_indices = putItemsToCache(cache_keys, mem_size);
     ASSERT_EQ(block_indices.size(), cache_keys.size());
 
-    std::vector<std::vector<BlockIdxType>> lbs_vec{{11, 12}, {21, 22}, {31, 32}, {41, 42}};
+    std::vector<std::vector<BlockIdxType>> lbs_vec{{1, 2}, {3, 4}, {5, 6}, {7, 8}};
     auto                                   res = makeCacheResource(cache_keys, lbs_vec);
 
     auto meta      = std::make_shared<TestReadMeta>(/*enable_memory_cache=*/true, /*enable_remote_cache=*/false, "");
@@ -1060,7 +1216,7 @@ TEST_F(KVCacheMemoryConnectorTest, asyncRead_FailureOnRpcStatus_NoReuseLenIncrem
     auto block_indices = putItemsToCache(cache_keys, mem_size);
     ASSERT_EQ(block_indices.size(), cache_keys.size());
 
-    std::vector<std::vector<BlockIdxType>> lbs_vec{{31, 32}, {41, 42}, {51, 52}, {61, 62}};
+    std::vector<std::vector<BlockIdxType>> lbs_vec{{1, 2}, {3, 4}, {5, 6}, {7, 8}};
     auto                                   res = makeCacheResource(cache_keys, lbs_vec);
 
     auto meta      = std::make_shared<TestReadMeta>(/*enable_memory_cache=*/true, /*enable_remote_cache=*/false, "");
@@ -1131,6 +1287,8 @@ TEST_F(KVCacheMemoryConnectorTest, asyncWrite_InvalidInputs_ReturnNullOrThrow) {
     // empty layer_block_ids
     auto res_empty_lbs = makeCacheResource(/*cache_keys=*/{1}, /*lbs=*/{{1}});
     res_empty_lbs->layer_block_ids.clear();
+    res_empty_lbs->layer_group_block_ids.clear();
+    res_empty_lbs->group_block_ids.clear();
     auto ctx2 = connector_->asyncWrite(res_empty_lbs, meta);
     EXPECT_EQ(ctx2, nullptr);
 }
@@ -1602,6 +1760,15 @@ TEST_F(KVCacheMemoryConnectorTest, copyCache_ReturnFalse_CountMismatch) {
     EXPECT_THROW((void)connector_->copyCache(req, resp), rtp_llm::RTPException);
 }
 
+TEST_F(KVCacheMemoryConnectorTest, copyCache_ReturnFalse_EmptyCopyItems) {
+    MemoryOperationRequestPB req;
+    req.set_copy_direction(MemoryOperationRequestPB::H2D);
+
+    MemoryOperationResponsePB resp;
+    EXPECT_FALSE(connector_->copyCache(req, resp));
+    EXPECT_FALSE(resp.success());
+}
+
 TEST_F(KVCacheMemoryConnectorTest, copyCache_ReturnFalse_InvalidMemBlock) {
     const int    layer_id      = 0;
     const int    gpu_block_idx = 1;
@@ -1701,7 +1868,6 @@ TEST_F(KVCacheMemoryConnectorTest, copyCache_ReturnTrue_H2D_SplitKvScale_NoBlock
 
     auto mla_spec                = std::make_shared<rtp_llm::MLAKVCacheSpec>();
     mla_spec->type               = rtp_llm::KVCacheSpecType::MultiHeadLatentAttention;
-    mla_spec->layer_num          = static_cast<uint32_t>(kLayerNum);
     mla_spec->local_head_num_kv  = 1;
     mla_spec->seq_size_per_block = kSeqPerBlock;
     mla_spec->kv_lora_rank       = 512;
@@ -1715,7 +1881,6 @@ TEST_F(KVCacheMemoryConnectorTest, copyCache_ReturnTrue_H2D_SplitKvScale_NoBlock
     cache_config_.use_mla               = true;
     cache_config_.is_sparse             = false;
     cache_config_.dtype                 = mla_spec->dtype;
-    cache_config_.cache_specs           = {mla_spec};
     cache_config_.kv_block_stride_bytes = kKvBytesPerTok * kSeqPerBlock;
     cache_config_.kv_scale_stride_bytes = kScaleBytesPerTok * kSeqPerBlock;
     cache_config_.kv_block_size_bytes   = static_cast<size_t>(kLayerNum) * cache_config_.kv_block_stride_bytes;
@@ -1728,10 +1893,8 @@ TEST_F(KVCacheMemoryConnectorTest, copyCache_ReturnTrue_H2D_SplitKvScale_NoBlock
     for (int i = 0; i < kLayerNum; ++i) {
         layer_ids[i] = i;
     }
-    cache_config_.layer_ids.clear();
-    cache_config_.global_layer_ids.clear();
-    cache_config_.layer_ids.push_back(layer_ids);
-    cache_config_.global_layer_ids.push_back(layer_ids);
+    cache_config_.fromGroupedSpecs({mla_spec}, {layer_ids}, {CacheGroupType::FULL}, {"default"});
+    setGroupStridesForConfig(cache_config_, {cache_config_.kv_block_stride_bytes}, {cache_config_.kv_scale_stride_bytes});
 
     ASSERT_EQ(mla_spec->block_size_bytes(), cache_config_.kv_block_stride_bytes);
 
@@ -1853,6 +2016,8 @@ TEST_F(KVCacheMemoryConnectorTest, copyCache_ReturnTrue_D2H_SingleLayer) {
         item->add_gpu_blocks(l == layer_id ? gpu_block_idx : NULL_BLOCK_IDX);
     }
     item->set_mem_block(mem_block_index);
+    item->set_is_complete(true);
+    item->set_backing_type(MemoryOperationRequestPB::MEMORY);
     req.set_copy_direction(MemoryOperationRequestPB::D2H);
 
     MemoryOperationResponsePB resp;
@@ -1968,6 +2133,412 @@ TEST_F(KVCacheMemoryConnectorTest, copyCache_D2H_MultiLayer_ValidatesByteOffsets
     }
 }
 
+// ============================== Dual-pool tests ==============================
+
+class KVCacheMemoryConnectorDualPoolTest: public ::testing::Test {
+protected:
+    void SetUp() override {
+        old_core_dump_on_exception_                  = StaticConfig::user_ft_core_dump_on_exception;
+        StaticConfig::user_ft_core_dump_on_exception = false;
+        createDevice();
+        startRpcServer(4);
+    }
+
+    void TearDown() override {
+        StaticConfig::user_ft_core_dump_on_exception = old_core_dump_on_exception_;
+    }
+
+    bool old_core_dump_on_exception_{false};
+
+    CacheConfig
+    createHybridCacheConfig(int layer_num = 4, int block_num = 10, int seq_size_per_block = 8, int linear_step = 4) {
+        constexpr int kTestMemoryCacheSizeMb      = 64;
+        constexpr int kTestMemoryCacheSyncTimeout = 1000;
+
+        CacheConfig config;
+        config.layer_num                              = layer_num;
+        config.layer_all_num                          = layer_num;
+        config.block_num                              = block_num;
+        config.seq_size_per_block                     = seq_size_per_block;
+        config.linear_step                            = linear_step;
+        config.group_layer_num                        = layer_num;
+        kv_cache_config_.memory_cache_size_mb         = kTestMemoryCacheSizeMb;
+        kv_cache_config_.memory_cache_sync_timeout_ms = kTestMemoryCacheSyncTimeout;
+
+        auto full_spec                = std::make_shared<MHAKVCacheSpec>();
+        full_spec->local_head_num_kv  = 4;
+        full_spec->size_per_head      = 64;
+        full_spec->seq_size_per_block = seq_size_per_block;
+        full_spec->dtype              = rtp_llm::DataType::TYPE_FP16;
+
+        auto swa_spec                = std::make_shared<MHAKVCacheSpec>();
+        swa_spec->local_head_num_kv  = 4;
+        swa_spec->size_per_head      = 64;
+        swa_spec->seq_size_per_block = seq_size_per_block;
+        swa_spec->dtype              = rtp_llm::DataType::TYPE_FP16;
+
+        const size_t full_stride           = full_spec->block_size_bytes();
+        const size_t swa_stride            = swa_spec->block_size_bytes();
+
+        config.dtype                 = full_spec->dtype;
+        config.kv_block_stride_bytes = std::max(full_stride, swa_stride);
+        config.kv_scale_stride_bytes = 0;
+        config.kv_block_size_bytes   = static_cast<size_t>(layer_num) * full_stride;
+        config.kv_scale_size_bytes   = 0;
+        config.block_size_bytes      = config.kv_block_size_bytes;
+
+        std::vector<int> full_layer_ids(layer_num);
+        std::vector<int> swa_layer_ids(layer_num);
+        for (int i = 0; i < layer_num; ++i) {
+            full_layer_ids[i] = i;
+            swa_layer_ids[i]  = i;
+        }
+        config.fromGroupedSpecs({full_spec, swa_spec},
+                                {full_layer_ids, swa_layer_ids},
+                                {CacheGroupType::FULL, CacheGroupType::SWA},
+                                {"default", "swa_kv"});
+        setGroupStridesForConfig(config, {full_stride, swa_stride}, {0, 0});
+        config.layer_to_block_stride_bytes.assign(layer_num, static_cast<int>(full_stride));
+
+        config.use_independent_block_pools = true;
+
+        return config;
+    }
+
+    std::shared_ptr<KVCacheMemoryConnector> createConnector(const CacheConfig& cfg) {
+        auto conn = std::make_shared<KVCacheMemoryConnector>(cfg, kv_cache_config_, allocator_, server_addrs_);
+        EXPECT_TRUE(conn->init());
+        return conn;
+    }
+
+    std::shared_ptr<KVCacheResource> makeHybridResource(const CacheConfig&                            cfg,
+                                                        const CacheKeysType&                          cache_keys,
+                                                        const std::vector<std::vector<BlockIdxType>>& full_blocks,
+                                                        const std::vector<std::vector<BlockIdxType>>& swa_blocks,
+                                                        size_t reuse_len = 0) const {
+        auto         res       = std::make_shared<KVCacheResource>();
+        const size_t layer_num = static_cast<size_t>(cfg.layer_all_num);
+
+        initResourceGroupsForConfig(*res, cfg);
+
+        res->resizeBlocks(static_cast<int>(cache_keys.size()), NULL_BLOCK_IDX);
+
+        for (size_t l = 0; l < layer_num; ++l) {
+            if (l < full_blocks.size()) {
+                BlockIndicesType padded(cache_keys.size(), NULL_BLOCK_IDX);
+                for (size_t k = 0; k < std::min(cache_keys.size(), full_blocks[l].size()); ++k) {
+                    padded[k] = full_blocks[l][k];
+                }
+                res->mutableBlockIds(static_cast<int>(l), 0).assign(padded);
+            }
+            if (l < swa_blocks.size()) {
+                BlockIndicesType padded(cache_keys.size(), NULL_BLOCK_IDX);
+                for (size_t k = 0; k < std::min(cache_keys.size(), swa_blocks[l].size()); ++k) {
+                    padded[k] = swa_blocks[l][k];
+                }
+                res->mutableBlockIds(static_cast<int>(l), 1).assign(padded);
+            }
+        }
+
+        res->cacheKeys() = cache_keys;
+        res->setDeviceReuseBlockNum(reuse_len);
+        res->setLastBlockAligned(true);
+        return res;
+    }
+
+    bool waitUntilDone(const std::shared_ptr<rtp_llm::AsyncContext>& ctx, int timeout_ms = 3000) const {
+        if (!ctx) {
+            return false;
+        }
+        const auto deadline = std::chrono::steady_clock::now() + std::chrono::milliseconds(timeout_ms);
+        while (std::chrono::steady_clock::now() < deadline) {
+            if (ctx->done()) {
+                return true;
+            }
+            std::this_thread::sleep_for(std::chrono::milliseconds(1));
+        }
+        return ctx->done();
+    }
+
+    KVCacheConfig                               kv_cache_config_;
+    std::shared_ptr<KVCacheAllocator>           allocator_;
+    std::vector<std::unique_ptr<TestRpcServer>> servers_;
+    std::vector<std::string>                    server_addrs_;
+
+private:
+    void createDevice() const {
+        initRuntime(/*device_id=*/0,
+                    /*trace_memory=*/false,
+                    /*enable_comm_overlap=*/false,
+                    MlaOpsType::AUTO);
+    }
+    void startRpcServer(int server_num) {
+        for (int i = 0; i < server_num; ++i) {
+            auto service = std::make_unique<TestRpcService>();
+            auto server  = std::make_unique<TestRpcServer>(std::move(service));
+            ASSERT_TRUE(server->start());
+            server_addrs_.push_back("127.0.0.1:" + std::to_string(server->listenPort()));
+            servers_.push_back(std::move(server));
+        }
+    }
+};
+
+TEST_F(KVCacheMemoryConnectorDualPoolTest, Init_CreatesDualPools) {
+    auto cfg   = createHybridCacheConfig();
+    allocator_ = std::make_shared<HybridTypeKVCacheAllocator>(cfg, AllocationType::DEVICE);
+    ASSERT_TRUE(allocator_->init());
+    auto conn = createConnector(cfg);
+
+    EXPECT_TRUE(conn->isDualPool());
+    EXPECT_NE(conn->complete_pool_, nullptr);
+    EXPECT_NE(conn->incomplete_pool_, nullptr);
+    EXPECT_EQ(conn->block_pool_, nullptr);
+    EXPECT_NE(conn->block_cache_, nullptr);
+    EXPECT_GT(conn->complete_block_size_, 0u);
+    EXPECT_GT(conn->incomplete_block_size_, 0u);
+    EXPECT_GT(conn->complete_block_size_, conn->incomplete_block_size_);
+}
+
+TEST_F(KVCacheMemoryConnectorDualPoolTest, Init_PureFullUsesSinglePool) {
+    // Pure FULL config: no typed slots, should use single pool
+    CacheConfig config;
+    config.layer_num                              = 4;
+    config.layer_all_num                          = 4;
+    config.block_num                              = 10;
+    config.seq_size_per_block                     = 8;
+    kv_cache_config_.memory_cache_size_mb         = 64;
+    kv_cache_config_.memory_cache_sync_timeout_ms = 1000;
+
+    auto spec                    = std::make_shared<MHAKVCacheSpec>();
+    spec->local_head_num_kv      = 8;
+    spec->size_per_head          = 128;
+    spec->seq_size_per_block     = 8;
+    spec->dtype                  = rtp_llm::DataType::TYPE_FP16;
+    config.dtype                 = spec->dtype;
+    config.kv_block_stride_bytes = spec->block_size_bytes();
+    config.kv_scale_stride_bytes = spec->scale_block_size_bytes();
+    config.kv_block_size_bytes   = 4UL * config.kv_block_stride_bytes;
+    config.kv_scale_size_bytes   = 4UL * config.kv_scale_stride_bytes;
+    config.block_size_bytes      = config.kv_block_size_bytes + config.kv_scale_size_bytes;
+    const size_t per_layer       = config.kv_block_stride_bytes + config.kv_scale_stride_bytes;
+    config.layer_to_block_stride_bytes.assign(4, static_cast<int>(per_layer));
+    std::vector<int> ids = {0, 1, 2, 3};
+    config.fromGroupedSpecs({spec}, {ids}, {CacheGroupType::FULL}, {"default"});
+
+    allocator_ = std::make_shared<SingleTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    ASSERT_TRUE(allocator_->init());
+    auto conn = createConnector(config);
+
+    EXPECT_FALSE(conn->isDualPool());
+    EXPECT_NE(conn->block_pool_, nullptr);
+    EXPECT_NE(conn->block_cache_, nullptr);
+    EXPECT_EQ(conn->complete_pool_, nullptr);
+    EXPECT_EQ(conn->incomplete_pool_, nullptr);
+}
+
+TEST_F(KVCacheMemoryConnectorDualPoolTest, AsyncMatch_AdvancesOnlyOnCompleteHit) {
+    auto cfg = createHybridCacheConfig(/*layer_num=*/2, /*block_num=*/10, /*seq_size_per_block=*/8, /*linear_step=*/4);
+    allocator_ = std::make_shared<HybridTypeKVCacheAllocator>(cfg, AllocationType::DEVICE);
+    ASSERT_TRUE(allocator_->init());
+    auto conn = createConnector(cfg);
+    ASSERT_TRUE(conn->isDualPool());
+
+    // 4 cache keys (3 real + 1 tail dummy that won't be matched)
+    CacheKeysType cache_keys{90001, 90002, 90003, 90999};
+    // FULL blocks: all valid (non-null) for both layers
+    std::vector<std::vector<BlockIdxType>> full_blocks{{1, 1, 1, 1}, {2, 2, 2, 2}};
+    // SWA blocks: key0 NULL (incomplete), key1 valid (complete), key2 NULL (incomplete)
+    std::vector<std::vector<BlockIdxType>> swa_blocks{{NULL_BLOCK_IDX, 1, NULL_BLOCK_IDX, 1},
+                                                      {NULL_BLOCK_IDX, 2, NULL_BLOCK_IDX, 2}};
+    auto                                   res = makeHybridResource(cfg, cache_keys, full_blocks, swa_blocks);
+
+    // Populate caches directly
+    {
+        // key0: incomplete (SWA NULL) → incomplete cache
+        auto inc_blks = conn->incomplete_pool_->malloc(2);
+        ASSERT_EQ(inc_blks.size(), 2u);
+        MemoryDiskBlockCache::CacheItem item0;
+        item0.cache_key    = cache_keys[0];
+        item0.backing_type = CacheBackingType::MEMORY;
+        item0.block_index  = static_cast<BlockIdxType>(inc_blks[0]);
+        item0.is_complete  = false;
+        conn->block_cache_->putCommitted(item0);
+        conn->incomplete_pool_->blockCacheReference({static_cast<BlockIdxType>(inc_blks[0])});
+        conn->incomplete_pool_->requestFree({inc_blks[0]});
+
+        // key2: incomplete → incomplete cache
+        MemoryDiskBlockCache::CacheItem item2;
+        item2.cache_key    = cache_keys[2];
+        item2.backing_type = CacheBackingType::MEMORY;
+        item2.block_index  = static_cast<BlockIdxType>(inc_blks[1]);
+        item2.is_complete  = false;
+        conn->block_cache_->putCommitted(item2);
+        conn->incomplete_pool_->blockCacheReference({static_cast<BlockIdxType>(inc_blks[1])});
+        conn->incomplete_pool_->requestFree({inc_blks[1]});
+
+        // key1: complete → complete cache
+        auto comp_blks = conn->complete_pool_->malloc(1);
+        ASSERT_EQ(comp_blks.size(), 1u);
+        MemoryDiskBlockCache::CacheItem item1;
+        item1.cache_key    = cache_keys[1];
+        item1.backing_type = CacheBackingType::MEMORY;
+        item1.block_index  = static_cast<BlockIdxType>(comp_blks[0]);
+        item1.is_complete  = true;
+        conn->block_cache_->putCommitted(item1);
+        conn->complete_pool_->blockCacheReference({static_cast<BlockIdxType>(comp_blks[0])});
+        conn->complete_pool_->requestFree({comp_blks[0]});
+    }
+
+    auto meta      = std::make_shared<TestReadMeta>(true);
+    auto match_ctx = conn->asyncMatch(res, meta);
+    ASSERT_NE(match_ctx, nullptr);
+    // key0: incomplete hit → scan continues, matched_num stays 0
+    // key1: complete hit + all GPU valid → matched_num = 2
+    // key2: incomplete hit → scan continues, matched_num stays 2
+    // Result: matched_num = 2
+    EXPECT_EQ(match_ctx->matchedBlockCount(), 2u);
+}
+
+TEST_F(KVCacheMemoryConnectorDualPoolTest, AsyncMatch_StopsOnDoubleMiss) {
+    auto cfg = createHybridCacheConfig(/*layer_num=*/2, /*block_num=*/10, /*seq_size_per_block=*/8, /*linear_step=*/4);
+    allocator_ = std::make_shared<HybridTypeKVCacheAllocator>(cfg, AllocationType::DEVICE);
+    ASSERT_TRUE(allocator_->init());
+    auto conn = createConnector(cfg);
+    ASSERT_TRUE(conn->isDualPool());
+
+    CacheKeysType                          cache_keys{92001, 92002, 92003, 92999};
+    std::vector<std::vector<BlockIdxType>> full_blocks{{1, 1, 1, 1}, {2, 2, 2, 2}};
+    std::vector<std::vector<BlockIdxType>> swa_blocks{{1, 1, 1, 1}, {2, 2, 2, 2}};
+    auto                                   res = makeHybridResource(cfg, cache_keys, full_blocks, swa_blocks);
+
+    // Put key0 as complete, skip key1 (gap), key2 as complete
+    auto blks = conn->complete_pool_->malloc(2);
+    ASSERT_EQ(blks.size(), 2u);
+    MemoryDiskBlockCache::CacheItem item0;
+    item0.cache_key    = cache_keys[0];
+    item0.backing_type = CacheBackingType::MEMORY;
+    item0.block_index  = static_cast<BlockIdxType>(blks[0]);
+    item0.is_complete  = true;
+    conn->block_cache_->putCommitted(item0);
+    conn->complete_pool_->blockCacheReference({static_cast<BlockIdxType>(blks[0])});
+    conn->complete_pool_->requestFree({blks[0]});
+
+    MemoryDiskBlockCache::CacheItem item2;
+    item2.cache_key    = cache_keys[2];
+    item2.backing_type = CacheBackingType::MEMORY;
+    item2.block_index  = static_cast<BlockIdxType>(blks[1]);
+    item2.is_complete  = true;
+    conn->block_cache_->putCommitted(item2);
+    conn->complete_pool_->blockCacheReference({static_cast<BlockIdxType>(blks[1])});
+    conn->complete_pool_->requestFree({blks[1]});
+
+    auto meta      = std::make_shared<TestReadMeta>(true);
+    auto match_ctx = conn->asyncMatch(res, meta);
+    ASSERT_NE(match_ctx, nullptr);
+    // key0 hit → matched=1, key1 miss in both caches → break
+    EXPECT_EQ(match_ctx->matchedBlockCount(), 1u);
+}
+
+TEST_F(KVCacheMemoryConnectorDualPoolTest, PoolSizing_JointCalculation) {
+    auto cfg = createHybridCacheConfig(/*layer_num=*/2, /*block_num=*/10, /*seq_size_per_block=*/8, /*linear_step=*/4);
+    allocator_ = std::make_shared<HybridTypeKVCacheAllocator>(cfg, AllocationType::DEVICE);
+    ASSERT_TRUE(allocator_->init());
+    auto conn = createConnector(cfg);
+    ASSERT_TRUE(conn->isDualPool());
+
+    // With linear_step=4: incomplete_num = complete_num * 3.
+    // totalBlocksNum() reports allocatable blocks and excludes reserved block 0.
+    const auto complete_total   = conn->complete_pool_->totalBlocksNum();
+    const auto incomplete_total = conn->incomplete_pool_->totalBlocksNum();
+    EXPECT_GT(complete_total, 0u);
+    EXPECT_GT(incomplete_total, 0u);
+    EXPECT_EQ(incomplete_total, (complete_total + 1) * 3 - 1);
+}
+
+TEST_F(KVCacheMemoryConnectorDualPoolTest, BuildCopyPlanForWrite_SkipsIncompleteWhenIncompletePoolDisabled) {
+    auto cfg = createHybridCacheConfig(/*layer_num=*/2, /*block_num=*/10, /*seq_size_per_block=*/8, /*linear_step=*/1);
+    allocator_ = std::make_shared<HybridTypeKVCacheAllocator>(cfg, AllocationType::DEVICE);
+    ASSERT_TRUE(allocator_->init());
+    auto conn = createConnector(cfg);
+    ASSERT_TRUE(conn->isDualPool());
+    ASSERT_NE(conn->complete_pool_, nullptr);
+    ASSERT_EQ(conn->incomplete_pool_, nullptr);
+
+    CacheKeysType                          cache_keys{93001, 93002, 93003};
+    std::vector<std::vector<BlockIdxType>> full_blocks{{1, 1, 1}, {2, 2, 2}};
+    std::vector<std::vector<BlockIdxType>> swa_blocks{{NULL_BLOCK_IDX, 1, NULL_BLOCK_IDX},
+                                                      {NULL_BLOCK_IDX, 2, NULL_BLOCK_IDX}};
+    auto                                   res   = makeHybridResource(cfg, cache_keys, full_blocks, swa_blocks);
+    auto                                   slots = conn->layerTagSlots();
+
+    bool no_need_write = true;
+    auto plan          = conn->buildCopyPlanForWrite(
+        res->cacheKeys(), res->layerGroupBlocks(), slots, /*start_index=*/0, /*write_num=*/3, no_need_write);
+
+    ASSERT_NE(plan, nullptr);
+    EXPECT_FALSE(no_need_write);
+    ASSERT_EQ(plan->copy_infos.size(), 1u);
+    EXPECT_EQ(plan->copy_infos[0].cache_key, cache_keys[1]);
+    EXPECT_TRUE(plan->copy_infos[0].is_complete);
+}
+
+TEST_F(KVCacheMemoryConnectorDualPoolTest, CacheKeys_MergesBothCaches) {
+    auto cfg   = createHybridCacheConfig(/*layer_num=*/2);
+    allocator_ = std::make_shared<HybridTypeKVCacheAllocator>(cfg, AllocationType::DEVICE);
+    ASSERT_TRUE(allocator_->init());
+    auto conn = createConnector(cfg);
+    ASSERT_TRUE(conn->isDualPool());
+
+    // Put complete/incomplete items into the unified backing cache.
+    auto comp_blks = conn->complete_pool_->malloc(1);
+    ASSERT_EQ(comp_blks.size(), 1u);
+    MemoryDiskBlockCache::CacheItem item1;
+    item1.cache_key    = 100;
+    item1.backing_type = CacheBackingType::MEMORY;
+    item1.block_index  = static_cast<BlockIdxType>(comp_blks[0]);
+    item1.is_complete  = true;
+    conn->block_cache_->putCommitted(item1);
+    conn->complete_pool_->blockCacheReference({static_cast<BlockIdxType>(comp_blks[0])});
+    conn->complete_pool_->requestFree({comp_blks[0]});
+
+    auto inc_blks = conn->incomplete_pool_->malloc(1);
+    ASSERT_EQ(inc_blks.size(), 1u);
+    MemoryDiskBlockCache::CacheItem item2;
+    item2.cache_key    = 200;
+    item2.backing_type = CacheBackingType::MEMORY;
+    item2.block_index  = static_cast<BlockIdxType>(inc_blks[0]);
+    item2.is_complete  = false;
+    conn->block_cache_->putCommitted(item2);
+    conn->incomplete_pool_->blockCacheReference({static_cast<BlockIdxType>(inc_blks[0])});
+    conn->incomplete_pool_->requestFree({inc_blks[0]});
+
+    auto keys = conn->cacheKeys();
+    EXPECT_EQ(keys.size(), 2u);
+    bool has_100 = std::find(keys.begin(), keys.end(), 100) != keys.end();
+    bool has_200 = std::find(keys.begin(), keys.end(), 200) != keys.end();
+    EXPECT_TRUE(has_100);
+    EXPECT_TRUE(has_200);
+}
+
+TEST_F(KVCacheMemoryConnectorDualPoolTest, Init_IncompletePoolTracksCompletePoolByStep) {
+    const int linear_step = 4;
+    const int layer_num   = 4;
+    const int block_num   = 10;
+    const int spb         = 8;
+
+    auto cfg = createHybridCacheConfig(layer_num, block_num, spb, linear_step);
+    allocator_ = std::make_shared<HybridTypeKVCacheAllocator>(cfg, AllocationType::DEVICE);
+    ASSERT_TRUE(allocator_->init());
+    auto conn = createConnector(cfg);
+    ASSERT_TRUE(conn->isDualPool());
+
+    const size_t incomplete = conn->incomplete_pool_->totalBlocksNum();
+    const size_t complete   = conn->complete_pool_->totalBlocksNum();
+    // BlockPool reserves block 0 in each pool, while initBlockPool sizes the
+    // incomplete pool from the complete pool's configured block_num.
+    EXPECT_EQ(incomplete, (complete + 1) * static_cast<size_t>(linear_step - 1) - 1);
+}
+
 }  // namespace rtp_llm::test
 
 int main(int argc, char** argv) {
diff --git a/rtp_llm/cpp/cache/connector/memory/test/MemoryAsyncContextTest.cc b/rtp_llm/cpp/cache/connector/memory/test/MemoryAsyncContextTest.cc
index 3dd7bdf630..fad3b27b7f 100644
--- a/rtp_llm/cpp/cache/connector/memory/test/MemoryAsyncContextTest.cc
+++ b/rtp_llm/cpp/cache/connector/memory/test/MemoryAsyncContextTest.cc
@@ -131,9 +131,10 @@ TEST_F(MemoryAsyncContextTest, waitDone_ReturnVoid_WhenBroadcastResultNullAndCal
     EXPECT_TRUE(ctx->done());
 }
 
-TEST_F(MemoryAsyncContextTest, waitDone_ReturnsImmediately_WhenBroadcastResultNotSet_ThenCallbackOnce) {
+TEST_F(MemoryAsyncContextTest, waitDone_BlocksUntilBroadcastResultSet_ThenCallbackOnce) {
     std::atomic<int>  callback_cnt{0};
     std::atomic<bool> last_ok{true};
+    std::atomic<bool> wait_returned{false};
     auto              cb = [&](bool ok) {
         callback_cnt.fetch_add(1);
         last_ok.store(ok);
@@ -142,17 +143,24 @@ TEST_F(MemoryAsyncContextTest, waitDone_ReturnsImmediately_WhenBroadcastResultNo
     auto ctx = std::make_shared<rtp_llm::MemoryAsyncContext>(cb);
     EXPECT_FALSE(ctx->done());
 
-    std::thread t([&]() { ctx->waitDone(); });
+    std::thread t([&]() {
+        ctx->waitDone();
+        wait_returned.store(true);
+    });
     std::this_thread::sleep_for(std::chrono::milliseconds(10));
-    // 如果 broadcast_result_ 还没设置，waitDone() 不会阻塞，而是按失败处理并回调一次。
-    EXPECT_TRUE(ctx->done());
-    EXPECT_EQ(callback_cnt.load(), 1);
-    EXPECT_FALSE(last_ok.load());
+    // waitDone() must not finalize before startCopyAsync() publishes its BroadcastResult.
+    EXPECT_FALSE(wait_returned.load());
+    EXPECT_FALSE(ctx->done());
+    EXPECT_EQ(callback_cnt.load(), 0);
 
-    ctx->setBroadcastResult(nullptr);
+    // Empty worker contexts => BroadcastResult::waitDone() returns immediately and marks success.
+    auto result = std::make_shared<MemoryBroadcastResultT>(std::vector<std::shared_ptr<MemoryWorkerCtxT>>{});
+    ctx->setBroadcastResult(result);
     t.join();
     EXPECT_TRUE(ctx->done());
+    EXPECT_TRUE(wait_returned.load());
     EXPECT_EQ(callback_cnt.load(), 1);
+    EXPECT_TRUE(last_ok.load());
 }
 
 TEST_F(MemoryAsyncContextTest, waitDone_ReturnVoid_WhenBroadcastResultNonNullAndCallbackReceivesSuccess) {
@@ -218,10 +226,40 @@ TEST_F(MemoryAsyncContextTest, waitDone_IsIdempotent_CallbackOnlyOnce) {
     EXPECT_TRUE(last_ok);
 }
 
+TEST_F(MemoryAsyncContextTest, waitDone_ConcurrentCallersFinalizeOnce) {
+    std::atomic<int>  callback_cnt{0};
+    std::atomic<bool> last_ok{false};
+    auto              cb = [&](bool ok) {
+        callback_cnt.fetch_add(1);
+        last_ok.store(ok);
+    };
+
+    auto result = std::make_shared<MemoryBroadcastResultT>(std::vector<std::shared_ptr<MemoryWorkerCtxT>>{});
+    auto ctx    = std::make_shared<rtp_llm::MemoryAsyncContext>(cb);
+
+    std::vector<std::thread> waiters;
+    for (size_t i = 0; i < 8; ++i) {
+        waiters.emplace_back([&]() { ctx->waitDone(); });
+    }
+    std::this_thread::sleep_for(std::chrono::milliseconds(10));
+    EXPECT_FALSE(ctx->done());
+    EXPECT_EQ(callback_cnt.load(), 0);
+
+    ctx->setBroadcastResult(result);
+    for (auto& waiter : waiters) {
+        waiter.join();
+    }
+
+    EXPECT_TRUE(ctx->done());
+    EXPECT_TRUE(ctx->success());
+    EXPECT_EQ(callback_cnt.load(), 1);
+    EXPECT_TRUE(last_ok.load());
+}
+
 }  // namespace rtp_llm::test
 
 int main(int argc, char** argv) {
     rtp_llm::initLogger();
     ::testing::InitGoogleTest(&argc, argv);
     return RUN_ALL_TESTS();
-}
\ No newline at end of file
+}
diff --git a/rtp_llm/cpp/cache/connector/memory/test/MemoryDiskBlockCacheTest.cc b/rtp_llm/cpp/cache/connector/memory/test/MemoryDiskBlockCacheTest.cc
new file mode 100644
index 0000000000..e95a59ed9c
--- /dev/null
+++ b/rtp_llm/cpp/cache/connector/memory/test/MemoryDiskBlockCacheTest.cc
@@ -0,0 +1,179 @@
+#include "gtest/gtest.h"
+
+#include "rtp_llm/cpp/cache/connector/memory/MemoryDiskBlockCache.h"
+
+namespace rtp_llm::test {
+namespace {
+
+MemoryDiskBlockCache::CacheItem memoryItem(CacheKeyType key, BlockIdxType block, bool complete = true) {
+    MemoryDiskBlockCache::CacheItem item;
+    item.cache_key    = key;
+    item.backing_type = CacheBackingType::MEMORY;
+    item.block_index  = block;
+    item.disk_slot    = -1;
+    item.is_complete  = complete;
+    return item;
+}
+
+MemoryDiskBlockCache::CacheItem diskItem(CacheKeyType key, int32_t slot, bool complete = true) {
+    MemoryDiskBlockCache::CacheItem item;
+    item.cache_key    = key;
+    item.backing_type = CacheBackingType::DISK;
+    item.block_index  = NULL_BLOCK_IDX;
+    item.disk_slot    = slot;
+    item.is_complete  = complete;
+    return item;
+}
+
+}  // namespace
+
+TEST(MemoryDiskBlockCacheTest, ContainsAndMatchMemoryAndDisk) {
+    MemoryDiskBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(memoryItem(1, 10)).first);
+    ASSERT_TRUE(cache.putCommitted(diskItem(2, 20)).first);
+
+    EXPECT_TRUE(cache.contains(1));
+    EXPECT_TRUE(cache.contains(2));
+
+    auto mem = cache.match(1);
+    EXPECT_EQ(mem.backing_type, CacheBackingType::MEMORY);
+    EXPECT_EQ(mem.matched_index, 10);
+
+    auto disk = cache.match(2);
+    EXPECT_EQ(disk.backing_type, CacheBackingType::DISK);
+    EXPECT_EQ(disk.disk_slot, 20);
+}
+
+TEST(MemoryDiskBlockCacheTest, SharedAccessSeqEvictsOldestAcrossBackings) {
+    MemoryDiskBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(memoryItem(1, 10)).first);
+    ASSERT_TRUE(cache.putCommitted(diskItem(2, 20)).first);
+
+    ASSERT_FALSE(isNullBlockIdx(cache.match(1).matched_index));
+
+    auto evicted = cache.popOldestEvictable();
+    ASSERT_TRUE(evicted.has_value());
+    EXPECT_EQ(evicted->cache_key, 2);
+    EXPECT_EQ(evicted->backing_type, CacheBackingType::DISK);
+}
+
+TEST(MemoryDiskBlockCacheTest, KindAwareEvictionOnlyPopsRequestedKind) {
+    MemoryDiskBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(memoryItem(1, 10, false)).first);
+    ASSERT_TRUE(cache.putCommitted(diskItem(2, 20, false)).first);
+    ASSERT_TRUE(cache.putCommitted(memoryItem(3, 30, true)).first);
+
+    auto evicted = cache.popOldestEvictable(CacheBlockKind::COMPLETE);
+    ASSERT_TRUE(evicted.has_value());
+    EXPECT_EQ(evicted->cache_key, 3);
+    EXPECT_TRUE(evicted->is_complete);
+
+    evicted = cache.popOldestEvictable(CacheBlockKind::INCOMPLETE);
+    ASSERT_TRUE(evicted.has_value());
+    EXPECT_EQ(evicted->cache_key, 1);
+    EXPECT_FALSE(evicted->is_complete);
+}
+
+TEST(MemoryDiskBlockCacheTest, KindAwareEvictionChoosesOldestAcrossMemoryAndDiskForSameKind) {
+    MemoryDiskBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(memoryItem(1, 10, true)).first);
+    ASSERT_TRUE(cache.putCommitted(diskItem(2, 20, true)).first);
+    ASSERT_FALSE(isNullBlockIdx(cache.match(1).matched_index));
+
+    auto evicted = cache.popOldestEvictable(CacheBlockKind::COMPLETE);
+    ASSERT_TRUE(evicted.has_value());
+    EXPECT_EQ(evicted->cache_key, 2);
+    EXPECT_EQ(evicted->backing_type, CacheBackingType::DISK);
+}
+
+TEST(MemoryDiskBlockCacheTest, ContainsDoesNotUpdateRecency) {
+    MemoryDiskBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(memoryItem(1, 10)).first);
+    ASSERT_TRUE(cache.putCommitted(diskItem(2, 20)).first);
+    ASSERT_TRUE(cache.contains(1));
+
+    auto evicted = cache.popOldestEvictable();
+    ASSERT_TRUE(evicted.has_value());
+    EXPECT_EQ(evicted->cache_key, 1);
+}
+
+TEST(MemoryDiskBlockCacheTest, PartialToCompleteCanUpgradeAcrossBacking) {
+    MemoryDiskBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(memoryItem(1, 10, false)).first);
+
+    auto [ok, popped] = cache.putCommitted(diskItem(1, 20, true));
+    ASSERT_TRUE(ok);
+    ASSERT_TRUE(popped.has_value());
+    EXPECT_EQ(popped->backing_type, CacheBackingType::MEMORY);
+    EXPECT_EQ(popped->block_index, 10);
+
+    auto match = cache.match(1);
+    EXPECT_EQ(match.backing_type, CacheBackingType::DISK);
+    EXPECT_EQ(match.disk_slot, 20);
+    EXPECT_TRUE(match.is_complete);
+}
+
+TEST(MemoryDiskBlockCacheTest, PartialToCompleteDoesNotReplaceInFlightItem) {
+    MemoryDiskBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(memoryItem(1, 10, false)).first);
+
+    auto in_flight = cache.matchAndMarkInFlight(1);
+    EXPECT_EQ(in_flight.backing_type, CacheBackingType::MEMORY);
+    EXPECT_EQ(in_flight.matched_index, 10);
+
+    auto [ok, popped] = cache.putCommitted(diskItem(1, 20, true));
+    EXPECT_FALSE(ok);
+    EXPECT_FALSE(popped.has_value());
+
+    auto match = cache.match(1);
+    EXPECT_EQ(match.backing_type, CacheBackingType::MEMORY);
+    EXPECT_EQ(match.matched_index, 10);
+    EXPECT_FALSE(match.is_complete);
+}
+
+TEST(MemoryDiskBlockCacheTest, InFlightEntryIsNotEvictable) {
+    MemoryDiskBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(memoryItem(1, 10)).first);
+    ASSERT_TRUE(cache.putCommitted(diskItem(2, 20)).first);
+    ASSERT_TRUE(cache.markInFlight(1, CacheBackingType::MEMORY, 10, -1));
+
+    auto evicted = cache.popOldestEvictable();
+    ASSERT_TRUE(evicted.has_value());
+    EXPECT_EQ(evicted->cache_key, 2);
+
+    cache.releaseInFlight(1, CacheBackingType::MEMORY, 10, -1);
+    evicted = cache.popOldestEvictable();
+    ASSERT_TRUE(evicted.has_value());
+    EXPECT_EQ(evicted->cache_key, 1);
+}
+
+TEST(MemoryDiskBlockCacheTest, MatchAndMarkInFlightPreventsEviction) {
+    MemoryDiskBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(memoryItem(1, 10)).first);
+    ASSERT_TRUE(cache.putCommitted(diskItem(2, 20)).first);
+
+    auto match = cache.matchAndMarkInFlight(1);
+    EXPECT_EQ(match.backing_type, CacheBackingType::MEMORY);
+    EXPECT_EQ(match.matched_index, 10);
+
+    auto evicted = cache.popOldestEvictable();
+    ASSERT_TRUE(evicted.has_value());
+    EXPECT_EQ(evicted->cache_key, 2);
+
+    cache.releaseInFlight(1, CacheBackingType::MEMORY, 10, -1);
+    evicted = cache.popOldestEvictable();
+    ASSERT_TRUE(evicted.has_value());
+    EXPECT_EQ(evicted->cache_key, 1);
+}
+
+TEST(MemoryDiskBlockCacheTest, RemoveIfMatchChecksBackingAndSlot) {
+    MemoryDiskBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(diskItem(2, 20)).first);
+
+    EXPECT_FALSE(cache.removeIfMatch(2, CacheBackingType::DISK, NULL_BLOCK_IDX, 21).has_value());
+    auto removed = cache.removeIfMatch(2, CacheBackingType::DISK, NULL_BLOCK_IDX, 20);
+    ASSERT_TRUE(removed.has_value());
+    EXPECT_FALSE(cache.contains(2));
+}
+
+}  // namespace rtp_llm::test
diff --git a/rtp_llm/cpp/cache/connector/memory/test/PrefixTreeMemoryBlockCacheTest.cc b/rtp_llm/cpp/cache/connector/memory/test/PrefixTreeMemoryBlockCacheTest.cc
new file mode 100644
index 0000000000..8ee579204a
--- /dev/null
+++ b/rtp_llm/cpp/cache/connector/memory/test/PrefixTreeMemoryBlockCacheTest.cc
@@ -0,0 +1,691 @@
+#include "gtest/gtest.h"
+
+#include <utility>
+#include <vector>
+
+#include "rtp_llm/cpp/cache/connector/memory/PrefixTreeMemoryBlockCache.h"
+
+namespace rtp_llm::test {
+namespace {
+
+BlockDependency rootDep(uint32_t ordinal = 0) {
+    BlockDependency dep;
+    dep.ordinal = ordinal;
+    return dep;
+}
+
+BlockDependency childDep(CacheKeyType parent, uint32_t ordinal) {
+    BlockDependency dep;
+    dep.has_parent = true;
+    dep.parent_key = parent;
+    dep.ordinal    = ordinal;
+    return dep;
+}
+
+PrefixTreeMemoryBlockCache::CacheItem item(CacheKeyType           key,
+                                           CacheBlockKind         kind,
+                                           BlockIdxType           block,
+                                           std::vector<uint8_t>   slot_valid_mask = {},
+                                           bool                   is_resident = false) {
+    PrefixTreeMemoryBlockCache::CacheItem item;
+    item.cache_key    = key;
+    item.kind         = kind;
+    item.backing_type = CacheBackingType::MEMORY;
+    item.block_index  = block;
+    item.disk_slot    = -1;
+    item.block_size   = 1024;
+    item.is_resident  = is_resident;
+    item.slot_valid_mask = std::move(slot_valid_mask);
+    return item;
+}
+
+PrefixTreeMemoryBlockCache::CacheItem diskItem(CacheKeyType         key,
+                                               CacheBlockKind       kind,
+                                               int32_t              disk_slot,
+                                               std::vector<uint8_t> slot_valid_mask = {}) {
+    auto result          = item(key, kind, NULL_BLOCK_IDX, std::move(slot_valid_mask));
+    result.backing_type  = CacheBackingType::DISK;
+    result.block_index   = NULL_BLOCK_IDX;
+    result.disk_slot     = disk_slot;
+    return result;
+}
+
+}  // namespace
+
+TEST(PrefixTreeMemoryBlockCacheTest, ContainsAndMatchAreKindAware) {
+    PrefixTreeMemoryBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(1, rootDep(), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first);
+    ASSERT_TRUE(cache.putCommitted(1, rootDep(), item(1, CacheBlockKind::STATE_SWA_KV, 12)).first);
+
+    EXPECT_TRUE(cache.contains(1, CacheBlockKind::COMPRESSED_KV));
+    EXPECT_TRUE(cache.contains(1, CacheBlockKind::STATE_SWA_KV));
+
+    auto compressed = cache.match(1, CacheBlockKind::COMPRESSED_KV);
+    ASSERT_TRUE(compressed.found);
+    EXPECT_EQ(compressed.block_index, 11);
+
+    auto state = cache.match(1, CacheBlockKind::STATE_SWA_KV);
+    ASSERT_TRUE(state.found);
+    EXPECT_EQ(state.block_index, 12);
+}
+
+TEST(PrefixTreeMemoryBlockCacheTest, DuplicateKindDoesNotBlockMissingOtherKind) {
+    PrefixTreeMemoryBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(1, rootDep(), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first);
+
+    auto duplicate = cache.putCommitted(1, rootDep(), item(1, CacheBlockKind::COMPRESSED_KV, 13));
+    EXPECT_FALSE(duplicate.first);
+    EXPECT_FALSE(duplicate.second.has_value());
+
+    auto missing_kind = cache.putCommitted(1, rootDep(), item(1, CacheBlockKind::STATE_SWA_KV, 12));
+    EXPECT_TRUE(missing_kind.first);
+    EXPECT_FALSE(missing_kind.second.has_value());
+    EXPECT_EQ(cache.match(1, CacheBlockKind::COMPRESSED_KV).block_index, 11);
+    EXPECT_EQ(cache.match(1, CacheBlockKind::STATE_SWA_KV).block_index, 12);
+}
+
+TEST(PrefixTreeMemoryBlockCacheTest, SlotMaskMustCoverRequestedSlots) {
+    PrefixTreeMemoryBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(1,
+                                   rootDep(),
+                                   item(1,
+                                        CacheBlockKind::STATE_SWA_KV,
+                                        11,
+                                        /*slot_valid_mask=*/std::vector<uint8_t>{0, 1, 0}))
+                    .first);
+
+    EXPECT_TRUE(cache.contains(1, CacheBlockKind::STATE_SWA_KV, std::vector<uint8_t>{0, 1, 0}));
+    EXPECT_TRUE(cache.match(1, CacheBlockKind::STATE_SWA_KV, std::vector<uint8_t>{0, 1, 0}).found);
+    EXPECT_FALSE(cache.contains(1, CacheBlockKind::STATE_SWA_KV, std::vector<uint8_t>{1, 1, 0}));
+    EXPECT_FALSE(cache.match(1, CacheBlockKind::STATE_SWA_KV, std::vector<uint8_t>{1, 1, 0}).found);
+}
+
+TEST(PrefixTreeMemoryBlockCacheTest, WiderSlotMaskReplacesNarrowerBacking) {
+    PrefixTreeMemoryBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(1,
+                                   rootDep(),
+                                   item(1,
+                                        CacheBlockKind::STATE_SWA_KV,
+                                        11,
+                                        /*slot_valid_mask=*/std::vector<uint8_t>{0, 1, 0}))
+                    .first);
+
+    auto replacement = cache.putCommitted(1,
+                                          rootDep(),
+                                          item(1,
+                                               CacheBlockKind::STATE_SWA_KV,
+                                               12,
+                                               /*slot_valid_mask=*/std::vector<uint8_t>{1, 1, 0}));
+    ASSERT_TRUE(replacement.first);
+    ASSERT_TRUE(replacement.second.has_value());
+    EXPECT_EQ(replacement.second->block_index, 11);
+
+    auto matched = cache.match(1, CacheBlockKind::STATE_SWA_KV, std::vector<uint8_t>{1, 1, 0});
+    ASSERT_TRUE(matched.found);
+    EXPECT_EQ(matched.block_index, 12);
+}
+
+TEST(PrefixTreeMemoryBlockCacheTest, NonCoveringSlotMaskDoesNotReplaceBacking) {
+    PrefixTreeMemoryBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(1,
+                                   rootDep(),
+                                   item(1,
+                                        CacheBlockKind::STATE_SWA_KV,
+                                        11,
+                                        /*slot_valid_mask=*/std::vector<uint8_t>{0, 1, 0}))
+                    .first);
+
+    auto replacement = cache.putCommitted(1,
+                                          rootDep(),
+                                          item(1,
+                                               CacheBlockKind::STATE_SWA_KV,
+                                               12,
+                                               /*slot_valid_mask=*/std::vector<uint8_t>{1, 0, 0}));
+    EXPECT_FALSE(replacement.first);
+    EXPECT_FALSE(replacement.second.has_value());
+
+    EXPECT_TRUE(cache.contains(1, CacheBlockKind::STATE_SWA_KV, std::vector<uint8_t>{0, 1, 0}));
+    EXPECT_FALSE(cache.contains(1, CacheBlockKind::STATE_SWA_KV, std::vector<uint8_t>{1, 1, 0}));
+    EXPECT_EQ(cache.match(1, CacheBlockKind::STATE_SWA_KV, std::vector<uint8_t>{0, 1, 0}).block_index, 11);
+}
+
+TEST(PrefixTreeMemoryBlockCacheTest, SameSlotMaskDuplicateDoesNotReplaceBacking) {
+    PrefixTreeMemoryBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(1,
+                                   rootDep(),
+                                   item(1,
+                                        CacheBlockKind::STATE_SWA_KV,
+                                        11,
+                                        /*slot_valid_mask=*/std::vector<uint8_t>{0, 1, 0}))
+                    .first);
+
+    auto duplicate = cache.putCommitted(1,
+                                        rootDep(),
+                                        item(1,
+                                             CacheBlockKind::STATE_SWA_KV,
+                                             12,
+                                             /*slot_valid_mask=*/std::vector<uint8_t>{0, 1, 0}));
+    EXPECT_FALSE(duplicate.first);
+    EXPECT_FALSE(duplicate.second.has_value());
+    EXPECT_EQ(cache.match(1, CacheBlockKind::STATE_SWA_KV, std::vector<uint8_t>{0, 1, 0}).block_index, 11);
+}
+
+TEST(PrefixTreeMemoryBlockCacheTest, MarkInFlightRejectsNonCoveringMask) {
+    PrefixTreeMemoryBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(1,
+                                   rootDep(),
+                                   item(1,
+                                        CacheBlockKind::STATE_SWA_KV,
+                                        11,
+                                        /*slot_valid_mask=*/std::vector<uint8_t>{0, 1, 0}))
+                    .first);
+
+    auto in_flight = cache.matchAndMarkInFlight(1, CacheBlockKind::STATE_SWA_KV, std::vector<uint8_t>{1, 1, 0});
+    EXPECT_FALSE(in_flight.found);
+
+    auto evicted = cache.popOldestEvictable(CacheBlockKind::STATE_SWA_KV);
+    ASSERT_TRUE(evicted.has_value());
+    EXPECT_EQ(evicted->block_index, 11);
+}
+
+TEST(PrefixTreeMemoryBlockCacheTest, InFlightCanBeReplacedByCoveringBacking) {
+    PrefixTreeMemoryBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(1,
+                                   rootDep(),
+                                   item(1,
+                                        CacheBlockKind::STATE_SWA_KV,
+                                        11,
+                                        /*slot_valid_mask=*/std::vector<uint8_t>{0, 1, 0}))
+                    .first);
+
+    auto in_flight = cache.matchAndMarkInFlight(1, CacheBlockKind::STATE_SWA_KV, std::vector<uint8_t>{0, 1, 0});
+    ASSERT_TRUE(in_flight.found);
+
+    auto replacement = cache.putCommitted(1,
+                                          rootDep(),
+                                          item(1,
+                                               CacheBlockKind::STATE_SWA_KV,
+                                               12,
+                                               /*slot_valid_mask=*/std::vector<uint8_t>{1, 1, 0}));
+    ASSERT_TRUE(replacement.first);
+    EXPECT_FALSE(replacement.second.has_value());
+
+    auto matched = cache.match(1, CacheBlockKind::STATE_SWA_KV, std::vector<uint8_t>{1, 1, 0});
+    ASSERT_TRUE(matched.found);
+    EXPECT_EQ(matched.block_index, 12);
+
+    auto retired = cache.releaseInFlight(1,
+                                         CacheBlockKind::STATE_SWA_KV,
+                                         CacheBackingType::MEMORY,
+                                         in_flight.block_index,
+                                         in_flight.disk_slot,
+                                         in_flight.generation);
+    ASSERT_TRUE(retired.has_value());
+    EXPECT_EQ(retired->block_index, 11);
+    matched = cache.match(1, CacheBlockKind::STATE_SWA_KV, std::vector<uint8_t>{1, 1, 0});
+    ASSERT_TRUE(matched.found);
+    EXPECT_EQ(matched.block_index, 12);
+}
+
+TEST(PrefixTreeMemoryBlockCacheTest, RetiredItemRequiresAllInFlightReleases) {
+    PrefixTreeMemoryBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(1,
+                                   rootDep(),
+                                   item(1,
+                                        CacheBlockKind::STATE_SWA_KV,
+                                        11,
+                                        /*slot_valid_mask=*/std::vector<uint8_t>{0, 1, 0}))
+                    .first);
+
+    auto first = cache.matchAndMarkInFlight(1, CacheBlockKind::STATE_SWA_KV, std::vector<uint8_t>{0, 1, 0});
+    auto second = cache.matchAndMarkInFlight(1, CacheBlockKind::STATE_SWA_KV, std::vector<uint8_t>{0, 1, 0});
+    ASSERT_TRUE(first.found);
+    ASSERT_TRUE(second.found);
+
+    auto replacement = cache.putCommitted(1,
+                                          rootDep(),
+                                          item(1,
+                                               CacheBlockKind::STATE_SWA_KV,
+                                               12,
+                                               /*slot_valid_mask=*/std::vector<uint8_t>{1, 1, 0}));
+    ASSERT_TRUE(replacement.first);
+    EXPECT_FALSE(replacement.second.has_value());
+
+    auto retired = cache.releaseInFlight(1,
+                                         CacheBlockKind::STATE_SWA_KV,
+                                         CacheBackingType::MEMORY,
+                                         first.block_index,
+                                         first.disk_slot,
+                                         first.generation);
+    EXPECT_FALSE(retired.has_value());
+
+    retired = cache.releaseInFlight(1,
+                                    CacheBlockKind::STATE_SWA_KV,
+                                    CacheBackingType::MEMORY,
+                                    second.block_index,
+                                    second.disk_slot,
+                                    second.generation);
+    ASSERT_TRUE(retired.has_value());
+    EXPECT_EQ(retired->block_index, 11);
+    EXPECT_EQ(cache.match(1, CacheBlockKind::STATE_SWA_KV, std::vector<uint8_t>{1, 1, 0}).block_index, 12);
+}
+
+TEST(PrefixTreeMemoryBlockCacheTest, MultipleRetiredItemsReleaseOutOfOrder) {
+    PrefixTreeMemoryBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(1,
+                                   rootDep(),
+                                   item(1,
+                                        CacheBlockKind::STATE_SWA_KV,
+                                        11,
+                                        /*slot_valid_mask=*/std::vector<uint8_t>{1, 0, 0}))
+                    .first);
+    auto old_in_flight = cache.matchAndMarkInFlight(1, CacheBlockKind::STATE_SWA_KV, std::vector<uint8_t>{1, 0, 0});
+    ASSERT_TRUE(old_in_flight.found);
+
+    ASSERT_TRUE(cache.putCommitted(1,
+                                   rootDep(),
+                                   item(1,
+                                        CacheBlockKind::STATE_SWA_KV,
+                                        12,
+                                        /*slot_valid_mask=*/std::vector<uint8_t>{1, 1, 0}))
+                    .first);
+    auto middle_in_flight = cache.matchAndMarkInFlight(1, CacheBlockKind::STATE_SWA_KV, std::vector<uint8_t>{1, 1, 0});
+    ASSERT_TRUE(middle_in_flight.found);
+
+    ASSERT_TRUE(cache.putCommitted(1,
+                                   rootDep(),
+                                   item(1,
+                                        CacheBlockKind::STATE_SWA_KV,
+                                        13,
+                                        /*slot_valid_mask=*/std::vector<uint8_t>{1, 1, 1}))
+                    .first);
+
+    auto retired = cache.releaseInFlight(1,
+                                         CacheBlockKind::STATE_SWA_KV,
+                                         CacheBackingType::MEMORY,
+                                         middle_in_flight.block_index,
+                                         middle_in_flight.disk_slot,
+                                         middle_in_flight.generation);
+    ASSERT_TRUE(retired.has_value());
+    EXPECT_EQ(retired->block_index, 12);
+
+    retired = cache.releaseInFlight(1,
+                                    CacheBlockKind::STATE_SWA_KV,
+                                    CacheBackingType::MEMORY,
+                                    old_in_flight.block_index,
+                                    old_in_flight.disk_slot,
+                                    old_in_flight.generation);
+    ASSERT_TRUE(retired.has_value());
+    EXPECT_EQ(retired->block_index, 11);
+    EXPECT_EQ(cache.match(1, CacheBlockKind::STATE_SWA_KV, std::vector<uint8_t>{1, 1, 1}).block_index, 13);
+}
+
+TEST(PrefixTreeMemoryBlockCacheTest, EvictionIsPerKindAndStopsAtBranchPoint) {
+    PrefixTreeMemoryBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(1, rootDep(0), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first);
+    ASSERT_TRUE(cache.putCommitted(2, childDep(1, 1), item(2, CacheBlockKind::COMPRESSED_KV, 12)).first);
+    ASSERT_TRUE(cache.putCommitted(3, childDep(1, 2), item(3, CacheBlockKind::COMPRESSED_KV, 13)).first);
+
+    auto evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV);
+    ASSERT_TRUE(evicted.has_value());
+    EXPECT_EQ(evicted->cache_key, 2);
+    EXPECT_FALSE(cache.contains(2, CacheBlockKind::COMPRESSED_KV));
+    EXPECT_TRUE(cache.contains(1, CacheBlockKind::COMPRESSED_KV));
+    EXPECT_TRUE(cache.contains(3, CacheBlockKind::COMPRESSED_KV));
+}
+
+TEST(PrefixTreeMemoryBlockCacheTest, PrefixTreeLinksChildInsertedBeforeParent) {
+    PrefixTreeMemoryBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(2, childDep(1, 1), item(2, CacheBlockKind::COMPRESSED_KV, 12)).first);
+    ASSERT_TRUE(cache.putCommitted(1, rootDep(0), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first);
+
+    auto evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV);
+    ASSERT_TRUE(evicted.has_value());
+    EXPECT_EQ(evicted->cache_key, 2);
+    EXPECT_FALSE(cache.contains(2, CacheBlockKind::COMPRESSED_KV));
+    EXPECT_TRUE(cache.contains(1, CacheBlockKind::COMPRESSED_KV));
+
+    evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV);
+    ASSERT_TRUE(evicted.has_value());
+    EXPECT_EQ(evicted->cache_key, 1);
+    EXPECT_FALSE(cache.contains(1, CacheBlockKind::COMPRESSED_KV));
+}
+
+TEST(PrefixTreeMemoryBlockCacheTest, ReparentMovesSubtreeRefFromOldToNewParent) {
+    PrefixTreeMemoryBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(1, rootDep(0), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first);
+    ASSERT_TRUE(cache.putCommitted(3, rootDep(0), item(3, CacheBlockKind::COMPRESSED_KV, 13)).first);
+    ASSERT_TRUE(cache.putCommitted(2, childDep(1, 1), item(2, CacheBlockKind::COMPRESSED_KV, 12)).first);
+
+    auto reparent = cache.putCommitted(2, childDep(3, 1), item(2, CacheBlockKind::COMPRESSED_KV, 14));
+    EXPECT_FALSE(reparent.first);
+    EXPECT_FALSE(reparent.second.has_value());
+
+    auto evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV);
+    ASSERT_TRUE(evicted.has_value());
+    EXPECT_EQ(evicted->cache_key, 1);
+
+    evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV);
+    ASSERT_TRUE(evicted.has_value());
+    EXPECT_EQ(evicted->cache_key, 2);
+
+    evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV);
+    ASSERT_TRUE(evicted.has_value());
+    EXPECT_EQ(evicted->cache_key, 3);
+}
+
+TEST(PrefixTreeMemoryBlockCacheTest, MultipleOrphanChildrenAttachOnParentInsert) {
+    PrefixTreeMemoryBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(2, childDep(1, 1), item(2, CacheBlockKind::COMPRESSED_KV, 12)).first);
+    ASSERT_TRUE(cache.putCommitted(3, childDep(1, 2), item(3, CacheBlockKind::COMPRESSED_KV, 13)).first);
+    ASSERT_TRUE(cache.putCommitted(1, rootDep(0), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first);
+
+    auto evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV);
+    ASSERT_TRUE(evicted.has_value());
+    EXPECT_EQ(evicted->cache_key, 2);
+    EXPECT_TRUE(cache.contains(1, CacheBlockKind::COMPRESSED_KV));
+    EXPECT_TRUE(cache.contains(3, CacheBlockKind::COMPRESSED_KV));
+
+    evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV);
+    ASSERT_TRUE(evicted.has_value());
+    EXPECT_EQ(evicted->cache_key, 3);
+
+    evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV);
+    ASSERT_TRUE(evicted.has_value());
+    EXPECT_EQ(evicted->cache_key, 1);
+}
+
+TEST(PrefixTreeMemoryBlockCacheTest, ReparentPendingOrphanMovesPendingEntry) {
+    PrefixTreeMemoryBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(2, childDep(1, 1), item(2, CacheBlockKind::COMPRESSED_KV, 12)).first);
+    auto reparent = cache.putCommitted(2, childDep(3, 1), item(2, CacheBlockKind::COMPRESSED_KV, 14));
+    EXPECT_FALSE(reparent.first);
+    ASSERT_TRUE(cache.putCommitted(3, rootDep(0), item(3, CacheBlockKind::COMPRESSED_KV, 13)).first);
+    ASSERT_TRUE(cache.putCommitted(1, rootDep(0), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first);
+
+    auto evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV);
+    ASSERT_TRUE(evicted.has_value());
+    EXPECT_EQ(evicted->cache_key, 2);
+
+    evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV);
+    ASSERT_TRUE(evicted.has_value());
+    EXPECT_EQ(evicted->cache_key, 3);
+
+    evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV);
+    ASSERT_TRUE(evicted.has_value());
+    EXPECT_EQ(evicted->cache_key, 1);
+}
+
+TEST(PrefixTreeMemoryBlockCacheTest, BranchParentBecomesEvictableAfterAllChildrenGone) {
+    PrefixTreeMemoryBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(1, rootDep(0), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first);
+    ASSERT_TRUE(cache.putCommitted(2, childDep(1, 1), item(2, CacheBlockKind::COMPRESSED_KV, 12)).first);
+    ASSERT_TRUE(cache.putCommitted(3, childDep(1, 2), item(3, CacheBlockKind::COMPRESSED_KV, 13)).first);
+
+    auto evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV);
+    ASSERT_TRUE(evicted.has_value());
+    EXPECT_EQ(evicted->cache_key, 2);
+
+    evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV);
+    ASSERT_TRUE(evicted.has_value());
+    EXPECT_EQ(evicted->cache_key, 3);
+
+    evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV);
+    ASSERT_TRUE(evicted.has_value());
+    EXPECT_EQ(evicted->cache_key, 1);
+}
+
+TEST(PrefixTreeMemoryBlockCacheTest, ResidentItemIsMatchableButNeverEvictable) {
+    PrefixTreeMemoryBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(1,
+                                   rootDep(0),
+                                   item(1,
+                                        CacheBlockKind::COMPRESSED_KV,
+                                        11,
+                                        /*slot_valid_mask=*/{},
+                                        /*is_resident=*/true))
+                    .first);
+
+    EXPECT_TRUE(cache.match(1, CacheBlockKind::COMPRESSED_KV).found);
+    EXPECT_FALSE(cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV).has_value());
+}
+
+TEST(PrefixTreeMemoryBlockCacheTest, ParentDetachPreservesChildLeafAccounting) {
+    PrefixTreeMemoryBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(1, rootDep(0), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first);
+    ASSERT_TRUE(cache.putCommitted(2, childDep(1, 1), item(2, CacheBlockKind::COMPRESSED_KV, 12)).first);
+
+    auto parent = cache.matchAndMarkInFlight(1, CacheBlockKind::COMPRESSED_KV);
+    ASSERT_TRUE(parent.found);
+    EXPECT_FALSE(cache.detachIfMatch(1,
+                                     CacheBlockKind::COMPRESSED_KV,
+                                     CacheBackingType::MEMORY,
+                                     parent.block_index,
+                                     parent.disk_slot,
+                                     parent.generation)
+                     .has_value());
+    auto retired_parent = cache.releaseInFlight(1,
+                                                CacheBlockKind::COMPRESSED_KV,
+                                                CacheBackingType::MEMORY,
+                                                parent.block_index,
+                                                parent.disk_slot,
+                                                parent.generation);
+    ASSERT_TRUE(retired_parent.has_value());
+
+    ASSERT_TRUE(cache.putCommitted(1, rootDep(0), item(1, CacheBlockKind::COMPRESSED_KV, 13)).first);
+
+    auto evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV);
+    ASSERT_TRUE(evicted.has_value());
+    EXPECT_EQ(evicted->cache_key, 2);
+    EXPECT_TRUE(cache.contains(1, CacheBlockKind::COMPRESSED_KV));
+    EXPECT_FALSE(cache.contains(2, CacheBlockKind::COMPRESSED_KV));
+}
+
+TEST(PrefixTreeMemoryBlockCacheTest, KindLeafAccountingIsIndependent) {
+    PrefixTreeMemoryBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(1, rootDep(0), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first);
+    ASSERT_TRUE(cache.putCommitted(2, childDep(1, 1), item(2, CacheBlockKind::COMPRESSED_KV, 12)).first);
+    ASSERT_TRUE(cache.putCommitted(1, rootDep(0), item(1, CacheBlockKind::STATE_SWA_KV, 21)).first);
+
+    auto evicted = cache.popOldestEvictable(CacheBlockKind::STATE_SWA_KV);
+    ASSERT_TRUE(evicted.has_value());
+    EXPECT_EQ(evicted->cache_key, 1);
+    EXPECT_FALSE(cache.contains(1, CacheBlockKind::STATE_SWA_KV));
+    EXPECT_TRUE(cache.contains(1, CacheBlockKind::COMPRESSED_KV));
+    EXPECT_TRUE(cache.contains(2, CacheBlockKind::COMPRESSED_KV));
+}
+
+TEST(PrefixTreeMemoryBlockCacheTest, DetachThenReplaceDoesNotReturnDetachedBackingAgain) {
+    PrefixTreeMemoryBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(1, rootDep(), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first);
+
+    auto in_flight = cache.matchAndMarkInFlight(1, CacheBlockKind::COMPRESSED_KV);
+    ASSERT_TRUE(in_flight.found);
+    ASSERT_EQ(in_flight.block_index, 11);
+
+    auto detached = cache.detachIfMatch(1,
+                                        CacheBlockKind::COMPRESSED_KV,
+                                        CacheBackingType::MEMORY,
+                                        in_flight.block_index,
+                                        in_flight.disk_slot,
+                                        in_flight.generation);
+    EXPECT_FALSE(detached.has_value());
+
+    auto replacement = cache.putCommitted(1, rootDep(), item(1, CacheBlockKind::COMPRESSED_KV, 12));
+    EXPECT_TRUE(replacement.first);
+    EXPECT_FALSE(replacement.second.has_value());
+
+    auto retired = cache.releaseInFlight(1,
+                                         CacheBlockKind::COMPRESSED_KV,
+                                         CacheBackingType::MEMORY,
+                                         in_flight.block_index,
+                                         in_flight.disk_slot,
+                                         in_flight.generation);
+    ASSERT_TRUE(retired.has_value());
+    EXPECT_EQ(retired->block_index, 11);
+    auto matched = cache.match(1, CacheBlockKind::COMPRESSED_KV);
+    ASSERT_TRUE(matched.found);
+    EXPECT_EQ(matched.block_index, 12);
+}
+
+TEST(PrefixTreeMemoryBlockCacheTest, DetachPrunesEmptyLeafButKeepsStructuralParent) {
+    PrefixTreeMemoryBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(1, rootDep(0), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first);
+    ASSERT_TRUE(cache.putCommitted(2, childDep(1, 1), item(2, CacheBlockKind::COMPRESSED_KV, 12)).first);
+
+    auto child = cache.matchAndMarkInFlight(2, CacheBlockKind::COMPRESSED_KV);
+    ASSERT_TRUE(child.found);
+    auto detached_child = cache.detachIfMatch(2,
+                                              CacheBlockKind::COMPRESSED_KV,
+                                              CacheBackingType::MEMORY,
+                                              child.block_index,
+                                              child.disk_slot,
+                                              child.generation);
+    EXPECT_FALSE(detached_child.has_value());
+    detached_child = cache.releaseInFlight(2,
+                                           CacheBlockKind::COMPRESSED_KV,
+                                           CacheBackingType::MEMORY,
+                                           child.block_index,
+                                           child.disk_slot,
+                                           child.generation);
+    ASSERT_TRUE(detached_child.has_value());
+
+    EXPECT_FALSE(cache.contains(2, CacheBlockKind::COMPRESSED_KV));
+    EXPECT_TRUE(cache.contains(1, CacheBlockKind::COMPRESSED_KV));
+    EXPECT_EQ(cache.cacheKeys(), (CacheKeysType{1}));
+    auto status_keys = cache.cacheKeysUnorderedForStatus();
+    std::sort(status_keys.begin(), status_keys.end());
+    EXPECT_EQ(status_keys, (CacheKeysType{1}));
+}
+
+TEST(PrefixTreeMemoryBlockCacheTest, StatusCacheKeysAreUnorderedAndDeduplicated) {
+    PrefixTreeMemoryBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(1, rootDep(0), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first);
+    ASSERT_TRUE(cache.putCommitted(1, rootDep(0), item(1, CacheBlockKind::STATE_SWA_KV, 12)).first);
+    ASSERT_TRUE(cache.putCommitted(2, childDep(1, 1), item(2, CacheBlockKind::COMPRESSED_KV, 21)).first);
+    ASSERT_TRUE(cache.putCommitted(3, childDep(2, 2), item(3, CacheBlockKind::STATE_SWA_KV, 31)).first);
+
+    auto status_keys = cache.cacheKeysUnorderedForStatus();
+    std::sort(status_keys.begin(), status_keys.end());
+
+    EXPECT_EQ(status_keys, (CacheKeysType{1, 2, 3}));
+    EXPECT_EQ(status_keys.size(), cache.cacheKeys().size());
+}
+
+TEST(PrefixTreeMemoryBlockCacheTest, ParentBecomesEvictableAfterChildDetachEvenAfterTouchWhileNonLeaf) {
+    PrefixTreeMemoryBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(1, rootDep(0), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first);
+    ASSERT_TRUE(cache.putCommitted(2, childDep(1, 1), item(2, CacheBlockKind::COMPRESSED_KV, 12)).first);
+
+    ASSERT_TRUE(cache.match(1, CacheBlockKind::COMPRESSED_KV).found);
+
+    auto child = cache.matchAndMarkInFlight(2, CacheBlockKind::COMPRESSED_KV);
+    ASSERT_TRUE(child.found);
+    auto detached_child = cache.detachIfMatch(2,
+                                              CacheBlockKind::COMPRESSED_KV,
+                                              CacheBackingType::MEMORY,
+                                              child.block_index,
+                                              child.disk_slot,
+                                              child.generation);
+    EXPECT_FALSE(detached_child.has_value());
+    detached_child = cache.releaseInFlight(2,
+                                           CacheBlockKind::COMPRESSED_KV,
+                                           CacheBackingType::MEMORY,
+                                           child.block_index,
+                                           child.disk_slot,
+                                           child.generation);
+    ASSERT_TRUE(detached_child.has_value());
+
+    auto evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV);
+    ASSERT_TRUE(evicted.has_value());
+    EXPECT_EQ(evicted->cache_key, 1);
+}
+
+TEST(PrefixTreeMemoryBlockCacheTest, InFlightReleaseRestoresEvictability) {
+    PrefixTreeMemoryBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(1, rootDep(), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first);
+
+    auto in_flight = cache.matchAndMarkInFlight(1, CacheBlockKind::COMPRESSED_KV);
+    ASSERT_TRUE(in_flight.found);
+    EXPECT_FALSE(cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV).has_value());
+
+    cache.releaseInFlight(1,
+                          CacheBlockKind::COMPRESSED_KV,
+                          CacheBackingType::MEMORY,
+                          in_flight.block_index,
+                          in_flight.disk_slot,
+                          in_flight.generation);
+    auto evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV);
+    ASSERT_TRUE(evicted.has_value());
+    EXPECT_EQ(evicted->cache_key, 1);
+}
+
+TEST(PrefixTreeMemoryBlockCacheTest, DiskBackingMatchesAndEvictsByBacking) {
+    PrefixTreeMemoryBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(1, rootDep(), diskItem(1, CacheBlockKind::COMPRESSED_KV, 7)).first);
+    ASSERT_TRUE(cache.putCommitted(2, rootDep(), item(2, CacheBlockKind::COMPRESSED_KV, 22)).first);
+
+    auto matched = cache.matchAndMarkInFlight(1, CacheBlockKind::COMPRESSED_KV);
+    ASSERT_TRUE(matched.found);
+    EXPECT_EQ(matched.backing_type, CacheBackingType::DISK);
+    EXPECT_EQ(matched.disk_slot, 7);
+    EXPECT_FALSE(cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV, CacheBackingType::DISK).has_value());
+
+    auto released = cache.releaseInFlight(1,
+                                          CacheBlockKind::COMPRESSED_KV,
+                                          CacheBackingType::DISK,
+                                          matched.block_index,
+                                          matched.disk_slot,
+                                          matched.generation);
+    EXPECT_FALSE(released.has_value());
+
+    auto evicted_disk = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV, CacheBackingType::DISK);
+    ASSERT_TRUE(evicted_disk.has_value());
+    EXPECT_EQ(evicted_disk->cache_key, 1);
+    EXPECT_EQ(evicted_disk->disk_slot, 7);
+
+    auto evicted_mem = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV, CacheBackingType::MEMORY);
+    ASSERT_TRUE(evicted_mem.has_value());
+    EXPECT_EQ(evicted_mem->cache_key, 2);
+    EXPECT_EQ(evicted_mem->block_index, 22);
+}
+
+TEST(PrefixTreeMemoryBlockCacheTest, StateIndependentEvictionDropsDeepestNonTailState) {
+    PrefixTreeMemoryBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(1, rootDep(), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first);
+    ASSERT_TRUE(cache.putCommitted(1, rootDep(), item(1, CacheBlockKind::STATE_SWA_KV, 101)).first);
+    ASSERT_TRUE(cache.putCommitted(2, childDep(1, 1), item(2, CacheBlockKind::COMPRESSED_KV, 12)).first);
+    ASSERT_TRUE(cache.putCommitted(2, childDep(1, 1), item(2, CacheBlockKind::STATE_SWA_KV, 102)).first);
+    ASSERT_TRUE(cache.putCommitted(3, childDep(2, 2), item(3, CacheBlockKind::COMPRESSED_KV, 13)).first);
+    ASSERT_TRUE(cache.putCommitted(3, childDep(2, 2), item(3, CacheBlockKind::STATE_SWA_KV, 103)).first);
+
+    auto evicted = cache.popOldestStateOrChainEvictable(CacheBackingType::MEMORY);
+
+    ASSERT_EQ(evicted.size(), 1u);
+    EXPECT_EQ(evicted[0].cache_key, 2);
+    EXPECT_EQ(evicted[0].kind, CacheBlockKind::STATE_SWA_KV);
+    EXPECT_EQ(evicted[0].block_index, 102);
+    EXPECT_TRUE(cache.contains(2, CacheBlockKind::COMPRESSED_KV));
+    EXPECT_FALSE(cache.contains(2, CacheBlockKind::STATE_SWA_KV));
+    EXPECT_TRUE(cache.contains(3, CacheBlockKind::STATE_SWA_KV));
+}
+
+TEST(PrefixTreeMemoryBlockCacheTest, StateIndependentEvictionFallsBackToWholeChain) {
+    PrefixTreeMemoryBlockCache cache;
+    ASSERT_TRUE(cache.putCommitted(1, rootDep(), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first);
+    ASSERT_TRUE(cache.putCommitted(2, childDep(1, 1), item(2, CacheBlockKind::COMPRESSED_KV, 12)).first);
+    ASSERT_TRUE(cache.putCommitted(2, childDep(1, 1), item(2, CacheBlockKind::STATE_SWA_KV, 102)).first);
+
+    auto evicted = cache.popOldestStateOrChainEvictable(CacheBackingType::MEMORY);
+
+    ASSERT_EQ(evicted.size(), 3u);
+    EXPECT_EQ(evicted[0].cache_key, 2);
+    EXPECT_EQ(evicted[0].kind, CacheBlockKind::COMPRESSED_KV);
+    EXPECT_EQ(evicted[1].cache_key, 2);
+    EXPECT_EQ(evicted[1].kind, CacheBlockKind::STATE_SWA_KV);
+    EXPECT_EQ(evicted[2].cache_key, 1);
+    EXPECT_EQ(evicted[2].kind, CacheBlockKind::COMPRESSED_KV);
+    EXPECT_EQ(cache.size(), 0u);
+}
+
+}  // namespace rtp_llm::test
diff --git a/rtp_llm/cpp/cache/connector/memory/test/mock/MockKVCacheMemoryConnector.h b/rtp_llm/cpp/cache/connector/memory/test/mock/MockKVCacheMemoryConnector.h
index a092c752e9..f33d587f67 100644
--- a/rtp_llm/cpp/cache/connector/memory/test/mock/MockKVCacheMemoryConnector.h
+++ b/rtp_llm/cpp/cache/connector/memory/test/mock/MockKVCacheMemoryConnector.h
@@ -8,6 +8,14 @@ namespace rtp_llm {
 
 class MockKVCacheMemoryConnector: public KVCacheMemoryConnector {
 public:
+    MockKVCacheMemoryConnector(const CacheConfig&                       cache_config,
+                               const KVCacheConfig&                     kv_cache_config,
+                               const ParallelismConfig&                 parallelism_config,
+                               const std::shared_ptr<KVCacheAllocator>& allocator,
+                               const std::vector<std::string>&          worker_addrs,
+                               const kmonitor::MetricsReporterPtr&      metrics_reporter):
+        KVCacheMemoryConnector(
+            cache_config, kv_cache_config, parallelism_config, allocator, worker_addrs, metrics_reporter) {}
     MockKVCacheMemoryConnector(const CacheConfig&                       cache_config,
                                const KVCacheConfig&                     kv_cache_config,
                                const std::shared_ptr<KVCacheAllocator>& allocator,
diff --git a/rtp_llm/cpp/cache/connector/p2p/LayerBlockConverterImpl.h b/rtp_llm/cpp/cache/connector/p2p/LayerBlockConverterImpl.h
index 360bf1c0bd..84b597a343 100644
--- a/rtp_llm/cpp/cache/connector/p2p/LayerBlockConverterImpl.h
+++ b/rtp_llm/cpp/cache/connector/p2p/LayerBlockConverterImpl.h
@@ -3,7 +3,7 @@
 #include <torch/extension.h>
 
 #include "rtp_llm/cpp/cache/connector/p2p/LayerBlockConverter.h"
-#include "rtp_llm/cpp/cache/KVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h"
 #include "rtp_llm/cpp/cache/BlockInfo.h"
 
 namespace rtp_llm {
diff --git a/rtp_llm/cpp/cache/connector/p2p/test/P2PBroadcastClientTest.cc b/rtp_llm/cpp/cache/connector/p2p/test/P2PBroadcastClientTest.cc
index 9c6d6a3084..aa8f4cacce 100644
--- a/rtp_llm/cpp/cache/connector/p2p/test/P2PBroadcastClientTest.cc
+++ b/rtp_llm/cpp/cache/connector/p2p/test/P2PBroadcastClientTest.cc
@@ -100,14 +100,9 @@ TEST_F(P2PBroadcastClientTest, Broadcast_ReturnNotNull_AllRequestsSuccess) {
 }
 
 TEST_F(P2PBroadcastClientTest, Broadcast_ReturnNotNull_Timeout) {
-    // 设置服务器延迟响应
-    for (auto& server : servers_) {
-        server->service()->setSleepMillis(200);
-    }
-
     std::string unique_key  = "test_broadcast_timeout";
     int64_t     request_id  = 1002;
-    int64_t     deadline_ms = currentTimeMs() + 10;  // 很短的超时时间
+    int64_t     deadline_ms = currentTimeMs() - 1;
 
     std::vector<std::shared_ptr<LayerCacheBuffer>> layer_cache_buffers;
     layer_cache_buffers.push_back(createLayerCacheBuffer(0, 2));
@@ -115,18 +110,13 @@ TEST_F(P2PBroadcastClientTest, Broadcast_ReturnNotNull_Timeout) {
     std::vector<std::pair<std::string, uint32_t>> decode_transfer_servers;
     decode_transfer_servers.push_back({"127.0.0.1", 12345});
 
-    // 执行 broadcast
     auto result = client_->broadcast(request_id,
                                      layer_cache_buffers,
                                      decode_transfer_servers,
                                      unique_key,
                                      deadline_ms,
                                      P2PConnectorBroadcastType::READ);
-
-    ASSERT_NE(result, nullptr);
-
-    // broadcast gRPC 超时时 BroadcastManager::waitDone 抛 RTPException
-    EXPECT_THROW(waitDone(result, 500), RTPException);
+    EXPECT_EQ(result, nullptr);
 }
 
 TEST_F(P2PBroadcastClientTest, Broadcast_ReturnNotNull_PartialResponseFailed) {
diff --git a/rtp_llm/cpp/cache/connector/p2p/test/P2PConnectorSchedulerTest.cc b/rtp_llm/cpp/cache/connector/p2p/test/P2PConnectorSchedulerTest.cc
index 3a2ffe6842..c4c398632d 100644
--- a/rtp_llm/cpp/cache/connector/p2p/test/P2PConnectorSchedulerTest.cc
+++ b/rtp_llm/cpp/cache/connector/p2p/test/P2PConnectorSchedulerTest.cc
@@ -51,11 +51,11 @@ class P2PConnectorSchedulerTest: public ::testing::Test {
     // 创建有效的 KVCacheResource（使用 initGroups + groupBlocks/blocks/cacheKeys 公开 API）
     KVCacheResourcePtr createValidKVCacheResource(int num_layers = 2, int blocks_per_layer = 2) {
         auto             resource = std::make_shared<KVCacheResource>();
-        std::vector<int> layer_to_group(num_layers);
+        std::vector<std::vector<int>> layer_group_ids(num_layers);
         for (int i = 0; i < num_layers; ++i) {
-            layer_to_group[i] = i;
+            layer_group_ids[i] = {i};
         }
-        resource->initGroups(num_layers, num_layers, layer_to_group);
+        resource->initGroups(num_layers, num_layers, layer_group_ids);
 
         for (int layer_id = 0; layer_id < num_layers; ++layer_id) {
             for (int i = 0; i < blocks_per_layer; ++i) {
@@ -188,23 +188,18 @@ TEST_F(P2PConnectorSchedulerTest, HandleRead_ReturnError_BroadcastPartialFailed)
     }
 }
 
-// 测试: broadcast worker 慢于 gRPC deadline，checkDone 路径抛 RTPException
-TEST_F(P2PConnectorSchedulerTest, HandleRead_ThrowException_BroadcastTimeout) {
-    for (auto& server : tp_broadcast_servers_) {
-        server->service()->setSleepMillis(500);  // 延迟 500ms
-        break;
-    }
-
+// 测试: broadcast 已超过 deadline，返回超时错误
+TEST_F(P2PConnectorSchedulerTest, HandleRead_ReturnError_BroadcastTimeout) {
     auto valid_resource = createValidKVCacheResource(2, 2);
 
     std::vector<std::pair<std::string, uint32_t>> decode_transfer_servers;
     decode_transfer_servers.push_back({"127.0.0.1", 12345});
 
-    auto deadline_ms = currentTimeMs() + 50;
+    auto deadline_ms = currentTimeMs() - 1;
 
-    EXPECT_THROW(
-        scheduler_->sendKVCache(valid_resource, "test_broadcast_timeout", 1004, decode_transfer_servers, deadline_ms),
-        RTPException);
+    auto error_info =
+        scheduler_->sendKVCache(valid_resource, "test_broadcast_timeout", 1004, decode_transfer_servers, deadline_ms);
+    EXPECT_TRUE(error_info.hasError());
 }
 
 // 测试: handleRead 被 client 取消, 返回失败
@@ -420,21 +415,14 @@ TEST_F(P2PConnectorSchedulerTest, AsyncRead_ReturnFalse_PrefillTimeout) {
     EXPECT_EQ(prefill_server_->service()->getStartLoadCallCount(), 1);
 }
 
-// 测试: broadcast worker 慢于 gRPC deadline，checkDone 抛 RTPException
-TEST_F(P2PConnectorSchedulerTest, AsyncRead_ThrowException_BroadcastTimeout) {
-    tp_broadcast_servers_[0]->service()->setSleepMillis(500);
-
-    scheduler_->stopChecker();
-
+// 测试: async read 已超过 deadline，返回超时错误
+TEST_F(P2PConnectorSchedulerTest, AsyncRead_ReturnError_BroadcastTimeout) {
     auto resource = createValidKVCacheResource(2, 2);
-    auto meta     = createMockMeta(2008, "test_async_read_broadcast_timeout", currentTimeMs() + 50);
+    auto meta     = createMockMeta(2008, "test_async_read_broadcast_timeout", currentTimeMs() - 1);
 
     auto result = scheduler_->asyncRead(resource, meta, {0, -1});
-    ASSERT_TRUE(result.ok());
-    auto async_context = result.context;
-    ASSERT_NE(async_context, nullptr);
-
-    EXPECT_THROW(waitAsyncContextDone(async_context, 500, /*check_done=*/true), RTPException);
+    EXPECT_FALSE(result.ok());
+    EXPECT_EQ(result.context, nullptr);
 }
 
 // 测试: asyncread prefill 失败, 取消broadcast
@@ -509,22 +497,16 @@ TEST_F(P2PConnectorSchedulerTest, AsyncRead_CancelPrefill_WhenBroadcastFailed) {
     // 服务端可能已经开始处理请求，所以这里不验证取消是否成功
 }
 
-// Prefill：worker 极慢导致 gRPC DEADLINE_EXCEEDED 时抛 RTPException（与 BroadcastManager 行为一致）
-TEST_F(P2PConnectorSchedulerTest, SendKVCache_ThrowException_WhenBroadcastExceedsDeadline) {
-    for (auto& server : tp_broadcast_servers_) {
-        server->service()->setSleepMillis(120000);
-        server->service()->setP2PResponseSuccess(true);
-    }
-
+// Prefill：已超过 deadline 时返回 broadcast 失败错误
+TEST_F(P2PConnectorSchedulerTest, SendKVCache_ReturnError_WhenBroadcastExceedsDeadline) {
     auto                                          valid_resource = createValidKVCacheResource(2, 2);
     std::vector<std::pair<std::string, uint32_t>> decode_transfer_servers;
     decode_transfer_servers.push_back({"127.0.0.1", 12345});
 
-    const int64_t deadline_ms = currentTimeMs() + 80;
-    EXPECT_THROW(
-        scheduler_->sendKVCache(
-            valid_resource, "test_prefill_broadcast_past_deadline", 4006, decode_transfer_servers, deadline_ms),
-        RTPException);
+    const int64_t deadline_ms = currentTimeMs() - 1;
+    auto          error_info  = scheduler_->sendKVCache(
+        valid_resource, "test_prefill_broadcast_past_deadline", 4006, decode_transfer_servers, deadline_ms);
+    EXPECT_TRUE(error_info.hasError());
 }
 
 // StartLoad 返回 TRANSFER_NOT_DONE 且 hold_ms>0：checkDone 进入保留窗口，done 仍为 false 且 needCancel 为 false；hold
diff --git a/rtp_llm/cpp/cache/connector/p2p/test/P2PConnectorTest.cc b/rtp_llm/cpp/cache/connector/p2p/test/P2PConnectorTest.cc
index 252456f600..fb663efe76 100644
--- a/rtp_llm/cpp/cache/connector/p2p/test/P2PConnectorTest.cc
+++ b/rtp_llm/cpp/cache/connector/p2p/test/P2PConnectorTest.cc
@@ -79,11 +79,11 @@ class P2PConnectorTest: public ::testing::Test {
     // 创建有效的 KVCacheResource（使用 initGroups + groupBlocks/blocks/cacheKeys 公开 API）
     KVCacheResourcePtr createValidKVCacheResource(int num_layers = 2, int blocks_per_layer = 2) {
         auto             resource = std::make_shared<KVCacheResource>();
-        std::vector<int> layer_to_group(num_layers);
+        std::vector<std::vector<int>> layer_group_ids(num_layers);
         for (int i = 0; i < num_layers; ++i) {
-            layer_to_group[i] = i;
+            layer_group_ids[i] = {i};
         }
-        resource->initGroups(num_layers, num_layers, layer_to_group);
+        resource->initGroups(num_layers, num_layers, layer_group_ids);
 
         for (int layer_id = 0; layer_id < num_layers; ++layer_id) {
             for (int i = 0; i < blocks_per_layer; ++i) {
diff --git a/rtp_llm/cpp/cache/connector/p2p/test/P2PConnectorWorkerTest.cc b/rtp_llm/cpp/cache/connector/p2p/test/P2PConnectorWorkerTest.cc
index e50d151bac..4e7849df85 100644
--- a/rtp_llm/cpp/cache/connector/p2p/test/P2PConnectorWorkerTest.cc
+++ b/rtp_llm/cpp/cache/connector/p2p/test/P2PConnectorWorkerTest.cc
@@ -17,7 +17,7 @@
 #include "rtp_llm/cpp/cache/connector/p2p/ComputedLayerCacheBuffer.h"
 #include "rtp_llm/cpp/utils/ErrorCode.h"
 #include "rtp_llm/cpp/utils/TimeUtil.h"
-#include "rtp_llm/cpp/cache/KVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h"
 #include "rtp_llm/cpp/cache/BatchKVCacheResource.h"
 namespace rtp_llm {
 
@@ -279,11 +279,11 @@ class P2PConnectorWorkerTest: public ::testing::Test {
     KVCacheResourcePtr createKVCacheResource(int layer_id, int num_blocks = 2) {
         auto             resource  = std::make_shared<KVCacheResource>();
         int              layer_num = static_cast<int>(worker_config_.layer_all_num);
-        std::vector<int> layer_to_group(layer_num);
+        std::vector<std::vector<int>> layer_group_ids(layer_num);
         for (int i = 0; i < layer_num; ++i) {
-            layer_to_group[i] = i;
+            layer_group_ids[i] = {i};
         }
-        resource->initGroups(layer_num, layer_num, layer_to_group);
+        resource->initGroups(layer_num, layer_num, layer_group_ids);
 
         for (int i = 0; i < layer_num; ++i) {
             if (i == layer_id) {
@@ -1006,11 +1006,11 @@ class LayerCacheBufferUtilTest: public ::testing::Test {
 protected:
     KVCacheResourcePtr createResource(int num_layers, int blocks_per_layer) {
         auto             resource = std::make_shared<KVCacheResource>();
-        std::vector<int> layer_to_group(num_layers);
+        std::vector<std::vector<int>> layer_group_ids(num_layers);
         for (int i = 0; i < num_layers; ++i) {
-            layer_to_group[i] = i;
+            layer_group_ids[i] = {i};
         }
-        resource->initGroups(num_layers, num_layers, layer_to_group);
+        resource->initGroups(num_layers, num_layers, layer_group_ids);
         for (int layer = 0; layer < num_layers; ++layer) {
             for (int i = 0; i < blocks_per_layer; ++i) {
                 resource->mutableBlockIds(layer).add({i});
diff --git a/rtp_llm/cpp/cache/connector/remote_connector/GroupPolicy.cc b/rtp_llm/cpp/cache/connector/remote_connector/GroupPolicy.cc
index a500c2419f..f445f36f9f 100644
--- a/rtp_llm/cpp/cache/connector/remote_connector/GroupPolicy.cc
+++ b/rtp_llm/cpp/cache/connector/remote_connector/GroupPolicy.cc
@@ -5,7 +5,7 @@
 #include "autil/EnvUtil.h"
 #include "rtp_llm/cpp/cache/connector/remote_connector/GroupPolicy.h"
 #include "rtp_llm/cpp/cache/Types.h"
-#include "rtp_llm/cpp/cache/KVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h"
 #include "rtp_llm/cpp/utils/Logger.h"
 
 namespace rtp_llm {
@@ -70,27 +70,32 @@ bool DefaultLayerGroupPolicy::init() {
     }
     const auto  layer_layout       = allocator_->allLayerCacheBase();
     uint64_t    group_name_bithash = 1;
-    const auto& layer_to_groups    = layer_layout.layer_to_groups;
-    for (int layer = 0; layer < static_cast<int>(layer_to_groups.size()); ++layer) {
-        const int group_idx     = layer_to_groups.at(layer);
-        bool      is_full_group = false;
-        if (full_group_ids_.find(group_idx) != full_group_ids_.end()) {
-            is_full_group = true;
+    const auto& layer_group_ids    = layer_layout.layer_to_group_ids;
+    for (int layer = 0; layer < static_cast<int>(layer_group_ids.size()); ++layer) {
+        if (layer_group_ids.at(layer).empty()) {
+            RTP_LLM_LOG_ERROR("layer [%d] has no cache group id", layer);
+            return false;
         }
-        if (!is_full_group) {
-            if (other_group_ids_.find(group_idx) == other_group_ids_.end()) {
-                RTP_LLM_LOG_ERROR("not find valid group id, [%d]", group_idx);
-                return false;
+        for (const int group_idx : layer_group_ids.at(layer)) {
+            bool is_full_group = false;
+            if (full_group_ids_.find(group_idx) != full_group_ids_.end()) {
+                is_full_group = true;
             }
+            if (!is_full_group) {
+                if (other_group_ids_.find(group_idx) == other_group_ids_.end()) {
+                    RTP_LLM_LOG_ERROR("not find valid group id, [%d]", group_idx);
+                    return false;
+                }
+            }
+            if (groups_.count(group_idx) == 0) {
+                std::string group_name         = is_full_group ? ("F" + std::to_string(group_idx)) :
+                                                                 (GetOtherGroupPrefixName() + std::to_string(group_idx));
+                groups_[group_idx]             = Group{is_full_group, group_name_bithash, group_name};
+                group_to_layer_ids_[group_idx] = {};
+                group_name_bithash <<= 1;
+            }
+            group_to_layer_ids_.at(group_idx).push_back(layer);
         }
-        if (groups_.count(group_idx) == 0) {
-            std::string group_name         = is_full_group ? ("F" + std::to_string(group_idx)) :
-                                                             (GetOtherGroupPrefixName() + std::to_string(group_idx));
-            groups_[group_idx]             = Group{is_full_group, group_name_bithash, group_name};
-            group_to_layer_ids_[group_idx] = {};
-            group_name_bithash <<= 1;
-        }
-        group_to_layer_ids_.at(group_idx).push_back(layer);
     }
     if (groups_.size() > 64) {
         RTP_LLM_LOG_ERROR("not support bigger than 64 groups");
@@ -158,17 +163,20 @@ bool DefaultLayerGroupPolicy::genBlockBuffers(const std::vector<int32_t>&     gr
         iovs.reserve(layer_ids.size() * 2);
         for (size_t j = 0; j < layer_ids.size(); ++j) {
             // if support scale, block_infos: {kv_info, scale_info}
-            const auto& block_infos = allocator_->convertIndexToBuffer(layer_ids[j], block_ids[i]);
+            const auto& block_infos = allocator_->convertIndexToBuffer(layer_ids[j], group_ids[i], block_ids[i]);
             if (block_infos.empty()) {
-                RTP_LLM_LOG_WARNING(
-                    "convertIndexToBuffer returned empty for layer_id [%d] block_id[%d]", layer_ids[j], block_ids[i]);
+                RTP_LLM_LOG_WARNING("convertIndexToBuffer returned empty for layer_id [%d] group_id [%d] block_id[%d]",
+                                    layer_ids[j],
+                                    group_ids[i],
+                                    block_ids[i]);
             }
             for (size_t idx = 0; idx < block_infos.size(); ++idx) {
                 CHECK_BLOCK_INFO_VALID(
                     block_infos[idx],
-                    "convertIndexToBuffer failed layer_id [%d] block_id[%d], block_info.addr or block_info.size_bytes is invalid",
-                    j,
-                    i);
+                    "convertIndexToBuffer failed layer_id [%d] group_id [%d] block_id[%d], block_info.addr or block_info.size_bytes is invalid",
+                    layer_ids[j],
+                    group_ids[i],
+                    block_ids[i]);
                 push_iov(iovs, block_infos[idx]);
             }
         }
diff --git a/rtp_llm/cpp/cache/connector/remote_connector/RemoteConnector.cc b/rtp_llm/cpp/cache/connector/remote_connector/RemoteConnector.cc
index 254b3db08e..4aa724449f 100644
--- a/rtp_llm/cpp/cache/connector/remote_connector/RemoteConnector.cc
+++ b/rtp_llm/cpp/cache/connector/remote_connector/RemoteConnector.cc
@@ -7,7 +7,7 @@
 #include "rtp_llm/cpp/utils/AssertUtils.h"
 #include "rtp_llm/cpp/utils/Logger.h"
 #include "rtp_llm/cpp/utils/TimeUtil.h"
-#include "rtp_llm/cpp/cache/KVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h"
 #include "rtp_llm/models_py/bindings/cuda/cuda_host_utils.h"
 #include "rtp_llm/cpp/metrics/RtpLLMMetrics.h"
 #include "rtp_llm/cpp/cache/connector/Meta.h"
@@ -181,18 +181,20 @@ RemoteConnector::RemoteConnector(const CacheConfig&                        cache
                                             register_buffer_size};
     init_params_ = std::make_shared<RemoteConnector::InitParams>(std::move(init_params));
     std::vector<int32_t> full_group_ids, linear_group_ids;
-    if (cache_config.linear_group_num == 0) {
-        full_group_ids.push_back(0);
+    for (int32_t group_id = 0; group_id < cache_config.groupNums(); group_id++) {
+        if (cache_config.typeForGroup(static_cast<size_t>(group_id)) == CacheGroupType::FULL) {
+            full_group_ids.push_back(group_id);
+        } else {
+            linear_group_ids.push_back(group_id);
+        }
+    }
+    if (linear_group_ids.empty()) {
+        if (full_group_ids.empty()) {
+            full_group_ids.push_back(0);
+        }
         group_policy_ =
             std::make_unique<remote_connector::FullLayerGroupPolicy>(allocator, full_group_ids, linear_group_ids);
     } else {
-        for (int32_t group_id = 0; static_cast<size_t>(group_id) < cache_config.group_types.size(); group_id++) {
-            if (cache_config.group_types[group_id] == CacheGroupType::FULL) {
-                full_group_ids.push_back(group_id);
-            } else {
-                linear_group_ids.push_back(group_id);
-            }
-        }
         group_policy_ = std::make_unique<remote_connector::FullLinearLayerGroupPolicy>(
             allocator, full_group_ids, linear_group_ids, std::max(1, cache_config.linear_step));
     }
diff --git a/rtp_llm/cpp/cache/connector/remote_connector/test/BUILD b/rtp_llm/cpp/cache/connector/remote_connector/test/BUILD
index 0318ea9af8..35db1b1fde 100644
--- a/rtp_llm/cpp/cache/connector/remote_connector/test/BUILD
+++ b/rtp_llm/cpp/cache/connector/remote_connector/test/BUILD
@@ -5,6 +5,18 @@ test_copts = [
     "-fno-access-control",
 ] + copts()
 
+cc_import(
+    name = "cuda13_torch_nvshmem",
+    shared_library = "@pip_gpu_cuda13_torch_torch//:site-packages/torch/lib/libtorch_nvshmem.so",
+)
+
+cuda13_torch_link_deps = select({
+    "@//:using_cuda13_x86": [
+        ":cuda13_torch_nvshmem",
+    ],
+    "//conditions:default": [],
+})
+
 test_deps = [
     "//rtp_llm/models_py/bindings/cuda/ops:cuda_impl",
     "//rtp_llm/models_py/bindings/core:exec_ops_test_lib",
@@ -13,7 +25,7 @@ test_deps = [
     "@com_google_googletest//:gtest_main",
     "@local_config_cuda//cuda:cuda_headers",
     "@local_config_cuda//cuda:cudart",
-] + torch_deps()
+] + torch_deps() + cuda13_torch_link_deps
 
 test_copts = [
     "-fno-access-control",
diff --git a/rtp_llm/cpp/cache/connector/remote_connector/test/GroupPolicyTest.cc b/rtp_llm/cpp/cache/connector/remote_connector/test/GroupPolicyTest.cc
index 64ba760189..f5e8aa54ea 100644
--- a/rtp_llm/cpp/cache/connector/remote_connector/test/GroupPolicyTest.cc
+++ b/rtp_llm/cpp/cache/connector/remote_connector/test/GroupPolicyTest.cc
@@ -2,7 +2,7 @@
 #include <gtest/gtest.h>
 
 #include "rtp_llm/cpp/utils/Logger.h"
-#include "rtp_llm/cpp/cache/KVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h"
 #include "rtp_llm/cpp/cache/connector/remote_connector/GroupPolicy.h"
 
 using namespace rtp_llm;
@@ -33,12 +33,12 @@ class FakeKVCacheAllocator: public KVCacheAllocator {
         KVCacheAllocator(config) {
         for (int32_t full_group_id : full_group_ids) {
             for (int i = 0; i < per_group_layer_num; i++) {
-                fake_layout_.layer_to_groups.push_back(full_group_id);
+                fake_layout_.layer_to_group_ids.push_back({full_group_id});
             }
         }
         for (int32_t other_group_id : other_group_ids) {
             for (int i = 0; i < per_group_layer_num; i++) {
-                fake_layout_.layer_to_groups.push_back(other_group_id);
+                fake_layout_.layer_to_group_ids.push_back({other_group_id});
             }
         }
     }
diff --git a/rtp_llm/cpp/cache/connector/remote_connector/test/RemoteConnectorInternalTest.cc b/rtp_llm/cpp/cache/connector/remote_connector/test/RemoteConnectorInternalTest.cc
index ac3565361e..bdebbeb8ef 100644
--- a/rtp_llm/cpp/cache/connector/remote_connector/test/RemoteConnectorInternalTest.cc
+++ b/rtp_llm/cpp/cache/connector/remote_connector/test/RemoteConnectorInternalTest.cc
@@ -1,8 +1,10 @@
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
+#include <numeric>
+
 #include "rtp_llm/cpp/cache/connector/remote_connector/RemoteConnector.h"
-#include "rtp_llm/cpp/cache/KVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h"
 #include "rtp_llm/cpp/utils/Logger.h"
 #include "rtp_llm/models_py/bindings/core/ExecOps.h"
 #include "autil/EnvUtil.h"
@@ -29,12 +31,12 @@ class FakeKVCacheAllocator: public KVCacheAllocator {
         KVCacheAllocator(config) {
         for (int32_t full_group_id : full_group_ids) {
             for (int i = 0; i < per_group_layer_num; i++) {
-                fake_layout_.layer_to_groups.push_back(full_group_id);
+                fake_layout_.layer_to_group_ids.push_back({full_group_id});
             }
         }
         for (int32_t other_group_id : other_group_ids) {
             for (int i = 0; i < per_group_layer_num; i++) {
-                fake_layout_.layer_to_groups.push_back(other_group_id);
+                fake_layout_.layer_to_group_ids.push_back({other_group_id});
             }
         }
     }
@@ -128,22 +130,23 @@ class RemoteConnectorInternalTest: public ::testing::Test {
     void SetUp() override {
         rtp_llm::initLogger();
         auto mha_spec                = std::make_shared<MHAKVCacheSpec>();
-        mha_spec->layer_num          = layer_num_;
         mha_spec->local_head_num_kv  = 8;
         mha_spec->size_per_head      = 128;
         mha_spec->seq_size_per_block = 8;
         mha_spec->dtype              = rtp_llm::DataType::TYPE_FP16;
         mha_spec->type               = KVCacheSpecType::MultiHeadAttention;
         cache_config_.block_num      = 8;
-        cache_config_.cache_specs.push_back(mha_spec);
-        byte_size_per_block_           = static_cast<size_t>(mha_spec->block_size_bytes() * mha_spec->layer_num);
+        byte_size_per_block_           = static_cast<size_t>(mha_spec->block_size_bytes() * layer_num_);
         cache_config_.block_size_bytes = byte_size_per_block_;
         cache_config_.dtype            = rtp_llm::DataType::TYPE_FP16;
-        cache_config_.group_types.push_back(CacheGroupType::FULL);
-        cache_config_.group_types.push_back(CacheGroupType::LINEAR);
-        cache_config_.group_types.push_back(CacheGroupType::LINEAR);
-        cache_config_.full_group_num   = 1;
-        cache_config_.linear_group_num = 2;
+        cache_config_.layer_num        = layer_num_;
+        cache_config_.layer_all_num    = layer_num_;
+        std::vector<int> layers(layer_num_);
+        std::iota(layers.begin(), layers.end(), 0);
+        cache_config_.fromGroupedSpecs({mha_spec, mha_spec, mha_spec},
+                                       {layers, layers, layers},
+                                       {CacheGroupType::FULL, CacheGroupType::LINEAR, CacheGroupType::LINEAR},
+                                       {"F0", "L1", "L2"});
     }
 
     void TearDown() override {}
diff --git a/rtp_llm/cpp/cache/connector/remote_connector/test/RemoteConnectorMockFullLinearTest.cc b/rtp_llm/cpp/cache/connector/remote_connector/test/RemoteConnectorMockFullLinearTest.cc
index 4f8d6d18cc..153c8d0a7f 100644
--- a/rtp_llm/cpp/cache/connector/remote_connector/test/RemoteConnectorMockFullLinearTest.cc
+++ b/rtp_llm/cpp/cache/connector/remote_connector/test/RemoteConnectorMockFullLinearTest.cc
@@ -1,7 +1,7 @@
-#include "rtp_llm/cpp/cache/KVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h"
 #include "rtp_llm/cpp/cache/connector/remote_connector/test/RemoteConnectorMockTestBase.h"
 #include "rtp_llm/cpp/cache/connector/Meta.h"
-#include "rtp_llm/cpp/cache/HybridTypeKVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/allocator/HybridTypeKVCacheAllocator.h"
 #include "rtp_llm/cpp/utils/AssertUtils.h"
 #include "rtp_llm/cpp/config/StaticConfig.h"
 
@@ -94,16 +94,13 @@ class RemoteConnectorMockFullLinearTest: public RemoteConnectorMockTestBase {
     }
 
     void initHybridLayerCacheConfig(int layer_num = 4, int block_num = 10, int seq_size_per_block = 8) {
-        cache_config_.linear_group_num = other_group_ids_.size();
-        cache_config_.full_group_num   = full_group_ids_.size();
-        size_t all_group_num           = cache_config_.linear_group_num + cache_config_.full_group_num;
+        size_t all_group_num           = other_group_ids_.size() + full_group_ids_.size();
         cache_config_.layer_num        = all_group_num * layer_num;
         cache_config_.layer_all_num    = all_group_num * layer_num;
         cache_config_.group_layer_num  = layer_num;
         int unique_layer_id            = 0;
 
         auto full_spec                = std::make_shared<MHAKVCacheSpec>();
-        full_spec->layer_num          = layer_num;
         full_spec->local_head_num_kv  = 8;
         full_spec->size_per_head      = 128;
         full_spec->seq_size_per_block = seq_size_per_block;
@@ -113,7 +110,6 @@ class RemoteConnectorMockFullLinearTest: public RemoteConnectorMockTestBase {
         auto linear_spec                = std::make_shared<LinearKVCacheSpec>();
         linear_spec->type               = KVCacheSpecType::LinearAttention;
         linear_spec->dtype              = rtp_llm::DataType::TYPE_FP16;
-        linear_spec->layer_num          = layer_num;
         linear_spec->local_num_k_heads  = 1;
         linear_spec->local_num_v_heads  = 1;
         linear_spec->head_k_dim         = 1;
@@ -122,38 +118,37 @@ class RemoteConnectorMockFullLinearTest: public RemoteConnectorMockTestBase {
         linear_spec->local_head_num_kv  = 1;
         linear_spec->seq_size_per_block = seq_size_per_block;
 
-        for (int i = 0; i < cache_config_.full_group_num; i++) {
-            cache_config_.global_layer_ids.push_back({});
-            cache_config_.layer_ids.push_back({});
-            cache_config_.group_types.push_back(CacheGroupType::FULL);
-            cache_config_.cache_specs.push_back(full_spec);
-            cache_config_.full_groups.push_back({});
+        std::vector<KVCacheSpecPtr>    specs;
+        std::vector<std::vector<int>>  layers_by_group;
+        std::vector<CacheGroupType>    types;
+        std::vector<std::string>       tags;
+
+        for (size_t i = 0; i < full_group_ids_.size(); i++) {
+            specs.push_back(full_spec);
+            layers_by_group.emplace_back();
+            types.push_back(CacheGroupType::FULL);
+            tags.push_back("full_" + std::to_string(i));
             for (int j = 0; j < layer_num; j++) {
-                cache_config_.layer_to_group_id.push_back(full_group_ids_[i]);
-                cache_config_.global_layer_ids.back().push_back(unique_layer_id);
-                cache_config_.layer_ids.back().push_back(unique_layer_id);
+                layers_by_group.back().push_back(unique_layer_id);
                 unique_layer_id++;
             }
         }
 
-        for (int i = 0; i < cache_config_.linear_group_num; i++) {
-            cache_config_.global_layer_ids.push_back({});
-            cache_config_.layer_ids.push_back({});
-            cache_config_.group_types.push_back(CacheGroupType::LINEAR);
-            cache_config_.cache_specs.push_back(linear_spec);
-            cache_config_.linear_groups.push_back({});
+        for (size_t i = 0; i < other_group_ids_.size(); i++) {
+            specs.push_back(linear_spec);
+            layers_by_group.emplace_back();
+            types.push_back(CacheGroupType::LINEAR);
+            tags.push_back("linear_" + std::to_string(i));
             for (int j = 0; j < layer_num; j++) {
-                cache_config_.layer_to_group_id.push_back(other_group_ids_[i]);
-                cache_config_.global_layer_ids.back().push_back(unique_layer_id);
-                cache_config_.layer_ids.back().push_back(unique_layer_id);
+                layers_by_group.back().push_back(unique_layer_id);
                 unique_layer_id++;
             }
         }
 
-        cache_config_.layer_ids          = cache_config_.global_layer_ids;
         cache_config_.block_num          = block_num;
         cache_config_.seq_size_per_block = seq_size_per_block;
         cache_config_.dtype              = rtp_llm::DataType::TYPE_FP16;
+        cache_config_.fromGroupedSpecs(specs, layers_by_group, types, tags);
 
         const size_t full_kv_block_stride_bytes   = full_spec->block_size_bytes();
         const size_t linear_kv_block_stride_bytes = linear_spec->block_size_bytes();
@@ -1065,4 +1060,4 @@ TEST_F(RemoteConnectorMockFullLinearTest, test_threadpool_ec) {
 }
 
 }  // namespace test
-}  // namespace rtp_llm
\ No newline at end of file
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/connector/remote_connector/test/RemoteConnectorMockOnlyFullTest.cc b/rtp_llm/cpp/cache/connector/remote_connector/test/RemoteConnectorMockOnlyFullTest.cc
index 742b364d03..bbd7f11907 100644
--- a/rtp_llm/cpp/cache/connector/remote_connector/test/RemoteConnectorMockOnlyFullTest.cc
+++ b/rtp_llm/cpp/cache/connector/remote_connector/test/RemoteConnectorMockOnlyFullTest.cc
@@ -1,4 +1,4 @@
-#include "rtp_llm/cpp/cache/SingleTypeKVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/allocator/SingleTypeKVCacheAllocator.h"
 #include "rtp_llm/cpp/cache/connector/Meta.h"
 #include "rtp_llm/cpp/cache/connector/remote_connector/test/RemoteConnectorMockTestBase.h"
 
@@ -98,15 +98,12 @@ class RemoteConnectorMockOnlyFullTest: public RemoteConnectorMockTestBase {
         cache_config_.seq_size_per_block = seq_size_per_block;
 
         auto mha_spec                = std::make_shared<MHAKVCacheSpec>();
-        mha_spec->layer_num          = layer_num;
         mha_spec->local_head_num_kv  = 8;
         mha_spec->size_per_head      = 128;
         mha_spec->seq_size_per_block = seq_size_per_block;
         mha_spec->dtype              = rtp_llm::DataType::TYPE_FP16;
         mha_spec->type               = KVCacheSpecType::MultiHeadAttention;
         cache_config_.dtype          = rtp_llm::DataType::TYPE_FP16;
-        cache_config_.cache_specs.push_back(mha_spec);
-        ;
         cache_config_.kv_block_stride_bytes = mha_spec->block_size_bytes();  // one-layer KV bytes for one logical block
         cache_config_.kv_scale_stride_bytes = 0;
         cache_config_.kv_block_size_bytes   = static_cast<size_t>(layer_num) * cache_config_.kv_block_stride_bytes;
@@ -116,8 +113,7 @@ class RemoteConnectorMockOnlyFullTest: public RemoteConnectorMockTestBase {
         for (int i = 0; i < layer_num; ++i) {
             layer_ids[i] = i;
         }
-        cache_config_.layer_ids.push_back(layer_ids);
-        cache_config_.global_layer_ids.push_back(layer_ids);
+        cache_config_.fromGroupedSpecs({mha_spec}, {layer_ids}, {CacheGroupType::FULL}, {"default"});
     }
 };
 
@@ -149,7 +145,7 @@ TEST_F(RemoteConnectorMockOnlyFullTest, test_async_match_and_async_read_with_gpu
     {
         // 没有其他connector
         UriStrVec          expected_uris        = genUris({1, 2, 3});
-        BlockBuffersExpect block_buffers_expect = {3, kFakeLayerNum, cache_config_.cache_specs[0]->block_size_bytes()};
+        BlockBuffersExpect block_buffers_expect = {3, kFakeLayerNum, cache_config_.specForGroup(0)->block_size_bytes()};
         std::vector<std::string> expect_block_ids({"1", "2", "3"});
         EXPECT_CALL(*transfer_client_,
                     LoadKvCaches(Eq(expected_uris),
@@ -175,7 +171,7 @@ TEST_F(RemoteConnectorMockOnlyFullTest, test_async_match_and_async_read_with_gpu
     {
         // 其他connector也命中了部分
         UriStrVec          expected_uris        = genUris({2, 3});
-        BlockBuffersExpect block_buffers_expect = {2, kFakeLayerNum, cache_config_.cache_specs[0]->block_size_bytes()};
+        BlockBuffersExpect block_buffers_expect = {2, kFakeLayerNum, cache_config_.specForGroup(0)->block_size_bytes()};
         std::vector<std::string> expect_block_ids({"2", "3"});
         EXPECT_CALL(*transfer_client_,
                     LoadKvCaches(Eq(expected_uris),
@@ -240,7 +236,7 @@ TEST_F(RemoteConnectorMockOnlyFullTest, test_async_match_and_async_read_with_gpu
     {
         // 没有其他connector
         UriStrVec          expected_uris        = genUris({2, 3});
-        BlockBuffersExpect block_buffers_expect = {2, kFakeLayerNum, cache_config_.cache_specs[0]->block_size_bytes()};
+        BlockBuffersExpect block_buffers_expect = {2, kFakeLayerNum, cache_config_.specForGroup(0)->block_size_bytes()};
         std::vector<std::string> expect_block_ids({"2", "3"});
         EXPECT_CALL(*transfer_client_,
                     LoadKvCaches(Eq(expected_uris),
@@ -263,7 +259,7 @@ TEST_F(RemoteConnectorMockOnlyFullTest, test_async_match_and_async_read_with_gpu
     {
         // 有其他connector
         UriStrVec          expected_uris        = genUris({3});
-        BlockBuffersExpect block_buffers_expect = {1, kFakeLayerNum, cache_config_.cache_specs[0]->block_size_bytes()};
+        BlockBuffersExpect block_buffers_expect = {1, kFakeLayerNum, cache_config_.specForGroup(0)->block_size_bytes()};
         std::vector<std::string> expect_block_ids({"3"});
         EXPECT_CALL(*transfer_client_,
                     LoadKvCaches(Eq(expected_uris),
@@ -324,7 +320,7 @@ TEST_F(RemoteConnectorMockOnlyFullTest, test_write_success_broadcast_success_act
     UriStrVec expected_uris = genUris({1, 2, 3});
     UriStrVec actual_uris   = genUris({1, 2, 3}, {}, "actual_");
 
-    BlockBuffersExpect block_buffers_expect = {3, kFakeLayerNum, cache_config_.cache_specs[0]->block_size_bytes()};
+    BlockBuffersExpect block_buffers_expect = {3, kFakeLayerNum, cache_config_.specForGroup(0)->block_size_bytes()};
     std::vector<std::string> expect_block_ids({"1", "2", "3"});
     EXPECT_CALL(*transfer_client_,
                 SaveKvCaches(Eq(expected_uris),
@@ -370,7 +366,7 @@ TEST_F(RemoteConnectorMockOnlyFullTest,
     UriStrVec expected_uris = genUris({2, 3});
     UriStrVec actual_uris   = genUris({2, 3}, {}, "actual_");
 
-    BlockBuffersExpect block_buffers_expect = {2, kFakeLayerNum, cache_config_.cache_specs[0]->block_size_bytes()};
+    BlockBuffersExpect block_buffers_expect = {2, kFakeLayerNum, cache_config_.specForGroup(0)->block_size_bytes()};
     std::vector<std::string> expect_block_ids({"2", "3"});
     EXPECT_CALL(*transfer_client_,
                 SaveKvCaches(Eq(expected_uris),
@@ -417,7 +413,7 @@ TEST_F(RemoteConnectorMockOnlyFullTest,
     UriStrVec expected_uris = genUris({2, 4});
     UriStrVec actual_uris   = genUris({2, 4}, {}, "actual_");
 
-    BlockBuffersExpect block_buffers_expect = {2, kFakeLayerNum, cache_config_.cache_specs[0]->block_size_bytes()};
+    BlockBuffersExpect block_buffers_expect = {2, kFakeLayerNum, cache_config_.specForGroup(0)->block_size_bytes()};
     std::vector<std::string> expect_block_ids({"2", "4"});
     EXPECT_CALL(*transfer_client_,
                 SaveKvCaches(Eq(expected_uris),
@@ -489,7 +485,7 @@ TEST_F(RemoteConnectorMockOnlyFullTest, test_write_success_broadcast_success_act
 
     UriStrVec expected_uris = genUris({1, 2, 3});
 
-    BlockBuffersExpect block_buffers_expect = {3, kFakeLayerNum, cache_config_.cache_specs[0]->block_size_bytes()};
+    BlockBuffersExpect block_buffers_expect = {3, kFakeLayerNum, cache_config_.specForGroup(0)->block_size_bytes()};
     std::vector<std::string> expect_block_ids({"1", "2", "3"});
     EXPECT_CALL(*transfer_client_,
                 SaveKvCaches(Eq(expected_uris),
@@ -511,4 +507,4 @@ TEST_F(RemoteConnectorMockOnlyFullTest, test_write_success_broadcast_success_act
 }
 
 }  // namespace test
-}  // namespace rtp_llm
\ No newline at end of file
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/connector/test/BUILD b/rtp_llm/cpp/cache/connector/test/BUILD
index b2aa5d8535..0bfdba38f9 100644
--- a/rtp_llm/cpp/cache/connector/test/BUILD
+++ b/rtp_llm/cpp/cache/connector/test/BUILD
@@ -7,6 +7,18 @@ test_copts = [
     "-fno-access-control",
 ] + copts()
 
+cc_import(
+    name = "cuda13_torch_nvshmem",
+    shared_library = "@pip_gpu_cuda13_torch_torch//:site-packages/torch/lib/libtorch_nvshmem.so",
+)
+
+cuda13_torch_link_deps = select({
+    "@//:using_cuda13_x86": [
+        ":cuda13_torch_nvshmem",
+    ],
+    "//conditions:default": [],
+})
+
 test_deps = [
     "//rtp_llm/models_py/bindings/cuda/ops:cuda_impl",
     "//rtp_llm/cpp/config:config_modules",
@@ -16,7 +28,7 @@ test_deps = [
     "@com_google_googletest//:gtest_main",
     "@local_config_cuda//cuda:cuda_headers",
     "@local_config_cuda//cuda:cudart",
-] + torch_deps()
+] + torch_deps() + cuda13_torch_link_deps
 
 cc_test(
     name = "coordinator_test",
diff --git a/rtp_llm/cpp/cache/connector/test/KVCacheConnectorCoordinatorTest.cc b/rtp_llm/cpp/cache/connector/test/KVCacheConnectorCoordinatorTest.cc
index 60094c47e2..962c976d43 100644
--- a/rtp_llm/cpp/cache/connector/test/KVCacheConnectorCoordinatorTest.cc
+++ b/rtp_llm/cpp/cache/connector/test/KVCacheConnectorCoordinatorTest.cc
@@ -1,8 +1,13 @@
 #include <gtest/gtest.h>
 #include <gmock/gmock.h>
 
+#include <algorithm>
+#include <numeric>
+
 #include "rtp_llm/cpp/cache/BlockPool.h"
 #include "rtp_llm/cpp/cache/BlockPoolConfigHelper.h"
+#include "rtp_llm/cpp/cache/spec/MHAKVCacheSpec.h"
+#include "rtp_llm/cpp/cache/spec/OpaqueKVCacheSpec.h"
 #include "rtp_llm/cpp/cache/connector/KVCacheConnectorCoordinator.h"
 #include "rtp_llm/cpp/cache/connector/memory/test/mock/MockKVCacheMemoryConnector.h"
 #include "rtp_llm/cpp/cache/connector/test/mock/MockAsyncContext.h"
@@ -14,12 +19,37 @@
 #include "rtp_llm/models_py/bindings/core/ExecOps.h"
 #include "rtp_llm/cpp/config/ModelConfig.h"
 #include "rtp_llm/cpp/config/EplbConfig.h"
+#include "rtp_llm/cpp/config/StaticConfig.h"
 
 namespace rtp_llm {
 namespace test {
 
 namespace {
 
+void initSingleGroupConfig(CacheConfig& config) {
+    auto spec                = std::make_shared<MHAKVCacheSpec>();
+    spec->type               = KVCacheSpecType::MultiHeadAttention;
+    spec->dtype              = config.dtype;
+    spec->local_head_num_kv  = 1;
+    spec->size_per_head      = 1;
+    spec->seq_size_per_block = static_cast<uint32_t>(std::max<size_t>(1, config.seq_size_per_block));
+    std::vector<int> layers(static_cast<size_t>(config.layer_num));
+    std::iota(layers.begin(), layers.end(), 0);
+    config.fromGroupedSpecs({spec}, {layers}, {CacheGroupType::FULL}, {"default"});
+}
+
+void initTwoGroupCpConfig(CacheConfig& config) {
+    auto full_spec                = std::make_shared<MHAKVCacheSpec>();
+    full_spec->type               = KVCacheSpecType::MultiHeadAttention;
+    full_spec->dtype              = config.dtype;
+    full_spec->local_head_num_kv  = 1;
+    full_spec->size_per_head      = 1;
+    full_spec->seq_size_per_block = static_cast<uint32_t>(std::max<size_t>(1, config.seq_size_per_block));
+
+    auto swa_spec = std::make_shared<FixedStateCacheSpec>("swa", 1, 1, config.dtype, full_spec->seq_size_per_block);
+    config.fromGroupedSpecs({full_spec, swa_spec}, {{0}, {1}}, {CacheGroupType::FULL, CacheGroupType::SWA}, {"full", "swa"});
+}
+
 class TestMeta final: public Meta {
 public:
     explicit TestMeta(bool enable_memory_cache, bool enable_remote_cache, std::string trace_id):
@@ -55,6 +85,8 @@ class TestMeta final: public Meta {
 class KVCacheConnectorCoordinatorTest: public ::testing::Test {
 protected:
     void SetUp() override {
+        old_core_dump_on_exception_                  = StaticConfig::user_ft_core_dump_on_exception;
+        StaticConfig::user_ft_core_dump_on_exception = false;
         rtp_llm::initLogger();
 
         cache_config_.layer_num        = 1;
@@ -62,7 +94,7 @@ class KVCacheConnectorCoordinatorTest: public ::testing::Test {
         cache_config_.block_num        = 10;
         cache_config_.block_size_bytes = 1024;
         cache_config_.dtype            = rtp_llm::TYPE_FP16;
-        cache_config_.layer_to_group_id.assign(static_cast<size_t>(cache_config_.layer_all_num), 0);
+        initSingleGroupConfig(cache_config_);
 
         kv_cache_config_.memory_cache_size_mb         = 100;
         kv_cache_config_.memory_cache_sync_timeout_ms = 1000;
@@ -73,7 +105,7 @@ class KVCacheConnectorCoordinatorTest: public ::testing::Test {
         // Those methods assume allocator_->block_pool_ is non-null. In UT we use a mock allocator, so set a
         // minimal BlockPool here to avoid crashes/hangs in tests that exercise coordinator paths.
         {
-            // NOTE: use the 4-arg overload to avoid requiring cache_config_.cache_specs in unit tests.
+            // NOTE: use the 4-arg overload because the mock allocator only needs physical block layout here.
             const size_t block_stride_bytes =
                 cache_config_.block_size_bytes / static_cast<size_t>(std::max(1u, cache_config_.layer_all_num));
             auto pool_config = BlockPoolConfigHelper::createConfig(
@@ -108,6 +140,7 @@ class KVCacheConnectorCoordinatorTest: public ::testing::Test {
     }
 
     void TearDown() override {
+        StaticConfig::user_ft_core_dump_on_exception = old_core_dump_on_exception_;
         if (coordinator_) {
             // Ensure all internal contexts/connectors are released before gmock leak checker runs at program exit.
             coordinator_->stop_.store(true);
@@ -195,6 +228,7 @@ class KVCacheConnectorCoordinatorTest: public ::testing::Test {
 
     std::shared_ptr<MockKVCacheAllocator>        allocator_;
     std::shared_ptr<KVCacheConnectorCoordinator> coordinator_;
+    bool                                         old_core_dump_on_exception_{false};
 };
 
 TEST_F(KVCacheConnectorCoordinatorTest, Init_ReturnFalse_WhenMemoryConfigInvalid) {
@@ -205,7 +239,7 @@ TEST_F(KVCacheConnectorCoordinatorTest, Init_ReturnFalse_WhenMemoryConfigInvalid
     cache_config.layer_all_num    = 1;
     cache_config.block_num        = 1;
     cache_config.block_size_bytes = 1;
-    cache_config.layer_to_group_id.assign(static_cast<size_t>(cache_config.layer_all_num), 0);
+    initSingleGroupConfig(cache_config);
 
     kv_cache_config.enable_memory_cache = true;
     kv_cache_config.reuse_cache = true;  // coordinator init only enables memory connector when reuse_cache is true
@@ -227,7 +261,7 @@ TEST_F(KVCacheConnectorCoordinatorTest, Init_ReturnTrue_WhenMemorySkipped_AndSto
     cache_config.layer_all_num    = 1;
     cache_config.block_num        = 1;
     cache_config.block_size_bytes = 1;
-    cache_config.layer_to_group_id.assign(static_cast<size_t>(cache_config.layer_all_num), 0);
+    initSingleGroupConfig(cache_config);
 
     kv_cache_config.enable_memory_cache = false;  // skip memory connector in init
 
@@ -248,7 +282,7 @@ TEST_F(KVCacheConnectorCoordinatorTest, Init_ReturnFalse_WhenMemoryEnabledButSiz
     cache_config.layer_all_num    = 1;
     cache_config.block_num        = 1;
     cache_config.block_size_bytes = 1;
-    cache_config.layer_to_group_id.assign(static_cast<size_t>(cache_config.layer_all_num), 0);
+    initSingleGroupConfig(cache_config);
 
     kv_cache_config.enable_memory_cache          = true;
     kv_cache_config.reuse_cache                  = true;
@@ -274,7 +308,7 @@ TEST_F(KVCacheConnectorCoordinatorTest, Init_ReturnTrue_WhenMemoryEnabled_HappyP
     // Keep block size reasonably large so block_num doesn't explode in createBlockPool().
     cache_config.block_size_bytes = 1024;
     cache_config.dtype            = rtp_llm::TYPE_FP16;
-    cache_config.layer_to_group_id.assign(static_cast<size_t>(cache_config.layer_all_num), 0);
+    initSingleGroupConfig(cache_config);
     // Memory connector requires per-layer block stride bytes.
     cache_config.layer_to_block_stride_bytes.assign(static_cast<size_t>(cache_config.layer_num),
                                                     cache_config.block_size_bytes);
@@ -314,7 +348,7 @@ TEST_F(KVCacheConnectorCoordinatorTest, AsyncRead_ReturnNull_WhenStop) {
     cache_config.layer_all_num    = 1;
     cache_config.block_num        = 1;
     cache_config.block_size_bytes = 1;
-    cache_config.layer_to_group_id.assign(static_cast<size_t>(cache_config.layer_all_num), 0);
+    initSingleGroupConfig(cache_config);
 
     auto allocator   = std::make_shared<testing::NiceMock<MockKVCacheAllocator>>(cache_config);
     auto coordinator = std::make_shared<KVCacheConnectorCoordinator>(cache_config,
@@ -345,7 +379,7 @@ TEST_F(KVCacheConnectorCoordinatorTest, AsyncRead_ReturnNull_WhenCacheKeysEmpty)
     coordinator_->allocator_  = allocator_;
 
     KVCacheResource resource;
-    resource.initGroups(1, cache_config_.layer_all_num, cache_config_.layer_to_group_id);
+    resource.initGroups(1, cache_config_.layer_all_num, cache_config_.layerGroupIdsSnapshot());
     // leave cacheKeys empty to hit the early return
     auto                  rw_ctx = std::make_shared<testing::NiceMock<MockKVCacheConnectorReadWriteContext>>();
     std::shared_ptr<Meta> meta =
@@ -380,7 +414,7 @@ TEST_F(KVCacheConnectorCoordinatorTest, AsyncRead_ReturnNull_WhenIncrKVCacheRefR
     }
 
     KVCacheResource resource;
-    resource.initGroups(1, cache_config_.layer_all_num, cache_config_.layer_to_group_id);
+    resource.initGroups(1, cache_config_.layer_all_num, cache_config_.layerGroupIdsSnapshot());
     resource.cacheKeys() = CacheKeysType{1, 2, 3};
 
     auto                  rw_ctx = std::make_shared<testing::NiceMock<MockKVCacheConnectorReadWriteContext>>();
@@ -410,7 +444,7 @@ TEST_F(KVCacheConnectorCoordinatorTest, AsyncRead_ReturnNull_WhenNoMatchContexts
     // and will be processed/cleaned up by the coordinator update loop if enabled.
     // Use a plain shared_ptr here to avoid custom-deleter side effects in this no-connector path.
     auto resource = std::make_shared<KVCacheResource>();
-    resource->initGroups(1, cache_config_.layer_all_num, cache_config_.layer_to_group_id);
+    resource->initGroups(1, cache_config_.layer_all_num, cache_config_.layerGroupIdsSnapshot());
     // Don't let gmock keep a ref to `resource` until program exit.
     // gmock actions are stored as const; use a shared holder to release the ref after first call.
     auto resource_holder = std::make_shared<std::shared_ptr<KVCacheResource>>(resource);
@@ -491,7 +525,7 @@ TEST_F(KVCacheConnectorCoordinatorTest, AsyncWrite_ReturnNull_WhenStop) {
     cache_config.layer_all_num    = 1;
     cache_config.block_num        = 1;
     cache_config.block_size_bytes = 1;
-    cache_config.layer_to_group_id.assign(static_cast<size_t>(cache_config.layer_all_num), 0);
+    initSingleGroupConfig(cache_config);
 
     auto allocator   = std::make_shared<testing::NiceMock<MockKVCacheAllocator>>(cache_config);
     auto coordinator = std::make_shared<KVCacheConnectorCoordinator>(cache_config,
@@ -522,7 +556,7 @@ TEST_F(KVCacheConnectorCoordinatorTest, AsyncWrite_ReturnNull_WhenCacheKeysEmpty
     coordinator_->allocator_  = allocator_;
 
     KVCacheResource resource;
-    resource.initGroups(1, cache_config_.layer_all_num, cache_config_.layer_to_group_id);
+    resource.initGroups(1, cache_config_.layer_all_num, cache_config_.layerGroupIdsSnapshot());
     // leave cacheKeys empty
     auto                  rw_ctx = std::make_shared<testing::NiceMock<MockKVCacheConnectorReadWriteContext>>();
     std::shared_ptr<Meta> meta =
@@ -537,6 +571,319 @@ TEST_F(KVCacheConnectorCoordinatorTest, AsyncWrite_ReturnNull_WhenCacheKeysEmpty
     EXPECT_EQ(ctx, nullptr);
 }
 
+TEST_F(KVCacheConnectorCoordinatorTest, AsyncWrite_CPShardedKeepsNonFullGroupsInLogicalCoordinates) {
+    CacheConfig cp_cache_config       = cache_config_;
+    cp_cache_config.layer_num         = 2;
+    cp_cache_config.layer_all_num     = 2;
+    initTwoGroupCpConfig(cp_cache_config);
+
+    ParallelismConfig parallelism_config;
+    parallelism_config.tp_size                            = 2;
+    parallelism_config.prefill_cp_config.kv_cache_sharded = true;
+
+    auto coordinator = std::make_shared<KVCacheConnectorCoordinator>(cp_cache_config,
+                                                                     kv_cache_config_,
+                                                                     runtime_config_,
+                                                                     parallelism_config,
+                                                                     SpeculativeExecutionConfig{},
+                                                                     allocator_);
+    coordinator->connectors_.clear();
+
+    KVCacheResource resource;
+    resource.initGroups(/*group_num=*/2,
+                        /*layer_num=*/static_cast<int>(cp_cache_config.layer_all_num),
+                        cp_cache_config.layerGroupIdsSnapshot(),
+                        cp_cache_config.kernelBlocksPerKvBlock(),
+                        cp_cache_config.groupTypesSnapshot());
+    resource.cacheKeys() = CacheKeysType{10, 11, 12, 13};
+    resource.setLastBlockAligned(false);
+    resource.mutableBlockIds(/*gid=*/0).assign(BlockIndicesType{100, 101});            // FULL: compact local blocks
+    resource.mutableBlockIds(/*gid=*/1).assign(BlockIndicesType{200, 201, 202, 203});  // SWA: full logical slots
+
+    EXPECT_CALL(*allocator_, incrKVCacheRef(testing::_, testing::_, testing::Eq(true)))
+        .WillOnce(
+            testing::Invoke([](const KVCacheResource& ref_resource, const CacheKeysType& ref_keys, bool is_connector) {
+                (void)is_connector;
+                EXPECT_THAT(ref_keys, testing::ElementsAre(11, 13));
+                EXPECT_THAT(ref_resource.cacheKeys(), testing::ElementsAre(11, 13));
+                EXPECT_FALSE(ref_resource.lastBlockAligned());
+                EXPECT_THAT(ref_resource.blocks(/*gid=*/0), testing::ElementsAre(100, 101));
+                EXPECT_THAT(ref_resource.blocks(/*gid=*/1), testing::ElementsAre(201, 203));
+                return std::make_shared<KVCacheResource>();
+            }));
+
+    auto rw_ctx = std::make_shared<testing::NiceMock<MockKVCacheConnectorReadWriteContext>>();
+    ON_CALL(*rw_ctx, kvCacheResource()).WillByDefault(testing::ReturnRef(resource));
+    std::shared_ptr<Meta> meta =
+        std::make_shared<TestMeta>(/*enable_memory_cache=*/true, /*enable_remote_cache=*/false, "");
+    ON_CALL(*rw_ctx, meta()).WillByDefault(testing::ReturnRef(meta));
+
+    auto async_ctx = coordinator->asyncWrite(rw_ctx);
+    ASSERT_NE(async_ctx, nullptr);
+
+    {
+        std::lock_guard<std::mutex> lock(coordinator->update_mutex_);
+        coordinator->fused_async_write_context_list_.clear();
+    }
+    async_ctx.reset();
+    coordinator.reset();
+}
+
+TEST_F(KVCacheConnectorCoordinatorTest, AsyncWrite_CPShardedSkipsRemapForCanonicalEvictedResource) {
+    CacheConfig cp_cache_config       = cache_config_;
+    cp_cache_config.layer_num         = 2;
+    cp_cache_config.layer_all_num     = 2;
+    initTwoGroupCpConfig(cp_cache_config);
+
+    ParallelismConfig parallelism_config;
+    parallelism_config.tp_size                            = 2;
+    parallelism_config.prefill_cp_config.kv_cache_sharded = true;
+
+    auto coordinator = std::make_shared<KVCacheConnectorCoordinator>(cp_cache_config,
+                                                                     kv_cache_config_,
+                                                                     runtime_config_,
+                                                                     parallelism_config,
+                                                                     SpeculativeExecutionConfig{},
+                                                                     allocator_);
+    coordinator->connectors_.clear();
+
+    KVCacheResource resource;
+    resource.initGroups(/*group_num=*/2,
+                        /*layer_num=*/static_cast<int>(cp_cache_config.layer_all_num),
+                        cp_cache_config.layerGroupIdsSnapshot(),
+                        cp_cache_config.kernelBlocksPerKvBlock(),
+                        cp_cache_config.groupTypesSnapshot());
+    resource.setCacheKeys(CacheKeysType{11, 13});
+    resource.setCacheKeysAreCpCanonical(true);
+    BlockDependency root_dep;
+    root_dep.ordinal = 0;
+    BlockDependency child_dep;
+    child_dep.has_parent = true;
+    child_dep.parent_key = 11;
+    child_dep.ordinal    = 1;
+    resource.setBlockDependencies(BlockDependenciesType{root_dep, child_dep});
+    resource.setLastBlockAligned(true);
+    resource.mutableBlockIds(/*gid=*/0).assign(BlockIndicesType{100, 101});
+    resource.mutableBlockIds(/*gid=*/1).assign(BlockIndicesType{201, 203});
+
+    EXPECT_CALL(*allocator_, incrKVCacheRef(testing::_, testing::_, testing::Eq(true)))
+        .WillOnce(
+            testing::Invoke([](const KVCacheResource& ref_resource, const CacheKeysType& ref_keys, bool is_connector) {
+                (void)is_connector;
+                EXPECT_THAT(ref_keys, testing::ElementsAre(11, 13));
+                EXPECT_THAT(ref_resource.cacheKeys(), testing::ElementsAre(11, 13));
+                EXPECT_TRUE(ref_resource.cacheKeysAreCpCanonical());
+                EXPECT_EQ(ref_resource.blockDependencies().size(), 2u);
+                if (ref_resource.blockDependencies().size() == 2u) {
+                    EXPECT_FALSE(ref_resource.blockDependencies()[0].has_parent);
+                    EXPECT_EQ(ref_resource.blockDependencies()[0].ordinal, 0u);
+                    EXPECT_TRUE(ref_resource.blockDependencies()[1].has_parent);
+                    EXPECT_EQ(ref_resource.blockDependencies()[1].parent_key, 11);
+                    EXPECT_EQ(ref_resource.blockDependencies()[1].ordinal, 1u);
+                }
+                EXPECT_THAT(ref_resource.blocks(/*gid=*/0), testing::ElementsAre(100, 101));
+                EXPECT_THAT(ref_resource.blocks(/*gid=*/1), testing::ElementsAre(201, 203));
+                return std::make_shared<KVCacheResource>();
+            }));
+
+    auto rw_ctx = std::make_shared<testing::NiceMock<MockKVCacheConnectorReadWriteContext>>();
+    ON_CALL(*rw_ctx, kvCacheResource()).WillByDefault(testing::ReturnRef(resource));
+    std::shared_ptr<Meta> meta =
+        std::make_shared<TestMeta>(/*enable_memory_cache=*/true, /*enable_remote_cache=*/false, "");
+    ON_CALL(*rw_ctx, meta()).WillByDefault(testing::ReturnRef(meta));
+
+    auto async_ctx = coordinator->asyncWrite(rw_ctx);
+    ASSERT_NE(async_ctx, nullptr);
+
+    {
+        std::lock_guard<std::mutex> lock(coordinator->update_mutex_);
+        coordinator->fused_async_write_context_list_.clear();
+    }
+    async_ctx.reset();
+    coordinator.reset();
+}
+
+TEST_F(KVCacheConnectorCoordinatorTest, AsyncWrite_CPShardedKeepsCompactFixedGroupsInCanonicalCoordinates) {
+    CacheConfig cp_cache_config                    = cache_config_;
+    cp_cache_config.layer_num                      = 2;
+    cp_cache_config.layer_all_num                  = 2;
+    cp_cache_config.seq_size_per_block             = 128;
+    initTwoGroupCpConfig(cp_cache_config);
+    cp_cache_config.group_seq_size_per_block = {128, 256};
+
+    ParallelismConfig parallelism_config;
+    parallelism_config.tp_size                            = 2;
+    parallelism_config.prefill_cp_config.kv_cache_sharded = true;
+
+    auto coordinator = std::make_shared<KVCacheConnectorCoordinator>(cp_cache_config,
+                                                                     kv_cache_config_,
+                                                                     runtime_config_,
+                                                                     parallelism_config,
+                                                                     SpeculativeExecutionConfig{},
+                                                                     allocator_);
+    coordinator->connectors_.clear();
+
+    KVCacheResource resource;
+    resource.initGroups(/*group_num=*/2,
+                        /*layer_num=*/static_cast<int>(cp_cache_config.layer_all_num),
+                        cp_cache_config.layerGroupIdsSnapshot(),
+                        cp_cache_config.kernelBlocksPerKvBlock(),
+                        cp_cache_config.groupTypesSnapshot());
+    resource.cacheKeys() = CacheKeysType{10, 11, 12, 13};
+    resource.setLastBlockAligned(false);
+    resource.mutableBlockIds(/*gid=*/0).assign(BlockIndicesType{100, 101});
+    resource.mutableBlockIds(/*gid=*/1).assign(BlockIndicesType{200, 201});
+
+    EXPECT_CALL(*allocator_, incrKVCacheRef(testing::_, testing::_, testing::Eq(true)))
+        .WillOnce(
+            testing::Invoke([](const KVCacheResource& ref_resource, const CacheKeysType& ref_keys, bool is_connector) {
+                (void)is_connector;
+                EXPECT_THAT(ref_keys, testing::ElementsAre(11, 13));
+                EXPECT_THAT(ref_resource.cacheKeys(), testing::ElementsAre(11, 13));
+                EXPECT_FALSE(ref_resource.lastBlockAligned());
+                EXPECT_THAT(ref_resource.blocks(/*gid=*/0), testing::ElementsAre(100, 101));
+                EXPECT_THAT(ref_resource.blocks(/*gid=*/1), testing::ElementsAre(200, 201));
+                return std::make_shared<KVCacheResource>();
+            }));
+
+    auto rw_ctx = std::make_shared<testing::NiceMock<MockKVCacheConnectorReadWriteContext>>();
+    ON_CALL(*rw_ctx, kvCacheResource()).WillByDefault(testing::ReturnRef(resource));
+    std::shared_ptr<Meta> meta =
+        std::make_shared<TestMeta>(/*enable_memory_cache=*/true, /*enable_remote_cache=*/false, "");
+    ON_CALL(*rw_ctx, meta()).WillByDefault(testing::ReturnRef(meta));
+
+    auto async_ctx = coordinator->asyncWrite(rw_ctx);
+    ASSERT_NE(async_ctx, nullptr);
+
+    {
+        std::lock_guard<std::mutex> lock(coordinator->update_mutex_);
+        coordinator->fused_async_write_context_list_.clear();
+    }
+    async_ctx.reset();
+    coordinator.reset();
+}
+
+TEST_F(KVCacheConnectorCoordinatorTest, AsyncWrite_DecodePrefillCpRemapsFullAndCompactFixedGroups) {
+    CacheConfig cp_cache_config                    = cache_config_;
+    cp_cache_config.layer_num                      = 2;
+    cp_cache_config.layer_all_num                  = 2;
+    cp_cache_config.seq_size_per_block             = 128;
+    initTwoGroupCpConfig(cp_cache_config);
+    cp_cache_config.group_seq_size_per_block       = {128, 256};
+
+    ParallelismConfig parallelism_config;
+    parallelism_config.role_type                          = RoleType::DECODE;
+    parallelism_config.tp_size                            = 1;
+    parallelism_config.prefill_cp_config.method           = CPRotateMethod::PREFILL_CP;
+    parallelism_config.prefill_cp_config.kv_cache_sharded = true;
+    parallelism_config.prefill_cp_config.prefill_cp_size  = 2;
+
+    auto coordinator = std::make_shared<KVCacheConnectorCoordinator>(cp_cache_config,
+                                                                     kv_cache_config_,
+                                                                     runtime_config_,
+                                                                     parallelism_config,
+                                                                     SpeculativeExecutionConfig{},
+                                                                     allocator_);
+    coordinator->connectors_.clear();
+
+    KVCacheResource resource;
+    resource.initGroups(/*group_num=*/2,
+                        /*layer_num=*/static_cast<int>(cp_cache_config.layer_all_num),
+                        cp_cache_config.layerGroupIdsSnapshot(),
+                        cp_cache_config.kernelBlocksPerKvBlock(),
+                        cp_cache_config.groupTypesSnapshot());
+    resource.cacheKeys() = CacheKeysType{10, 11, 12, 13, 14};
+    resource.setLastBlockAligned(false);
+    resource.mutableBlockIds(/*gid=*/0).assign(BlockIndicesType{100, 101, 102, 103, 104});
+    resource.mutableBlockIds(/*gid=*/1).assign(BlockIndicesType{200, 201, 202});
+
+    EXPECT_CALL(*allocator_, incrKVCacheRef(testing::_, testing::_, testing::Eq(true)))
+        .WillOnce(
+            testing::Invoke([](const KVCacheResource& ref_resource, const CacheKeysType& ref_keys, bool is_connector) {
+                (void)is_connector;
+                EXPECT_THAT(ref_keys, testing::ElementsAre(11, 13, 14));
+                EXPECT_THAT(ref_resource.cacheKeys(), testing::ElementsAre(11, 13, 14));
+                EXPECT_FALSE(ref_resource.lastBlockAligned());
+                EXPECT_THAT(ref_resource.blocks(/*gid=*/0), testing::ElementsAre(101, 103));
+                EXPECT_THAT(ref_resource.blocks(/*gid=*/1), testing::ElementsAre(200, 201));
+                return std::make_shared<KVCacheResource>();
+            }));
+
+    auto rw_ctx = std::make_shared<testing::NiceMock<MockKVCacheConnectorReadWriteContext>>();
+    ON_CALL(*rw_ctx, kvCacheResource()).WillByDefault(testing::ReturnRef(resource));
+    std::shared_ptr<Meta> meta =
+        std::make_shared<TestMeta>(/*enable_memory_cache=*/true, /*enable_remote_cache=*/false, "");
+    ON_CALL(*rw_ctx, meta()).WillByDefault(testing::ReturnRef(meta));
+
+    auto async_ctx = coordinator->asyncWrite(rw_ctx);
+    ASSERT_NE(async_ctx, nullptr);
+
+    {
+        std::lock_guard<std::mutex> lock(coordinator->update_mutex_);
+        coordinator->fused_async_write_context_list_.clear();
+    }
+    async_ctx.reset();
+    coordinator.reset();
+}
+
+TEST_F(KVCacheConnectorCoordinatorTest, AsyncWrite_CPShardedAppendsDummyTailWhenPartialIsNotLastRank) {
+    CacheConfig cp_cache_config       = cache_config_;
+    cp_cache_config.layer_num         = 2;
+    cp_cache_config.layer_all_num     = 2;
+    initTwoGroupCpConfig(cp_cache_config);
+
+    ParallelismConfig parallelism_config;
+    parallelism_config.tp_size                            = 2;
+    parallelism_config.prefill_cp_config.kv_cache_sharded = true;
+
+    auto coordinator = std::make_shared<KVCacheConnectorCoordinator>(cp_cache_config,
+                                                                     kv_cache_config_,
+                                                                     runtime_config_,
+                                                                     parallelism_config,
+                                                                     SpeculativeExecutionConfig{},
+                                                                     allocator_);
+    coordinator->connectors_.clear();
+
+    KVCacheResource resource;
+    resource.initGroups(/*group_num=*/2,
+                        /*layer_num=*/static_cast<int>(cp_cache_config.layer_all_num),
+                        cp_cache_config.layerGroupIdsSnapshot(),
+                        cp_cache_config.kernelBlocksPerKvBlock(),
+                        cp_cache_config.groupTypesSnapshot());
+    resource.cacheKeys() = CacheKeysType{10, 11, 12, 13, 14};
+    resource.setLastBlockAligned(false);
+    resource.mutableBlockIds(/*gid=*/0).assign(BlockIndicesType{100, 101, 102});
+    resource.mutableBlockIds(/*gid=*/1).assign(BlockIndicesType{200, 201, 202, 203, 204});
+
+    EXPECT_CALL(*allocator_, incrKVCacheRef(testing::_, testing::_, testing::Eq(true)))
+        .WillOnce(
+            testing::Invoke([](const KVCacheResource& ref_resource, const CacheKeysType& ref_keys, bool is_connector) {
+                (void)is_connector;
+                EXPECT_THAT(ref_keys, testing::ElementsAre(11, 13, 14));
+                EXPECT_THAT(ref_resource.cacheKeys(), testing::ElementsAre(11, 13, 14));
+                EXPECT_FALSE(ref_resource.lastBlockAligned());
+                EXPECT_THAT(ref_resource.blocks(/*gid=*/0), testing::ElementsAre(100, 101));
+                EXPECT_THAT(ref_resource.blocks(/*gid=*/1), testing::ElementsAre(201, 203));
+                return std::make_shared<KVCacheResource>();
+            }));
+
+    auto rw_ctx = std::make_shared<testing::NiceMock<MockKVCacheConnectorReadWriteContext>>();
+    ON_CALL(*rw_ctx, kvCacheResource()).WillByDefault(testing::ReturnRef(resource));
+    std::shared_ptr<Meta> meta =
+        std::make_shared<TestMeta>(/*enable_memory_cache=*/true, /*enable_remote_cache=*/false, "");
+    ON_CALL(*rw_ctx, meta()).WillByDefault(testing::ReturnRef(meta));
+
+    auto async_ctx = coordinator->asyncWrite(rw_ctx);
+    ASSERT_NE(async_ctx, nullptr);
+
+    {
+        std::lock_guard<std::mutex> lock(coordinator->update_mutex_);
+        coordinator->fused_async_write_context_list_.clear();
+    }
+    async_ctx.reset();
+    coordinator.reset();
+}
+
 TEST_F(KVCacheConnectorCoordinatorTest, AsyncWrite_ReturnNull_WhenIncrKVCacheRefReturnsNull) {
     auto mock_connector       = std::make_shared<MockKVCacheConnector>();
     coordinator_->connectors_ = {mock_connector};
@@ -544,7 +891,7 @@ TEST_F(KVCacheConnectorCoordinatorTest, AsyncWrite_ReturnNull_WhenIncrKVCacheRef
 
     // Build a connector context with non-empty cache keys.
     auto ctx_resource = std::make_shared<KVCacheResource>();
-    ctx_resource->initGroups(1, cache_config_.layer_all_num, cache_config_.layer_to_group_id);
+    ctx_resource->initGroups(1, cache_config_.layer_all_num, cache_config_.layerGroupIdsSnapshot());
     ctx_resource->cacheKeys()    = CacheKeysType{1, 2, 3};
     auto                  rw_ctx = std::make_shared<testing::NiceMock<MockKVCacheConnectorReadWriteContext>>();
     std::shared_ptr<Meta> meta =
@@ -569,7 +916,7 @@ TEST_F(KVCacheConnectorCoordinatorTest, AsyncWrite_ReturnFusedContext_WhenMemory
     coordinator_->allocator_  = allocator_;
 
     KVCacheResource resource;
-    resource.initGroups(1, cache_config_.layer_all_num, cache_config_.layer_to_group_id);
+    resource.initGroups(1, cache_config_.layer_all_num, cache_config_.layerGroupIdsSnapshot());
     resource.cacheKeys() = CacheKeysType{1, 2, 3};
 
     auto selected_resource        = makeResourceWithAutoDecr();
@@ -608,7 +955,7 @@ TEST_F(KVCacheConnectorCoordinatorTest, AsyncWrite_ReturnFusedContext_WhenConnec
     coordinator_->allocator_  = allocator_;
 
     KVCacheResource resource;
-    resource.initGroups(1, cache_config_.layer_all_num, cache_config_.layer_to_group_id);
+    resource.initGroups(1, cache_config_.layer_all_num, cache_config_.layerGroupIdsSnapshot());
     resource.cacheKeys() = CacheKeysType{1, 2, 3};
 
     auto selected_resource        = makeResourceWithAutoDecr();
@@ -648,7 +995,7 @@ TEST_F(KVCacheConnectorCoordinatorTest, AsyncWrite_ReturnFusedContext_WhenNoConn
     coordinator_->allocator_ = allocator_;
 
     KVCacheResource resource;
-    resource.initGroups(1, cache_config_.layer_all_num, cache_config_.layer_to_group_id);
+    resource.initGroups(1, cache_config_.layer_all_num, cache_config_.layerGroupIdsSnapshot());
     resource.cacheKeys() = CacheKeysType{1, 2, 3};
 
     auto selected_resource        = makeResourceWithAutoDecr();
@@ -876,4 +1223,4 @@ TEST_F(KVCacheConnectorCoordinatorTest, AsyncReadAfterMatch_Throws_WhenSizeMisma
 }
 
 }  // namespace test
-}  // namespace rtp_llm
\ No newline at end of file
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/FullKVCacheGroup.cc b/rtp_llm/cpp/cache/group/FullKVCacheGroup.cc
similarity index 54%
rename from rtp_llm/cpp/cache/FullKVCacheGroup.cc
rename to rtp_llm/cpp/cache/group/FullKVCacheGroup.cc
index 6e8791ac69..31ae7532a5 100644
--- a/rtp_llm/cpp/cache/FullKVCacheGroup.cc
+++ b/rtp_llm/cpp/cache/group/FullKVCacheGroup.cc
@@ -1,4 +1,4 @@
-#include "rtp_llm/cpp/cache/FullKVCacheGroup.h"
+#include "rtp_llm/cpp/cache/group/FullKVCacheGroup.h"
 #include "rtp_llm/cpp/utils/Logger.h"
 
 namespace rtp_llm {
@@ -9,13 +9,12 @@ int FullKVCacheGroup::needBlocksNum(int seq_len, int current_blocks, int reserve
 
 NeedBlocksInfo FullKVCacheGroup::getNeedBlocks(
     int common_seq_len, int seq_len, int reserve_step, int reuse_blocks_len, bool reuse_enabled) const {
-    (void)reuse_blocks_len;
-    (void)reuse_enabled;
     NeedBlocksInfo info;
-    const int      common_slots = needBlocksNum(common_seq_len, /*current_blocks=*/0);
-    const int      total_slots  = needBlocksNum(seq_len, /*current_blocks=*/0, reserve_step);
-    info.common_blocks          = std::max(common_slots, 0);
-    info.extra_blocks           = std::max(total_slots - common_slots, 0);
+    const int      common_slots        = needBlocksNum(common_seq_len, /*current_blocks=*/0);
+    const int      total_slots         = needBlocksNum(seq_len, /*current_blocks=*/0, reserve_step);
+    const int      reused_common_slots = reuse_enabled ? std::min(std::max(reuse_blocks_len, 0), common_slots) : 0;
+    info.common_blocks                 = std::max(common_slots - reused_common_slots, 0);
+    info.extra_blocks                  = std::max(total_slots - common_slots, 0);
     return info;
 }
 
@@ -42,20 +41,24 @@ bool FullKVCacheGroup::malloc(BlockIds& block_ids, int seq_len, bool enable_reus
     return true;
 }
 
-MatchResult FullKVCacheGroup::match(const CacheKeysType& cache_keys) {
+MatchResult FullKVCacheGroup::matchPrefix(const CacheKeysType& cache_keys) const {
     MatchResult final_result;
 
-    for (const auto& cache_key : cache_keys) {
-        auto result = block_cache_->match(cache_key, group_id_);
-        if (isNullBlockIdx(result.matched_index)) {
+    if (!shared_cache_) {
+        return final_result;
+    }
+
+    for (size_t i = 0; i < cache_keys.size(); ++i) {
+        const auto cache_key = cache_keys[i];
+        auto       block_idx = shared_cache_->matchGroup(cache_key, group_id_);
+        if (isNullBlockIdx(block_idx)) {
             break;
         }
         final_result.reuse_blocks++;
-        final_result.block_indices.push_back(result.matched_index);
+        final_result.block_indices.push_back(block_idx);
     }
 
     final_result.reuse_length = final_result.reuse_blocks * seqSizePerBlock();
-
     return final_result;
 }
 
@@ -73,34 +76,6 @@ void FullKVCacheGroup::reference(BlockIds& block_ids, const BlockIndicesType& ne
     block_pool_->requestReference(new_block_indices);
 }
 
-void FullKVCacheGroup::insertIntoCache(const CacheKeysType&    cache_keys,
-                                       const BlockIndicesType& block_indices,
-                                       bool                    is_resident) {
-    if (cache_keys.empty()) {
-        return;
-    }
-
-    if (cache_keys.size() != block_indices.size()) {
-        RTP_LLM_LOG_ERROR(
-            "Cache keys size (%zu) doesn't match block indices size (%zu)", cache_keys.size(), block_indices.size());
-        return;
-    }
-
-    const int last_index = cache_keys.size() - 1;
-    for (int i = last_index; i >= 0; --i) {
-        BlockCache::CacheItem item;
-        item.cache_key   = cache_keys[i];
-        item.group_id    = group_id_;
-        item.block_index = block_indices[i];
-        item.is_resident = is_resident;
-        if (block_cache_->put(item)) {
-            block_pool_->blockCacheReference(block_indices[i]);
-        }
-    }
-
-    RTP_LLM_LOG_DEBUG("Inserted %zu blocks into cache", block_indices.size());
-}
-
 void FullKVCacheGroup::removeSkippedBlocks(BlockIds& /*block_ids*/, bool /*enable_reuse_cache*/, int /*reserve_step*/) {
 }
 
diff --git a/rtp_llm/cpp/cache/FullKVCacheGroup.h b/rtp_llm/cpp/cache/group/FullKVCacheGroup.h
similarity index 52%
rename from rtp_llm/cpp/cache/FullKVCacheGroup.h
rename to rtp_llm/cpp/cache/group/FullKVCacheGroup.h
index 13bb862766..f7331159bf 100644
--- a/rtp_llm/cpp/cache/FullKVCacheGroup.h
+++ b/rtp_llm/cpp/cache/group/FullKVCacheGroup.h
@@ -2,23 +2,24 @@
 
 #include <memory>
 
-#include "rtp_llm/cpp/cache/KVCacheGroup.h"
+#include "rtp_llm/cpp/cache/group/KVCacheGroup.h"
 
 namespace rtp_llm {
 
 class FullKVCacheGroup: public KVCacheGroup {
 public:
     FullKVCacheGroup(const LayerIdsType&          layer_ids,
-                     std::shared_ptr<KVCacheSpec> kvcache_spec,
-                     BlockPoolPtr                 block_pool,
-                     int                          group_id):
-        KVCacheGroup(layer_ids, kvcache_spec, block_pool, group_id) {}
+	                     std::shared_ptr<KVCacheSpec> kvcache_spec,
+	                     BlockPoolPtr                 block_pool,
+	                     int                          group_id,
+	                     SharedBlockCache*            shared_cache = nullptr,
+	                     const kmonitor::MetricsReporterPtr& metrics_reporter = nullptr,
+	                     CacheGroupPolicy             policy       = CacheGroupPolicy{}):
+	        KVCacheGroup(layer_ids, kvcache_spec, block_pool, group_id, policy, shared_cache, metrics_reporter) {}
 
-    bool malloc(BlockIds& block_indices, int seq_len, bool enable_reuse_cache = false, int reserve_step = 0) override;
-    MatchResult match(const CacheKeysType& cache_keys) override;
+    bool malloc(BlockIds& block_ids, int seq_len, bool enable_reuse_cache = false, int reserve_step = 0) override;
+    MatchResult matchPrefix(const CacheKeysType& cache_keys) const override;
     void        free(const BlockIndicesType& block_indices) override;
-    void
-    insertIntoCache(const CacheKeysType& cache_keys, const BlockIndicesType& block_indices, bool is_resident) override;
     void removeSkippedBlocks(BlockIds& block_ids, bool enable_reuse_cache = false, int reserve_step = 0) override;
     int  needBlocksNum(int seq_len, int current_blocks = 0, int reserve_step = 0) const override;
     NeedBlocksInfo getNeedBlocks(int  common_seq_len,
diff --git a/rtp_llm/cpp/cache/KVCacheGroup.cc b/rtp_llm/cpp/cache/group/KVCacheGroup.cc
similarity index 56%
rename from rtp_llm/cpp/cache/KVCacheGroup.cc
rename to rtp_llm/cpp/cache/group/KVCacheGroup.cc
index 5b9343890f..db18411f00 100644
--- a/rtp_llm/cpp/cache/KVCacheGroup.cc
+++ b/rtp_llm/cpp/cache/group/KVCacheGroup.cc
@@ -1,4 +1,5 @@
-#include "rtp_llm/cpp/cache/KVCacheGroup.h"
+#include "rtp_llm/cpp/cache/group/KVCacheGroup.h"
+#include "rtp_llm/cpp/metrics/RtpLLMMetrics.h"
 #include "rtp_llm/cpp/utils/Logger.h"
 
 namespace rtp_llm {
@@ -41,28 +42,58 @@ bool KVCacheGroup::ensureFreeBlocks(int required_blocks) {
         return true;
     }
 
-    // blocks popped by block cache might be occupied by request
-    // it's necessary to checkout whether free blocks are enough
     while (true) {
         const auto free_blocks = block_pool_->freeBlocksNum();
         if (free_blocks >= static_cast<size_t>(required_blocks)) {
             break;
         }
 
-        const int need_evict     = required_blocks - static_cast<int>(free_blocks);
-        auto      evicted_blocks = block_cache_->pop(need_evict);
-        if (evicted_blocks.empty()) {
-            RTP_LLM_LOG_WARNING("ensure free blocks failed, free blocks : %d, need evict blocks : %d",
+        if (!shared_cache_) {
+            RTP_LLM_LOG_WARNING(
+                "ensure free blocks failed, no shared cache, free blocks: %zu, need: %d", free_blocks, required_blocks);
+            return false;
+        }
+
+        const size_t                  need_evict = static_cast<size_t>(required_blocks) - free_blocks;
+        SharedBlockCache::EvictResult evict_result;
+        size_t                        freed = shared_cache_->evictAndFreeForGroup(group_id_, need_evict, &evict_result);
+        if (metrics_reporter_) {
+            for (const auto& [cache_key, lifetime_ms] : evict_result.evicted_lifetime_ms) {
+                RtpLLMCacheEvictionMetricsCollector collector;
+                collector.lifetime_ms = lifetime_ms;
+                kmonitor::MetricsTags tags("scope", "gpu");
+                tags.AddTag("evict_policy",
+                            evict_result.evicted_independent_group.count(cache_key) ? "independent" : "chain");
+                tags.AddTag("backing", "device");
+                metrics_reporter_->report<RtpLLMCacheEvictionMetrics, RtpLLMCacheEvictionMetricsCollector>(&tags,
+                                                                                                           &collector);
+            }
+        }
+        if (freed == 0) {
+            RTP_LLM_LOG_WARNING("ensure free blocks failed, free blocks: %zu, need evict blocks: %zu",
                                 block_pool_->freeBlocksNum(),
                                 need_evict);
             return false;
         }
-        block_pool_->blockCacheFree(evicted_blocks);
     }
 
     return true;
 }
 
+MatchResult KVCacheGroup::match(const CacheKeysType& cache_keys) {
+    return matchPrefix(cache_keys);
+}
+
+MatchResult KVCacheGroup::matchPrefix(const CacheKeysType& /*cache_keys*/) const {
+    RTP_LLM_FAIL("KVCacheGroup gid=%d does not support prefix matching", group_id_);
+    return {};
+}
+
+MatchResult KVCacheGroup::matchSingleKey(CacheKeyType /*cache_key*/) const {
+    RTP_LLM_FAIL("KVCacheGroup gid=%d does not support single-key matching", group_id_);
+    return {};
+}
+
 size_t KVCacheGroup::freeBlocksNum() const {
     return block_pool_->freeBlocksNum();
 }
@@ -75,6 +106,26 @@ int KVCacheGroup::group_id() const {
     return group_id_;
 }
 
+const CacheGroupPolicy& KVCacheGroup::policy() const {
+    return policy_;
+}
+
+CacheReusePolicy KVCacheGroup::reusePolicy() const {
+    return policy_.reuse_policy;
+}
+
+CacheEvictPolicy KVCacheGroup::evictPolicy() const {
+    return policy_.evict_policy;
+}
+
+uint32_t KVCacheGroup::explicitBlockNum() const {
+    return policy_.explicit_block_num;
+}
+
+size_t KVCacheGroup::activeTailBlocks() const {
+    return policy_.active_tail_blocks > 0 ? static_cast<size_t>(policy_.active_tail_blocks) : 0;
+}
+
 std::unordered_map<int, torch::Tensor> KVCacheGroup::allLayerCacheBase() const {
     return global_layer_to_kv_tensors;
 }
@@ -109,4 +160,36 @@ void KVCacheGroup::reference(const BlockIndicesType& new_block_indices) {
     block_pool_->requestReference(new_block_indices);
 }
 
+bool KVCacheGroup::isCpShardable() const {
+    return policy_.is_cp_shardable;
+}
+
+bool KVCacheGroup::prefixReusable() const {
+    return policy_.prefix_reusable && policy_.reuse_policy == CacheReusePolicy::REUSABLE;
+}
+
+bool KVCacheGroup::hasSparseSlots() const {
+    return policy_.has_sparse_slots;
+}
+
+bool KVCacheGroup::hasKernelBlockSubdiv() const {
+    return policy_.has_kernel_block_subdiv;
+}
+
+bool KVCacheGroup::transferTailBlocks() const {
+    return activeTailBlocks() > 0;
+}
+
+bool KVCacheGroup::cpCompactTailBlocks() const {
+    return policy_.cp_compact_tail_blocks;
+}
+
+bool KVCacheGroup::isReservable() const {
+    return policy_.is_reservable;
+}
+
+bool KVCacheGroup::usesPinnedCpuBacking() const {
+    return policy_.uses_pinned_cpu_backing;
+}
+
 }  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/KVCacheGroup.h b/rtp_llm/cpp/cache/group/KVCacheGroup.h
similarity index 61%
rename from rtp_llm/cpp/cache/KVCacheGroup.h
rename to rtp_llm/cpp/cache/group/KVCacheGroup.h
index 3274a40dc9..7004a523a3 100644
--- a/rtp_llm/cpp/cache/KVCacheGroup.h
+++ b/rtp_llm/cpp/cache/group/KVCacheGroup.h
@@ -7,12 +7,13 @@
 
 #include <torch/torch.h>
 
+#include "kmonitor/client/MetricsReporter.h"
 #include "rtp_llm/cpp/cache/KVCacheResource.h"
 #include "rtp_llm/cpp/cache/Types.h"
 #include "rtp_llm/cpp/cache/BufferTypes.h"
 #include "rtp_llm/cpp/cache/CacheConfig.h"
 #include "rtp_llm/cpp/cache/BlockPool.h"
-#include "rtp_llm/cpp/cache/BlockCache.h"
+#include "rtp_llm/cpp/cache/SharedBlockCache.h"
 
 namespace rtp_llm {
 
@@ -23,11 +24,19 @@ struct NeedBlocksInfo {
 
 class KVCacheGroup {
 public:
-    KVCacheGroup(const LayerIdsType& layer_ids, KVCacheSpecPtr kvcache_spec, BlockPoolPtr block_pool, int group_id):
+    KVCacheGroup(const LayerIdsType& layer_ids,
+                 KVCacheSpecPtr      kvcache_spec,
+                 BlockPoolPtr        block_pool,
+                 int                 group_id,
+                 CacheGroupPolicy    policy       = CacheGroupPolicy{},
+                 SharedBlockCache*   shared_cache = nullptr,
+                 const kmonitor::MetricsReporterPtr& metrics_reporter = nullptr):
         layer_ids_(layer_ids),
         kvcache_spec_(std::move(kvcache_spec)),
         block_pool_(block_pool),
-        block_cache_(block_pool_->blockCache()),
+        policy_(policy),
+        shared_cache_(shared_cache),
+        metrics_reporter_(metrics_reporter),
         group_id_(group_id),
         seq_size_per_block_(kvcache_spec_->seq_size_per_block) {}
 
@@ -37,10 +46,10 @@ class KVCacheGroup {
     // Allocate blocks for `seq_len` tokens; appends new IDs to `block_ids` via BlockIds::add().
     virtual bool malloc(BlockIds& block_ids, int seq_len, bool enable_reuse_cache = false, int reserve_step = 0) = 0;
     // TODO, match的时候热度不增加，最终匹配成功的时候再去增加热度。
-    virtual MatchResult match(const CacheKeysType& cache_keys)      = 0;
-    virtual void        free(const BlockIndicesType& block_indices) = 0;
-    virtual void
-    insertIntoCache(const CacheKeysType& cache_keys, const BlockIndicesType& block_indices, bool is_resident)    = 0;
+    virtual MatchResult match(const CacheKeysType& cache_keys);
+    virtual MatchResult matchPrefix(const CacheKeysType& cache_keys) const;
+    virtual MatchResult matchSingleKey(CacheKeyType cache_key) const;
+    virtual void        free(const BlockIndicesType& block_indices)                                              = 0;
     virtual void removeSkippedBlocks(BlockIds& block_ids, bool enable_reuse_cache = false, int reserve_step = 0) = 0;
     virtual int  needBlocksNum(int seq_len, int current_blocks, int reserve_step = 0) const                      = 0;
     virtual NeedBlocksInfo getNeedBlocks(
@@ -59,13 +68,29 @@ class KVCacheGroup {
     bool   ensureFreeBlocks(int need_blocks);
     int    seqSizePerBlock() const;
     int    group_id() const;
+    const CacheGroupPolicy& policy() const;
+    CacheReusePolicy        reusePolicy() const;
+    CacheEvictPolicy        evictPolicy() const;
+    uint32_t                explicitBlockNum() const;
+    size_t                  activeTailBlocks() const;
+
+    virtual bool isCpShardable() const;
+    virtual bool prefixReusable() const;
+    virtual bool hasSparseSlots() const;
+    virtual bool hasKernelBlockSubdiv() const;
+    virtual bool transferTailBlocks() const;
+    virtual bool cpCompactTailBlocks() const;
+    virtual bool isReservable() const;
+    virtual bool usesPinnedCpuBacking() const;
 
 protected:
-    LayerIdsType   layer_ids_;
-    KVCacheSpecPtr kvcache_spec_;
-    BlockPoolPtr   block_pool_;
-    BlockCachePtr  block_cache_;
-    int            group_id_ = 0;
+    LayerIdsType      layer_ids_;
+    KVCacheSpecPtr    kvcache_spec_;
+    BlockPoolPtr      block_pool_;
+    CacheGroupPolicy  policy_;
+    SharedBlockCache* shared_cache_ = nullptr;
+    kmonitor::MetricsReporterPtr metrics_reporter_ = nullptr;
+    int               group_id_     = 0;
 
     int                                    seq_size_per_block_;
     std::unordered_map<int, torch::Tensor> global_layer_to_kv_tensors;
diff --git a/rtp_llm/cpp/cache/group/LinearKVCacheGroup.cc b/rtp_llm/cpp/cache/group/LinearKVCacheGroup.cc
new file mode 100644
index 0000000000..68a8d33c34
--- /dev/null
+++ b/rtp_llm/cpp/cache/group/LinearKVCacheGroup.cc
@@ -0,0 +1,219 @@
+#include "rtp_llm/cpp/cache/group/LinearKVCacheGroup.h"
+
+#include <algorithm>
+#include <unordered_set>
+
+#include "rtp_llm/cpp/utils/Logger.h"
+
+namespace rtp_llm {
+
+void LinearKVCacheGroup::filterValidBlocks(const BlockIndicesType& in, BlockIndicesType& out) const {
+    out.clear();
+    out.reserve(in.size());
+    for (auto b : in) {
+        if (!isNullBlockIdx(b)) {
+            out.push_back(b);
+        }
+    }
+}
+
+int LinearKVCacheGroup::needBlocksNum(int seq_len, int current_blocks, int reserve_step) const {
+    int extra_blocks = reserve_step ? reserve_step - 1 : 0;
+    return std::max((seq_len + seq_size_per_block_ - 1) / seq_size_per_block_ + extra_blocks - current_blocks, 0);
+}
+
+bool LinearKVCacheGroup::shouldMaterializeBlock(int pos, int seq_len, int reserve_step, bool enable_reuse_cache) const {
+    if (pos < 0) {
+        return false;
+    }
+
+    const int  step        = std::max(1, linear_step_);
+    const int  seq_slots   = needBlocksNum(seq_len, 0, 0);
+    const int  total_slots = needBlocksNum(seq_len, 0, reserve_step);
+    const bool is_seq_tail = (seq_slots > 0) && (pos >= std::max(0, seq_slots - 2)) && (pos < seq_slots);
+    const bool is_reserve  = (reserve_step > 0) && (pos >= seq_slots) && (pos < total_slots);
+    const bool step_hit    = (((pos + 1) % step) == 0);
+    return is_reserve || (enable_reuse_cache ? (step_hit || is_seq_tail) : is_seq_tail);
+}
+
+NeedBlocksInfo LinearKVCacheGroup::getNeedBlocks(
+    int common_seq_len, int seq_len, int reserve_step, int reuse_blocks_len, bool reuse_enabled) const {
+    NeedBlocksInfo info;
+
+    // common_slots: blocks for common_seq_len (no reserve)
+    const int common_slots = needBlocksNum(common_seq_len, 0);
+    // total_slots includes reserve_step - 1 extra linear slots when reserve_step is non-zero.
+    const int total_slots = needBlocksNum(seq_len, 0, reserve_step);
+
+    auto common_required = [&](int pos) { return shouldMaterializeBlock(pos, common_seq_len, 0, reuse_enabled); };
+    auto final_required  = [&](int pos) { return shouldMaterializeBlock(pos, seq_len, reserve_step, reuse_enabled); };
+
+    for (int pos = 0; pos < common_slots; ++pos) {
+        if (common_required(pos)) {
+            info.common_blocks++;
+        }
+    }
+    for (int pos = 0; pos < total_slots; ++pos) {
+        if (final_required(pos) && !(pos < common_slots && common_required(pos))) {
+            info.extra_blocks++;
+        }
+    }
+
+    // Linear reuse materializes only one prefix block: the matched tail at
+    // reuse_blocks_len - 1. Do not count that block as newly allocated.
+    const int reused_tail_pos = (reuse_enabled && reuse_blocks_len > 0) ? reuse_blocks_len - 1 : -1;
+    if (reused_tail_pos >= 0) {
+        if (reused_tail_pos < common_slots && common_required(reused_tail_pos)) {
+            info.common_blocks--;
+        } else if (reused_tail_pos < total_slots && final_required(reused_tail_pos)) {
+            info.extra_blocks--;
+        }
+    }
+
+    info.common_blocks = std::max(info.common_blocks, 0);
+    info.extra_blocks  = std::max(info.extra_blocks, 0);
+    return info;
+}
+
+MatchResult LinearKVCacheGroup::matchSingleKey(CacheKeyType cache_key) const {
+    MatchResult result;
+    if (!shared_cache_) {
+        return result;
+    }
+    auto block_idx = shared_cache_->matchGroup(cache_key, group_id_);
+    if (!isNullBlockIdx(block_idx)) {
+        result.block_indices = {block_idx};
+    }
+    return result;
+}
+
+bool LinearKVCacheGroup::malloc(BlockIds& block_ids, int seq_len, bool enable_reuse_cache, int reserve_step) {
+    const int step               = std::max(1, linear_step_);
+    const int current_blocks_len = static_cast<int>(block_ids.blocksNum());
+    const int seq_slots          = needBlocksNum(seq_len, 0, 0);
+    const int total_slots        = needBlocksNum(seq_len, 0, reserve_step);
+    const int new_blocks_len     = std::max(total_slots - current_blocks_len, 0);
+
+    auto should_materialize = [&](int pos) {
+        // Materialize tail and tail-1: causal_conv1d_update may read
+        // (seq_len - 2) / SBP when seq_len crosses a block boundary.
+        // Leaving tail-1 NULL can hit IMA on long prompts.
+        const bool is_seq_tail = (seq_slots > 0) && (pos >= std::max(0, seq_slots - 2)) && (pos < seq_slots);
+        const bool is_reserve  = (reserve_step > 0) && (pos >= seq_slots) && (pos < total_slots);
+        const bool step_hit    = (((pos + 1) % step) == 0);
+        return is_reserve || (enable_reuse_cache ? (step_hit || is_seq_tail) : is_seq_tail);
+    };
+
+    std::vector<size_t> positions_to_backfill;
+    const auto&         existing_blocks = block_ids.blocks();
+    const int           existing_scan   = std::min(current_blocks_len, total_slots);
+    for (int i = 0; i < existing_scan; ++i) {
+        if (should_materialize(i) && isNullBlockIdx(existing_blocks[static_cast<size_t>(i)])) {
+            positions_to_backfill.push_back(static_cast<size_t>(i));
+        }
+    }
+
+    int need_alloc_blocks = 0;
+    need_alloc_blocks += static_cast<int>(positions_to_backfill.size());
+    for (int i = current_blocks_len; i < total_slots; i++) {
+        if (should_materialize(i)) {
+            need_alloc_blocks++;
+        }
+    }
+
+    if (need_alloc_blocks > 0) {
+        const auto free_blocks_num = freeBlocksNum();
+        if (free_blocks_num < static_cast<size_t>(need_alloc_blocks)) {
+            if (!ensureFreeBlocks(need_alloc_blocks)) {
+                RTP_LLM_LOG_WARNING("Insufficient free blocks for LinearKVCacheGroup: need %d, have %zu",
+                                    need_alloc_blocks,
+                                    free_blocks_num);
+                return false;
+            }
+        }
+    }
+
+    BlockIndicesType allocated_blocks;
+    if (need_alloc_blocks > 0) {
+        allocated_blocks = block_pool_->malloc(need_alloc_blocks);
+        if (allocated_blocks.size() != static_cast<size_t>(need_alloc_blocks)) {
+            if (!allocated_blocks.empty()) {
+                block_pool_->requestFree(allocated_blocks);
+            }
+            return false;
+        }
+    }
+
+    size_t allocated_idx = 0;
+    for (size_t pos : positions_to_backfill) {
+        block_ids.setAt(pos, allocated_blocks[allocated_idx++]);
+    }
+
+    BlockIndicesType new_ids;
+    new_ids.reserve(static_cast<size_t>(new_blocks_len));
+    for (int i = current_blocks_len; i < total_slots; i++) {
+        if (should_materialize(i)) {
+            new_ids.push_back(allocated_blocks[allocated_idx++]);
+        } else {
+            new_ids.push_back(NULL_BLOCK_IDX);
+        }
+    }
+    if (!new_ids.empty()) {
+        block_ids.add(new_ids);
+    }
+    RTP_LLM_CHECK_WITH_INFO(allocated_idx == allocated_blocks.size(),
+                            "linear kv allocation accounting mismatch, used=%zu allocated=%zu",
+                            allocated_idx,
+                            allocated_blocks.size());
+    return true;
+}
+
+void LinearKVCacheGroup::removeSkippedBlocks(BlockIds& block_ids, bool enable_reuse_cache, int reserve_step) {
+    const auto& block_indices = block_ids.blocks();  // const view for reading current state
+    if (block_indices.empty()) {
+        return;
+    }
+    const int step       = std::max(1, linear_step_);
+    const int block_size = static_cast<int>(block_indices.size());
+
+    BlockIndicesType    blocks_to_free;
+    std::vector<size_t> pos_to_remove;
+    // keep last 2 and every reserve_step
+    for (int i = block_size - 3 - reserve_step; i >= 0; i--) {
+        if (isNullBlockIdx(block_indices[i])) {
+            continue;
+        }
+        if (enable_reuse_cache && ((i + 1) % step) == 0) {
+            continue;
+        }
+        blocks_to_free.push_back(block_indices[i]);
+        pos_to_remove.push_back(static_cast<size_t>(i));
+    }
+    if (!blocks_to_free.empty()) {
+        block_pool_->requestFree(blocks_to_free);
+        block_ids.remove(pos_to_remove);  // null-out by position, updates kernel slots incrementally
+    }
+}
+
+void LinearKVCacheGroup::free(const BlockIndicesType& block_indices) {
+    if (block_indices.empty()) {
+        return;
+    }
+    BlockIndicesType valid;
+    filterValidBlocks(block_indices, valid);
+    if (valid.empty()) {
+        return;
+    }
+    block_pool_->requestFree(valid);
+}
+
+void LinearKVCacheGroup::reference(BlockIds& block_ids, const BlockIndicesType& new_block_indices) {
+    block_ids.add(new_block_indices);
+    BlockIndicesType valid;
+    filterValidBlocks(new_block_indices, valid);
+    if (!valid.empty()) {
+        block_pool_->requestReference(valid);
+    }
+}
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/LinearKVCacheGroup.h b/rtp_llm/cpp/cache/group/LinearKVCacheGroup.h
similarity index 75%
rename from rtp_llm/cpp/cache/LinearKVCacheGroup.h
rename to rtp_llm/cpp/cache/group/LinearKVCacheGroup.h
index f3daa1f197..0d8db388a1 100644
--- a/rtp_llm/cpp/cache/LinearKVCacheGroup.h
+++ b/rtp_llm/cpp/cache/group/LinearKVCacheGroup.h
@@ -4,7 +4,7 @@
 #include <vector>
 #include <cstdint>
 
-#include "rtp_llm/cpp/cache/KVCacheGroup.h"
+#include "rtp_llm/cpp/cache/group/KVCacheGroup.h"
 
 namespace rtp_llm {
 
@@ -14,15 +14,16 @@ class LinearKVCacheGroup: public KVCacheGroup {
                        std::shared_ptr<KVCacheSpec> kvcache_spec,
                        BlockPoolPtr                 block_pool,
                        int                          group_id,
-                       int                          linear_step = 0):
-        KVCacheGroup(layer_ids, kvcache_spec, block_pool, group_id), linear_step_(linear_step) {}
+                       int                          linear_step      = 0,
+                       SharedBlockCache*            shared_cache     = nullptr,
+                       const kmonitor::MetricsReporterPtr& metrics_reporter = nullptr,
+                       CacheGroupPolicy policy = defaultCacheGroupPolicy(CacheGroupType::LINEAR)):
+        KVCacheGroup(layer_ids, kvcache_spec, block_pool, group_id, policy, shared_cache, metrics_reporter),
+        linear_step_(linear_step) {}
 
-    MatchResult match(const CacheKeysType& cache_keys) override;
     // Match a single cache key (used by Hybrid allocator to do right-to-left joint matching).
-    MatchResult matchSingleKey(CacheKeyType cache_key) const;
+    MatchResult matchSingleKey(CacheKeyType cache_key) const override;
     bool malloc(BlockIds& block_ids, int seq_len, bool enable_reuse_cache = false, int reserve_step = 0) override;
-    void
-    insertIntoCache(const CacheKeysType& cache_keys, const BlockIndicesType& block_indices, bool is_resident) override;
 
     void removeSkippedBlocks(BlockIds& block_ids, bool enable_reuse_cache = false, int reserve_step = 0) override;
     void free(const BlockIndicesType& block_indices) override;
@@ -33,6 +34,7 @@ class LinearKVCacheGroup: public KVCacheGroup {
                                  int  reserve_step,
                                  int  reuse_blocks_len,
                                  bool reuse_enabled = false) const override;
+    bool           shouldMaterializeBlock(int pos, int seq_len, int reserve_step, bool enable_reuse_cache) const;
 
 private:
     void filterValidBlocks(const BlockIndicesType& in, BlockIndicesType& out) const;
diff --git a/rtp_llm/cpp/cache/group/SWAKVCacheGroup.cc b/rtp_llm/cpp/cache/group/SWAKVCacheGroup.cc
new file mode 100644
index 0000000000..7be84d60e9
--- /dev/null
+++ b/rtp_llm/cpp/cache/group/SWAKVCacheGroup.cc
@@ -0,0 +1,241 @@
+#include "rtp_llm/cpp/cache/group/SWAKVCacheGroup.h"
+
+#include <algorithm>
+#include <cstdlib>
+#include <string>
+
+#include "rtp_llm/cpp/utils/Logger.h"
+
+namespace rtp_llm {
+
+namespace {
+
+bool isActiveTailBlock(int block_idx, int seq_slots, int active_tail_blocks) {
+    if (seq_slots <= 0 || block_idx >= seq_slots) {
+        return false;
+    }
+    return block_idx >= std::max(seq_slots - active_tail_blocks, 0);
+}
+
+bool shouldAllocateBlock(
+    int block_idx, int seq_slots, int reserve_step, int step, bool enable_reuse_cache, int active_tail_blocks) {
+    const bool is_reserve = reserve_step > 0 && block_idx >= seq_slots;
+    const bool step_hit   = ((block_idx + 1) % step) == 0;
+    return is_reserve || isActiveTailBlock(block_idx, seq_slots, active_tail_blocks)
+           || (enable_reuse_cache && step_hit);
+}
+
+bool dsv4TrapInvalidKVAccessEnabled() {
+    const char* value = std::getenv("DSV4_TRAP_INVALID_KV_ACCESS");
+    if (value == nullptr) {
+        return false;
+    }
+    const std::string flag(value);
+    return !flag.empty() && flag != "0" && flag != "false" && flag != "FALSE" && flag != "off" && flag != "OFF";
+}
+
+}  // namespace
+
+bool SWAKVCacheGroup::shouldCheckSWATailBlockIds() const {
+    if (!dsv4TrapInvalidKVAccessEnabled()) {
+        return false;
+    }
+    return policy_.validate_tail_blocks;
+}
+
+bool SWAKVCacheGroup::effectiveReuseCacheForAllocation(bool enable_reuse_cache) const {
+    return enable_reuse_cache && policy_.reuse_policy == CacheReusePolicy::REUSABLE;
+}
+
+int SWAKVCacheGroup::activeTailBlockCount() const {
+    return std::max(1, policy_.active_tail_blocks);
+}
+
+void SWAKVCacheGroup::checkSWATailBlockIds(const BlockIds& block_ids, const char* caller) const {
+    if (!shouldCheckSWATailBlockIds()) {
+        return;
+    }
+
+    const auto& blocks = block_ids.blocks();
+    if (blocks.empty()) {
+        return;
+    }
+
+    const size_t block_num = blocks.size();
+    RTP_LLM_CHECK_WITH_INFO(!isNullBlockIdx(blocks[block_num - 1]),
+                            "%s invalid SWA block ids: tail block is NULL, block_num=%zu",
+                            caller,
+                            block_num);
+    if (activeTailBlockCount() >= 2 && block_num >= 2) {
+        RTP_LLM_CHECK_WITH_INFO(!isNullBlockIdx(blocks[block_num - 2]),
+                                "%s invalid SWA block ids: tail-1 block is NULL, block_num=%zu",
+                                caller,
+                                block_num);
+    }
+}
+
+void SWAKVCacheGroup::filterValidBlocks(const BlockIndicesType& in, BlockIndicesType& out) const {
+    out.clear();
+    out.reserve(in.size());
+    for (auto b : in) {
+        if (!isNullBlockIdx(b)) {
+            out.push_back(b);
+        }
+    }
+}
+
+int SWAKVCacheGroup::needBlocksNum(int seq_len, int current_blocks, int reserve_step) const {
+    return std::max((seq_len + reserve_step + seq_size_per_block_ - 1) / seq_size_per_block_ - current_blocks, 0);
+}
+
+NeedBlocksInfo SWAKVCacheGroup::getNeedBlocks(
+    int common_seq_len, int seq_len, int reserve_step, int reuse_blocks_len, bool reuse_enabled) const {
+    (void)common_seq_len;
+    const int  step                    = std::max(1, linear_step_);
+    const bool effective_reuse_enabled = effectiveReuseCacheForAllocation(reuse_enabled);
+    const int  active_tail_blocks      = activeTailBlockCount();
+
+    NeedBlocksInfo info;
+
+    const int seq_slots   = needBlocksNum(seq_len, 0);
+    const int total_slots = needBlocksNum(seq_len, 0, reserve_step);
+
+    info.common_blocks = 0;
+    for (int i = reuse_blocks_len; i < seq_slots; ++i) {
+        if (shouldAllocateBlock(i, seq_slots, /*reserve_step=*/0, step, effective_reuse_enabled, active_tail_blocks)) {
+            ++info.extra_blocks;
+        }
+    }
+    info.extra_blocks += std::max(total_slots - std::max(seq_slots, reuse_blocks_len), 0);
+
+    info.extra_blocks = std::max(info.extra_blocks, 0);
+    return info;
+}
+
+MatchResult SWAKVCacheGroup::matchSingleKey(CacheKeyType cache_key) const {
+    MatchResult result;
+    if (!shared_cache_) {
+        return result;
+    }
+    auto block_idx = shared_cache_->matchGroup(cache_key, group_id_);
+    if (!isNullBlockIdx(block_idx)) {
+        result.block_indices = {block_idx};
+    }
+    return result;
+}
+
+bool SWAKVCacheGroup::malloc(BlockIds& block_ids, int seq_len, bool enable_reuse_cache, int reserve_step) {
+    const int  step                    = std::max(1, linear_step_);
+    const bool effective_reuse_enabled = effectiveReuseCacheForAllocation(enable_reuse_cache);
+    const int  active_tail_blocks      = activeTailBlockCount();
+    const int  current_blocks_len      = static_cast<int>(block_ids.blocksNum());
+    const int  seq_slots               = needBlocksNum(seq_len, 0, 0);
+    const int  new_blocks_len          = needBlocksNum(seq_len, current_blocks_len, reserve_step);
+
+    if (new_blocks_len == 0) {
+        checkSWATailBlockIds(block_ids, "SWAKVCacheGroup::malloc");
+        return true;
+    }
+
+    int need_alloc_blocks = 0;
+    for (int i = current_blocks_len; i < current_blocks_len + new_blocks_len; i++) {
+        if (shouldAllocateBlock(i, seq_slots, reserve_step, step, effective_reuse_enabled, active_tail_blocks)) {
+            need_alloc_blocks++;
+        }
+    }
+
+    if (need_alloc_blocks > 0) {
+        const auto free_blocks_num = freeBlocksNum();
+        if (free_blocks_num < static_cast<size_t>(need_alloc_blocks)) {
+            if (!ensureFreeBlocks(need_alloc_blocks)) {
+                RTP_LLM_LOG_WARNING("Insufficient free blocks for SWAKVCacheGroup: need %d, have %zu",
+                                    need_alloc_blocks,
+                                    free_blocks_num);
+                return false;
+            }
+        }
+    }
+
+    BlockIndicesType allocated_blocks;
+    if (need_alloc_blocks > 0) {
+        allocated_blocks = block_pool_->malloc(need_alloc_blocks);
+        if (allocated_blocks.size() != static_cast<size_t>(need_alloc_blocks)) {
+            if (!allocated_blocks.empty()) {
+                block_pool_->requestFree(allocated_blocks);
+            }
+            return false;
+        }
+    }
+
+    BlockIndicesType new_ids;
+    new_ids.reserve(static_cast<size_t>(new_blocks_len));
+    size_t allocated_idx = 0;
+    for (int i = current_blocks_len; i < current_blocks_len + new_blocks_len; i++) {
+        const bool should_alloc =
+            shouldAllocateBlock(i, seq_slots, reserve_step, step, effective_reuse_enabled, active_tail_blocks);
+        if (should_alloc) {
+            new_ids.push_back(allocated_blocks[allocated_idx++]);
+        } else {
+            new_ids.push_back(NULL_BLOCK_IDX);
+        }
+    }
+    RTP_LLM_CHECK_WITH_INFO(allocated_idx == allocated_blocks.size(),
+                            "swa kv allocation accounting mismatch, used=%zu allocated=%zu",
+                            allocated_idx,
+                            allocated_blocks.size());
+    block_ids.add(new_ids);
+    checkSWATailBlockIds(block_ids, "SWAKVCacheGroup::malloc");
+    return true;
+}
+
+void SWAKVCacheGroup::removeSkippedBlocks(BlockIds& block_ids, bool enable_reuse_cache, int reserve_step) {
+    const auto& block_indices = block_ids.blocks();
+    if (block_indices.empty()) {
+        checkSWATailBlockIds(block_ids, "SWAKVCacheGroup::removeSkippedBlocks");
+        return;
+    }
+    const int  step                    = std::max(1, linear_step_);
+    const bool effective_reuse_enabled = effectiveReuseCacheForAllocation(enable_reuse_cache);
+    const int  active_tail_blocks      = activeTailBlockCount();
+    const int  block_size              = static_cast<int>(block_indices.size());
+
+    BlockIndicesType    blocks_to_free;
+    std::vector<size_t> pos_to_remove;
+    for (int i = block_size - active_tail_blocks - 1 - reserve_step; i >= 0; i--) {
+        if (isNullBlockIdx(block_indices[i])) {
+            break;
+        }
+        if (effective_reuse_enabled && ((i + 1) % step) == 0) {
+            continue;
+        }
+        blocks_to_free.push_back(block_indices[i]);
+        pos_to_remove.push_back(static_cast<size_t>(i));
+    }
+    if (!blocks_to_free.empty()) {
+        block_pool_->requestFree(blocks_to_free);
+        block_ids.remove(pos_to_remove);
+    }
+    checkSWATailBlockIds(block_ids, "SWAKVCacheGroup::removeSkippedBlocks");
+}
+
+void SWAKVCacheGroup::free(const BlockIndicesType& block_indices) {
+    if (block_indices.empty()) {
+        return;
+    }
+    BlockIndicesType valid;
+    filterValidBlocks(block_indices, valid);
+    if (!valid.empty()) {
+        block_pool_->requestFree(valid);
+    }
+}
+
+void SWAKVCacheGroup::reference(BlockIds& block_ids, const BlockIndicesType& new_block_indices) {
+    block_ids.add(new_block_indices);
+    BlockIndicesType valid;
+    filterValidBlocks(new_block_indices, valid);
+    if (!valid.empty()) {
+        block_pool_->requestReference(valid);
+    }
+}
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/group/SWAKVCacheGroup.h b/rtp_llm/cpp/cache/group/SWAKVCacheGroup.h
new file mode 100644
index 0000000000..202a85766b
--- /dev/null
+++ b/rtp_llm/cpp/cache/group/SWAKVCacheGroup.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <memory>
+
+#include "rtp_llm/cpp/cache/group/KVCacheGroup.h"
+
+namespace rtp_llm {
+
+class SWAKVCacheGroup: public KVCacheGroup {
+public:
+    SWAKVCacheGroup(const LayerIdsType&                    layer_ids,
+                    std::shared_ptr<KVCacheSpec>           kvcache_spec,
+                    BlockPoolPtr                           block_pool,
+                    int                                    group_id,
+                    int                                    linear_step      = 0,
+                    SharedBlockCache*                      shared_cache     = nullptr,
+                    const kmonitor::MetricsReporterPtr&    metrics_reporter = nullptr,
+                    CacheGroupPolicy policy = defaultCacheGroupPolicy(CacheGroupType::SWA)):
+        KVCacheGroup(layer_ids, kvcache_spec, block_pool, group_id, policy, shared_cache, metrics_reporter),
+        linear_step_(linear_step) {}
+
+    MatchResult matchSingleKey(CacheKeyType cache_key) const override;
+    bool malloc(BlockIds& block_ids, int seq_len, bool enable_reuse_cache = false, int reserve_step = 0) override;
+    void removeSkippedBlocks(BlockIds& block_ids, bool enable_reuse_cache = false, int reserve_step = 0) override;
+    void free(const BlockIndicesType& block_indices) override;
+    void reference(BlockIds& block_ids, const BlockIndicesType& new_block_indices) override;
+    int  needBlocksNum(int seq_len, int current_blocks, int reserve_step = 0) const override;
+    NeedBlocksInfo getNeedBlocks(int  common_seq_len,
+                                 int  seq_len,
+                                 int  reserve_step,
+                                 int  reuse_blocks_len,
+                                 bool reuse_enabled = false) const override;
+
+private:
+    void filterValidBlocks(const BlockIndicesType& in, BlockIndicesType& out) const;
+    int  activeTailBlockCount() const;
+    bool effectiveReuseCacheForAllocation(bool enable_reuse_cache) const;
+    bool shouldCheckSWATailBlockIds() const;
+    void checkSWATailBlockIds(const BlockIds& block_ids, const char* caller) const;
+
+    int linear_step_ = 0;
+};
+
+using SWAKVCacheGroupPtr = std::shared_ptr<SWAKVCacheGroup>;
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/spec/CacheGroupType.h b/rtp_llm/cpp/cache/spec/CacheGroupType.h
new file mode 100644
index 0000000000..9bbe3ccdff
--- /dev/null
+++ b/rtp_llm/cpp/cache/spec/CacheGroupType.h
@@ -0,0 +1,81 @@
+#pragma once
+
+#include <cstdint>
+
+namespace rtp_llm {
+
+// Cache group type for hybrid KV-cache:
+// - LINEAR: linear attention group (PD cache-store transfer keeps the last block)
+// - FULL: full attention group (all blocks are needed for cache-store transfer)
+// - SWA: sliding-window attention group (PD cache-store transfer keeps the last two blocks)
+enum class CacheGroupType : int8_t {
+    LINEAR = 0,
+    FULL   = 1,
+    SWA    = 2,
+};
+
+enum class CacheReusePolicy : int8_t {
+    REUSABLE     = 0,
+    NON_REUSABLE = 1,
+};
+
+enum class CacheEvictPolicy : int8_t {
+    CHAIN       = 0,
+    INDEPENDENT = 1,
+    NONE        = 2,
+};
+
+struct CacheGroupPolicy {
+    CacheReusePolicy reuse_policy              = CacheReusePolicy::REUSABLE;
+    CacheEvictPolicy evict_policy              = CacheEvictPolicy::CHAIN;
+    int              active_tail_blocks        = 2;
+    bool             validate_tail_blocks      = true;
+    uint32_t         explicit_block_num        = 0;
+    bool             reserve_from_paged_budget = false;
+    bool             prefix_reusable           = true;
+    bool             uses_pinned_cpu_backing   = false;
+    bool             is_cp_shardable           = true;
+    bool             has_sparse_slots          = false;
+    bool             has_kernel_block_subdiv   = true;
+    bool             cp_compact_tail_blocks    = false;
+    bool             is_reservable             = true;
+    CacheGroupType   group_type                = CacheGroupType::FULL;
+};
+
+inline const char* cacheGroupTypeName(CacheGroupType group_type) {
+    switch (group_type) {
+        case CacheGroupType::LINEAR:
+            return "LINEAR";
+        case CacheGroupType::FULL:
+            return "FULL";
+        case CacheGroupType::SWA:
+            return "SWA";
+    }
+    return "UNKNOWN";
+}
+
+inline const char* cacheEvictPolicyName(CacheEvictPolicy evict_policy) {
+    switch (evict_policy) {
+        case CacheEvictPolicy::CHAIN:
+            return "chain";
+        case CacheEvictPolicy::INDEPENDENT:
+            return "independent";
+        case CacheEvictPolicy::NONE:
+            return "none";
+    }
+    return "unknown";
+}
+
+inline CacheGroupPolicy defaultCacheGroupPolicy(CacheGroupType group_type) {
+    CacheGroupPolicy policy;
+    policy.group_type         = group_type;
+    policy.active_tail_blocks = group_type == CacheGroupType::LINEAR ? 1 : (group_type == CacheGroupType::SWA ? 2 : 0);
+    policy.prefix_reusable = group_type == CacheGroupType::FULL;
+    policy.is_cp_shardable = group_type == CacheGroupType::FULL;
+    policy.has_sparse_slots = group_type != CacheGroupType::FULL;
+    policy.has_kernel_block_subdiv = group_type == CacheGroupType::FULL;
+    policy.cp_compact_tail_blocks = group_type == CacheGroupType::SWA;
+    return policy;
+}
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/KVCacheSpec.h b/rtp_llm/cpp/cache/spec/KVCacheSpec.h
similarity index 53%
rename from rtp_llm/cpp/cache/KVCacheSpec.h
rename to rtp_llm/cpp/cache/spec/KVCacheSpec.h
index c8dec3a39a..e7df5ec9b1 100644
--- a/rtp_llm/cpp/cache/KVCacheSpec.h
+++ b/rtp_llm/cpp/cache/spec/KVCacheSpec.h
@@ -3,10 +3,11 @@
 // This header includes all KVCacheSpec related classes
 // Split into separate files for better modularity
 
-#include "rtp_llm/cpp/cache/KVCacheSpecBase.h"
-#include "rtp_llm/cpp/cache/MHAKVCacheSpec.h"
-#include "rtp_llm/cpp/cache/MLAKVCacheSpec.h"
-#include "rtp_llm/cpp/cache/LinearKVCacheSpec.h"
+#include "rtp_llm/cpp/cache/spec/KVCacheSpecBase.h"
+#include "rtp_llm/cpp/cache/spec/MHAKVCacheSpec.h"
+#include "rtp_llm/cpp/cache/spec/MLAKVCacheSpec.h"
+#include "rtp_llm/cpp/cache/spec/LinearKVCacheSpec.h"
+#include "rtp_llm/cpp/cache/spec/OpaqueKVCacheSpec.h"
 
 namespace rtp_llm {
 // All KVCacheSpec classes are now available through individual headers
diff --git a/rtp_llm/cpp/cache/spec/KVCacheSpecBase.h b/rtp_llm/cpp/cache/spec/KVCacheSpecBase.h
new file mode 100644
index 0000000000..a547ee8dac
--- /dev/null
+++ b/rtp_llm/cpp/cache/spec/KVCacheSpecBase.h
@@ -0,0 +1,188 @@
+#pragma once
+
+#include <memory>
+#include <vector>
+#include <map>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "rtp_llm/cpp/cache/BlockInfo.h"
+#include "rtp_llm/cpp/cache/spec/CacheGroupType.h"
+#include "rtp_llm/cpp/cache/Types.h"
+#include "rtp_llm/cpp/config/ConfigModules.h"
+#include "rtp_llm/cpp/utils/AssertUtils.h"
+#include "rtp_llm/models_py/bindings/core/Types.h"
+#include "rtp_llm/cpp/model_utils/AttentionConfig.h"
+
+namespace rtp_llm {
+
+// Physical signature used to determine whether two KVCacheSpec instances can
+// share the same KVCacheGroup and BlockPool. Two specs with identical tags AND
+// identical SpecPhysicalSignature are merged into a single group; different tags
+// always produce different groups regardless of physical equality.
+struct SpecPhysicalSignature {
+    size_t         block_size_bytes       = 0;
+    size_t         scale_block_size_bytes = 0;
+    CacheGroupType lifecycle_type         = CacheGroupType::FULL;
+    rtp_llm::DataType dtype               = rtp_llm::DataType::TYPE_INVALID;
+
+    bool operator==(const SpecPhysicalSignature& other) const {
+        return block_size_bytes       == other.block_size_bytes
+            && scale_block_size_bytes == other.scale_block_size_bytes
+            && lifecycle_type         == other.lifecycle_type
+            && dtype                  == other.dtype;
+    }
+    bool operator!=(const SpecPhysicalSignature& other) const {
+        return !(*this == other);
+    }
+};
+
+enum KVCacheSpecType {
+    MultiHeadAttention,        // MHAKVCacheSpec: standard multi-head attention KV cache
+    MultiHeadLatentAttention,  // MLAKVCacheSpec: MLA compressed latent KV cache
+    LinearAttention,           // LinearKVCacheSpec: linear / SSM attention state cache
+    OpaqueKV,                  // Byte-addressed opaque paged KV pool
+    OpaqueState,               // Fixed-allocation opaque state / SWA-like pool
+};
+
+enum class CPTransferPolicy {
+    NONE,
+    INTRA_BLOCK_SLICE,
+};
+
+inline const char* KVCacheSpecTypeToString(KVCacheSpecType t) {
+    switch (t) {
+        case KVCacheSpecType::MultiHeadAttention:
+            return "MultiHeadAttention";
+        case KVCacheSpecType::MultiHeadLatentAttention:
+            return "MultiHeadLatentAttention";
+        case KVCacheSpecType::LinearAttention:
+            return "LinearAttention";
+        case KVCacheSpecType::OpaqueKV:
+            return "OpaqueKV";
+        case KVCacheSpecType::OpaqueState:
+            return "OpaqueState";
+        default:
+            return "Unknown";
+    }
+}
+
+struct KVCacheSpec;
+using KVCacheSpecPtr = std::shared_ptr<KVCacheSpec>;
+using LayerKVCacheSpecs = std::vector<std::vector<KVCacheSpecPtr>>;
+
+struct KVCacheSpec {
+    std::string tag;
+    std::vector<int> layers;
+    uint32_t local_head_num_kv = 1;
+    uint32_t seq_size_per_block = 1;
+    bool     is_state_cache = false;
+    bool     skip_prefix_reuse = false;
+
+    // Lifecycle governs the allocation strategy for this cache group.
+    // Each concrete spec subclass sets this in its constructor; do NOT set it
+    // manually from outside the spec class hierarchy.
+    //   FULL    - standard paged allocation (MHA, MLA, OpaqueKV)
+    //   LINEAR  - fixed-capacity ring buffer (LinearAttention / SSM state)
+    //   SWA     - fixed-size tail-allocation pool (DSV4 state / SWA_KV)
+    CacheGroupType   lifecycle = CacheGroupType::FULL;
+
+    KVCacheSpecType   type = KVCacheSpecType::MultiHeadAttention;
+    rtp_llm::DataType dtype = rtp_llm::DataType::TYPE_INVALID;
+
+    // Derived from lifecycle; true when this spec uses SWA-style fixed allocation.
+    bool isFixedCache() const { return lifecycle == CacheGroupType::SWA; }
+
+    virtual size_t block_size() const   = 0;
+    virtual size_t k_block_size() const = 0;
+    virtual size_t v_block_size() const = 0;
+
+    virtual size_t block_size_bytes() const   = 0;
+    virtual size_t k_block_size_bytes() const = 0;
+    virtual size_t v_block_size_bytes() const = 0;
+
+    virtual size_t scale_block_size_bytes() const {
+        return 0;
+    }
+    virtual size_t k_scale_block_size_bytes() const {
+        return 0;
+    }
+    virtual size_t v_scale_block_size_bytes() const {
+        return 0;
+    }
+
+    virtual KVCacheSpecPtr clone() const = 0;
+
+    virtual CPTransferPolicy cpTransferPolicy() const {
+        return CPTransferPolicy::NONE;
+    }
+
+    bool supportsCpSlice() const {
+        return cpTransferPolicy() == CPTransferPolicy::INTRA_BLOCK_SLICE;
+    }
+
+    virtual std::vector<BlockInfo> cpSliceDestination(std::vector<BlockInfo> parts,
+                                                      size_t                 cp_size,
+                                                      size_t                 peer_idx) const {
+        (void)cp_size;
+        (void)peer_idx;
+        return parts;
+    }
+
+    virtual std::vector<BlockInfo> sliceBlockForPeer(std::vector<BlockInfo> parts,
+                                                     size_t                 cp_size,
+                                                     size_t                 peer_idx) const {
+        return cpSliceDestination(std::move(parts), cp_size, peer_idx);
+    }
+
+    std::string fingerprint() const {
+        std::ostringstream os;
+        os << "tag=" << tag << ";type=" << static_cast<int>(type) << ";dtype=" << static_cast<int>(dtype)
+           << ";local_head_num_kv=" << local_head_num_kv << ";seq_size_per_block=" << seq_size_per_block;
+        os << fingerprintExtra();
+        return os.str();
+    }
+
+    virtual std::string debugString(size_t indent = 0) const = 0;
+
+    // Returns the physical signature used for spec grouping.
+    // Two specs with the same (tag, physicalSignature()) are merged into one
+    // KVCacheGroup. lifecycle is a direct field — no switch needed.
+    // LinearKVCacheSpec overrides to encode its dual-dtype block layout.
+    virtual SpecPhysicalSignature physicalSignature() const {
+        return {block_size_bytes(), scale_block_size_bytes(), lifecycle, dtype};
+    }
+
+protected:
+    virtual std::string fingerprintExtra() const {
+        return "";
+    }
+
+    // Helper method to generate common parts of debug string
+    std::string commonDebugString(size_t indent = 0) const {
+        const std::string indent_str = std::string(indent, ' ');
+        const std::string indent1    = indent_str + "  ";
+
+        std::ostringstream os;
+        os << indent1 << "tag=" << tag << "\n";
+        os << indent1 << "type=" << KVCacheSpecTypeToString(type) << "(" << static_cast<int>(type) << ")\n";
+        os << indent1 << "dtype=" << static_cast<int>(dtype) << "\n";
+        os << indent1 << "layers.size=" << layers.size() << "\n";
+        os << indent1 << "local_head_num_kv=" << local_head_num_kv << "\n";
+        os << indent1 << "seq_size_per_block=" << seq_size_per_block << "\n";
+        os << indent1 << "is_state_cache=" << (is_state_cache ? "true" : "false") << "\n";
+        os << indent1 << "is_fixed_cache=" << (isFixedCache() ? "true" : "false") << "\n";
+        os << indent1 << "skip_prefix_reuse=" << (skip_prefix_reuse ? "true" : "false") << "\n";
+        os << indent1 << "block_size=" << block_size() << "\n";
+        os << indent1 << "k_block_size=" << k_block_size() << "\n";
+        os << indent1 << "v_block_size=" << v_block_size() << "\n";
+        os << indent1 << "block_size_bytes=" << block_size_bytes() << "\n";
+        os << indent1 << "k_block_size_bytes=" << k_block_size_bytes() << "\n";
+        os << indent1 << "v_block_size_bytes=" << v_block_size_bytes() << "\n";
+        return os.str();
+    }
+};
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/spec/KVCacheSpecDesc.h b/rtp_llm/cpp/cache/spec/KVCacheSpecDesc.h
new file mode 100644
index 0000000000..b2a957300e
--- /dev/null
+++ b/rtp_llm/cpp/cache/spec/KVCacheSpecDesc.h
@@ -0,0 +1,211 @@
+#pragma once
+
+#include <algorithm>
+#include <memory>
+#include <numeric>
+
+#include "rtp_llm/cpp/cache/spec/KVCacheSpecDescTypes.h"
+#include "rtp_llm/cpp/cache/spec/KVCacheSpec.h"
+#include "rtp_llm/cpp/config/ConfigModules.h"
+#include "rtp_llm/cpp/utils/AssertUtils.h"
+
+namespace rtp_llm {
+
+struct SpecBuildContext {
+    DataType dtype                   = DataType::TYPE_INVALID;
+    uint32_t seq_size_per_block      = 0;
+    uint32_t attn_tp_size            = 1;  // TP size for computing local head counts from global desc fields
+    uint32_t kernel_tokens_per_block = 0;
+    uint32_t gen_num_per_cycle       = 0;
+    uint32_t cp_size                 = 1;
+    bool     cp_prefill_sliced       = false;
+};
+
+class SpecBuilder {
+public:
+    static KVCacheSpecPtr build(const KVCacheSpecDesc& desc, const SpecBuildContext& ctx) {
+        RTP_LLM_CHECK_WITH_INFO(!desc.tag.empty(), "KVCacheSpecDesc tag must not be empty");
+        auto spec = buildTyped(desc, ctx);
+        spec->tag                = desc.tag;
+        spec->seq_size_per_block = effectiveSeqSizePerBlock(desc, ctx);
+        spec->dtype              = dataTypeOr(desc.dtype, dataTypeOr(ctx.dtype, desc.store_dtype));
+        return spec;
+    }
+
+    static CacheGroupType groupType(const KVCacheSpecDesc& desc) {
+        switch (desc.cache_type) {
+            case CacheType::LINEAR:
+                return CacheGroupType::LINEAR;
+            case CacheType::FIXED_STATE:
+                return CacheGroupType::SWA;
+            case CacheType::MHA:
+            case CacheType::MLA:
+            case CacheType::COMPRESSED_KV:
+                return CacheGroupType::FULL;
+        }
+        return CacheGroupType::FULL;
+    }
+
+private:
+    static uint32_t valueOr(uint32_t value, uint32_t fallback) {
+        return value == 0 ? fallback : value;
+    }
+
+    static DataType dataTypeOr(DataType value, DataType fallback) {
+        return value == DataType::TYPE_INVALID ? fallback : value;
+    }
+
+    static uint32_t alignUpToMultiple(uint32_t value, uint32_t multiple) {
+        RTP_LLM_CHECK_WITH_INFO(multiple > 0, "align multiple must be > 0");
+        return ((value + multiple - 1) / multiple) * multiple;
+    }
+
+    static uint32_t effectiveSeqSizePerBlock(const KVCacheSpecDesc& desc, const SpecBuildContext& ctx) {
+        const auto ctx_seq_size = valueOr(ctx.seq_size_per_block, 1);
+        if (desc.extra.use_fixed_region_cp_tokens && ctx.cp_size > 1) {
+            return ctx_seq_size * ctx.cp_size;
+        }
+        return valueOr(desc.seq_size_per_block, ctx_seq_size);
+    }
+
+    static uint32_t computeStateRingEntries(const KVCacheSpecDesc& desc, const SpecBuildContext& ctx) {
+        RTP_LLM_CHECK_WITH_INFO(desc.extra.state_ring_compression_ratio > 0,
+                                "state ring desc tag=%s requires positive state_ring_compression_ratio",
+                                desc.tag.c_str());
+        const uint32_t window =
+            (1 + desc.extra.state_ring_overlap) * desc.extra.state_ring_compression_ratio;
+        const uint32_t raw =
+            window + (desc.extra.state_ring_add_gen_num_per_cycle ? ctx.gen_num_per_cycle : 0);
+        return (raw + 1) & ~1U;
+    }
+
+    static uint32_t effectiveEntriesPerBlock(const KVCacheSpecDesc& desc, const SpecBuildContext& ctx) {
+        if (desc.extra.derive_entries_from_kernel_block) {
+            RTP_LLM_CHECK_WITH_INFO(desc.compression_ratio > 0,
+                                    "desc tag=%s derives entries from kernel block but has invalid compression_ratio=%u",
+                                    desc.tag.c_str(),
+                                    desc.compression_ratio);
+            RTP_LLM_CHECK_WITH_INFO(ctx.kernel_tokens_per_block > 0,
+                                    "desc tag=%s derives entries from kernel block but kernel_tokens_per_block is 0",
+                                    desc.tag.c_str());
+            RTP_LLM_CHECK_WITH_INFO(ctx.kernel_tokens_per_block % desc.compression_ratio == 0,
+                                    "desc tag=%s compression_ratio=%u must divide kernel block %u",
+                                    desc.tag.c_str(),
+                                    desc.compression_ratio,
+                                    ctx.kernel_tokens_per_block);
+            return ctx.kernel_tokens_per_block / desc.compression_ratio;
+        }
+
+        if (desc.extra.state_ring_compression_ratio > 0) {
+            uint32_t entries = computeStateRingEntries(desc, ctx);
+            if (ctx.cp_size > 1 && (desc.extra.cp_align_entries || desc.extra.cp_slice_entries)) {
+                entries = alignUpToMultiple(entries, ctx.cp_size);
+                if (desc.extra.cp_slice_entries && ctx.cp_prefill_sliced) {
+                    entries /= ctx.cp_size;
+                }
+            }
+            return entries;
+        }
+
+        return desc.entries_per_block;
+    }
+
+    static size_t effectiveFixedStateBlockOverride(const KVCacheSpecDesc& desc,
+                                                   uint32_t               entries_per_block,
+                                                   const SpecBuildContext& ctx) {
+        if (ctx.cp_size <= 1 || !ctx.cp_prefill_sliced || !desc.extra.cp_prefill_slice_block_bytes) {
+            return desc.block_size_bytes_override;
+        }
+        const size_t natural_bytes = static_cast<size_t>(entries_per_block) * desc.entry_elems * getTypeSize(desc.store_dtype);
+        const size_t align =
+            desc.block_size_bytes_alignment > 0 ?
+                std::lcm(desc.block_size_bytes_alignment, static_cast<size_t>(ctx.cp_size)) :
+                static_cast<size_t>(ctx.cp_size);
+        const size_t full_stride_bytes = ((natural_bytes + align - 1) / align) * align;
+        RTP_LLM_CHECK_WITH_INFO(full_stride_bytes % ctx.cp_size == 0,
+                                "CP prefill byte slicing tag=%s full stride %zu must be divisible by cp_size %u",
+                                desc.tag.c_str(),
+                                full_stride_bytes,
+                                ctx.cp_size);
+        return full_stride_bytes / ctx.cp_size;
+    }
+
+    // Dispatch to per-type factory methods.
+    // Each factory method owns all type-specific field assignments,
+    // including local_head_num_kv derived from global desc fields and runtime TP size.
+    static KVCacheSpecPtr buildTyped(const KVCacheSpecDesc& desc, const SpecBuildContext& ctx) {
+        switch (desc.cache_type) {
+            case CacheType::MHA:           return buildMHA(desc, ctx);
+            case CacheType::MLA:           return buildMLA(desc);
+            case CacheType::LINEAR:        return buildLinear(desc, ctx);
+            case CacheType::COMPRESSED_KV: return buildCompressedKV(desc, ctx);
+            case CacheType::FIXED_STATE:   return buildFixedState(desc, ctx);
+        }
+        RTP_LLM_CHECK_WITH_INFO(false, "unknown CacheType=%d", static_cast<int>(desc.cache_type));
+        return nullptr;
+    }
+
+    // MHA/GQA: local_head_num_kv = global_kv_heads / TP, with gcd fallback for non-divisible TP.
+    static KVCacheSpecPtr buildMHA(const KVCacheSpecDesc& desc, const SpecBuildContext& ctx) {
+        const uint32_t tp   = std::max(1u, ctx.attn_tp_size);
+        auto spec               = std::make_shared<MHAKVCacheSpec>();
+        spec->size_per_head     = desc.size_per_head;
+        const uint32_t kv       = valueOr(desc.num_kv_heads, 1);
+        spec->local_head_num_kv = (kv % tp == 0) ? kv / tp : kv / std::gcd(kv, tp);
+        return spec;
+    }
+
+    // MLA: local_head_num_kv is always 1 — heads are not split across TP.
+    static KVCacheSpecPtr buildMLA(const KVCacheSpecDesc& desc) {
+        auto spec               = std::make_shared<MLAKVCacheSpec>();
+        spec->kv_lora_rank      = desc.kv_lora_rank;
+        spec->rope_head_dim     = desc.rope_head_dim;
+        spec->local_head_num_kv = 1;
+        return spec;
+    }
+
+    // Linear Attention: all three local head fields derived from global counts / TP.
+    static KVCacheSpecPtr buildLinear(const KVCacheSpecDesc& desc, const SpecBuildContext& ctx) {
+        const uint32_t tp   = std::max(1u, ctx.attn_tp_size);
+        auto spec               = std::make_shared<LinearKVCacheSpec>();
+        spec->local_num_k_heads = desc.num_k_heads / tp;
+        spec->local_num_v_heads = desc.num_v_heads / tp;
+        spec->head_k_dim        = desc.head_k_dim;
+        spec->head_v_dim        = desc.head_v_dim;
+        spec->conv_kernel_dim   = desc.conv_kernel_dim;
+        spec->ssm_state_dtype   = dataTypeOr(desc.ssm_state_dtype, DataType::TYPE_BF16);
+        spec->conv_state_dtype  = dataTypeOr(desc.conv_state_dtype, DataType::TYPE_BF16);
+        const uint32_t v_heads  = valueOr(desc.num_v_heads, 1);
+        spec->local_head_num_kv = std::max(1u, (v_heads > 1u) ? v_heads / tp : v_heads);
+        return spec;
+    }
+
+    // COMPRESSED_KV / FIXED_STATE: no per-head TP split, local_head_num_kv = global value.
+    static KVCacheSpecPtr buildCompressedKV(const KVCacheSpecDesc& desc, const SpecBuildContext& ctx) {
+        auto spec                        = std::make_shared<CompressedKVCacheSpec>();
+        spec->entry_elems                = desc.entry_elems;
+        spec->entries_per_block          = effectiveEntriesPerBlock(desc, ctx);
+        spec->compression_ratio          = valueOr(desc.compression_ratio, 1);
+        spec->store_dtype                = desc.store_dtype;
+        spec->block_size_bytes_alignment = desc.block_size_bytes_alignment;
+        spec->local_head_num_kv          = valueOr(desc.num_kv_heads, 1);
+        return spec;
+    }
+
+    static KVCacheSpecPtr buildFixedState(const KVCacheSpecDesc& desc, const SpecBuildContext& ctx) {
+        const auto entries_per_block            = effectiveEntriesPerBlock(desc, ctx);
+        auto spec                              = std::make_shared<FixedStateCacheSpec>();
+        spec->state_dim                        = desc.entry_elems;
+        spec->entries_per_block                = entries_per_block;
+        spec->store_dtype                      = desc.store_dtype;
+        spec->block_size_bytes_override        = effectiveFixedStateBlockOverride(desc, entries_per_block, ctx);
+        spec->block_size_bytes_alignment       = desc.block_size_bytes_alignment;
+        spec->block_size_alignment_min_entries = desc.block_size_alignment_min_entries;
+        spec->is_state_cache                   = desc.is_state_cache;
+        spec->skip_prefix_reuse                = desc.skip_prefix_reuse;
+        spec->local_head_num_kv                = valueOr(desc.num_kv_heads, 1);
+        return spec;
+    }
+};
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/spec/KVCacheSpecDescTypes.h b/rtp_llm/cpp/cache/spec/KVCacheSpecDescTypes.h
new file mode 100644
index 0000000000..4bd554e3aa
--- /dev/null
+++ b/rtp_llm/cpp/cache/spec/KVCacheSpecDescTypes.h
@@ -0,0 +1,88 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+#include "rtp_llm/cpp/cache/spec/CacheGroupType.h"
+#include "rtp_llm/models_py/bindings/core/Types.h"
+
+namespace rtp_llm {
+
+enum class CacheType : int8_t {
+    MHA           = 0,
+    MLA           = 1,
+    LINEAR        = 2,
+    COMPRESSED_KV = 3,
+    FIXED_STATE   = 4,
+};
+
+struct KVCacheSpecDescExtra {
+    uint32_t explicit_block_num        = 0;
+    bool     reserve_from_paged_budget = false;
+
+    bool     derive_entries_from_kernel_block = false;
+    uint32_t state_ring_compression_ratio     = 0;
+    uint32_t state_ring_overlap               = 0;
+    bool     state_ring_add_gen_num_per_cycle = false;
+    bool     cp_align_entries                 = false;
+    bool     cp_slice_entries                 = false;
+    bool     cp_prefill_slice_block_bytes     = false;
+    bool     use_fixed_region_cp_tokens       = false;
+};
+
+struct KVCacheSpecDesc {
+    std::string tag;
+    CacheType   cache_type = CacheType::MHA;
+    uint32_t num_kv_heads      = 0;  // global model KV head count (MHA: kv_head_num; MLA: 1)
+    uint32_t seq_size_per_block = 0;
+    DataType dtype              = DataType::TYPE_INVALID;
+
+    uint32_t size_per_head = 0;
+    uint32_t kv_lora_rank  = 0;
+    uint32_t rope_head_dim = 0;
+
+    uint32_t num_k_heads       = 0;  // Linear Attention: global key head count
+    uint32_t num_v_heads       = 0;  // Linear Attention: global value head count
+    uint32_t head_k_dim        = 0;
+    uint32_t head_v_dim        = 0;
+    uint32_t conv_kernel_dim   = 0;
+    DataType ssm_state_dtype   = DataType::TYPE_INVALID;
+    DataType conv_state_dtype  = DataType::TYPE_INVALID;
+
+    uint32_t entry_elems                      = 0;
+    uint32_t entries_per_block                = 0;
+    DataType store_dtype                      = DataType::TYPE_INVALID;
+    uint32_t compression_ratio                = 1;
+    size_t   block_size_bytes_override        = 0;
+    size_t   block_size_bytes_alignment       = 0;
+    uint32_t block_size_alignment_min_entries = 0;
+    bool     is_state_cache                   = true;
+    bool     skip_prefix_reuse                = false;
+
+    bool             has_reuse_policy          = false;
+    CacheReusePolicy reuse_policy              = CacheReusePolicy::REUSABLE;
+    bool             has_evict_policy          = false;
+    CacheEvictPolicy evict_policy              = CacheEvictPolicy::CHAIN;
+    bool             has_active_tail_blocks    = false;
+    int              active_tail_blocks        = 0;
+    bool             has_validate_tail_blocks  = false;
+    bool             validate_tail_blocks      = true;
+    KVCacheSpecDescExtra extra;
+    bool             has_prefix_reusable       = false;
+    bool             prefix_reusable           = true;
+    bool             uses_pinned_cpu_backing   = false;
+    bool             has_is_cp_shardable       = false;
+    bool             is_cp_shardable           = true;
+    bool             has_sparse_slots          = false;
+    bool             sparse_slots              = false;
+    bool             has_kernel_block_subdiv   = false;
+    bool             kernel_block_subdiv       = true;
+    bool             has_cp_compact_tail_blocks = false;
+    bool             cp_compact_tail_blocks     = false;
+    bool             has_is_reservable          = false;
+    bool             is_reservable              = true;
+
+};
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/LinearKVCacheSpec.h b/rtp_llm/cpp/cache/spec/LinearKVCacheSpec.h
similarity index 81%
rename from rtp_llm/cpp/cache/LinearKVCacheSpec.h
rename to rtp_llm/cpp/cache/spec/LinearKVCacheSpec.h
index 39a80bdd80..d62bfced44 100644
--- a/rtp_llm/cpp/cache/LinearKVCacheSpec.h
+++ b/rtp_llm/cpp/cache/spec/LinearKVCacheSpec.h
@@ -4,7 +4,7 @@
 #include <sstream>
 #include <string>
 
-#include "rtp_llm/cpp/cache/KVCacheSpecBase.h"
+#include "rtp_llm/cpp/cache/spec/KVCacheSpecBase.h"
 #include "rtp_llm/cpp/config/ConfigModules.h"
 #include "rtp_llm/models_py/bindings/core/Types.h"
 #include "rtp_llm/cpp/model_utils/AttentionConfig.h"
@@ -24,7 +24,10 @@ struct LinearKVCacheSpec: public KVCacheSpec {
     DataType ssm_state_dtype   = DataType::TYPE_BF16;
     DataType conv_state_dtype  = DataType::TYPE_BF16;
 
-    LinearKVCacheSpec() = default;
+    LinearKVCacheSpec() {
+        type      = KVCacheSpecType::LinearAttention;
+        lifecycle = CacheGroupType::LINEAR;
+    }
 
     LinearKVCacheSpec(const AttentionConfigs&      attn_config,
                       const ParallelismConfig&     parallelism_config,
@@ -52,7 +55,7 @@ struct LinearKVCacheSpec: public KVCacheSpec {
                                 linear_config.linear_value_head_dim);
 
         type               = KVCacheSpecType::LinearAttention;
-        layer_num          = 1;  // Will be set by caller
+        lifecycle          = CacheGroupType::LINEAR;
         local_head_num_kv  = static_cast<uint32_t>(std::max(
             1,
             (linear_config.linear_num_value_heads > 1) ?
@@ -134,6 +137,35 @@ struct LinearKVCacheSpec: public KVCacheSpec {
         return {0, k_block_bytes, k_block_bytes, v_block_bytes};
     }
 
+    KVCacheSpecPtr clone() const override {
+        return std::make_shared<LinearKVCacheSpec>(*this);
+    }
+
+protected:
+    std::string fingerprintExtra() const override {
+        std::ostringstream os;
+        os << ";linear.local_num_k_heads=" << local_num_k_heads
+           << ";linear.local_num_v_heads=" << local_num_v_heads << ";linear.head_k_dim=" << head_k_dim
+           << ";linear.head_v_dim=" << head_v_dim << ";linear.conv_kernel_dim=" << conv_kernel_dim
+           << ";linear.ssm_state_dtype=" << static_cast<int>(ssm_state_dtype)
+           << ";linear.conv_state_dtype=" << static_cast<int>(conv_state_dtype);
+        return os.str();
+    }
+
+public:
+    // Override physicalSignature() to capture the dual-dtype layout.
+    // LinearKVCacheSpec uses ssm_state_dtype for the K (SSM) segment and
+    // conv_state_dtype for the V (conv) segment. Since block_size_bytes() already
+    // encodes their combined element count, we also expose k_block_size_bytes()
+    // (= SSM segment bytes) via scale_block_size_bytes to distinguish specs that
+    // share total block bytes but have a different K/V dtype split.
+    SpecPhysicalSignature physicalSignature() const override {
+        return {block_size_bytes(),
+                k_block_size_bytes(),  // K segment bytes as secondary discriminator
+                lifecycle,             // always LINEAR; use field for consistency with base class
+                ssm_state_dtype};      // primary dtype for the K (SSM) segment
+    }
+
     std::string debugString(size_t indent = 0) const override {
         const std::string indent_str = std::string(indent, ' ');
         const std::string indent1    = indent_str + "  ";
@@ -154,4 +186,4 @@ struct LinearKVCacheSpec: public KVCacheSpec {
     }
 };
 
-}  // namespace rtp_llm
\ No newline at end of file
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/MHAKVCacheSpec.h b/rtp_llm/cpp/cache/spec/MHAKVCacheSpec.h
similarity index 92%
rename from rtp_llm/cpp/cache/MHAKVCacheSpec.h
rename to rtp_llm/cpp/cache/spec/MHAKVCacheSpec.h
index 38dcc4fc8c..cf03f55b94 100644
--- a/rtp_llm/cpp/cache/MHAKVCacheSpec.h
+++ b/rtp_llm/cpp/cache/spec/MHAKVCacheSpec.h
@@ -1,10 +1,11 @@
 #pragma once
 
 #include <memory>
+#include <numeric>
 #include <sstream>
 #include <string>
 
-#include "rtp_llm/cpp/cache/KVCacheSpecBase.h"
+#include "rtp_llm/cpp/cache/spec/KVCacheSpecBase.h"
 #include "rtp_llm/cpp/config/ConfigModules.h"
 #include "rtp_llm/models_py/bindings/core/Types.h"
 #include "rtp_llm/cpp/model_utils/AttentionConfig.h"
@@ -14,12 +15,13 @@ namespace rtp_llm {
 struct MHAKVCacheSpec: public KVCacheSpec {
     uint32_t size_per_head;
 
-    MHAKVCacheSpec() = default;
-
-    MHAKVCacheSpec(const AttentionConfigs& attn_config, const ParallelismConfig& parallelism_config) {
-        type              = KVCacheSpecType::MultiHeadAttention;
-        layer_num         = 1;  // Will be set by caller
+    MHAKVCacheSpec() {
+        type      = KVCacheSpecType::MultiHeadAttention;
+        lifecycle = CacheGroupType::FULL;
+    }
 
+    MHAKVCacheSpec(const AttentionConfigs& attn_config, const ParallelismConfig& parallelism_config)
+        : MHAKVCacheSpec() {
         // TODO(xinfei.sxf): 这里的head_num_kv分配逻辑需要和ModelConfig::getAttentionConfigs里保持一致，目前这里还是单独计算的
         local_head_num_kv = static_cast<uint32_t>(
             (attn_config.kv_head_num % parallelism_config.get_attn_tp_size() == 0) ?
@@ -126,6 +128,18 @@ struct MHAKVCacheSpec: public KVCacheSpec {
         return {k_partition_off, k_partition_sz, v_partition_off, v_partition_sz};
     }
 
+    KVCacheSpecPtr clone() const override {
+        return std::make_shared<MHAKVCacheSpec>(*this);
+    }
+
+protected:
+    std::string fingerprintExtra() const override {
+        std::ostringstream os;
+        os << ";mha.size_per_head=" << size_per_head;
+        return os.str();
+    }
+
+public:
     std::string debugString(size_t indent = 0) const override {
         const std::string indent_str = std::string(indent, ' ');
         const std::string indent1    = indent_str + "  ";
@@ -142,4 +156,4 @@ struct MHAKVCacheSpec: public KVCacheSpec {
     }
 };
 
-}  // namespace rtp_llm
\ No newline at end of file
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/MLAKVCacheSpec.h b/rtp_llm/cpp/cache/spec/MLAKVCacheSpec.h
similarity index 87%
rename from rtp_llm/cpp/cache/MLAKVCacheSpec.h
rename to rtp_llm/cpp/cache/spec/MLAKVCacheSpec.h
index d89fa8309f..edba354858 100644
--- a/rtp_llm/cpp/cache/MLAKVCacheSpec.h
+++ b/rtp_llm/cpp/cache/spec/MLAKVCacheSpec.h
@@ -4,7 +4,7 @@
 #include <sstream>
 #include <string>
 
-#include "rtp_llm/cpp/cache/KVCacheSpecBase.h"
+#include "rtp_llm/cpp/cache/spec/KVCacheSpecBase.h"
 #include "rtp_llm/cpp/config/ConfigModules.h"
 #include "rtp_llm/models_py/bindings/core/Types.h"
 #include "rtp_llm/cpp/model_utils/AttentionConfig.h"
@@ -15,11 +15,13 @@ struct MLAKVCacheSpec: public KVCacheSpec {
     uint32_t kv_lora_rank;
     uint32_t rope_head_dim;
 
-    MLAKVCacheSpec() = default;
+    MLAKVCacheSpec() {
+        type      = KVCacheSpecType::MultiHeadLatentAttention;
+        lifecycle = CacheGroupType::FULL;
+    }
 
-    MLAKVCacheSpec(const AttentionConfigs& attn_config, const ParallelismConfig& parallelism_config) {
-        type               = KVCacheSpecType::MultiHeadLatentAttention;
-        layer_num          = 1;  // Will be set by caller
+    MLAKVCacheSpec(const AttentionConfigs& attn_config, const ParallelismConfig& parallelism_config)
+        : MLAKVCacheSpec() {
         local_head_num_kv  = 1;  // mla set local_head_num_kv to 1
         seq_size_per_block = static_cast<uint32_t>(attn_config.tokens_per_block);
         kv_lora_rank       = static_cast<uint32_t>(attn_config.kv_lora_rank);
@@ -81,6 +83,18 @@ struct MLAKVCacheSpec: public KVCacheSpec {
         return {0, k_block_bytes, k_block_bytes, v_block_bytes};
     }
 
+    KVCacheSpecPtr clone() const override {
+        return std::make_shared<MLAKVCacheSpec>(*this);
+    }
+
+protected:
+    std::string fingerprintExtra() const override {
+        std::ostringstream os;
+        os << ";mla.kv_lora_rank=" << kv_lora_rank << ";mla.rope_head_dim=" << rope_head_dim;
+        return os.str();
+    }
+
+public:
     std::string debugString(size_t indent = 0) const override {
         const std::string indent_str = std::string(indent, ' ');
         const std::string indent1    = indent_str + "  ";
@@ -93,4 +107,4 @@ struct MLAKVCacheSpec: public KVCacheSpec {
     }
 };
 
-}  // namespace rtp_llm
\ No newline at end of file
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/spec/OpaqueKVCacheSpec.h b/rtp_llm/cpp/cache/spec/OpaqueKVCacheSpec.h
new file mode 100644
index 0000000000..0a2c2f6a27
--- /dev/null
+++ b/rtp_llm/cpp/cache/spec/OpaqueKVCacheSpec.h
@@ -0,0 +1,286 @@
+#pragma once
+
+#include <utility>
+
+#include "rtp_llm/cpp/cache/spec/KVCacheSpecBase.h"
+#include "rtp_llm/cpp/utils/AssertUtils.h"
+
+namespace rtp_llm {
+
+struct OpaqueKVCacheSpec: public KVCacheSpec {
+    uint32_t entry_elems                       = 0;
+    uint32_t entries_per_block                 = 0;
+    DataType store_dtype                       = DataType::TYPE_INVALID;
+    size_t   block_size_bytes_override         = 0;
+    size_t   block_size_bytes_alignment        = 0;
+    uint32_t block_size_alignment_min_entries  = 0;
+
+    OpaqueKVCacheSpec() = default;
+
+    OpaqueKVCacheSpec(KVCacheSpecType spec_type,
+                      CacheGroupType  lifecycle_type,
+                      uint32_t        entry_elements,
+                      uint32_t        block_entries,
+                      DataType        storage_dtype,
+                      uint32_t        seq_size_per_blk,
+                      size_t          block_size_bytes_override_value = 0,
+                      size_t          block_size_alignment            = 0,
+                      uint32_t        block_alignment_min_entries     = 0) {
+        type                             = spec_type;
+        lifecycle                        = lifecycle_type;
+        entry_elems                      = entry_elements;
+        entries_per_block                = block_entries;
+        store_dtype                      = storage_dtype;
+        block_size_bytes_override        = block_size_bytes_override_value;
+        block_size_bytes_alignment       = block_size_alignment;
+        block_size_alignment_min_entries = block_alignment_min_entries;
+
+        local_head_num_kv  = 1;
+        seq_size_per_block = seq_size_per_blk;
+        dtype              = store_dtype;
+    }
+
+    size_t block_size() const override {
+        return static_cast<size_t>(entries_per_block) * entry_elems;
+    }
+
+    size_t k_block_size() const override {
+        return block_size() / 2;
+    }
+
+    size_t v_block_size() const override {
+        return block_size() / 2;
+    }
+
+    size_t natural_block_size_bytes() const {
+        return static_cast<size_t>(entries_per_block) * entry_elems * getTypeSize(store_dtype);
+    }
+
+    size_t block_size_bytes() const override {
+        if (block_size_bytes_override > 0) {
+            return block_size_bytes_override;
+        }
+        const size_t natural = natural_block_size_bytes();
+        if (block_size_bytes_alignment > 0 && entries_per_block >= block_size_alignment_min_entries) {
+            return ((natural + block_size_bytes_alignment - 1) / block_size_bytes_alignment)
+                   * block_size_bytes_alignment;
+        }
+        return natural;
+    }
+
+    size_t k_block_size_bytes() const override {
+        return block_size_bytes() / 2;
+    }
+
+    size_t v_block_size_bytes() const override {
+        return block_size_bytes() / 2;
+    }
+
+    KVCacheSpecPtr clone() const override {
+        return std::make_shared<OpaqueKVCacheSpec>(*this);
+    }
+
+protected:
+    std::string opaqueFingerprintExtra(const std::string& prefix) const {
+        std::ostringstream os;
+        os << ";" << prefix << ".entry_elems=" << entry_elems
+           << ";" << prefix << ".entries_per_block=" << entries_per_block
+           << ";" << prefix << ".store_dtype=" << static_cast<int>(store_dtype)
+           << ";" << prefix << ".block_size_bytes_override=" << block_size_bytes_override
+           << ";" << prefix << ".block_size_bytes_alignment=" << block_size_bytes_alignment
+           << ";" << prefix << ".block_size_alignment_min_entries=" << block_size_alignment_min_entries;
+        return os.str();
+    }
+
+    std::string fingerprintExtra() const override {
+        return opaqueFingerprintExtra("opaque");
+    }
+
+public:
+    std::string debugString(size_t indent = 0) const override {
+        std::ostringstream os;
+        os << std::string(indent, ' ') << "OpaqueKVCacheSpec{\n";
+        os << commonDebugString(indent);
+        os << std::string(indent + 2, ' ') << "entry_elems=" << entry_elems << "\n";
+        os << std::string(indent + 2, ' ') << "entries_per_block=" << entries_per_block << "\n";
+        os << std::string(indent + 2, ' ') << "block_size_bytes_override=" << block_size_bytes_override << "\n";
+        os << std::string(indent + 2, ' ') << "block_size_bytes_alignment=" << block_size_bytes_alignment << "\n";
+        os << std::string(indent + 2, ' ')
+           << "block_size_alignment_min_entries=" << block_size_alignment_min_entries << "\n";
+        os << std::string(indent, ' ') << "}\n";
+        return os.str();
+    }
+};
+
+struct CompressedKVCacheSpec: public OpaqueKVCacheSpec {
+    uint32_t compression_ratio = 1;
+
+    CompressedKVCacheSpec() {
+        type      = KVCacheSpecType::OpaqueKV;
+        lifecycle = CacheGroupType::FULL;
+    }
+
+    CompressedKVCacheSpec(std::string cache_tag,
+                          uint32_t    entry_elements,
+                          uint32_t    block_entries,
+                          DataType    storage_dtype,
+                          uint32_t    seq_size_per_blk,
+                          uint32_t    cache_compression_ratio = 1,
+                          size_t      block_size_alignment    = 0)
+        : CompressedKVCacheSpec() {
+        tag                        = std::move(cache_tag);
+        entry_elems                = entry_elements;
+        entries_per_block          = block_entries;
+        compression_ratio          = cache_compression_ratio;
+        store_dtype                = storage_dtype;
+        block_size_bytes_alignment = block_size_alignment;
+
+        local_head_num_kv  = 1;
+        seq_size_per_block = seq_size_per_blk;
+        dtype              = store_dtype;
+    }
+
+    KVCacheSpecPtr clone() const override {
+        return std::make_shared<CompressedKVCacheSpec>(*this);
+    }
+
+protected:
+    std::string fingerprintExtra() const override {
+        std::ostringstream os;
+        os << ";compressed_kv.compression_ratio=" << compression_ratio
+           << opaqueFingerprintExtra("compressed_kv");
+        return os.str();
+    }
+
+public:
+    std::string debugString(size_t indent = 0) const override {
+        std::ostringstream os;
+        os << std::string(indent, ' ') << "CompressedKVCacheSpec{\n";
+        os << commonDebugString(indent);
+        os << std::string(indent + 2, ' ') << "entry_elems=" << entry_elems << "\n";
+        os << std::string(indent + 2, ' ') << "entries_per_block=" << entries_per_block << "\n";
+        os << std::string(indent + 2, ' ') << "compression_ratio=" << compression_ratio << "\n";
+        os << std::string(indent + 2, ' ') << "block_size_bytes_alignment=" << block_size_bytes_alignment << "\n";
+        os << std::string(indent, ' ') << "}\n";
+        return os.str();
+    }
+};
+
+struct FixedStateCacheSpec: public OpaqueKVCacheSpec {
+    uint32_t& state_dim;
+
+    FixedStateCacheSpec(): state_dim(entry_elems) {
+        type      = KVCacheSpecType::OpaqueState;
+        lifecycle = CacheGroupType::SWA;
+    }
+
+    FixedStateCacheSpec(const FixedStateCacheSpec& other): OpaqueKVCacheSpec(other), state_dim(entry_elems) {}
+
+    FixedStateCacheSpec& operator=(const FixedStateCacheSpec& other) {
+        if (this != &other) {
+            OpaqueKVCacheSpec::operator=(other);
+        }
+        return *this;
+    }
+
+    FixedStateCacheSpec(std::string cache_tag,
+                        uint32_t    state_elements,
+                        uint32_t    block_entries,
+                        DataType    storage_dtype,
+                        uint32_t    seq_size_per_blk,
+                        size_t      block_size_bytes_override_value = 0,
+                        size_t      block_size_alignment            = 0,
+                        uint32_t    block_alignment_min_entries     = 0,
+                        bool        state_cache                     = true,
+                        bool        skip_reuse                      = false)
+        : FixedStateCacheSpec() {
+        tag                              = std::move(cache_tag);
+        state_dim                        = state_elements;
+        entries_per_block                = block_entries;
+        store_dtype                      = storage_dtype;
+        block_size_bytes_override        = block_size_bytes_override_value;
+        block_size_bytes_alignment       = block_size_alignment;
+        block_size_alignment_min_entries = block_alignment_min_entries;
+
+        local_head_num_kv  = 1;
+        seq_size_per_block = seq_size_per_blk;
+        dtype              = store_dtype;
+        is_state_cache     = state_cache;
+        skip_prefix_reuse  = skip_reuse;
+    }
+
+    std::vector<BlockInfo> sliceBlockForPeer(std::vector<BlockInfo> parts,
+                                             size_t                 cp_size,
+                                             size_t                 peer_idx) const override {
+        return cpSliceDestination(std::move(parts), cp_size, peer_idx);
+    }
+
+    CPTransferPolicy cpTransferPolicy() const override {
+        return CPTransferPolicy::INTRA_BLOCK_SLICE;
+    }
+
+    std::vector<BlockInfo> cpSliceDestination(std::vector<BlockInfo> parts,
+                                              size_t                 cp_size,
+                                              size_t                 peer_idx) const override {
+        if (cp_size <= 1) {
+            return parts;
+        }
+        RTP_LLM_CHECK_WITH_INFO(parts.size() == 1,
+                                "FixedStateCacheSpec CP byte slicing expects one block part, got %zu",
+                                parts.size());
+        auto& block = parts[0];
+        RTP_LLM_CHECK_WITH_INFO(block.addr != nullptr, "FixedStateCacheSpec CP byte slicing got null block addr");
+
+        size_t slice_bytes = 0;
+        if (block_size_bytes_override > 0 || block.size_bytes == block_size_bytes()) {
+            RTP_LLM_CHECK_WITH_INFO(block.size_bytes % cp_size == 0,
+                                    "FixedStateCacheSpec block bytes %zu not divisible by cp_size %zu",
+                                    block.size_bytes,
+                                    cp_size);
+            slice_bytes = block.size_bytes / cp_size;
+        } else {
+            RTP_LLM_CHECK_WITH_INFO(entries_per_block % cp_size == 0,
+                                    "FixedStateCacheSpec entries %u not divisible by cp_size %zu",
+                                    entries_per_block,
+                                    cp_size);
+            const size_t local_entries = entries_per_block / cp_size;
+            slice_bytes = local_entries * static_cast<size_t>(state_dim) * getTypeSize(store_dtype);
+        }
+
+        const size_t slice_offset = slice_bytes * peer_idx;
+        RTP_LLM_CHECK_WITH_INFO(slice_offset + slice_bytes <= block.size_bytes,
+                                "FixedStateCacheSpec CP slice [%zu, %zu) exceeds block bytes %zu",
+                                slice_offset,
+                                slice_offset + slice_bytes,
+                                block.size_bytes);
+        block.addr       = static_cast<void*>(static_cast<char*>(block.addr) + slice_offset);
+        block.size_bytes = slice_bytes;
+        return parts;
+    }
+
+    KVCacheSpecPtr clone() const override {
+        return std::make_shared<FixedStateCacheSpec>(*this);
+    }
+
+protected:
+    std::string fingerprintExtra() const override {
+        return opaqueFingerprintExtra("fixed_state");
+    }
+
+public:
+    std::string debugString(size_t indent = 0) const override {
+        std::ostringstream os;
+        os << std::string(indent, ' ') << "FixedStateCacheSpec{\n";
+        os << commonDebugString(indent);
+        os << std::string(indent + 2, ' ') << "state_dim=" << state_dim << "\n";
+        os << std::string(indent + 2, ' ') << "entries_per_block=" << entries_per_block << "\n";
+        os << std::string(indent + 2, ' ') << "block_size_bytes_override=" << block_size_bytes_override << "\n";
+        os << std::string(indent + 2, ' ') << "block_size_bytes_alignment=" << block_size_bytes_alignment << "\n";
+        os << std::string(indent + 2, ' ')
+           << "block_size_alignment_min_entries=" << block_size_alignment_min_entries << "\n";
+        os << std::string(indent, ' ') << "}\n";
+        return os.str();
+    }
+};
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/test/BUILD b/rtp_llm/cpp/cache/test/BUILD
index 5c4c954fb6..dcc71eb08a 100644
--- a/rtp_llm/cpp/cache/test/BUILD
+++ b/rtp_llm/cpp/cache/test/BUILD
@@ -7,6 +7,48 @@ test_copts = [
     "-fno-access-control",
 ] + copts()
 
+cc_import(
+    name = "cuda12_cudart",
+    shared_library = "@pip_gpu_cuda12_9_torch_nvidia_cuda_runtime_cu12//:site-packages/nvidia/cuda_runtime/lib/libcudart.so.12",
+)
+
+cc_import(
+    name = "cuda12_cublas",
+    shared_library = "@pip_gpu_cuda12_9_torch_nvidia_cublas_cu12//:site-packages/nvidia/cublas/lib/libcublas.so.12",
+)
+
+cc_import(
+    name = "cuda12_cublas_lt",
+    shared_library = "@pip_gpu_cuda12_9_torch_nvidia_cublas_cu12//:site-packages/nvidia/cublas/lib/libcublasLt.so.12",
+)
+
+cc_import(
+    name = "cuda12_cufft",
+    shared_library = "@pip_gpu_cuda12_9_torch_nvidia_cufft_cu12//:site-packages/nvidia/cufft/lib/libcufft.so.11",
+)
+
+cc_import(
+    name = "cuda12_cupti",
+    shared_library = "@pip_gpu_cuda12_9_torch_nvidia_cuda_cupti_cu12//:site-packages/nvidia/cuda_cupti/lib/libcupti.so.12",
+)
+
+cc_import(
+    name = "cuda13_torch_nvshmem",
+    shared_library = "@pip_gpu_cuda13_torch_torch//:site-packages/torch/lib/libtorch_nvshmem.so",
+)
+
+cuda12_torch_link_deps = select({
+    "@//:using_cuda13_x86": [
+        ":cuda12_cublas",
+        ":cuda12_cublas_lt",
+        ":cuda12_cudart",
+        ":cuda12_cufft",
+        ":cuda12_cupti",
+        ":cuda13_torch_nvshmem",
+    ],
+    "//conditions:default": [],
+})
+
 test_deps = [
     "//rtp_llm/models_py/bindings/cuda/ops:cuda_impl",
     "//rtp_llm/models_py/bindings/core:exec_ops_test_lib",
@@ -19,7 +61,7 @@ test_deps = [
     "@com_google_googletest//:gtest_main",
     "@local_config_cuda//cuda:cuda_headers",
     "@local_config_cuda//cuda:cudart",
-] + torch_deps()
+] + torch_deps() + cuda12_torch_link_deps
 
 block_cache_test_deps = [
     "//rtp_llm/cpp/testing:device_test_utils",
@@ -31,7 +73,7 @@ block_cache_test_deps = [
     "@com_google_googletest//:gtest_main",
     "@local_config_cuda//cuda:cuda_headers",
     "@local_config_cuda//cuda:cudart",
-] + torch_deps()
+] + torch_deps() + cuda12_torch_link_deps
 
 
 cc_library(
@@ -42,10 +84,41 @@ cc_library(
     copts = test_copts,
     deps = [
         "//rtp_llm/cpp/cache:cache_types",
+        "//rtp_llm/cpp/config:model_config",
     ],
     visibility = ["//visibility:public"],
 )
 
+cc_test(
+    name = "cp_slot_mapper_test",
+    srcs = [
+        "CPSlotMapperTest.cc",
+    ],
+    data = [],
+    copts = test_copts,
+    deps = [
+        "//rtp_llm/cpp/cache:cp_slot_mapper",
+        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_main",
+    ],
+    env = {},
+)
+
+cc_test(
+    name = "kv_cache_resource_local_cache_keys_test",
+    srcs = [
+        "KVCacheResourceLocalCacheKeysTest.cc",
+    ],
+    data = [],
+    copts = test_copts,
+    deps = [
+        "//rtp_llm/cpp/cache:batch_kv_cache_resource",
+        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_main",
+    ],
+    env = {},
+)
+
 cc_library(
     name = "block_pool_test_helper",
     hdrs = [
@@ -71,6 +144,18 @@ cc_test(
     exec_properties = {'gpu':'H20'},
 )
 
+cc_test(
+    name = "shared_block_cache_test",
+    srcs = [
+        "SharedBlockCacheTest.cc",
+    ],
+    data = [],
+    copts = test_copts,
+    deps = block_cache_test_deps,
+    env = {},
+    exec_properties = {'gpu':'H20'},
+)
+
 cc_test(
     name = "block_pool_test",
     srcs = [
@@ -142,6 +227,22 @@ cc_test(
     exec_properties = {'gpu':'H20'},
 )
 
+cc_test(
+    name = "kv_cache_manager_cp_test",
+    srcs = [
+        "KVCacheManagerCPSlotMapperTest.cc",
+    ],
+    data = [],
+    copts = test_copts,
+    deps = test_deps + [
+        ":block_pool_test_helper",
+        "//rtp_llm/cpp/cache/connector/test/mock:connector_mock_files_lib",
+        "//rtp_llm/cpp/cache/connector/memory:memory_connector",
+    ],
+    env = {},
+    exec_properties = {'gpu':'H20'},
+)
+
 cc_test(
     name = "linear_kv_cache_group_test",
     srcs = [
@@ -170,6 +271,80 @@ cc_test(
     exec_properties = {'gpu':'H20'},
 )
 
+cc_test(
+    name = "hybrid_kv_cache_allocator_cp_shard_test",
+    srcs = [
+        "HybridKVCacheAllocatorCPShardTest.cc",
+    ],
+    data = [],
+    copts = test_copts,
+    deps = test_deps + [
+        ":block_pool_test_helper",
+    ],
+    env = {},
+    exec_properties = {'gpu':'H20'},
+)
+
+cc_test(
+    name = "hybrid_pool_kv_cache_allocator_test",
+    srcs = [
+        "HybridPoolKVCacheAllocatorTest.cc",
+    ],
+    data = [],
+    copts = test_copts,
+    deps = test_deps + [
+        ":block_pool_test_helper",
+    ],
+    env = {},
+    exec_properties = {'gpu':'H20'},
+)
+
+cc_test(
+    name = "swa_kv_cache_group_test",
+    srcs = [
+        "SWAKVCacheGroupTest.cc",
+    ],
+    data = [],
+    copts = test_copts,
+    deps = test_deps + [
+        ":block_pool_test_helper",
+    ],
+    env = {},
+    exec_properties = {'gpu':'H20'},
+)
+
+cc_test(
+    name = "swa_kv_cache_group_malloc_range_test",
+    srcs = [
+        "SWAKVCacheGroupMallocRangeTest.cc",
+    ],
+    data = [],
+    copts = test_copts,
+    deps = [
+        "//rtp_llm/cpp/cache:kv_cache_group",
+        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_main",
+    ] + torch_deps() + cuda12_torch_link_deps,
+    env = {
+        "RTP_LLM_PIN_HOST_BLOCK_POOL": "0",
+    },
+)
+
+cc_test(
+    name = "dsv4_cache_test",
+    srcs = [
+        "DSV4CacheTest.cc",
+    ],
+    data = [],
+    copts = test_copts,
+    deps = test_deps + [
+        ":block_pool_test_helper",
+        "//rtp_llm/cpp/cache:kv_cache_transfer_planner",
+    ],
+    env = {},
+    exec_properties = {'gpu':'H20'},
+)
+
 cc_test(
     name = "kv_cache_resource_test",
     srcs = [
@@ -180,4 +355,4 @@ cc_test(
     deps = test_deps,
     env = {},
     exec_properties = {'gpu':'H20'},
-)
\ No newline at end of file
+)
diff --git a/rtp_llm/cpp/cache/test/BlockPoolTest.cc b/rtp_llm/cpp/cache/test/BlockPoolTest.cc
index 46132b09af..0e03f69c77 100644
--- a/rtp_llm/cpp/cache/test/BlockPoolTest.cc
+++ b/rtp_llm/cpp/cache/test/BlockPoolTest.cc
@@ -8,10 +8,12 @@
 #include "rtp_llm/cpp/utils/Logger.h"
 #include "rtp_llm/cpp/cache/BlockPool.h"
 #include "rtp_llm/cpp/cache/CacheConfig.h"
-#include "rtp_llm/cpp/cache/CacheConfigCreator.h"
+#include "rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h"
 #include "rtp_llm/cpp/cache/BlockPoolConfigHelper.h"
+#include "rtp_llm/cpp/config/StaticConfig.h"
 #include "rtp_llm/models_py/bindings/core/ExecOps.h"
 #include "rtp_llm/cpp/cache/test/BlockPoolTestHelper.h"
+#include "rtp_llm/cpp/cache/test/CacheConfigTestUtils.h"
 
 namespace rtp_llm {
 namespace test {
@@ -19,14 +21,18 @@ namespace test {
 class BlockPoolTest: public ::testing::Test {
 protected:
     void SetUp() override {
+        old_core_dump_on_exception_                  = StaticConfig::user_ft_core_dump_on_exception;
+        StaticConfig::user_ft_core_dump_on_exception = false;
         createDevice();
     }
 
     void TearDown() override {
+        StaticConfig::user_ft_core_dump_on_exception = old_core_dump_on_exception_;
         block_pool_.reset();
     }
 
     std::shared_ptr<BlockPool> block_pool_;
+    bool                       old_core_dump_on_exception_{false};
 };
 
 namespace {
@@ -46,7 +52,7 @@ static rtp_llm::ModelConfig makeTestModelConfig(uint32_t num_layers) {
     m.attn_config.kv_lora_rank     = 0;
     m.attn_config.rope_head_dim    = 0;
     m.attn_config.head_num         = 2;
-    // keep other fields default
+    setDefaultKvCacheSpec(m);
     return m;
 }
 
@@ -95,27 +101,25 @@ TEST_F(BlockPoolTest, ConstructorAndInit) {
 }
 
 TEST_F(BlockPoolTest, MTPConvertIndexGlobalIdMapping) {
-    // Use createSpConfig logic so that global_layer_ids is filled for main + sub-model layers.
+    // Use createSpConfig logic so that group layer ids are filled for main + sub-model layers.
     // main(2 layers) + mtp1(1 layer) + mtp2(1 layer)
     auto cache_cfg = makeMtpCacheConfigByCreateSpConfig(/*main_layers=*/2, /*mtp_module_num=*/2, /*block_num=*/4);
 
-    ASSERT_FALSE(cache_cfg.global_layer_ids.empty());
-    ASSERT_EQ(cache_cfg.global_layer_ids[0].size(), static_cast<size_t>(cache_cfg.layer_all_num));
+    ASSERT_GT(cache_cfg.groupNums(), 0);
+    ASSERT_EQ(cache_cfg.layerIdsForGroup(0).size(), static_cast<size_t>(cache_cfg.layer_all_num));
 
     ASSERT_EQ(cache_cfg.mtp_sub_configs.size(), 2u);
     ASSERT_NE(cache_cfg.mtp_sub_configs[0], nullptr);
     ASSERT_NE(cache_cfg.mtp_sub_configs[1], nullptr);
     ASSERT_EQ(cache_cfg.mtp_sub_configs[0]->groupNums(), 1);
     ASSERT_EQ(cache_cfg.mtp_sub_configs[1]->groupNums(), 1);
-    EXPECT_EQ(cache_cfg.mtp_sub_configs[0]->cache_specs[0]->block_size_bytes(),
-              cache_cfg.mtp_sub_configs[1]->cache_specs[0]->block_size_bytes());
-
-    ASSERT_FALSE(cache_cfg.mtp_sub_configs[0]->global_layer_ids.empty());
-    ASSERT_FALSE(cache_cfg.mtp_sub_configs[1]->global_layer_ids.empty());
-    ASSERT_EQ(cache_cfg.mtp_sub_configs[0]->global_layer_ids[0].size(), 1u);
-    ASSERT_EQ(cache_cfg.mtp_sub_configs[1]->global_layer_ids[0].size(), 1u);
-    EXPECT_EQ(cache_cfg.mtp_sub_configs[0]->global_layer_ids[0][0], 2);
-    EXPECT_EQ(cache_cfg.mtp_sub_configs[1]->global_layer_ids[0][0], 3);
+    EXPECT_EQ(cache_cfg.mtp_sub_configs[0]->specForGroup(0)->block_size_bytes(),
+              cache_cfg.mtp_sub_configs[1]->specForGroup(0)->block_size_bytes());
+
+    ASSERT_EQ(cache_cfg.mtp_sub_configs[0]->layerIdsForGroup(0).size(), 1u);
+    ASSERT_EQ(cache_cfg.mtp_sub_configs[1]->layerIdsForGroup(0).size(), 1u);
+    EXPECT_EQ(cache_cfg.mtp_sub_configs[0]->layerIdsForGroup(0)[0], 2);
+    EXPECT_EQ(cache_cfg.mtp_sub_configs[1]->layerIdsForGroup(0)[0], 3);
 
     auto pool_cfg = rtp_llm::BlockPoolConfigHelper::createConfig(cache_cfg);
     ASSERT_EQ(pool_cfg.memory_layouts.size(), 3u);
diff --git a/rtp_llm/cpp/cache/test/BlockPoolTestHelper.h b/rtp_llm/cpp/cache/test/BlockPoolTestHelper.h
index bd3f68e000..ff6438fb94 100644
--- a/rtp_llm/cpp/cache/test/BlockPoolTestHelper.h
+++ b/rtp_llm/cpp/cache/test/BlockPoolTestHelper.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <cstdint>
+#include <numeric>
 #include "rtp_llm/cpp/cache/CacheConfig.h"
 #include "rtp_llm/cpp/cache/BlockPoolConfigHelper.h"
 #include "rtp_llm/cpp/utils/AssertUtils.h"
@@ -40,7 +41,6 @@ inline KVCacheSpecPtr createTestKvCacheSpec(uint32_t          layer_num,
         auto spec                = std::make_shared<MHAKVCacheSpec>();
         spec->type               = KVCacheSpecType::MultiHeadAttention;
         spec->dtype              = dtype;
-        spec->layer_num          = layer_num;
         spec->local_head_num_kv  = local_head_num_kv;
         spec->seq_size_per_block = seq_size_per_block;
         spec->size_per_head      = static_cast<uint32_t>(k_elems / denom);
@@ -50,7 +50,6 @@ inline KVCacheSpecPtr createTestKvCacheSpec(uint32_t          layer_num,
         auto spec                = std::make_shared<MLAKVCacheSpec>();
         spec->type               = KVCacheSpecType::MultiHeadLatentAttention;
         spec->dtype              = dtype;
-        spec->layer_num          = layer_num;
         spec->local_head_num_kv  = local_head_num_kv;
         spec->seq_size_per_block = seq_size_per_block;
         spec->kv_lora_rank       = static_cast<uint32_t>(k_elems / denom);
@@ -72,16 +71,19 @@ inline BlockPoolConfig createTestConfig(size_t            k_block_stride_bytes =
     auto spec = createTestKvCacheSpec(
         kLayerNum, dtype, local_head_num_kv, seq_size_per_block, k_block_stride_bytes, v_block_stride_bytes);
 
-    // Create CacheConfig with the spec
     rtp_llm::CacheConfig cache_config;
-    cache_config.cache_specs           = {spec};
     cache_config.layer_num             = kLayerNum;
+    cache_config.layer_all_num         = kLayerNum;
     cache_config.block_num             = kBlockNum;
     cache_config.dtype                 = dtype;
     cache_config.seq_size_per_block    = seq_size_per_block;
     cache_config.kv_block_stride_bytes = k_block_stride_bytes + v_block_stride_bytes;
     cache_config.kv_scale_stride_bytes = k_scale_stride_bytes + v_scale_stride_bytes;
 
+    std::vector<int> layer_ids(kLayerNum);
+    std::iota(layer_ids.begin(), layer_ids.end(), 0);
+    cache_config.fromGroupedSpecs({spec}, {layer_ids}, {CacheGroupType::FULL}, {"default"});
+
     return BlockPoolConfigHelper::createConfig(cache_config);
 }
 
diff --git a/rtp_llm/cpp/cache/test/CPSlotMapperTest.cc b/rtp_llm/cpp/cache/test/CPSlotMapperTest.cc
new file mode 100644
index 0000000000..d8d052a7ae
--- /dev/null
+++ b/rtp_llm/cpp/cache/test/CPSlotMapperTest.cc
@@ -0,0 +1,124 @@
+#include <gtest/gtest.h>
+#include <stdexcept>
+#include "rtp_llm/cpp/cache/CPSlotMapper.h"
+
+namespace rtp_llm {
+namespace test {
+
+class CPSlotMapperTest: public ::testing::Test {};
+
+TEST_F(CPSlotMapperTest, DefaultConstructorIsNotSharded) {
+    CPSlotMapper mapper;
+    EXPECT_FALSE(mapper.isSharded());  // cp_size=1 → not sharded
+    EXPECT_EQ(mapper.cpRank(), 0);
+    EXPECT_EQ(mapper.cpSize(), 1);
+    EXPECT_EQ(mapper.blockSize(), 1);
+    EXPECT_EQ(mapper.virtualBlockSize(), 1);
+}
+
+TEST_F(CPSlotMapperTest, SingleRankIsNotSharded) {
+    CPSlotMapper mapper(0, 1, 32);
+    EXPECT_FALSE(mapper.isSharded());  // cp_size=1 → not sharded
+}
+
+TEST_F(CPSlotMapperTest, MultiRankIsSharded) {
+    CPSlotMapper mapper(0, 2, 32);
+    EXPECT_TRUE(mapper.isSharded());           // cp_size=2 → sharded
+    EXPECT_EQ(mapper.virtualBlockSize(), 64);  // block_size * cp_size
+}
+
+TEST_F(CPSlotMapperTest, RejectsInvalidGeometry) {
+    EXPECT_THROW(CPSlotMapper(0, 0, 32), std::invalid_argument);
+    EXPECT_THROW(CPSlotMapper(0, 2, 0), std::invalid_argument);
+    EXPECT_THROW(CPSlotMapper(-1, 2, 32), std::invalid_argument);
+    EXPECT_THROW(CPSlotMapper(2, 2, 32), std::invalid_argument);
+}
+
+TEST_F(CPSlotMapperTest, LocalBlockCount) {
+    const int block_size = 4;
+
+    // cp_size=2: localBlockCount = ceil(total_blocks / cp_size), same for all ranks
+    CPSlotMapper rank0(0, 2, block_size);
+    CPSlotMapper rank1(1, 2, block_size);
+
+    // seq_len=0: 0 total blocks -> 0
+    EXPECT_EQ(rank0.localBlockCount(0), 0);
+    EXPECT_EQ(rank1.localBlockCount(0), 0);
+
+    // seq_len=4: 1 total block -> ceil(1/2)=1
+    EXPECT_EQ(rank0.localBlockCount(4), 1);
+    EXPECT_EQ(rank1.localBlockCount(4), 1);
+
+    // seq_len=8: 2 total blocks -> ceil(2/2)=1
+    EXPECT_EQ(rank0.localBlockCount(8), 1);
+    EXPECT_EQ(rank1.localBlockCount(8), 1);
+
+    // seq_len=12: 3 total blocks -> ceil(3/2)=2
+    EXPECT_EQ(rank0.localBlockCount(12), 2);
+    EXPECT_EQ(rank1.localBlockCount(12), 2);
+
+    // seq_len=16: 4 total blocks -> ceil(4/2)=2
+    EXPECT_EQ(rank0.localBlockCount(16), 2);
+    EXPECT_EQ(rank1.localBlockCount(16), 2);
+
+    // seq_len=5: 2 total blocks -> ceil(2/2)=1
+    EXPECT_EQ(rank0.localBlockCount(5), 1);
+    EXPECT_EQ(rank1.localBlockCount(5), 1);
+}
+
+TEST_F(CPSlotMapperTest, LocalBlockCountFourRanks) {
+    // seq_len=55, block_size=8, cp_size=4
+    // total_blocks = ceil(55/8) = 7, localBlockCount = ceil(7/4) = 2
+    // All ranks get 2 — rank3 has 1 unused trailing block
+    const int block_size = 8;
+    const int cp_size    = 4;
+
+    for (int r = 0; r < cp_size; ++r) {
+        CPSlotMapper mapper(r, cp_size, block_size);
+        EXPECT_EQ(mapper.localBlockCount(55), 2) << "rank=" << r;
+    }
+}
+
+TEST_F(CPSlotMapperTest, EffectiveSeqLenForAllocIsRankIndependent) {
+    const int    block_size = 4;
+    CPSlotMapper rank0(0, 2, block_size);
+    CPSlotMapper rank1(1, 2, block_size);
+
+    // effectiveSeqLenForAlloc = ceil(total_blocks / cp_size) * block_size
+    // This is rank-independent — always allocates max across all ranks.
+    EXPECT_EQ(rank0.effectiveSeqLenForAlloc(0), 0);
+    EXPECT_EQ(rank0.effectiveSeqLenForAlloc(4), 4);   // ceil(1/2)=1 block * 4
+    EXPECT_EQ(rank0.effectiveSeqLenForAlloc(8), 4);   // ceil(2/2)=1 block * 4
+    EXPECT_EQ(rank0.effectiveSeqLenForAlloc(12), 8);  // ceil(3/2)=2 blocks * 4
+    EXPECT_EQ(rank0.effectiveSeqLenForAlloc(16), 8);  // ceil(4/2)=2 blocks * 4
+
+    // Same results for rank1 — rank-independent
+    EXPECT_EQ(rank1.effectiveSeqLenForAlloc(0), 0);
+    EXPECT_EQ(rank1.effectiveSeqLenForAlloc(4), 4);
+    EXPECT_EQ(rank1.effectiveSeqLenForAlloc(8), 4);
+    EXPECT_EQ(rank1.effectiveSeqLenForAlloc(12), 8);
+    EXPECT_EQ(rank1.effectiveSeqLenForAlloc(16), 8);
+}
+
+TEST_F(CPSlotMapperTest, EffectiveSeqLenFourRanks) {
+    // seq_len=55, block_size=8, cp_size=4
+    // total_blocks=7, ceil(7/4)=2, effective=16
+    // All ranks get the same value
+    const int block_size = 8;
+    const int cp_size    = 4;
+
+    for (int r = 0; r < cp_size; ++r) {
+        CPSlotMapper mapper(r, cp_size, block_size);
+        EXPECT_EQ(mapper.effectiveSeqLenForAlloc(55), 16) << "rank=" << r;
+    }
+}
+
+TEST_F(CPSlotMapperTest, NonShardedPassthrough) {
+    CPSlotMapper mapper;  // cp_size=1, block_size=1
+
+    EXPECT_EQ(mapper.localBlockCount(10), 10);
+    EXPECT_EQ(mapper.effectiveSeqLenForAlloc(10), 10);
+}
+
+}  // namespace test
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/test/CacheConfigTestUtils.h b/rtp_llm/cpp/cache/test/CacheConfigTestUtils.h
index 9ecd30ca0c..5565b40769 100644
--- a/rtp_llm/cpp/cache/test/CacheConfigTestUtils.h
+++ b/rtp_llm/cpp/cache/test/CacheConfigTestUtils.h
@@ -4,12 +4,257 @@
 #include <cstddef>
 #include <cstdint>
 #include <memory>
+#include <numeric>
+#include <string>
 #include <vector>
 
 #include "rtp_llm/cpp/cache/CacheConfig.h"
+#include "rtp_llm/cpp/cache/spec/LinearKVCacheSpec.h"
+#include "rtp_llm/cpp/cache/spec/MHAKVCacheSpec.h"
+#include "rtp_llm/cpp/cache/spec/MLAKVCacheSpec.h"
+#include "rtp_llm/cpp/cache/spec/OpaqueKVCacheSpec.h"
+#include "rtp_llm/cpp/config/ModelConfig.h"
+#include "rtp_llm/cpp/utils/AssertUtils.h"
 
 namespace rtp_llm::test {
 
+inline constexpr uint32_t DSV4_FP8_KV_ENTRY_BYTES            = 584;
+inline constexpr uint32_t DSV4_FP8_INDEXER_ENTRY_BYTES       = 132;
+inline constexpr size_t   DSV4_FP8_MLA_BLOCK_ALIGNMENT_BYTES = 576;
+inline constexpr uint32_t DSV4_SWA_WINDOW_ENTRIES            = 128;
+
+inline size_t alignDsv4Fp8KvBlockBytes(size_t natural, size_t extra_multiple = 1) {
+    const size_t align = std::lcm(DSV4_FP8_MLA_BLOCK_ALIGNMENT_BYTES, std::max<size_t>(extra_multiple, 1));
+    return ((natural + align - 1) / align) * align;
+}
+
+inline KVCacheSpecPtr makeDsv4Spec(const std::string& tag,
+                                    const std::string& kind,
+                                    uint32_t           entry_elems,
+                                    DataType           dtype,
+                                    uint32_t           compression_ratio = 1) {
+    KVCacheSpecPtr spec;
+    if (kind == "compressed_kv") {
+        auto kv_spec               = std::make_shared<CompressedKVCacheSpec>();
+        kv_spec->entry_elems       = entry_elems;
+        kv_spec->compression_ratio = compression_ratio;
+        kv_spec->store_dtype       = dtype;
+        spec                       = kv_spec;
+    } else {
+        auto state_spec        = std::make_shared<FixedStateCacheSpec>();
+        state_spec->state_dim  = entry_elems;
+        state_spec->store_dtype = dtype;
+        spec                   = state_spec;
+    }
+    spec->tag                = tag;
+    spec->dtype              = dtype;
+    return spec;
+}
+
+inline KVCacheSpecDesc dsv4DescForSpec(const KVCacheSpecPtr& spec) {
+    RTP_LLM_CHECK_WITH_INFO(spec != nullptr, "dsv4DescForSpec got null spec");
+    KVCacheSpecDesc desc;
+    desc.tag             = spec->tag;
+    desc.dtype           = spec->dtype;
+    if (auto* compressed = dynamic_cast<CompressedKVCacheSpec*>(spec.get())) {
+        desc.cache_type                 = CacheType::COMPRESSED_KV;
+        desc.is_state_cache             = false;
+        desc.entry_elems                = compressed->entry_elems;
+        desc.compression_ratio          = compressed->compression_ratio;
+        desc.store_dtype                = compressed->store_dtype;
+        desc.block_size_bytes_alignment = compressed->block_size_bytes_alignment;
+        desc.extra.derive_entries_from_kernel_block = true;
+        if (desc.block_size_bytes_alignment == 0 && desc.entry_elems == DSV4_FP8_KV_ENTRY_BYTES) {
+            desc.block_size_bytes_alignment = DSV4_FP8_MLA_BLOCK_ALIGNMENT_BYTES;
+        }
+        return desc;
+    }
+
+    auto* fixed = dynamic_cast<FixedStateCacheSpec*>(spec.get());
+    RTP_LLM_CHECK_WITH_INFO(fixed != nullptr, "DSV4 test spec tag=%s must be opaque", spec->tag.c_str());
+    desc.cache_type                       = CacheType::FIXED_STATE;
+    desc.entry_elems                      = fixed->entry_elems;
+    desc.store_dtype                      = fixed->store_dtype;
+    desc.block_size_bytes_override        = fixed->block_size_bytes_override;
+    desc.block_size_bytes_alignment       = fixed->block_size_bytes_alignment;
+    desc.block_size_alignment_min_entries = fixed->block_size_alignment_min_entries;
+    if (desc.tag == "indexer_state" || desc.tag == "csa_state") {
+        desc.extra.state_ring_compression_ratio = 4;
+        desc.extra.state_ring_overlap           = 1;
+        desc.extra.cp_align_entries             = true;
+        desc.extra.cp_slice_entries             = true;
+    } else if (desc.tag == "hca_state") {
+        desc.extra.state_ring_compression_ratio = 128;
+        desc.extra.cp_align_entries             = true;
+        desc.extra.cp_slice_entries             = true;
+        desc.extra.explicit_block_num           = 256;
+        desc.skip_prefix_reuse                  = true;
+        desc.has_reuse_policy                   = true;
+        desc.reuse_policy                       = CacheReusePolicy::NON_REUSABLE;
+        desc.has_active_tail_blocks             = true;
+        desc.active_tail_blocks                 = 1;
+        desc.has_validate_tail_blocks           = true;
+        desc.validate_tail_blocks               = false;
+    } else if (desc.tag == "swa_kv") {
+        desc.extra.state_ring_compression_ratio = DSV4_SWA_WINDOW_ENTRIES;
+        desc.extra.cp_align_entries             = true;
+        desc.extra.cp_prefill_slice_block_bytes = true;
+        if (desc.block_size_bytes_alignment == 0 && desc.entry_elems == DSV4_FP8_KV_ENTRY_BYTES) {
+            desc.block_size_bytes_alignment = DSV4_FP8_MLA_BLOCK_ALIGNMENT_BYTES;
+        }
+    }
+    desc.extra.state_ring_add_gen_num_per_cycle = true;
+    desc.extra.use_fixed_region_cp_tokens       = true;
+    desc.block_size_alignment_min_entries =
+        desc.block_size_alignment_min_entries == 0 ? DSV4_SWA_WINDOW_ENTRIES : desc.block_size_alignment_min_entries;
+    desc.is_state_cache     = true;
+    desc.has_evict_policy   = true;
+    desc.evict_policy       = CacheEvictPolicy::INDEPENDENT;
+    return desc;
+}
+
+inline void setDefaultKvCacheSpec(ModelConfig& model_config) {
+    KVCacheSpecDesc desc;
+    desc.tag                = "default";
+    desc.seq_size_per_block = static_cast<uint32_t>(model_config.attn_config.tokens_per_block);
+    if (model_config.attn_config.use_mla && model_config.mla_ops_type != rtp_llm::MlaOpsType::MHA) {
+        desc.cache_type    = CacheType::MLA;
+        desc.kv_lora_rank  = static_cast<uint32_t>(model_config.attn_config.kv_lora_rank);
+        desc.rope_head_dim = static_cast<uint32_t>(model_config.attn_config.rope_head_dim);
+        desc.num_kv_heads  = 1;
+    } else {
+        desc.cache_type    = CacheType::MHA;
+        desc.size_per_head = static_cast<uint32_t>(model_config.attn_config.size_per_head);
+        desc.num_kv_heads  = static_cast<uint32_t>(model_config.attn_config.kv_head_num);
+    }
+    model_config.kv_cache_spec_descs.assign(static_cast<size_t>(model_config.num_layers), {desc});
+}
+
+inline void setHybridAttentionKvCacheSpecs(ModelConfig& model_config) {
+    std::vector<int> full_layers;
+    std::vector<int> swa_layers;
+    std::vector<int> linear_layers;
+    const auto&      types = model_config.hybrid_attention_config.hybrid_attention_types;
+    RTP_LLM_CHECK_WITH_INFO(types.size() == static_cast<size_t>(model_config.num_layers),
+                            "hybrid_attention_types size %zu != num_layers %ld",
+                            types.size(),
+                            model_config.num_layers);
+    for (int i = 0; i < static_cast<int>(model_config.num_layers); ++i) {
+        switch (types[static_cast<size_t>(i)]) {
+            case HybridAttentionType::LINEAR:
+                linear_layers.push_back(i);
+                break;
+            case HybridAttentionType::SLIDING_WINDOW:
+                swa_layers.push_back(i);
+                break;
+            case HybridAttentionType::NONE:
+            default:
+                full_layers.push_back(i);
+                break;
+        }
+    }
+
+    KVCacheSpecDesc full_desc;
+    full_desc.tag                = "full";
+    full_desc.cache_type         = CacheType::MHA;
+    full_desc.seq_size_per_block = static_cast<uint32_t>(model_config.attn_config.tokens_per_block);
+    full_desc.size_per_head      = static_cast<uint32_t>(model_config.attn_config.size_per_head);
+    full_desc.num_kv_heads       = static_cast<uint32_t>(model_config.attn_config.kv_head_num);
+
+    KVCacheSpecDesc swa_desc = full_desc;
+    swa_desc.tag             = "swa";
+    swa_desc.cache_type      = CacheType::FIXED_STATE;
+    swa_desc.entry_elems     = static_cast<uint32_t>(model_config.attn_config.size_per_head)
+                           * static_cast<uint32_t>(model_config.attn_config.kv_head_num) * 2;
+    swa_desc.entries_per_block = static_cast<uint32_t>(model_config.attn_config.sliding_window > 0 ?
+                                                       model_config.attn_config.sliding_window :
+                                                       model_config.attn_config.tokens_per_block);
+    swa_desc.store_dtype       = DataType::TYPE_FP16;
+
+    const auto& linear_config           = model_config.linear_attention_config;
+    KVCacheSpecDesc linear_desc;
+    linear_desc.tag                    = "linear";
+    linear_desc.cache_type             = CacheType::LINEAR;
+    linear_desc.seq_size_per_block     = static_cast<uint32_t>(model_config.attn_config.tokens_per_block);
+    linear_desc.num_k_heads            = static_cast<uint32_t>(linear_config.linear_num_key_heads);
+    linear_desc.num_v_heads            = static_cast<uint32_t>(linear_config.linear_num_value_heads);
+    linear_desc.head_k_dim             = static_cast<uint32_t>(linear_config.linear_key_head_dim);
+    linear_desc.head_v_dim             = static_cast<uint32_t>(linear_config.linear_value_head_dim);
+    linear_desc.conv_kernel_dim        = static_cast<uint32_t>(linear_config.linear_conv_kernel_dim);
+    linear_desc.ssm_state_dtype        = linear_config.ssm_state_dtype;
+    linear_desc.conv_state_dtype       = linear_config.conv_state_dtype;
+
+    model_config.kv_cache_spec_descs.assign(static_cast<size_t>(model_config.num_layers), {});
+    for (int layer_id : full_layers) {
+        model_config.kv_cache_spec_descs[static_cast<size_t>(layer_id)] = {full_desc};
+    }
+    for (int layer_id : swa_layers) {
+        model_config.kv_cache_spec_descs[static_cast<size_t>(layer_id)] = {swa_desc};
+    }
+    for (int layer_id : linear_layers) {
+        model_config.kv_cache_spec_descs[static_cast<size_t>(layer_id)] = {linear_desc};
+    }
+}
+
+inline void setDsv4KvCacheSpecs(ModelConfig& model_config) {
+    const int layer_num = static_cast<int>(model_config.num_layers);
+
+    const bool     fp8_kv              = model_config.attn_config.kv_cache_dtype == KvCacheDataType::FP8;
+    const uint32_t kv_entry_elems      = fp8_kv ? 584 : static_cast<uint32_t>(model_config.attn_config.size_per_head) * 2;
+    const uint32_t indexer_entry_elems = fp8_kv ? 132 : static_cast<uint32_t>(model_config.attn_config.indexer_head_dim) * 2;
+    const uint32_t head_dim            = static_cast<uint32_t>(model_config.attn_config.size_per_head);
+    const uint32_t indexer_head_dim    = static_cast<uint32_t>(model_config.attn_config.indexer_head_dim);
+
+    auto csa_kv = makeDsv4Spec("csa_kv", "compressed_kv", kv_entry_elems, DataType::TYPE_UINT8, 4);
+    auto hca_kv = makeDsv4Spec("hca_kv", "compressed_kv", kv_entry_elems, DataType::TYPE_UINT8, 128);
+    auto indexer_kv = makeDsv4Spec("indexer_kv", "compressed_kv", indexer_entry_elems, DataType::TYPE_UINT8, 4);
+    auto indexer_state = makeDsv4Spec("indexer_state", "fixed_state", 4 * indexer_head_dim, DataType::TYPE_FP32);
+    auto csa_state = makeDsv4Spec("csa_state", "fixed_state", 4 * head_dim, DataType::TYPE_FP32);
+    auto hca_state = makeDsv4Spec("hca_state", "fixed_state", 2 * head_dim, DataType::TYPE_FP32);
+    auto swa_kv = makeDsv4Spec("swa_kv", "sliding_window_kv", kv_entry_elems, DataType::TYPE_UINT8);
+
+    model_config.kv_cache_spec_descs.clear();
+    model_config.kv_cache_spec_descs.resize(layer_num);
+    for (int i = 0; i < layer_num; ++i) {
+        const int ratio = i < static_cast<int>(model_config.attn_config.layer_compress_ratios.size()) ?
+                              model_config.attn_config.layer_compress_ratios[static_cast<size_t>(i)] :
+                              0;
+        std::vector<KVCacheSpecPtr> specs;
+        if (ratio == 4) {
+            specs = {csa_kv, indexer_kv, indexer_state, csa_state, swa_kv};
+        } else if (ratio == 128) {
+            specs = {hca_kv, hca_state, swa_kv};
+        } else {
+            specs = {swa_kv};
+        }
+        auto& descs = model_config.kv_cache_spec_descs[static_cast<size_t>(i)];
+        descs.reserve(specs.size());
+        for (const auto& spec : specs) {
+            descs.push_back(dsv4DescForSpec(spec));
+        }
+    }
+}
+
+inline void refreshDsv4KvCacheSpecDescs(ModelConfig&             model_config,
+                                        const ParallelismConfig& parallelism_config,
+                                        const KVCacheConfig&     kv_cache_config,
+                                        int                      gen_num_per_cycle = 0) {
+    (void)parallelism_config;
+    (void)kv_cache_config;
+    (void)gen_num_per_cycle;
+    setDsv4KvCacheSpecs(model_config);
+}
+
+inline void setDsv4ExplicitPoolBlocks(ModelConfig& model_config, const std::string& tag, uint32_t block_num) {
+    for (auto& descs : model_config.kv_cache_spec_descs) {
+        for (auto& desc : descs) {
+            if (desc.tag == tag) {
+                desc.extra.explicit_block_num = block_num;
+            }
+        }
+    }
+}
+
 // A tiny helper for unit tests to construct a minimal MultiHeadAttention KV cache config.
 //
 // NOTE:
@@ -33,22 +278,16 @@ inline CacheConfig makeSimpleMhaCacheConfig(int               layer_num,
     spec->type               = KVCacheSpecType::MultiHeadAttention;
     spec->dtype              = dtype;
     spec->seq_size_per_block = static_cast<uint32_t>(tokens_per_block);
-    spec->layer_num          = static_cast<uint32_t>(layer_num);
     spec->local_head_num_kv  = local_head_num_kv;
     spec->size_per_head      = size_per_head;
-    config.cache_specs.push_back(spec);
-
     std::vector<int> layer_ids(layer_num);
     for (int i = 0; i < layer_num; ++i) {
         layer_ids[i] = i;
     }
-    config.layer_ids.push_back(layer_ids);
-    config.global_layer_ids.push_back(layer_ids);
-    config.layer_to_group_id.assign(layer_num, 0);
-    config.layer_attn_types.assign(layer_num, CacheGroupType::FULL);
+    config.fromGroupedSpecs({spec}, {layer_ids}, {CacheGroupType::FULL}, {"default"});
 
     config.kv_block_stride_bytes = spec->block_size_bytes();
-    config.kv_block_size_bytes   = static_cast<size_t>(spec->block_size_bytes() * spec->layer_num);
+    config.kv_block_size_bytes   = static_cast<size_t>(layer_num) * spec->block_size_bytes();
 
     if (dtype == rtp_llm::TYPE_INT8 || dtype == rtp_llm::TYPE_FP8_E4M3) {
         const size_t kv_scale_kv_stride       = static_cast<size_t>(spec->local_head_num_kv) * tokens_per_block;
@@ -100,14 +339,11 @@ inline CacheConfig makeSimpleHybridMhaCacheConfig(int               layer_num,
     }
 
     const int group_cnt     = layer_num / config.group_layer_num;
-    const int linear_groups = 1;
-    const int full_groups   = group_cnt - 1;
 
     // Specs.
     auto linear_spec                = std::make_shared<LinearKVCacheSpec>();
     linear_spec->type               = KVCacheSpecType::LinearAttention;
     linear_spec->dtype              = dtype;
-    linear_spec->layer_num          = static_cast<uint32_t>(config.group_layer_num);
     linear_spec->local_num_k_heads  = 1;
     linear_spec->local_num_v_heads  = 1;
     linear_spec->head_k_dim         = 1;
@@ -120,19 +356,17 @@ inline CacheConfig makeSimpleHybridMhaCacheConfig(int               layer_num,
     full_spec->type               = KVCacheSpecType::MultiHeadAttention;
     full_spec->dtype              = dtype;
     full_spec->seq_size_per_block = static_cast<uint32_t>(tokens_per_block);
-    full_spec->layer_num          = static_cast<uint32_t>(config.group_layer_num);
     full_spec->local_head_num_kv  = local_head_num_kv;
     full_spec->size_per_head      = size_per_head;
 
-    config.layer_ids.clear();
-    config.global_layer_ids.clear();
-    config.linear_groups.clear();
-    config.full_groups.clear();
-    config.cache_specs.clear();
-    config.group_types.clear();
-
-    config.layer_to_group_id.assign(static_cast<size_t>(layer_num), 0);
-    config.layer_attn_types.assign(static_cast<size_t>(layer_num), CacheGroupType::FULL);
+    std::vector<KVCacheSpecPtr>    specs;
+    std::vector<std::vector<int>>  layers_by_group;
+    std::vector<CacheGroupType>    types;
+    std::vector<std::string>       tags;
+    specs.reserve(static_cast<size_t>(group_cnt));
+    layers_by_group.reserve(static_cast<size_t>(group_cnt));
+    types.reserve(static_cast<size_t>(group_cnt));
+    tags.reserve(static_cast<size_t>(group_cnt));
 
     // Build groups: gid=0 linear, gid>=1 full.
     for (int gid = 0; gid < group_cnt; ++gid) {
@@ -141,26 +375,19 @@ inline CacheConfig makeSimpleHybridMhaCacheConfig(int               layer_num,
         for (int local = 0; local < config.group_layer_num; ++local) {
             const int layer_id = gid * config.group_layer_num + local;
             group_layers.push_back(layer_id);
-            config.layer_to_group_id[static_cast<size_t>(layer_id)] = gid;
-            config.layer_attn_types[static_cast<size_t>(layer_id)] =
-                (gid == 0) ? CacheGroupType::LINEAR : CacheGroupType::FULL;
         }
-        config.layer_ids.push_back(group_layers);
-        config.global_layer_ids.push_back(group_layers);
+        layers_by_group.push_back(group_layers);
 
         if (gid == 0) {
-            config.cache_specs.push_back(linear_spec);
-            config.group_types.push_back(CacheGroupType::LINEAR);
-            config.linear_groups.push_back(group_layers);
+            specs.push_back(linear_spec);
+            types.push_back(CacheGroupType::LINEAR);
         } else {
-            config.cache_specs.push_back(full_spec);
-            config.group_types.push_back(CacheGroupType::FULL);
-            config.full_groups.push_back(group_layers);
+            specs.push_back(full_spec);
+            types.push_back(CacheGroupType::FULL);
         }
+        tags.push_back("default");
     }
-
-    config.linear_group_num = linear_groups;
-    config.full_group_num   = full_groups;
+    config.fromGroupedSpecs(specs, layers_by_group, types, tags);
 
     // Physical sizes for hybrid memory layout: one group (group_layer_num) worth of layers.
     config.kv_block_stride_bytes = std::max(full_spec->block_size_bytes(), linear_spec->block_size_bytes());
diff --git a/rtp_llm/cpp/cache/test/DSV4CacheTest.cc b/rtp_llm/cpp/cache/test/DSV4CacheTest.cc
new file mode 100644
index 0000000000..e5ee99997a
--- /dev/null
+++ b/rtp_llm/cpp/cache/test/DSV4CacheTest.cc
@@ -0,0 +1,2596 @@
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h"
+#include "rtp_llm/cpp/cache/BlockPoolConfigHelper.h"
+#include "rtp_llm/cpp/cache/CPSlotMapper.h"
+#include "rtp_llm/cpp/cache/config_creator/HybridPoolConfigCreator.h"
+#include "rtp_llm/cpp/cache/allocator/HybridPoolKVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/allocator/HybridTypeKVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/KVCacheTransferPlanner.h"
+#include "rtp_llm/cpp/cache/spec/OpaqueKVCacheSpec.h"
+#include "rtp_llm/cpp/cache/BatchKVCacheResource.h"
+#include "rtp_llm/cpp/cache/SharedBlockCache.h"
+#include "rtp_llm/cpp/cache/test/BlockPoolTestHelper.h"
+#include "rtp_llm/cpp/cache/test/CacheConfigTestUtils.h"
+#include "rtp_llm/cpp/engine_base/stream/CompleteTokenIds.h"
+#include "rtp_llm/cpp/config/ConfigModules.h"
+#include "rtp_llm/cpp/config/ModelConfig.h"
+#include "rtp_llm/cpp/config/StaticConfig.h"
+#include "rtp_llm/cpp/utils/Logger.h"
+
+namespace rtp_llm {
+namespace test {
+
+namespace {
+
+constexpr int      kDsv4PoolNum           = 7;
+constexpr uint32_t kDsv4TokensPerBlock    = 128;
+constexpr uint32_t kDsv4KvEntryBytes      = 1024;
+constexpr uint32_t kDsv4IndexerEntryBytes = 256;
+constexpr uint32_t kDsv4Fp8KvEntryBytes   = 584;
+const std::vector<std::string> kDsv4FlashFirstSeenTags = {
+    "swa_kv", "csa_kv", "indexer_kv", "indexer_state", "csa_state", "hca_kv", "hca_state"};
+const std::vector<std::string> kDsv4ProFirstSeenTags = {
+    "hca_kv", "hca_state", "swa_kv", "csa_kv", "indexer_kv", "indexer_state", "csa_state"};
+
+static size_t gidForTag(const CacheConfig& config, const std::string& tag) {
+    return static_cast<size_t>(config.groupIdForTag(tag));
+}
+
+class DSV4CacheTestEnvironment: public ::testing::Environment {
+public:
+    void SetUp() override {
+        old_core_dump_on_exception_                  = StaticConfig::user_ft_core_dump_on_exception;
+        StaticConfig::user_ft_core_dump_on_exception = false;
+    }
+
+    void TearDown() override {
+        StaticConfig::user_ft_core_dump_on_exception = old_core_dump_on_exception_;
+    }
+
+private:
+    bool old_core_dump_on_exception_{false};
+};
+
+[[maybe_unused]] auto* const dsv4_cache_test_env =
+    ::testing::AddGlobalTestEnvironment(new DSV4CacheTestEnvironment());
+
+}  // namespace
+
+static KVCacheConfig makeDsv4KvCacheConfig() {
+    KVCacheConfig config;
+    config.seq_size_per_block = 128;
+    return config;
+}
+
+static void setGroupBlockNumsForTest(CacheConfig& config, const std::vector<uint32_t>& block_nums) {
+    std::vector<size_t> kv_strides;
+    std::vector<size_t> scale_strides;
+    kv_strides.reserve(static_cast<size_t>(config.groupNums()));
+    scale_strides.reserve(static_cast<size_t>(config.groupNums()));
+    for (size_t gid = 0; gid < static_cast<size_t>(config.groupNums()); ++gid) {
+        kv_strides.push_back(config.kvBlockStrideBytesForGroup(gid));
+        scale_strides.push_back(config.kvScaleStrideBytesForGroup(gid));
+    }
+    config.setGroupBlockLayout(block_nums, kv_strides, scale_strides);
+}
+
+static void initDsv4BatchGroups(BatchKVCacheResource& batch_res, const CacheConfig& config) {
+    batch_res.initGroups(config.groupNums(),
+                         static_cast<int>(config.layer_all_num),
+                         config.layerGroupIdsSnapshot(),
+                         config.kernelBlocksPerKvBlock(),
+                         config.groupTypesSnapshot());
+}
+
+static ModelConfig makeProModelConfig() {
+    ModelConfig mc;
+    mc.num_layers                   = 61;
+    mc.hidden_size                  = 7168;
+    mc.attn_config.head_num         = 128;
+    mc.attn_config.kv_head_num      = 1;
+    mc.attn_config.size_per_head    = 512;
+    mc.attn_config.rope_head_dim    = 64;
+    mc.attn_config.sliding_window   = 128;
+    mc.attn_config.indexer_head_dim = 128;
+    mc.attn_config.indexer_head_num = 64;
+    mc.attn_config.indexer_topk     = 1024;
+    mc.attn_config.o_groups         = 16;
+    mc.attn_config.o_lora_rank      = 1024;
+    std::vector<int> ratios;
+    ratios.push_back(128);
+    ratios.push_back(128);
+    for (int i = 2; i < 61; i++) {
+        ratios.push_back((i % 2 == 0) ? 4 : 128);
+    }
+    mc.attn_config.layer_compress_ratios = ratios;
+    mc.hybrid_attention_config.enable_hybrid_attention           = true;
+    mc.hybrid_attention_config.enable_independent_kv_cache_pools = true;
+    setDsv4KvCacheSpecs(mc);
+    return mc;
+}
+
+static ModelConfig makeFlashModelConfig() {
+    ModelConfig mc;
+    mc.num_layers                   = 43;
+    mc.hidden_size                  = 4096;
+    mc.attn_config.head_num         = 64;
+    mc.attn_config.kv_head_num      = 1;
+    mc.attn_config.size_per_head    = 512;
+    mc.attn_config.rope_head_dim    = 64;
+    mc.attn_config.sliding_window   = 128;
+    mc.attn_config.indexer_head_dim = 128;
+    mc.attn_config.indexer_head_num = 64;
+    mc.attn_config.indexer_topk     = 512;
+    mc.attn_config.o_groups         = 8;
+    mc.attn_config.o_lora_rank      = 1024;
+    std::vector<int> ratios         = {0, 0};
+    for (int i = 2; i < 43; i++) {
+        ratios.push_back((i % 2 == 0) ? 4 : 128);
+    }
+    mc.attn_config.layer_compress_ratios                         = ratios;
+    mc.hybrid_attention_config.enable_hybrid_attention           = true;
+    mc.hybrid_attention_config.enable_independent_kv_cache_pools = true;
+    setDsv4KvCacheSpecs(mc);
+    return mc;
+}
+
+static ModelConfig makeFlashMtpModelConfig() {
+    ModelConfig mc                       = makeFlashModelConfig();
+    mc.num_layers                        = 1;
+    mc.attn_config.layer_compress_ratios = {0};
+    setDsv4KvCacheSpecs(mc);
+    return mc;
+}
+
+static ModelConfig makeHybridAttentionModelConfig(bool independent_pool) {
+    ModelConfig mc;
+    mc.num_layers                                                = 4;
+    mc.hidden_size                                               = 128;
+    mc.attn_config.head_num                                      = 4;
+    mc.attn_config.kv_head_num                                   = 2;
+    mc.attn_config.size_per_head                                 = independent_pool ? 16 : 32;
+    mc.attn_config.tokens_per_block                              = 8;
+    mc.hybrid_attention_config.enable_hybrid_attention           = true;
+    mc.hybrid_attention_config.enable_independent_kv_cache_pools = independent_pool;
+    mc.hybrid_attention_config.hybrid_attention_types            = {
+        HybridAttentionType::LINEAR, HybridAttentionType::NONE, HybridAttentionType::LINEAR, HybridAttentionType::NONE};
+    mc.linear_attention_config.linear_conv_kernel_dim = 4;
+    mc.linear_attention_config.linear_key_head_dim    = 16;
+    mc.linear_attention_config.linear_value_head_dim  = 16;
+    mc.linear_attention_config.linear_num_key_heads   = 2;
+    mc.linear_attention_config.linear_num_value_heads = 2;
+    setHybridAttentionKvCacheSpecs(mc);
+    return mc;
+}
+
+// ============================================================
+// Layer classification
+// ============================================================
+
+TEST(HybridPoolConfigCreatorTest, ProLayerClassification) {
+    ParallelismConfig pc;
+    auto config = CacheConfigCreator::createBasicConfig(makeProModelConfig(), pc, makeDsv4KvCacheConfig(), false, 0);
+    EXPECT_EQ(config.layer_num, 61u);
+    EXPECT_EQ(config.groupTagsSnapshot(), kDsv4ProFirstSeenTags);
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "csa_kv")).size(), 30u);
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "hca_kv")).size(), 31u);
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "swa_kv")).size(), 61u);
+}
+
+TEST(HybridPoolConfigCreatorTest, FlashLayerClassification) {
+    ParallelismConfig pc;
+    auto config = CacheConfigCreator::createBasicConfig(makeFlashModelConfig(), pc, makeDsv4KvCacheConfig(), false, 0);
+    EXPECT_EQ(config.layer_num, 43u);
+    EXPECT_EQ(config.groupTagsSnapshot(), kDsv4FlashFirstSeenTags);
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "csa_kv")).size(), 21u);
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "hca_kv")).size(), 20u);
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "swa_kv")).size(), 43u);
+}
+
+TEST(HybridPoolConfigCreatorTest, MtpSwaOnlyLayerIsNotStripped) {
+    ParallelismConfig pc;
+    auto              config =
+        CacheConfigCreator::createBasicConfig(makeFlashMtpModelConfig(), pc, makeDsv4KvCacheConfig(), true, 0);
+
+    EXPECT_EQ(config.layer_num, 1u);
+    EXPECT_EQ(config.block_size_bytes, 1u);
+    ASSERT_EQ(static_cast<size_t>(config.groupNums()), 1u);
+    ASSERT_EQ(config.layerIdsForGroup(gidForTag(config, "swa_kv")), std::vector<int>({0}));
+    ASSERT_EQ(config.layerGroupIdsSnapshot().size(), 1u);
+    EXPECT_EQ(config.layerGroupIdsSnapshot()[0], std::vector<int>({0}));
+    EXPECT_EQ(config.tagForGroup(0), "swa_kv");
+    EXPECT_EQ(config.groupIdForLayerTag(0, "swa_kv"), 0);
+}
+
+TEST(HybridPoolConfigCreatorTest, Dsv4SpecOrderControlsFirstSeenGroupOrder) {
+    auto mc = makeFlashModelConfig();
+    for (auto& layer_descs : mc.kv_cache_spec_descs) {
+        std::reverse(layer_descs.begin(), layer_descs.end());
+    }
+
+    ParallelismConfig pc;
+    auto config = CacheConfigCreator::createBasicConfig(mc, pc, makeDsv4KvCacheConfig(), false, 0);
+
+    const std::vector<std::string> expected_tags = {
+        "swa_kv", "csa_state", "indexer_state", "indexer_kv", "csa_kv", "hca_state", "hca_kv"};
+    EXPECT_EQ(config.groupTagsSnapshot(), expected_tags);
+
+    ASSERT_EQ(static_cast<size_t>(config.groupNums()), expected_tags.size());
+    ASSERT_EQ(static_cast<size_t>(config.groupNums()), expected_tags.size());
+    for (size_t gid = 0; gid < expected_tags.size(); ++gid) {
+        ASSERT_NE(config.specForGroup(gid), nullptr);
+        EXPECT_EQ(config.specForGroup(gid)->tag, expected_tags[gid]) << "gid=" << gid;
+        EXPECT_EQ(config.specForGroup(gid)->layers, config.layerIdsForGroup(gid)) << "gid=" << gid;
+    }
+
+    EXPECT_EQ(config.groupIdForLayerTag(2, "csa_kv"), config.groupIdForTag("csa_kv"));
+    EXPECT_EQ(config.groupIdForLayerTag(3, "hca_kv"), config.groupIdForTag("hca_kv"));
+    EXPECT_EQ(config.groupIdForLayerTag(0, "swa_kv"), config.groupIdForTag("swa_kv"));
+}
+
+static GroupBase makeTestGroup(const KVCacheSpecPtr& spec, CacheGroupType type, std::vector<int> layer_ids) {
+    GroupBase group;
+    group.spec      = spec;
+    group.policy    = defaultCacheGroupPolicy(type);
+    group.layer_ids = std::move(layer_ids);
+    return group;
+}
+
+TEST(CacheConfigTest, SetTopologyInstallsTagAndGroupTopology) {
+    CacheConfig config;
+    config.layer_num     = 3;
+    config.layer_all_num = 3;
+
+    auto swa_spec = std::make_shared<FixedStateCacheSpec>();
+    swa_spec->tag = "swa";
+    swa_spec->state_dim = 1;
+    swa_spec->entries_per_block = 1;
+    swa_spec->store_dtype = DataType::TYPE_UINT8;
+    swa_spec->dtype = DataType::TYPE_UINT8;
+    auto csa_spec = std::make_shared<CompressedKVCacheSpec>();
+    csa_spec->tag = "csa";
+    csa_spec->entry_elems = 1;
+    csa_spec->entries_per_block = 1;
+    csa_spec->compression_ratio = 1;
+    csa_spec->store_dtype = DataType::TYPE_UINT8;
+    csa_spec->dtype = DataType::TYPE_UINT8;
+
+    std::vector<LayerBase> layers(3);
+    layers[0].group_ids = {0};
+    layers[0].tag_to_gid["swa"] = 0;
+    layers[1].group_ids = {0, 1};
+    layers[1].tag_to_gid["swa"] = 0;
+    layers[1].tag_to_gid["csa"] = 1;
+    layers[2].group_ids = {0};
+    layers[2].tag_to_gid["swa"] = 0;
+
+    config.setTopology({makeTestGroup(swa_spec, CacheGroupType::SWA, {0, 1, 2}),
+                        makeTestGroup(csa_spec, CacheGroupType::FULL, {1})},
+                       std::move(layers));
+
+    EXPECT_EQ(config.groupTagsSnapshot(), std::vector<std::string>({"swa", "csa"}));
+    EXPECT_EQ(config.groupIdForLayerTag(1, "swa"), 0);
+    EXPECT_EQ(config.groupIdForLayerTag(1, "csa"), 1);
+    EXPECT_THROW((void)config.groupIdFor(1), std::exception);
+    EXPECT_EQ(config.layerGroupIdsSnapshot()[1], std::vector<int>({0, 1}));
+}
+
+TEST(CacheConfigTest, SetTopologyRejectsMissingLayer) {
+    CacheConfig config;
+    config.layer_num     = 2;
+    config.layer_all_num = 2;
+
+    auto spec = std::make_shared<MHAKVCacheSpec>();
+    spec->tag = "default";
+    std::vector<LayerBase> layers(2);
+    layers[0].group_ids = {0};
+    layers[0].tag_to_gid["default"] = 0;
+    EXPECT_THROW(config.setTopology({makeTestGroup(spec, CacheGroupType::FULL, {0})}, std::move(layers)),
+                 std::exception);
+}
+
+TEST(CacheConfigTest, SetTopologyRejectsEmptyTag) {
+    CacheConfig config;
+    config.layer_num     = 1;
+    config.layer_all_num = 1;
+
+    auto spec = std::make_shared<MHAKVCacheSpec>();
+    std::vector<LayerBase> layers(1);
+    layers[0].group_ids = {0};
+    EXPECT_THROW(config.setTopology({makeTestGroup(spec, CacheGroupType::FULL, {0})}, std::move(layers)),
+                 std::exception);
+}
+
+TEST(CacheConfigTest, SetTopologyAllowsDifferentLayerTags) {
+    CacheConfig config;
+    config.layer_num     = 1;
+    config.layer_all_num = 1;
+
+    auto spec0 = std::make_shared<MHAKVCacheSpec>();
+    spec0->tag = "full";
+    auto spec1 = std::make_shared<MHAKVCacheSpec>();
+    spec1->tag = "linear";
+
+    std::vector<LayerBase> layers(1);
+    layers[0].group_ids = {0, 1};
+    layers[0].tag_to_gid["full"] = 0;
+    layers[0].tag_to_gid["linear"] = 1;
+    EXPECT_NO_THROW(config.setTopology({makeTestGroup(spec0, CacheGroupType::FULL, {0}),
+                                        makeTestGroup(spec1, CacheGroupType::LINEAR, {0})},
+                                       std::move(layers)));
+    EXPECT_EQ(config.layerGroupIdsSnapshot()[0].size(), 2u);
+}
+
+TEST(HybridPoolConfigCreatorTest, Dsv4ModelProvidedAlignmentPropagatesToCacheSpecs) {
+    auto mc = makeFlashModelConfig();
+    for (auto& layer_descs : mc.kv_cache_spec_descs) {
+        for (auto& desc : layer_descs) {
+            if (desc.tag == "csa_kv") {
+                desc.block_size_bytes_alignment = 1024;
+            } else if (desc.tag == "swa_kv") {
+                desc.block_size_bytes_alignment        = 2048;
+                desc.block_size_alignment_min_entries = 256;
+            }
+        }
+    }
+
+    ParallelismConfig pc;
+    auto config = CacheConfigCreator::createBasicConfig(mc, pc, makeDsv4KvCacheConfig(), false, 0);
+
+    auto* csa_kv = dynamic_cast<CompressedKVCacheSpec*>(config.specForGroup(gidForTag(config, "csa_kv")).get());
+    auto* swa_kv = dynamic_cast<FixedStateCacheSpec*>(config.specForGroup(gidForTag(config, "swa_kv")).get());
+    ASSERT_NE(csa_kv, nullptr);
+    ASSERT_NE(swa_kv, nullptr);
+    EXPECT_EQ(csa_kv->block_size_bytes_alignment, 1024u);
+    EXPECT_EQ(swa_kv->block_size_bytes_alignment, 2048u);
+    EXPECT_EQ(swa_kv->block_size_alignment_min_entries, 256u);
+}
+
+TEST(HybridPoolConfigCreatorTest, Dsv4TagRoutesAreConsistent) {
+    ParallelismConfig pc;
+    auto              config =
+        CacheConfigCreator::createBasicConfig(makeFlashModelConfig(), pc, makeDsv4KvCacheConfig(), false, 0);
+
+    auto expect_route = [&](int layer_id, const std::string& tag, int expected_gid) {
+        EXPECT_EQ(config.groupIdForLayerTag(layer_id, tag), expected_gid) << "layer=" << layer_id << " tag=" << tag;
+    };
+
+    // Flash DSV4 test config uses layers 2,4,... as CSA and 3,5,... as HCA; 0/1 are SWA-only.
+    expect_route(2, "csa_kv", config.groupIdForTag("csa_kv"));
+    expect_route(2, "indexer_kv", config.groupIdForTag("indexer_kv"));
+    expect_route(2, "indexer_state", config.groupIdForTag("indexer_state"));
+    expect_route(2, "csa_state", config.groupIdForTag("csa_state"));
+    expect_route(2, "swa_kv", config.groupIdForTag("swa_kv"));
+
+    expect_route(3, "hca_kv", config.groupIdForTag("hca_kv"));
+    expect_route(3, "hca_state", config.groupIdForTag("hca_state"));
+    expect_route(3, "swa_kv", config.groupIdForTag("swa_kv"));
+
+    expect_route(0, "swa_kv", config.groupIdForTag("swa_kv"));
+    EXPECT_THROW(config.groupIdForLayerTag(0, "csa_kv"), std::exception);
+    EXPECT_THROW(config.groupIdForLayerTag(0, "hca_kv"), std::exception);
+
+    auto mtp_config =
+        CacheConfigCreator::createBasicConfig(makeFlashMtpModelConfig(), pc, makeDsv4KvCacheConfig(), true, 0);
+    ASSERT_EQ(mtp_config.groupIdForLayerTag(0, "swa_kv"), 0);
+}
+
+TEST(HybridPoolConfigCreatorTest, Dsv4GroupPoliciesMatchLegacyBehavior) {
+    ParallelismConfig pc;
+    auto              config =
+        CacheConfigCreator::createBasicConfig(makeFlashModelConfig(), pc, makeDsv4KvCacheConfig(), false, 0);
+
+    ASSERT_EQ(config.groupPoliciesSnapshot().size(), static_cast<size_t>(config.groupNums()));
+    auto expect_policy = [&](const std::string& tag,
+                             CacheReusePolicy reuse_policy,
+                             CacheEvictPolicy evict_policy,
+                             int active_tail_blocks) {
+        const auto group_tags = config.groupTagsSnapshot();
+        auto       it         = std::find(group_tags.begin(), group_tags.end(), tag);
+        ASSERT_NE(it, group_tags.end()) << tag;
+        const auto gid = static_cast<size_t>(std::distance(group_tags.begin(), it));
+        EXPECT_EQ(config.policyForGroup(gid).reuse_policy, reuse_policy) << tag;
+        EXPECT_EQ(config.policyForGroup(gid).evict_policy, evict_policy) << tag;
+        EXPECT_EQ(config.policyForGroup(gid).active_tail_blocks, active_tail_blocks) << tag;
+    };
+
+    expect_policy("hca_state", CacheReusePolicy::NON_REUSABLE, CacheEvictPolicy::INDEPENDENT, 1);
+    expect_policy("swa_kv", CacheReusePolicy::REUSABLE, CacheEvictPolicy::INDEPENDENT, 2);
+    expect_policy("csa_state", CacheReusePolicy::REUSABLE, CacheEvictPolicy::INDEPENDENT, 2);
+    expect_policy("csa_kv", CacheReusePolicy::REUSABLE, CacheEvictPolicy::CHAIN, 0);
+    expect_policy("hca_kv", CacheReusePolicy::REUSABLE, CacheEvictPolicy::CHAIN, 0);
+    expect_policy("indexer_kv", CacheReusePolicy::REUSABLE, CacheEvictPolicy::CHAIN, 0);
+}
+
+TEST(HybridPoolConfigCreatorTest, Dsv4SpecsMissingFailsFastWithoutRatioFallback) {
+    auto mc = makeFlashModelConfig();
+    mc.kv_cache_spec_descs.clear();
+
+    ParallelismConfig pc;
+    EXPECT_THROW((void)CacheConfigCreator::createBasicConfig(mc, pc, makeDsv4KvCacheConfig(), false, 0),
+                 std::exception);
+}
+
+// ============================================================
+// Pool specs
+// ============================================================
+
+TEST(HybridPoolConfigCreatorTest, ProPoolSpecs) {
+    ParallelismConfig pc;
+    auto config = CacheConfigCreator::createBasicConfig(makeProModelConfig(), pc, makeDsv4KvCacheConfig(), false, 0);
+
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "csa_kv")).size(), 30u);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "csa_kv"))->block_size_bytes(), 32u * kDsv4KvEntryBytes);
+    EXPECT_EQ(config.typeForGroup(gidForTag(config, "csa_kv")), CacheGroupType::FULL);
+
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "hca_kv")).size(), 31u);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "hca_kv"))->block_size_bytes(), 1u * kDsv4KvEntryBytes);
+
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "indexer_kv")).size(), 30u);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "indexer_kv"))->block_size_bytes(), 32u * kDsv4IndexerEntryBytes);
+
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "indexer_state")).size(), 30u);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "indexer_state"))->block_size_bytes(), 8u * 512u * 4u);
+
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "csa_state")).size(), 30u);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "csa_state"))->block_size_bytes(), 8u * 2048u * 4u);
+
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "hca_state")).size(), 31u);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "hca_state"))->block_size_bytes(), 128u * 1024u * 4u);
+
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "swa_kv")).size(), 61u);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "swa_kv"))->block_size_bytes(), kDsv4TokensPerBlock * kDsv4KvEntryBytes);
+}
+
+TEST(HybridPoolConfigCreatorTest, FlashPoolSpecs) {
+    ParallelismConfig pc;
+    auto config = CacheConfigCreator::createBasicConfig(makeFlashModelConfig(), pc, makeDsv4KvCacheConfig(), false, 0);
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "csa_kv")).size(), 21u);
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "hca_kv")).size(), 20u);
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "swa_kv")).size(), 43u);
+}
+
+// ============================================================
+// Block size bytes
+// ============================================================
+
+TEST(HybridPoolConfigCreatorTest, BlockSizeBytes) {
+    ParallelismConfig pc;
+    auto config = CacheConfigCreator::createBasicConfig(makeProModelConfig(), pc, makeDsv4KvCacheConfig(), false, 0);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "csa_kv"))->block_size_bytes(), 32u * kDsv4KvEntryBytes);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "hca_kv"))->block_size_bytes(), 1u * kDsv4KvEntryBytes);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "indexer_kv"))->block_size_bytes(), 32u * kDsv4IndexerEntryBytes);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "indexer_state"))->block_size_bytes(), 8u * 512u * 4u);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "csa_state"))->block_size_bytes(), 8u * 2048u * 4u);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "hca_state"))->block_size_bytes(), 128u * 1024u * 4u);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "swa_kv"))->block_size_bytes(), kDsv4TokensPerBlock * kDsv4KvEntryBytes);
+}
+
+TEST(HybridPoolConfigCreatorTest, Fp8BlockSizeBytesUsePaddedPhysicalStride) {
+    ParallelismConfig pc;
+    auto              mc          = makeProModelConfig();
+    mc.attn_config.kv_cache_dtype = KvCacheDataType::FP8;
+    setDsv4KvCacheSpecs(mc);
+    auto config                   = CacheConfigCreator::createBasicConfig(mc, pc, makeDsv4KvCacheConfig(), false, 0);
+
+    ASSERT_EQ(static_cast<size_t>(config.groupNums()), 7u);
+    ASSERT_EQ(config.groupKvBlockStrideBytesSnapshot().size(), 7u);
+
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "csa_kv"))->block_size_bytes(), 19008u);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "hca_kv"))->block_size_bytes(), 1152u);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "indexer_kv"))->block_size_bytes(), 32u * 132u);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "swa_kv"))->block_size_bytes(), 74880u);
+
+    EXPECT_EQ(config.kvBlockStrideBytesForGroup(gidForTag(config, "csa_kv")),
+              config.specForGroup(gidForTag(config, "csa_kv"))->block_size_bytes());
+    EXPECT_EQ(config.kvBlockStrideBytesForGroup(gidForTag(config, "hca_kv")),
+              config.specForGroup(gidForTag(config, "hca_kv"))->block_size_bytes());
+    EXPECT_EQ(config.kvBlockStrideBytesForGroup(gidForTag(config, "swa_kv")),
+              config.specForGroup(gidForTag(config, "swa_kv"))->block_size_bytes());
+}
+
+TEST(HybridPoolConfigCreatorTest, DecoupledPhysicalAndKernelBlockSizeUsesPerGroupBpk) {
+    ParallelismConfig pc;
+    auto              mc = makeProModelConfig();
+    KVCacheConfig     kv_cache_config;
+    kv_cache_config.seq_size_per_block        = 16384;
+    kv_cache_config.kernel_seq_size_per_block = 128;
+    auto config = CacheConfigCreator::createBasicConfig(mc, pc, kv_cache_config, false, 0);
+
+    ASSERT_EQ(static_cast<size_t>(config.groupNums()), 7u);
+    ASSERT_EQ(config.group_seq_size_per_block.size(), 7u);
+
+    EXPECT_EQ(config.seq_size_per_block, 16384u);
+    EXPECT_EQ(config.kernel_seq_size_per_block, 128u);
+    EXPECT_EQ(config.kernelBlocksPerKvBlock(), 128u);
+    for (size_t gid = 0; gid < config.group_seq_size_per_block.size(); ++gid) {
+        EXPECT_EQ(config.group_seq_size_per_block[gid], 16384u);
+    }
+
+    auto* csa_kv = dynamic_cast<CompressedKVCacheSpec*>(config.specForGroup(gidForTag(config, "csa_kv")).get());
+    auto* hca_kv = dynamic_cast<CompressedKVCacheSpec*>(config.specForGroup(gidForTag(config, "hca_kv")).get());
+    auto* idx_kv = dynamic_cast<CompressedKVCacheSpec*>(config.specForGroup(gidForTag(config, "indexer_kv")).get());
+    auto* swa_kv = dynamic_cast<FixedStateCacheSpec*>(config.specForGroup(gidForTag(config, "swa_kv")).get());
+    ASSERT_NE(csa_kv, nullptr);
+    ASSERT_NE(hca_kv, nullptr);
+    ASSERT_NE(idx_kv, nullptr);
+    ASSERT_NE(swa_kv, nullptr);
+    EXPECT_EQ(csa_kv->compression_ratio, 4u);
+    EXPECT_EQ(hca_kv->compression_ratio, 128u);
+    EXPECT_EQ(idx_kv->compression_ratio, 4u);
+    EXPECT_EQ(csa_kv->entries_per_block, 32u);
+    EXPECT_EQ(hca_kv->entries_per_block, 1u);
+    EXPECT_EQ(idx_kv->entries_per_block, 32u);
+    EXPECT_EQ(swa_kv->entries_per_block, 128u);
+
+    EXPECT_EQ(config.kvBlockStrideBytesForGroup(gidForTag(config, "csa_kv")),
+              config.specForGroup(gidForTag(config, "csa_kv"))->block_size_bytes() * 128u);
+    EXPECT_EQ(config.kvBlockStrideBytesForGroup(gidForTag(config, "hca_kv")),
+              config.specForGroup(gidForTag(config, "hca_kv"))->block_size_bytes() * 128u);
+    EXPECT_EQ(config.kvBlockStrideBytesForGroup(gidForTag(config, "indexer_kv")),
+              config.specForGroup(gidForTag(config, "indexer_kv"))->block_size_bytes() * 128u);
+    EXPECT_EQ(config.kvBlockStrideBytesForGroup(gidForTag(config, "swa_kv")),
+              config.specForGroup(gidForTag(config, "swa_kv"))->block_size_bytes());
+
+    auto full_pool = BlockPoolConfigHelper::createConfigForGroup(config, gidForTag(config, "csa_kv"));
+    auto swa_pool  = BlockPoolConfigHelper::createConfigForGroup(config, gidForTag(config, "swa_kv"));
+    ASSERT_EQ(full_pool.memory_layouts.size(), 1u);
+    ASSERT_EQ(swa_pool.memory_layouts.size(), 1u);
+    EXPECT_EQ(full_pool.memory_layouts[0].kernel_blocks_per_kv_block, 128u);
+    EXPECT_EQ(swa_pool.memory_layouts[0].kernel_blocks_per_kv_block, 1u);
+}
+
+TEST(HybridPoolConfigCreatorTest, PrefillCpShardedSlicesFixedAndSwaPhysicalBlocks) {
+    ParallelismConfig pc;
+    pc.role_type                          = RoleType::PREFILL;
+    pc.tp_size                            = 4;
+    pc.prefill_cp_config.kv_cache_sharded = true;
+
+    auto mc                       = makeProModelConfig();
+    mc.attn_config.kv_cache_dtype = KvCacheDataType::FP8;
+    setDsv4KvCacheSpecs(mc);
+    auto config                   = CacheConfigCreator::createBasicConfig(mc, pc, makeDsv4KvCacheConfig(), false, 0);
+
+    ASSERT_EQ(static_cast<size_t>(config.groupNums()), 7u);
+    ASSERT_EQ(config.groupKvBlockStrideBytesSnapshot().size(), 7u);
+
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "csa_kv"))->block_size_bytes(), 19008u);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "hca_kv"))->block_size_bytes(), 1152u);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "indexer_kv"))->block_size_bytes(), 32u * 132u);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "indexer_state"))->block_size_bytes(), 2u * 512u * 4u);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "csa_state"))->block_size_bytes(), 2u * 2048u * 4u);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "hca_state"))->block_size_bytes(), 32u * 1024u * 4u);
+
+    // SWA_KV keeps full logical ring entries for byte-sliced CP layout, but
+    // each prefill rank stores only one aligned byte slice of the full block.
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "swa_kv"))->block_size_bytes(), 18720u);
+    for (const auto& tag : {"indexer_state", "csa_state", "hca_state", "swa_kv"}) {
+        const auto gid = gidForTag(config, tag);
+        EXPECT_EQ(config.kvBlockStrideBytesForGroup(gid), config.specForGroup(gid)->block_size_bytes());
+        EXPECT_EQ(config.group_seq_size_per_block[gid], kDsv4TokensPerBlock * 4u) << "tag=" << tag;
+    }
+
+    pc.role_type       = RoleType::DECODE;
+    auto decode_config = CacheConfigCreator::createBasicConfig(mc, pc, makeDsv4KvCacheConfig(), false, 0);
+    EXPECT_EQ(decode_config.specForGroup(gidForTag(decode_config, "indexer_state"))->block_size_bytes(), 8u * 512u * 4u);
+    EXPECT_EQ(decode_config.specForGroup(gidForTag(decode_config, "csa_state"))->block_size_bytes(), 8u * 2048u * 4u);
+    EXPECT_EQ(decode_config.specForGroup(gidForTag(decode_config, "hca_state"))->block_size_bytes(), 128u * 1024u * 4u);
+    EXPECT_EQ(decode_config.specForGroup(gidForTag(decode_config, "swa_kv"))->block_size_bytes(), 74880u);
+}
+
+TEST(KVCacheTransferPlannerTest, CpCompactSwaUsesCanonicalTailRows) {
+    auto plan = buildCacheStoreBlockPlan(/*total_logical_blocks=*/8,
+                                         /*reuse_block_size=*/0,
+                                         /*use_hybrid=*/true,
+                                         CacheGroupType::SWA,
+                                         /*cp_rank=*/0,
+                                         /*cp_size=*/4);
+    ASSERT_EQ(plan.size(), 2u);
+    EXPECT_EQ(plan[0].key_index, 3);
+    EXPECT_EQ(plan[0].offset_index, 0);
+    EXPECT_EQ(plan[1].key_index, 7);
+    EXPECT_EQ(plan[1].offset_index, 1);
+}
+
+TEST(KVCacheTransferPlannerTest, CpCompactSwaKeepsPartialTailRows) {
+    {
+        auto plan = buildCacheStoreBlockPlan(/*total_logical_blocks=*/1,
+                                             /*reuse_block_size=*/0,
+                                             /*use_hybrid=*/true,
+                                             CacheGroupType::SWA,
+                                             /*cp_rank=*/0,
+                                             /*cp_size=*/2);
+        ASSERT_EQ(plan.size(), 1u);
+        EXPECT_EQ(plan[0].key_index, 0);
+        EXPECT_EQ(plan[0].offset_index, 0);
+    }
+    {
+        auto plan = buildCacheStoreBlockPlan(/*total_logical_blocks=*/11,
+                                             /*reuse_block_size=*/0,
+                                             /*use_hybrid=*/true,
+                                             CacheGroupType::SWA,
+                                             /*cp_rank=*/0,
+                                             /*cp_size=*/2);
+        ASSERT_EQ(plan.size(), 2u);
+        EXPECT_EQ(plan[0].key_index, 9);
+        EXPECT_EQ(plan[0].offset_index, 4);
+        EXPECT_EQ(plan[1].key_index, 10);
+        EXPECT_EQ(plan[1].offset_index, 5);
+    }
+}
+
+// ============================================================
+// CacheConfig output
+// ============================================================
+
+TEST(HybridPoolConfigCreatorTest, CreateCacheConfig) {
+    auto              mc = makeProModelConfig();
+    ParallelismConfig pc;
+    auto              config = CacheConfigCreator::createBasicConfig(mc, pc, makeDsv4KvCacheConfig(), false, 0);
+
+    // 7 groups -> groupNums() > 1 -> HybridTypeKVCacheAllocator path
+    EXPECT_EQ(config.groupNums(), 7);
+    EXPECT_EQ(static_cast<size_t>(config.groupNums()), 7u);
+    EXPECT_EQ(static_cast<size_t>(config.groupNums()), 7u);
+    EXPECT_EQ(static_cast<size_t>(config.groupNums()), 7u);
+    EXPECT_EQ(config.layer_num, 61u);
+    EXPECT_TRUE(config.is_sparse);
+    EXPECT_FALSE(config.use_mla);
+}
+
+TEST(HybridPoolConfigCreatorTest, FlashCacheConfig) {
+    auto              mc = makeFlashModelConfig();
+    ParallelismConfig pc;
+    auto              config = CacheConfigCreator::createBasicConfig(mc, pc, makeDsv4KvCacheConfig(), false, 0);
+
+    EXPECT_EQ(config.groupNums(), 7);
+    EXPECT_EQ(config.layer_num, 43u);
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "swa_kv")).size(), 43u);
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "csa_kv")).size(), 21u);
+}
+
+TEST(HybridPoolConfigCreatorTest, HybridAttentionIndependentPoolUsesHybridPoolConfig) {
+    ParallelismConfig pc;
+    auto              config =
+        CacheConfigCreator::createBasicConfig(makeHybridAttentionModelConfig(true), pc, KVCacheConfig{}, false, 0);
+
+    EXPECT_TRUE(config.use_independent_block_pools);
+    ASSERT_EQ(config.groupNums(), 2);
+    const auto group_types = config.groupTypesSnapshot();
+    EXPECT_EQ(std::count(group_types.begin(), group_types.end(), CacheGroupType::FULL), 1);
+    EXPECT_EQ(std::count(group_types.begin(), group_types.end(), CacheGroupType::LINEAR), 1);
+    ASSERT_EQ(static_cast<size_t>(config.groupNums()), 2u);
+    EXPECT_LT(config.specForGroup(gidForTag(config, "full"))->block_size_bytes(),
+              config.specForGroup(gidForTag(config, "linear"))->block_size_bytes());
+    EXPECT_EQ(config.groupBlockNumsSnapshot().size(), 2u);
+    EXPECT_EQ(config.groupTagsSnapshot(), std::vector<std::string>({"linear", "full"}));
+}
+
+TEST(HybridPoolConfigCreatorTest, HybridAttentionIndependentPoolSplitsFullAndSwaSpecs) {
+    auto mc = makeHybridAttentionModelConfig(true);
+    mc.hybrid_attention_config.hybrid_attention_types = {HybridAttentionType::NONE,
+                                                        HybridAttentionType::SLIDING_WINDOW,
+                                                        HybridAttentionType::LINEAR,
+                                                        HybridAttentionType::SLIDING_WINDOW};
+    setHybridAttentionKvCacheSpecs(mc);
+
+    ParallelismConfig pc;
+    auto              config = CacheConfigCreator::createBasicConfig(mc, pc, KVCacheConfig{}, false, 0);
+
+    ASSERT_EQ(config.groupNums(), 3);
+    EXPECT_EQ(config.groupTypesSnapshot(),
+              std::vector<CacheGroupType>({CacheGroupType::FULL, CacheGroupType::SWA, CacheGroupType::LINEAR}));
+    EXPECT_EQ(config.groupTagsSnapshot(), std::vector<std::string>({"full", "swa", "linear"}));
+    ASSERT_EQ(static_cast<size_t>(config.groupNums()), 3u);
+    EXPECT_NE(config.specForGroup(0).get(), config.specForGroup(1).get());
+    EXPECT_EQ(config.layerIdsForGroup(0), std::vector<int>({0}));
+    EXPECT_EQ(config.layerIdsForGroup(1), std::vector<int>({1, 3}));
+    EXPECT_EQ(config.layerIdsForGroup(2), std::vector<int>({2}));
+    EXPECT_EQ(config.layerIdsForGroup(0).size(), 1u);
+    EXPECT_EQ(config.layerIdsForGroup(1).size(), 2u);
+    EXPECT_EQ(config.layerIdsForGroup(2).size(), 1u);
+    EXPECT_EQ(config.groupIdForLayerTag(1, "swa"), 1);
+    EXPECT_EQ(config.groupIdForLayerTag(2, "linear"), 2);
+}
+
+TEST(HybridPoolConfigCreatorTest, HybridAttentionWithoutIndependentPoolKeepsSharedHybridConfig) {
+    ParallelismConfig pc;
+    auto              config =
+        CacheConfigCreator::createBasicConfig(makeHybridAttentionModelConfig(false), pc, KVCacheConfig{}, false, 0);
+
+    EXPECT_FALSE(config.use_independent_block_pools);
+    ASSERT_EQ(config.groupNums(), 2);
+    EXPECT_TRUE(config.groupBlockNumsSnapshot().empty());
+}
+
+TEST(HybridConfigCreatorTest, HybridAttentionTypesMustCoverAllLayers) {
+    auto mc = makeHybridAttentionModelConfig(false);
+    mc.hybrid_attention_config.hybrid_attention_types.pop_back();
+
+    ParallelismConfig pc;
+    EXPECT_THROW((void)CacheConfigCreator::createBasicConfig(mc, pc, KVCacheConfig{}, false, 0),
+                 std::exception);
+}
+
+// ============================================================
+// Generic opaque cache specs
+// ============================================================
+
+TEST(GenericOpaqueCacheSpecTest, KVSpecFromPoolSpec) {
+    CompressedKVCacheSpec spec("csa_kv",
+                               kDsv4Fp8KvEntryBytes,
+                               64,
+                               DataType::TYPE_UINT8,
+                               kDsv4TokensPerBlock,
+                               1,
+                               DSV4_FP8_MLA_BLOCK_ALIGNMENT_BYTES);
+
+    EXPECT_EQ(spec.block_size(), 64u * kDsv4Fp8KvEntryBytes);
+    EXPECT_EQ(spec.natural_block_size_bytes(), 64u * kDsv4Fp8KvEntryBytes * 1u);  // uint8 = 1 byte
+    EXPECT_EQ(spec.block_size_bytes(), 37440u);
+    EXPECT_EQ(spec.tag, "csa_kv");
+    EXPECT_EQ(spec.entry_elems, kDsv4Fp8KvEntryBytes);
+    EXPECT_EQ(spec.entries_per_block, 64u);
+
+    CompressedKVCacheSpec hca_spec("hca_kv",
+                                    kDsv4Fp8KvEntryBytes,
+                                    2,
+                                    DataType::TYPE_UINT8,
+                                    kDsv4TokensPerBlock,
+                                    1,
+                                    DSV4_FP8_MLA_BLOCK_ALIGNMENT_BYTES);
+    EXPECT_EQ(hca_spec.natural_block_size_bytes(), 2u * kDsv4Fp8KvEntryBytes);
+    EXPECT_EQ(hca_spec.block_size_bytes(), 1728u);
+}
+
+TEST(GenericOpaqueCacheSpecTest, CompressedKVSpecReportsGenericKindsAndLayout) {
+    CompressedKVCacheSpec spec("compressed",
+                               kDsv4Fp8KvEntryBytes,
+                               64,
+                               DataType::TYPE_UINT8,
+                               kDsv4TokensPerBlock,
+                               4,
+                               DSV4_FP8_MLA_BLOCK_ALIGNMENT_BYTES);
+
+    EXPECT_EQ(spec.type, KVCacheSpecType::OpaqueKV);
+    EXPECT_EQ(spec.lifecycle, CacheGroupType::FULL);
+    EXPECT_EQ(spec.block_size(), 64u * kDsv4Fp8KvEntryBytes);
+    EXPECT_EQ(spec.natural_block_size_bytes(), 64u * kDsv4Fp8KvEntryBytes);
+    EXPECT_EQ(spec.block_size_bytes(), 37440u);
+    EXPECT_EQ(spec.compression_ratio, 4u);
+    EXPECT_EQ(spec.cpTransferPolicy(), CPTransferPolicy::NONE);
+    EXPECT_FALSE(spec.supportsCpSlice());
+}
+
+TEST(GenericOpaqueCacheSpecTest, FixedStateSpecReportsGenericKindsAndSlicesByEntries) {
+    FixedStateCacheSpec spec("tail_state", 32, 8, DataType::TYPE_FP32, kDsv4TokensPerBlock);
+    char                storage[8 * 32 * 4] = {};
+    BlockInfo           block;
+    block.addr       = storage;
+    block.size_bytes = sizeof(storage);
+
+    auto sliced = spec.sliceBlockForPeer({block}, 4, 2);
+    ASSERT_EQ(sliced.size(), 1u);
+    EXPECT_EQ(spec.type, KVCacheSpecType::OpaqueState);
+    EXPECT_EQ(spec.lifecycle, CacheGroupType::SWA);
+    EXPECT_EQ(spec.cpTransferPolicy(), CPTransferPolicy::INTRA_BLOCK_SLICE);
+    EXPECT_TRUE(spec.supportsCpSlice());
+    EXPECT_EQ(sliced[0].addr, storage + 2 * 2 * 32 * 4);
+    EXPECT_EQ(sliced[0].size_bytes, 2u * 32u * 4u);
+}
+
+TEST(GenericOpaqueCacheSpecTest, FixedStateSpecSlicesOverrideByBytes) {
+    FixedStateCacheSpec spec("tail_bytes",
+                             kDsv4Fp8KvEntryBytes,
+                             kDsv4TokensPerBlock,
+                             DataType::TYPE_UINT8,
+                             kDsv4TokensPerBlock,
+                             74880);
+    char                storage[74880] = {};
+    BlockInfo           block;
+    block.addr       = storage;
+    block.size_bytes = sizeof(storage);
+
+    auto sliced = spec.sliceBlockForPeer({block}, 4, 3);
+    ASSERT_EQ(sliced.size(), 1u);
+    EXPECT_EQ(sliced[0].addr, storage + 3 * (sizeof(storage) / 4));
+    EXPECT_EQ(sliced[0].size_bytes, sizeof(storage) / 4);
+
+    auto cp_sliced = spec.cpSliceDestination({block}, 4, 3);
+    ASSERT_EQ(cp_sliced.size(), 1u);
+    EXPECT_EQ(cp_sliced[0].addr, sliced[0].addr);
+    EXPECT_EQ(cp_sliced[0].size_bytes, sliced[0].size_bytes);
+}
+
+TEST(GenericOpaqueCacheSpecTest, FixedStateSpecSlicesAlignedBlockByPhysicalBytes) {
+    FixedStateCacheSpec spec("aligned_tail",
+                             kDsv4Fp8KvEntryBytes,
+                             132,
+                             DataType::TYPE_UINT8,
+                             kDsv4TokensPerBlock,
+                             0,
+                             DSV4_FP8_MLA_BLOCK_ALIGNMENT_BYTES,
+                             DSV4_SWA_WINDOW_ENTRIES);
+    ASSERT_EQ(spec.natural_block_size_bytes(), 77088u);
+    ASSERT_EQ(spec.block_size_bytes(), 77184u);
+    char      storage[77184] = {};
+    BlockInfo block;
+    block.addr       = storage;
+    block.size_bytes = sizeof(storage);
+
+    auto sliced = spec.sliceBlockForPeer({block}, 2, 1);
+    ASSERT_EQ(sliced.size(), 1u);
+    EXPECT_EQ(sliced[0].addr, storage + 38592);
+    EXPECT_EQ(sliced[0].size_bytes, 38592u);
+}
+
+TEST(GenericOpaqueCacheSpecTest, SWAFp8StateSpecUsesPaddedPhysicalBlockSize) {
+    FixedStateCacheSpec spec("swa_kv",
+                             kDsv4Fp8KvEntryBytes,
+                             kDsv4TokensPerBlock,
+                             DataType::TYPE_UINT8,
+                             kDsv4TokensPerBlock,
+                             0,
+                             DSV4_FP8_MLA_BLOCK_ALIGNMENT_BYTES,
+                             DSV4_SWA_WINDOW_ENTRIES);
+
+    EXPECT_EQ(spec.block_size(), kDsv4TokensPerBlock * kDsv4Fp8KvEntryBytes);
+    EXPECT_EQ(spec.natural_block_size_bytes(), kDsv4TokensPerBlock * kDsv4Fp8KvEntryBytes);
+    EXPECT_EQ(spec.block_size_bytes(), 74880u);
+    EXPECT_EQ(spec.tag, "swa_kv");
+}
+
+TEST(GenericOpaqueCacheSpecTest, StateSpecFloat32) {
+    FixedStateCacheSpec spec("csa_state", 2048, 8, DataType::TYPE_FP32, kDsv4TokensPerBlock);
+
+    EXPECT_EQ(spec.block_size(), 8u * 2048u);
+    EXPECT_EQ(spec.block_size_bytes(), 8u * 2048u * 4u);  // float32 = 4 bytes
+    EXPECT_EQ(spec.tag, "csa_state");
+    EXPECT_EQ(spec.state_dim, 2048u);
+}
+
+TEST(GenericOpaqueCacheSpecTest, IndexerKVSpec) {
+    CompressedKVCacheSpec spec("indexer_kv", 132, 64, DataType::TYPE_UINT8, kDsv4TokensPerBlock);
+
+    EXPECT_EQ(spec.block_size(), 64u * 132u);
+    EXPECT_EQ(spec.block_size_bytes(), 64u * 132u);
+    EXPECT_EQ(spec.tag, "indexer_kv");
+}
+
+TEST(GenericOpaqueCacheSpecTest, HCAStateSpec) {
+    FixedStateCacheSpec spec("hca_state", 1024, 128, DataType::TYPE_FP32, kDsv4TokensPerBlock);
+
+    EXPECT_EQ(spec.block_size_bytes(), 128u * 1024u * 4u);
+    EXPECT_EQ(spec.tag, "hca_state");
+}
+
+// ============================================================
+// Pool 0/1/2 shared properties: same tokens_per_block, same num_blocks
+// ============================================================
+
+TEST(HybridPoolConfigCreatorTest, PagedPoolsShareTokensPerBlock) {
+    // Pro config
+    {
+        ParallelismConfig pc;
+        auto              config =
+            CacheConfigCreator::createBasicConfig(makeProModelConfig(), pc, makeDsv4KvCacheConfig(), false, 0);
+        for (const auto& tag : {"csa_kv", "hca_kv", "indexer_kv", "swa_kv"}) {
+            EXPECT_EQ(config.group_seq_size_per_block[gidForTag(config, tag)], kDsv4TokensPerBlock) << tag;
+        }
+    }
+    // Flash config
+    {
+        ParallelismConfig pc;
+        auto              config =
+            CacheConfigCreator::createBasicConfig(makeFlashModelConfig(), pc, makeDsv4KvCacheConfig(), false, 0);
+        for (const auto& tag : {"csa_kv", "hca_kv", "indexer_kv"}) {
+            EXPECT_EQ(config.group_seq_size_per_block[gidForTag(config, tag)], kDsv4TokensPerBlock) << tag;
+        }
+    }
+}
+
+TEST(HybridPoolConfigCreatorTest, AllPagedPoolsShareBlockNum) {
+    auto              mc = makeProModelConfig();
+    ParallelismConfig pc;
+    auto              config = CacheConfigCreator::createBasicConfig(mc, pc, makeDsv4KvCacheConfig(), false, 0);
+    config.block_num         = 100;
+
+    // Paged groups derive their block count from the global block_num; explicit
+    // independent groups may override it with per-group fixed block counts.
+    EXPECT_EQ(config.groupNums(), 7);
+    for (int i = 0; i < 7; i++) {
+        EXPECT_GT(config.specForGroup(i)->block_size_bytes(), 0u) << "pool " << i;
+    }
+}
+
+TEST(HybridPoolConfigCreatorTest, DSV4StateSwaPoolsFollowGlobalBlocks) {
+    auto              mc = makeProModelConfig();
+    ParallelismConfig pc;
+    RuntimeConfig     runtime_config;
+    KVCacheConfig     kv_cache_config;
+    kv_cache_config.seq_size_per_block                          = 128;
+    kv_cache_config.test_block_num                              = 100;
+    setDsv4ExplicitPoolBlocks(mc, "hca_state", 0);
+    runtime_config.max_generate_batch_size                      = 5;
+    runtime_config.fifo_scheduler_config.max_context_batch_size = 3;
+
+    auto config = CacheConfigCreator::createConfig(mc, pc, runtime_config, kv_cache_config);
+
+    ASSERT_EQ(config.groupBlockNumsSnapshot().size(), static_cast<size_t>(kDsv4PoolNum));
+    for (int gid = 0; gid < kDsv4PoolNum; ++gid) {
+        EXPECT_EQ(config.blockNumForGroup(gid), 100u) << "gid=" << gid;
+    }
+    EXPECT_EQ(config.explicitly_sized_pool_reserve_bytes, 0u);
+}
+
+TEST(HybridPoolConfigCreatorTest, DSV4HcaStatePoolBlocksOverridesOnlyHcaState) {
+    auto              mc = makeProModelConfig();
+    ParallelismConfig pc;
+    RuntimeConfig     runtime_config;
+    KVCacheConfig     kv_cache_config;
+    kv_cache_config.seq_size_per_block                          = 128;
+    kv_cache_config.test_block_num                              = 100;
+    setDsv4ExplicitPoolBlocks(mc, "hca_state", 350);
+    runtime_config.max_generate_batch_size                      = 5;
+    runtime_config.fifo_scheduler_config.max_context_batch_size = 3;
+
+    auto config = CacheConfigCreator::createConfig(mc, pc, runtime_config, kv_cache_config);
+
+    ASSERT_EQ(config.groupBlockNumsSnapshot().size(), static_cast<size_t>(kDsv4PoolNum));
+    const auto hca_state_gid = gidForTag(config, "hca_state");
+    for (size_t gid = 0; gid < static_cast<size_t>(config.groupNums()); ++gid) {
+        const uint32_t expected = gid == hca_state_gid ? 350u : 100u;
+        EXPECT_EQ(config.blockNumForGroup(gid), expected) << "gid=" << gid;
+    }
+
+    const size_t expected_reserve = 350u * config.blockSizeBytesForGroup(hca_state_gid);
+    EXPECT_EQ(config.explicitly_sized_pool_reserve_bytes, expected_reserve);
+    ASSERT_EQ(config.groupPoliciesSnapshot().size(), static_cast<size_t>(kDsv4PoolNum));
+    EXPECT_EQ(config.policyForGroup(hca_state_gid).explicit_block_num, 350u);
+    for (size_t gid = 0; gid < config.groupPoliciesSnapshot().size(); ++gid) {
+        if (gid != hca_state_gid) {
+            EXPECT_EQ(config.policyForGroup(gid).explicit_block_num, 0u) << "gid=" << gid;
+        }
+    }
+}
+
+TEST(CacheConfigTest, DSV4KernelSeqSizeAllowsDecoupledPhysicalBlocks) {
+    auto              mc = makeProModelConfig();
+    ParallelismConfig pc;
+    RuntimeConfig     runtime_config;
+    runtime_config.max_generate_batch_size                      = 2;
+    runtime_config.fifo_scheduler_config.max_context_batch_size = 1;
+
+    auto create_config = [&](int seq_size_per_block, int kernel_seq_size_per_block) {
+        KVCacheConfig kv_cache_config;
+        kv_cache_config.seq_size_per_block        = seq_size_per_block;
+        kv_cache_config.kernel_seq_size_per_block = kernel_seq_size_per_block;
+        kv_cache_config.test_block_num            = 100;
+        return CacheConfigCreator::createConfig(mc, pc, runtime_config, kv_cache_config);
+    };
+
+    auto old_valid = create_config(128, 128);
+    EXPECT_EQ(old_valid.seq_size_per_block, 128u);
+    EXPECT_EQ(old_valid.kernel_seq_size_per_block, 128u);
+    EXPECT_EQ(old_valid.kernelBlocksPerKvBlock(), 1u);
+
+    auto decoupled = create_config(16384, 128);
+    EXPECT_EQ(decoupled.seq_size_per_block, 16384u);
+    EXPECT_EQ(decoupled.kernel_seq_size_per_block, 128u);
+    EXPECT_EQ(decoupled.kernelBlocksPerKvBlock(), 128u);
+}
+
+TEST(CacheConfigTest, DSV4KernelSeqSizeRejectsInvalidPhysicalKernelShape) {
+    auto              mc = makeProModelConfig();
+    ParallelismConfig pc;
+    RuntimeConfig     runtime_config;
+    runtime_config.max_generate_batch_size                      = 2;
+    runtime_config.fifo_scheduler_config.max_context_batch_size = 1;
+
+    auto create_config = [&](int seq_size_per_block, int kernel_seq_size_per_block) {
+        KVCacheConfig kv_cache_config;
+        kv_cache_config.seq_size_per_block        = seq_size_per_block;
+        kv_cache_config.kernel_seq_size_per_block = kernel_seq_size_per_block;
+        kv_cache_config.test_block_num            = 100;
+        return CacheConfigCreator::createConfig(mc, pc, runtime_config, kv_cache_config);
+    };
+
+    EXPECT_THROW((void)create_config(16384, 64), std::exception);
+    EXPECT_THROW((void)create_config(16384, 384), std::exception);
+}
+
+TEST(HybridPoolConfigCreatorTest, DSV4HcaStatePoolBlocksIndependentOfMaxConcurrency) {
+    for (uint32_t max_concurrency : {1u, 2u, 8u}) {
+        auto              mc = makeProModelConfig();
+        ParallelismConfig pc;
+        RuntimeConfig     runtime_config;
+        KVCacheConfig     kv_cache_config;
+        kv_cache_config.seq_size_per_block                          = 128;
+        kv_cache_config.test_block_num                              = 100;
+        setDsv4ExplicitPoolBlocks(mc, "hca_state", 256);
+        runtime_config.max_generate_batch_size                      = max_concurrency;
+        runtime_config.fifo_scheduler_config.max_context_batch_size = 1;
+
+        auto config = CacheConfigCreator::createConfig(mc, pc, runtime_config, kv_cache_config);
+
+        ASSERT_EQ(config.groupBlockNumsSnapshot().size(), static_cast<size_t>(kDsv4PoolNum));
+        const auto hca_state_gid = gidForTag(config, "hca_state");
+        for (int gid = 0; gid < kDsv4PoolNum; ++gid) {
+            const uint32_t expected = static_cast<size_t>(gid) == hca_state_gid ? 256u : 100u;
+            EXPECT_EQ(config.blockNumForGroup(gid), expected)
+                << "gid=" << gid << " max_concurrency=" << max_concurrency;
+        }
+    }
+}
+
+TEST(HybridPoolConfigCreatorTest, DSV4HcaStatePoolBlocksCanBeOverriddenByConfig) {
+    auto              mc = makeProModelConfig();
+    ParallelismConfig pc;
+    RuntimeConfig     runtime_config;
+    KVCacheConfig     kv_cache_config;
+    kv_cache_config.seq_size_per_block                          = 128;
+    kv_cache_config.test_block_num                              = 100;
+    setDsv4ExplicitPoolBlocks(mc, "hca_state", 6);
+    runtime_config.max_generate_batch_size                      = 2;
+    runtime_config.fifo_scheduler_config.max_context_batch_size = 1;
+
+    auto config = CacheConfigCreator::createConfig(mc, pc, runtime_config, kv_cache_config);
+
+    ASSERT_EQ(config.groupBlockNumsSnapshot().size(), static_cast<size_t>(kDsv4PoolNum));
+    const auto hca_state_gid = gidForTag(config, "hca_state");
+    for (int gid = 0; gid < kDsv4PoolNum; ++gid) {
+        const uint32_t expected = static_cast<size_t>(gid) == hca_state_gid ? 6u : 100u;
+        EXPECT_EQ(config.blockNumForGroup(gid), expected) << "gid=" << gid;
+    }
+}
+
+TEST(CacheConfigTest, ModelSpecCloneKeepsExistingConfigStable) {
+    ModelConfig model_config;
+    model_config.num_layers                   = 2;
+    model_config.attn_config.kv_head_num      = 4;
+    model_config.attn_config.size_per_head    = 16;
+    model_config.attn_config.tokens_per_block = 8;
+    setDefaultKvCacheSpec(model_config);
+
+    ParallelismConfig pc_tp1;
+    pc_tp1.tp_size = 1;
+    auto config_tp1 = CacheConfigCreator::createBasicConfig(model_config, pc_tp1, KVCacheConfig{}, false, 0);
+    ASSERT_EQ(static_cast<size_t>(config_tp1.groupNums()), 1u);
+    EXPECT_EQ(config_tp1.specForGroup(0)->local_head_num_kv, 4u);
+
+    ParallelismConfig pc_tp2;
+    pc_tp2.tp_size = 2;
+    auto config_tp2 = CacheConfigCreator::createBasicConfig(model_config, pc_tp2, KVCacheConfig{}, false, 0);
+    ASSERT_EQ(static_cast<size_t>(config_tp2.groupNums()), 1u);
+    EXPECT_EQ(config_tp2.specForGroup(0)->local_head_num_kv, 2u);
+
+    EXPECT_EQ(config_tp1.specForGroup(0)->local_head_num_kv, 4u);
+    EXPECT_NE(config_tp1.specForGroup(0).get(), config_tp2.specForGroup(0).get());
+}
+
+TEST(CacheConfigTest, SpecBuilderDerivesHybridPoolRuntimeFieldsFromContext) {
+    SpecBuildContext ctx;
+    ctx.dtype                   = DataType::TYPE_BF16;
+    ctx.seq_size_per_block      = 128;
+    ctx.attn_tp_size            = 1;
+    ctx.kernel_tokens_per_block = 128;
+    ctx.gen_num_per_cycle       = 3;
+    ctx.cp_size                 = 2;
+    ctx.cp_prefill_sliced       = true;
+
+    KVCacheSpecDesc compressed_desc;
+    compressed_desc.tag                                      = "compressed";
+    compressed_desc.cache_type                               = CacheType::COMPRESSED_KV;
+    compressed_desc.entry_elems                              = 16;
+    compressed_desc.compression_ratio                        = 4;
+    compressed_desc.store_dtype                              = DataType::TYPE_UINT8;
+    compressed_desc.extra.derive_entries_from_kernel_block   = true;
+    compressed_desc.extra.use_fixed_region_cp_tokens         = true;
+
+    auto compressed = std::dynamic_pointer_cast<CompressedKVCacheSpec>(SpecBuilder::build(compressed_desc, ctx));
+    ASSERT_NE(compressed, nullptr);
+    EXPECT_EQ(compressed->entries_per_block, 32u);
+    EXPECT_EQ(compressed->seq_size_per_block, 256u);
+    EXPECT_EQ(compressed->dtype, DataType::TYPE_BF16);
+
+    KVCacheSpecDesc state_desc;
+    state_desc.tag                                          = "state";
+    state_desc.cache_type                                   = CacheType::FIXED_STATE;
+    state_desc.entry_elems                                  = 32;
+    state_desc.store_dtype                                  = DataType::TYPE_FP32;
+    state_desc.block_size_bytes_alignment                   = 64;
+    state_desc.extra.state_ring_compression_ratio           = 4;
+    state_desc.extra.state_ring_overlap                     = 1;
+    state_desc.extra.state_ring_add_gen_num_per_cycle       = true;
+    state_desc.extra.cp_align_entries                       = true;
+    state_desc.extra.cp_slice_entries                       = true;
+    state_desc.extra.cp_prefill_slice_block_bytes           = true;
+    state_desc.extra.use_fixed_region_cp_tokens             = true;
+
+    auto prefill_state = std::dynamic_pointer_cast<FixedStateCacheSpec>(SpecBuilder::build(state_desc, ctx));
+    ASSERT_NE(prefill_state, nullptr);
+    EXPECT_EQ(prefill_state->entries_per_block, 6u);
+    EXPECT_EQ(prefill_state->block_size_bytes_override, 384u);
+    EXPECT_EQ(prefill_state->seq_size_per_block, 256u);
+
+    ctx.cp_prefill_sliced = false;
+    auto decode_state = std::dynamic_pointer_cast<FixedStateCacheSpec>(SpecBuilder::build(state_desc, ctx));
+    ASSERT_NE(decode_state, nullptr);
+    EXPECT_EQ(decode_state->entries_per_block, 12u);
+    EXPECT_EQ(decode_state->block_size_bytes_override, 0u);
+    EXPECT_EQ(decode_state->seq_size_per_block, 256u);
+}
+
+TEST(CacheConfigTest, FinalizeBlockNumsIsNoopForSingleAndSharedHybridConfig) {
+    RuntimeConfig runtime_config;
+    runtime_config.max_generate_batch_size                      = 8;
+    runtime_config.fifo_scheduler_config.max_context_batch_size = 4;
+
+    ParallelismConfig pc;
+    ModelConfig       single_model_config;
+    single_model_config.num_layers                   = 1;
+    single_model_config.attn_config.kv_head_num      = 1;
+    single_model_config.attn_config.size_per_head    = 1;
+    single_model_config.attn_config.tokens_per_block = 1;
+    setDefaultKvCacheSpec(single_model_config);
+    auto single_config = CacheConfigCreator::createBasicConfig(single_model_config, pc, KVCacheConfig{}, false, 0);
+    single_config.finalizeBlockNums(123, runtime_config);
+    EXPECT_TRUE(single_config.groupBlockNumsSnapshot().empty());
+    EXPECT_EQ(single_config.explicitly_sized_pool_reserve_bytes, 0u);
+
+    auto hybrid_config =
+        CacheConfigCreator::createBasicConfig(makeHybridAttentionModelConfig(false), pc, KVCacheConfig{}, false, 0);
+    hybrid_config.finalizeBlockNums(123, runtime_config);
+    EXPECT_FALSE(hybrid_config.use_independent_block_pools);
+    EXPECT_TRUE(hybrid_config.groupBlockNumsSnapshot().empty());
+    EXPECT_EQ(hybrid_config.explicitly_sized_pool_reserve_bytes, 0u);
+}
+
+TEST(CacheConfigTest, FinalizeBlockNumsAppliesToIndependentPools) {
+    RuntimeConfig runtime_config;
+    runtime_config.max_generate_batch_size                      = 5;
+    runtime_config.fifo_scheduler_config.max_context_batch_size = 3;
+
+    ParallelismConfig pc;
+    auto config = CacheConfigCreator::createBasicConfig(makeProModelConfig(), pc, makeDsv4KvCacheConfig(), false, 0);
+    config.finalizeBlockNums(100, runtime_config);
+
+    ASSERT_EQ(config.groupBlockNumsSnapshot().size(), static_cast<size_t>(kDsv4PoolNum));
+    const auto hca_state_gid = gidForTag(config, "hca_state");
+    for (int gid = 0; gid < kDsv4PoolNum; ++gid) {
+        const uint32_t expected = static_cast<size_t>(gid) == hca_state_gid ? 256u : 100u;
+        EXPECT_EQ(config.blockNumForGroup(gid), expected) << "gid=" << gid;
+    }
+    EXPECT_EQ(config.explicitly_sized_pool_reserve_bytes, 256u * config.blockSizeBytesForGroup(hca_state_gid));
+}
+
+TEST(CacheConfigTest, HcaStateReserveDeductedFromPagedBudget) {
+    auto              mc = makeProModelConfig();
+    ParallelismConfig pc;
+    RuntimeConfig     runtime_config;
+    runtime_config.max_generate_batch_size                      = 4;
+    runtime_config.fifo_scheduler_config.max_context_batch_size = 2;
+
+    const uint32_t small_hca_state_pool = 32;
+    const uint32_t large_hca_state_pool = 256;
+
+    KVCacheConfig kv_cache_config_with;
+    kv_cache_config_with.seq_size_per_block         = 128;
+    kv_cache_config_with.kv_cache_mem_mb            = 65536;
+    setDsv4ExplicitPoolBlocks(mc, "hca_state", small_hca_state_pool);
+    auto config_with = CacheConfigCreator::createConfig(mc, pc, runtime_config, kv_cache_config_with);
+
+    KVCacheConfig kv_cache_config_without;
+    kv_cache_config_without.seq_size_per_block         = 128;
+    kv_cache_config_without.kv_cache_mem_mb            = 65536;
+    setDsv4ExplicitPoolBlocks(mc, "hca_state", large_hca_state_pool);
+    auto config_without = CacheConfigCreator::createConfig(mc, pc, runtime_config, kv_cache_config_without);
+
+    // More HCA_STATE blocks reserve more HBM and leave fewer blocks for the global pools.
+    EXPECT_GT(config_with.block_num, config_without.block_num);
+    EXPECT_EQ(config_with.blockNumForGroup(gidForTag(config_with, "hca_kv")),
+              static_cast<uint32_t>(config_with.block_num));
+    EXPECT_EQ(config_without.blockNumForGroup(gidForTag(config_without, "hca_kv")),
+              static_cast<uint32_t>(config_without.block_num));
+    EXPECT_EQ(config_with.blockNumForGroup(gidForTag(config_with, "hca_state")), small_hca_state_pool);
+    EXPECT_EQ(config_without.blockNumForGroup(gidForTag(config_without, "hca_state")), large_hca_state_pool);
+    const size_t expected_reserve =
+        static_cast<size_t>(small_hca_state_pool) * config_with.blockSizeBytesForGroup(gidForTag(config_with, "hca_state"));
+    EXPECT_EQ(config_with.explicitly_sized_pool_reserve_bytes, expected_reserve);
+}
+
+TEST(CacheConfigTest, DSV4ExplicitHcaStatePoolBlocksIgnoreLinearStep) {
+    RuntimeConfig runtime_config;
+    runtime_config.max_generate_batch_size                      = 4;
+    runtime_config.fifo_scheduler_config.max_context_batch_size = 2;
+
+    ParallelismConfig pc;
+    KVCacheConfig     kv_cache_config = makeDsv4KvCacheConfig();
+    auto config = CacheConfigCreator::createBasicConfig(makeProModelConfig(), pc, kv_cache_config, false, 0);
+    config.linear_step = 4;
+    config.finalizeBlockNums(100, runtime_config);
+
+    // FULL groups: unaffected by step, get global_block_num
+    const auto hca_state_gid = gidForTag(config, "hca_state");
+    for (size_t gid = 0; gid < static_cast<size_t>(config.groupNums()); ++gid) {
+        const uint32_t expected = gid == hca_state_gid ? 256u : 100u;
+        EXPECT_EQ(config.blockNumForGroup(gid), expected) << "gid=" << gid;
+    }
+    const size_t expected_reserve = 256u * config.blockSizeBytesForGroup(hca_state_gid);
+    EXPECT_EQ(config.explicitly_sized_pool_reserve_bytes, expected_reserve);
+}
+
+TEST(CacheConfigTest, DSV4StateSwaPoolsWithoutExplicitBlocksUseGlobalBlocks) {
+    RuntimeConfig runtime_config;
+    runtime_config.max_generate_batch_size                      = 4;
+    runtime_config.fifo_scheduler_config.max_context_batch_size = 2;
+
+    ParallelismConfig pc;
+    KVCacheConfig     kv_cache_config;
+    kv_cache_config.seq_size_per_block = 128;
+    kv_cache_config.test_block_num     = 100;
+    kv_cache_config.linear_step        = 4;
+    auto mc = makeProModelConfig();
+    setDsv4ExplicitPoolBlocks(mc, "hca_state", 0);
+
+    auto config = CacheConfigCreator::createConfig(mc, pc, runtime_config, kv_cache_config);
+
+    ASSERT_EQ(config.groupBlockNumsSnapshot().size(), static_cast<size_t>(kDsv4PoolNum));
+    for (int gid = 0; gid < kDsv4PoolNum; ++gid) {
+        EXPECT_EQ(config.blockNumForGroup(gid), 100u) << "gid=" << gid;
+    }
+    EXPECT_EQ(config.explicitly_sized_pool_reserve_bytes, 0u);
+}
+
+TEST(CacheConfigTest, DSV4MtpKeepsProposeLayerInSwaPool) {
+    auto score_model_config                         = makeFlashModelConfig();
+    auto propose_model_config                       = makeFlashMtpModelConfig();
+    score_model_config.attn_config.kv_cache_dtype   = KvCacheDataType::FP8;
+    propose_model_config.attn_config.kv_cache_dtype = KvCacheDataType::FP8;
+
+    ParallelismConfig parallelism_config;
+    RuntimeConfig     runtime_config;
+    runtime_config.max_generate_batch_size                      = 2;
+    runtime_config.fifo_scheduler_config.max_context_batch_size = 1;
+
+    KVCacheConfig kv_cache_config;
+    kv_cache_config.seq_size_per_block        = 16384;
+    kv_cache_config.kernel_seq_size_per_block = 128;
+    kv_cache_config.test_block_num            = 100;
+
+    SpeculativeExecutionConfig sp_config;
+    sp_config.type              = SP_TYPE_MTP;
+    sp_config.gen_num_per_cycle = 2;
+
+    auto config = CacheConfigCreator::createSpConfig(score_model_config,
+                                                     propose_model_config,
+                                                     parallelism_config,
+                                                     runtime_config,
+                                                     kv_cache_config,
+                                                     sp_config,
+                                                     std::nullopt,
+                                                     true,
+                                                     false);
+
+    ASSERT_EQ(config.layer_num, 43u);
+    ASSERT_EQ(config.layer_all_num, 45u);
+    ASSERT_EQ(config.mtp_sub_configs.size(), 2u);
+    ASSERT_NE(config.mtp_sub_configs[0], nullptr);
+    ASSERT_NE(config.mtp_sub_configs[1], nullptr);
+
+    const auto swa_gid = gidForTag(config, "swa_kv");
+    EXPECT_EQ(config.layerGroupIdsSnapshot()[43], std::vector<int>({static_cast<int>(swa_gid)}));
+    EXPECT_EQ(config.layerGroupIdsSnapshot()[44], std::vector<int>({static_cast<int>(swa_gid)}));
+    EXPECT_EQ(config.groupIdForLayerTag(43, "swa_kv"), static_cast<int>(swa_gid));
+    EXPECT_EQ(config.groupIdForLayerTag(44, "swa_kv"), static_cast<int>(swa_gid));
+
+    EXPECT_EQ(config.layerIdsForGroup(swa_gid).size(), 45u);
+
+    // MTP sub-configs preserve the target/global group namespace.  Current
+    // MTP execution passes block tables by gid without a draft-local remap, so
+    // unused target groups stay as empty placeholders and the real SWA layer
+    // keeps the same gid as the target config.
+    EXPECT_EQ(config.mtp_sub_configs[0]->groupTagsSnapshot(), config.groupTagsSnapshot());
+    EXPECT_EQ(config.mtp_sub_configs[1]->groupTagsSnapshot(), config.groupTagsSnapshot());
+    EXPECT_EQ(config.mtp_sub_configs[0]->groupIdForLayerTag(0, "swa_kv"), static_cast<int>(swa_gid));
+    EXPECT_EQ(config.mtp_sub_configs[1]->groupIdForLayerTag(0, "swa_kv"), static_cast<int>(swa_gid));
+    EXPECT_EQ(config.mtp_sub_configs[0]->layerIdsForGroup(swa_gid), std::vector<int>({43}));
+    EXPECT_EQ(config.mtp_sub_configs[1]->layerIdsForGroup(swa_gid), std::vector<int>({44}));
+    for (size_t gid = 0; gid < static_cast<size_t>(config.groupNums()); ++gid) {
+        if (gid == swa_gid) {
+            continue;
+        }
+        EXPECT_TRUE(config.mtp_sub_configs[0]->layerIdsForGroup(gid).empty()) << config.tagForGroup(gid);
+        EXPECT_TRUE(config.mtp_sub_configs[1]->layerIdsForGroup(gid).empty()) << config.tagForGroup(gid);
+    }
+    EXPECT_EQ(config.seq_size_per_block, 16384u);
+    EXPECT_EQ(config.kernel_seq_size_per_block, 128u);
+    EXPECT_EQ(config.kernelBlocksPerKvBlock(), 128u);
+    EXPECT_EQ(config.mtp_sub_configs[0]->seq_size_per_block, 16384u);
+    EXPECT_EQ(config.mtp_sub_configs[0]->kernel_seq_size_per_block, 128u);
+
+    EXPECT_EQ(config.explicitly_sized_pool_reserve_bytes,
+              256u * config.blockSizeBytesForGroup(gidForTag(config, "hca_state")));
+}
+
+TEST(HybridPoolConfigCreatorTest, MtpGenNum2RingEntriesMatch) {
+    // gen_num_per_cycle=2 -> CSA/INDEXER R=10, HCA R=130, SWA R=130.
+    // Formula: R = ceil_even((1 + overlap) * ratio + gen_num_per_cycle).
+    // SWA_KV is sized like the HCA state ring (window 128, overlap 0).
+    auto              mc = makeFlashModelConfig();
+    ParallelismConfig pc;
+    auto              config =
+        CacheConfigCreator::createBasicConfig(mc, pc, makeDsv4KvCacheConfig(), false, /*gen_num_per_cycle=*/2);
+
+    ASSERT_EQ(static_cast<size_t>(config.groupNums()), 7u);
+    // Pool 3: INDEXER_STATE (ratio=4, overlap=1) → R=10
+    auto* indexer_state = dynamic_cast<FixedStateCacheSpec*>(config.specForGroup(gidForTag(config, "indexer_state")).get());
+    ASSERT_NE(indexer_state, nullptr);
+    EXPECT_EQ(indexer_state->entries_per_block, 10u);
+    // Pool 4: CSA_STATE (ratio=4, overlap=1) → R=10
+    auto* csa_state = dynamic_cast<FixedStateCacheSpec*>(config.specForGroup(gidForTag(config, "csa_state")).get());
+    ASSERT_NE(csa_state, nullptr);
+    EXPECT_EQ(csa_state->entries_per_block, 10u);
+    // Pool 5: HCA_STATE (ratio=128, overlap=0) → R=130
+    auto* hca_state = dynamic_cast<FixedStateCacheSpec*>(config.specForGroup(gidForTag(config, "hca_state")).get());
+    ASSERT_NE(hca_state, nullptr);
+    EXPECT_EQ(hca_state->entries_per_block, 130u);
+    // Pool 6: SWA_KV (window=128, overlap=0) → R=130, same as HCA_STATE
+    auto* swa_kv = dynamic_cast<FixedStateCacheSpec*>(config.specForGroup(gidForTag(config, "swa_kv")).get());
+    ASSERT_NE(swa_kv, nullptr);
+    EXPECT_EQ(swa_kv->tag, "swa_kv");
+    EXPECT_EQ(swa_kv->entries_per_block, 130u);
+}
+
+TEST(HybridPoolConfigCreatorTest, PrefillCp8MtpGenNum2PadsStateRingBeforeSlicing) {
+    auto              mc = makeFlashModelConfig();
+    ParallelismConfig pc;
+    pc.role_type                          = RoleType::PREFILL;
+    pc.tp_size                            = 8;
+    pc.prefill_cp_config.kv_cache_sharded = true;
+
+    auto config = CacheConfigCreator::createBasicConfig(mc, pc, makeDsv4KvCacheConfig(), false, 2);
+
+    ASSERT_EQ(static_cast<size_t>(config.groupNums()), 7u);
+    auto* indexer_state = dynamic_cast<FixedStateCacheSpec*>(config.specForGroup(gidForTag(config, "indexer_state")).get());
+    auto* csa_state     = dynamic_cast<FixedStateCacheSpec*>(config.specForGroup(gidForTag(config, "csa_state")).get());
+    auto* hca_state     = dynamic_cast<FixedStateCacheSpec*>(config.specForGroup(gidForTag(config, "hca_state")).get());
+    auto* swa_kv        = dynamic_cast<FixedStateCacheSpec*>(config.specForGroup(gidForTag(config, "swa_kv")).get());
+    ASSERT_NE(indexer_state, nullptr);
+    ASSERT_NE(csa_state, nullptr);
+    ASSERT_NE(hca_state, nullptr);
+    ASSERT_NE(swa_kv, nullptr);
+
+    // gen_num_per_cycle=2 gives raw INDEXER/CSA R=10, HCA/SWA R=130.
+    // Fixed state pools are CP-sliced by entries; SWA_KV keeps full logical
+    // entries and slices its packed bytes instead.
+    EXPECT_EQ(indexer_state->entries_per_block, 2u);
+    EXPECT_EQ(csa_state->entries_per_block, 2u);
+    EXPECT_EQ(hca_state->entries_per_block, 17u);
+    EXPECT_EQ(swa_kv->entries_per_block, 136u);
+}
+
+TEST(HybridPoolConfigCreatorTest, DecodePrefillCp8MtpGenNum2ExpandsFixedAndSwaSlices) {
+    constexpr uint32_t cp_size = 8;
+    auto               mc      = makeFlashModelConfig();
+
+    ParallelismConfig prefill_pc;
+    prefill_pc.role_type                          = RoleType::PREFILL;
+    prefill_pc.tp_size                            = cp_size;
+    prefill_pc.prefill_cp_config.kv_cache_sharded = true;
+
+    ParallelismConfig decode_pc;
+    decode_pc.role_type                          = RoleType::DECODE;
+    decode_pc.tp_size                            = 1;
+    decode_pc.dp_size                            = cp_size;
+    decode_pc.world_size                         = cp_size;
+    decode_pc.prefill_cp_config.method           = CPRotateMethod::PREFILL_CP;
+    decode_pc.prefill_cp_config.kv_cache_sharded = true;
+    decode_pc.prefill_cp_config.prefill_cp_size  = cp_size;
+
+    auto prefill_config = CacheConfigCreator::createBasicConfig(mc, prefill_pc, makeDsv4KvCacheConfig(), false, 2);
+    auto decode_config  = CacheConfigCreator::createBasicConfig(mc, decode_pc, makeDsv4KvCacheConfig(), false, 2);
+
+    ASSERT_EQ(static_cast<size_t>(prefill_config.groupNums()), 7u);
+    ASSERT_EQ(static_cast<size_t>(decode_config.groupNums()), 7u);
+
+    for (const auto& tag : {"indexer_state", "csa_state", "hca_state"}) {
+        const auto prefill_gid = gidForTag(prefill_config, tag);
+        const auto decode_gid  = gidForTag(decode_config, tag);
+        auto* prefill_spec = dynamic_cast<FixedStateCacheSpec*>(prefill_config.specForGroup(prefill_gid).get());
+        auto* decode_spec  = dynamic_cast<FixedStateCacheSpec*>(decode_config.specForGroup(decode_gid).get());
+        ASSERT_NE(prefill_spec, nullptr) << tag;
+        ASSERT_NE(decode_spec, nullptr) << tag;
+        EXPECT_EQ(decode_spec->tag, prefill_spec->tag) << tag;
+        const auto expected_entries = prefill_spec->entries_per_block * cp_size;
+        EXPECT_EQ(decode_spec->entries_per_block, expected_entries) << tag;
+    }
+    auto* prefill_swa = dynamic_cast<FixedStateCacheSpec*>(
+        prefill_config.specForGroup(gidForTag(prefill_config, "swa_kv")).get());
+    auto* decode_swa = dynamic_cast<FixedStateCacheSpec*>(
+        decode_config.specForGroup(gidForTag(decode_config, "swa_kv")).get());
+    ASSERT_NE(prefill_swa, nullptr);
+    ASSERT_NE(decode_swa, nullptr);
+    EXPECT_EQ(prefill_swa->entries_per_block, 136u);
+    EXPECT_EQ(decode_swa->entries_per_block, prefill_swa->entries_per_block);
+
+    auto* indexer_state = dynamic_cast<FixedStateCacheSpec*>(decode_config.specForGroup(gidForTag(decode_config, "indexer_state")).get());
+    auto* csa_state     = dynamic_cast<FixedStateCacheSpec*>(decode_config.specForGroup(gidForTag(decode_config, "csa_state")).get());
+    auto* hca_state     = dynamic_cast<FixedStateCacheSpec*>(decode_config.specForGroup(gidForTag(decode_config, "hca_state")).get());
+    auto* swa_kv        = dynamic_cast<FixedStateCacheSpec*>(decode_config.specForGroup(gidForTag(decode_config, "swa_kv")).get());
+    ASSERT_NE(indexer_state, nullptr);
+    ASSERT_NE(csa_state, nullptr);
+    ASSERT_NE(hca_state, nullptr);
+    ASSERT_NE(swa_kv, nullptr);
+
+    EXPECT_EQ(indexer_state->entries_per_block, 16u);
+    EXPECT_EQ(csa_state->entries_per_block, 16u);
+    EXPECT_EQ(hca_state->entries_per_block, 136u);
+    EXPECT_EQ(swa_kv->entries_per_block, 136u);
+    for (const auto& tag : {"indexer_state", "csa_state", "hca_state", "swa_kv"}) {
+        const auto prefill_gid = gidForTag(prefill_config, tag);
+        const auto decode_gid  = gidForTag(decode_config, tag);
+        EXPECT_EQ(prefill_config.group_seq_size_per_block[prefill_gid], kDsv4TokensPerBlock * cp_size) << tag;
+        EXPECT_EQ(decode_config.group_seq_size_per_block[decode_gid], kDsv4TokensPerBlock * cp_size) << tag;
+    }
+}
+
+TEST(HybridPoolConfigCreatorTest, DecodeExplicitPrefillCpSizeHandlesDp16) {
+    constexpr uint32_t cp_size = 8;
+    auto               mc      = makeFlashModelConfig();
+
+    ParallelismConfig prefill_pc;
+    prefill_pc.role_type                          = RoleType::PREFILL;
+    prefill_pc.tp_size                            = cp_size;
+    prefill_pc.prefill_cp_config.kv_cache_sharded = true;
+
+    ParallelismConfig decode_pc;
+    decode_pc.role_type                          = RoleType::DECODE;
+    decode_pc.tp_size                            = 1;
+    decode_pc.dp_size                            = 16;
+    decode_pc.world_size                         = 16;
+    decode_pc.prefill_cp_config.method           = CPRotateMethod::PREFILL_CP;
+    decode_pc.prefill_cp_config.kv_cache_sharded = true;
+    decode_pc.prefill_cp_config.prefill_cp_size  = cp_size;
+
+    auto prefill_config = CacheConfigCreator::createBasicConfig(mc, prefill_pc, makeDsv4KvCacheConfig(), false, 2);
+    auto decode_config  = CacheConfigCreator::createBasicConfig(mc, decode_pc, makeDsv4KvCacheConfig(), false, 2);
+
+    for (const auto& tag : {"indexer_state", "csa_state", "hca_state"}) {
+        const auto prefill_gid = gidForTag(prefill_config, tag);
+        const auto decode_gid  = gidForTag(decode_config, tag);
+        auto* prefill_spec = dynamic_cast<FixedStateCacheSpec*>(prefill_config.specForGroup(prefill_gid).get());
+        auto* decode_spec  = dynamic_cast<FixedStateCacheSpec*>(decode_config.specForGroup(decode_gid).get());
+        ASSERT_NE(prefill_spec, nullptr) << tag;
+        ASSERT_NE(decode_spec, nullptr) << tag;
+        const auto expected_entries = prefill_spec->entries_per_block * cp_size;
+        EXPECT_EQ(decode_spec->entries_per_block, expected_entries) << tag;
+        EXPECT_EQ(prefill_config.group_seq_size_per_block[prefill_gid], kDsv4TokensPerBlock * cp_size) << tag;
+        EXPECT_EQ(decode_config.group_seq_size_per_block[decode_gid], kDsv4TokensPerBlock * cp_size) << tag;
+    }
+    auto* prefill_swa = dynamic_cast<FixedStateCacheSpec*>(
+        prefill_config.specForGroup(gidForTag(prefill_config, "swa_kv")).get());
+    auto* decode_swa = dynamic_cast<FixedStateCacheSpec*>(
+        decode_config.specForGroup(gidForTag(decode_config, "swa_kv")).get());
+    ASSERT_NE(prefill_swa, nullptr);
+    ASSERT_NE(decode_swa, nullptr);
+    EXPECT_EQ(prefill_swa->entries_per_block, 136u);
+    EXPECT_EQ(decode_swa->entries_per_block, prefill_swa->entries_per_block);
+    EXPECT_EQ(prefill_config.group_seq_size_per_block[gidForTag(prefill_config, "swa_kv")],
+              kDsv4TokensPerBlock * cp_size);
+    EXPECT_EQ(decode_config.group_seq_size_per_block[gidForTag(decode_config, "swa_kv")],
+              kDsv4TokensPerBlock * cp_size);
+}
+
+TEST(CacheConfigTest, DSV4NonMtpSpConfigDoesNotInflateRing) {
+    // SP_TYPE_NONE with default gen_num_per_cycle=1 must NOT inflate state ring.
+    // Non-MTP DSV4 ring: R = ceil_even((1+overlap)*ratio + 0) = 8 for CSA.
+    auto              mc = makeFlashModelConfig();
+    ParallelismConfig pc;
+    RuntimeConfig     rc;
+    rc.max_generate_batch_size                      = 2;
+    rc.fifo_scheduler_config.max_context_batch_size = 1;
+    KVCacheConfig kvc;
+    kvc.seq_size_per_block        = 128;
+    kvc.kernel_seq_size_per_block = 128;
+    kvc.test_block_num            = 50;
+    SpeculativeExecutionConfig sp_none;  // type=SP_TYPE_NONE, gen_num_per_cycle=1
+    auto config = CacheConfigCreator::createConfig(mc, pc, rc, kvc, std::nullopt, std::make_optional(sp_none));
+    ASSERT_EQ(static_cast<size_t>(config.groupNums()), 7u);
+    // CSA_STATE (pool 4): ratio=4, overlap=1, gen_num=0 → R=8
+    auto* csa = dynamic_cast<FixedStateCacheSpec*>(config.specForGroup(gidForTag(config, "csa_state")).get());
+    ASSERT_NE(csa, nullptr);
+    EXPECT_EQ(csa->entries_per_block, 8u) << "SP_TYPE_NONE should not inflate ring";
+}
+
+TEST(HybridPoolConfigCreatorTest, BlockIdConsistencyAcrossGroups) {
+    // DSV4 has multiple semantic cache tags per logical layer. The config must expose
+    // every tag's group id for the layer so model/runtime code can request the
+    // correct group by tag.
+    auto              mc = makeProModelConfig();
+    ParallelismConfig pc;
+    auto              config = CacheConfigCreator::createBasicConfig(mc, pc, makeDsv4KvCacheConfig(), false, 0);
+
+    // Verify every layer exposes its complete group ids directly.
+    const auto layer_group_ids = config.layerGroupIdsSnapshot();
+    EXPECT_EQ(layer_group_ids.size(), 61u);
+    for (size_t i = 0; i < layer_group_ids.size(); i++) {
+        EXPECT_FALSE(layer_group_ids[i].empty()) << "layer " << i;
+    }
+
+    // Verify group layer ids: each group has the correct layer list.
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "csa_kv")),
+              config.layerIdsForGroup(gidForTag(config, "indexer_kv")));
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "csa_kv")),
+              config.layerIdsForGroup(gidForTag(config, "indexer_state")));
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "csa_kv")),
+              config.layerIdsForGroup(gidForTag(config, "csa_state")));
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "hca_kv")),
+              config.layerIdsForGroup(gidForTag(config, "hca_state")));
+}
+
+// ============================================================
+// Helper: build a DSV4 CacheConfig with block_num set for allocator tests
+// ============================================================
+
+static CacheConfig makeDSV4AllocatorConfig(bool use_flash = false) {
+    auto              mc = use_flash ? makeFlashModelConfig() : makeProModelConfig();
+    ParallelismConfig pc;
+    auto              config = CacheConfigCreator::createBasicConfig(mc, pc, makeDsv4KvCacheConfig(), false, 0);
+    // Set enough blocks for tests (7 groups × N blocks each)
+    config.block_num = 200;
+    return config;
+}
+
+static CacheConfig makeDSV4CpAllocatorConfig(uint32_t cp_size) {
+    auto              mc = makeProModelConfig();
+    ParallelismConfig pc;
+    pc.role_type                          = RoleType::PREFILL;
+    pc.tp_size                            = cp_size;
+    pc.prefill_cp_config.kv_cache_sharded = true;
+    auto config      = CacheConfigCreator::createBasicConfig(mc, pc, makeDsv4KvCacheConfig(), false, 0);
+    config.block_num = 200;
+    setGroupBlockNumsForTest(config, std::vector<uint32_t>(static_cast<size_t>(config.groupNums()), config.block_num));
+    return config;
+}
+
+// ============================================================
+// HybridTypeKVCacheAllocator integration tests with DSV4 7-group config
+// ============================================================
+
+class DSV4AllocatorTest: public ::testing::Test {
+protected:
+    void SetUp() override {
+        rtp_llm::initLogger();
+        createDevice();
+    }
+};
+
+TEST_F(DSV4AllocatorTest, InitAndBasicProperties) {
+    auto config    = makeDSV4AllocatorConfig();
+    auto allocator = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    ASSERT_TRUE(allocator->init());
+
+    // 7 groups → HybridTypeKVCacheAllocator path
+    EXPECT_EQ(config.groupNums(), 7);
+    EXPECT_EQ(allocator->seqSizePerBlock(), static_cast<int>(config.seq_size_per_block));
+    EXPECT_EQ(allocator->totalBlocksNum(), config.block_num - 1);
+    EXPECT_EQ(allocator->freeBlocksNum(), config.block_num - 1);
+}
+
+TEST_F(DSV4AllocatorTest, CpPageRrFixedAndSwaAllocateOneBlockPerVirtualBlock) {
+    constexpr uint32_t cp_size   = 4;
+    auto               config    = makeDSV4CpAllocatorConfig(cp_size);
+    auto               allocator = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    ASSERT_TRUE(allocator->init());
+
+    const int spb     = allocator->seqSizePerBlock();
+    const int seq_len = static_cast<int>(cp_size) * spb;
+    allocator->setCPSlotMapper(std::make_shared<CPSlotMapper>(0, static_cast<int>(cp_size), spb));
+
+    auto batch_res = std::make_shared<BatchKVCacheResource>();
+    batch_res->resetBatchSize(1);
+    initDsv4BatchGroups(*batch_res, config);
+    batch_res->setBatchCacheKeys(0, CacheKeysType{100, 101, 102, 103});
+
+    auto cti            = std::make_shared<CompleteTokenIds>(1, 1, seq_len + spb, spb);
+    auto gi             = std::make_shared<GenerateInput>();
+    gi->input_ids       = torch::arange(seq_len, torch::kInt32);
+    gi->generate_config = std::make_shared<GenerateConfig>();
+    cti->init(gi);
+
+    MallocInfo info{batch_res, cti};
+    info.enable_device_cache = false;
+    info.reuse_cache         = false;
+
+    auto result = allocator->malloc(info);
+    ASSERT_TRUE(result.success);
+    for (int gid = 0; gid < 7; ++gid) {
+        EXPECT_EQ(batch_res->blocksNum(0, gid), 1u) << "gid=" << gid;
+    }
+
+    FreeInfo free_info{batch_res};
+    allocator->free(free_info);
+}
+
+TEST_F(DSV4AllocatorTest, FlashInitAndBasicProperties) {
+    auto config    = makeDSV4AllocatorConfig(/*use_flash=*/true);
+    auto allocator = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    ASSERT_TRUE(allocator->init());
+
+    EXPECT_EQ(config.groupNums(), 7);
+    EXPECT_EQ(config.layer_num, 43u);
+    EXPECT_EQ(allocator->totalBlocksNum(), config.block_num - 1);
+}
+
+TEST_F(DSV4AllocatorTest, AddressLookupAllGroups) {
+    auto config    = makeDSV4AllocatorConfig();
+    auto allocator = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    ASSERT_TRUE(allocator->init());
+
+    // Verify address lookup works for a layer in each group
+    // Group 0 (CSA KV): csa_layer_ids[0]
+    // Group 1 (HCA KV): hca_layer_ids[0]
+    // Group 6 (SWA KV): all_layer_ids[0]
+    for (int gid = 0; gid < 7; gid++) {
+        ASSERT_FALSE(config.layerIdsForGroup(gid).empty()) << "group " << gid << " has no layers";
+        int  layer_id = config.layerIdsForGroup(gid)[0];
+        auto addr     = allocator->convertIndexToAddr(layer_id, gid, /*block_id=*/1);
+        EXPECT_NE(addr.kv_addr, nullptr) << "null kv_addr for group " << gid << " layer " << layer_id;
+    }
+}
+
+TEST_F(DSV4AllocatorTest, BlockPoolCreatedWithCorrectTensors) {
+    auto config    = makeDSV4AllocatorConfig();
+    auto allocator = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    ASSERT_TRUE(allocator->init());
+
+    auto block_pool = allocator->getBlockPool();
+    ASSERT_NE(block_pool, nullptr);
+
+    // allLayerCacheBase should return tensors for all 61 layers
+    auto layout = allocator->allLayerCacheBase();
+    EXPECT_EQ(layout.layers_to_kv_buffer_ptrs.size(), static_cast<size_t>(config.layer_num));
+    for (size_t i = 0; i < layout.layers_to_kv_buffer_ptrs.size(); ++i) {
+        EXPECT_TRUE(layout.layers_to_kv_buffer_ptrs[i].defined()) << "undefined kv buffer for layer " << i;
+    }
+}
+
+TEST_F(DSV4AllocatorTest, ConvertIndexToBufferAllGroups) {
+    auto config    = makeDSV4AllocatorConfig();
+    auto allocator = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    ASSERT_TRUE(allocator->init());
+
+    // convertIndexToBuffer should work for layers in each of the 7 groups
+    for (int gid = 0; gid < 7; gid++) {
+        int  layer_id = config.layerIdsForGroup(gid)[0];
+        auto buf      = allocator->convertIndexToBuffer(layer_id, gid, /*block_id=*/1);
+        ASSERT_FALSE(buf.empty()) << "empty buffer for group " << gid;
+        EXPECT_NE(buf[0].addr, nullptr) << "null addr for group " << gid;
+    }
+}
+
+TEST_F(DSV4AllocatorTest, MallocAndFreeBlocks) {
+    auto config    = makeDSV4AllocatorConfig();
+    auto allocator = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    ASSERT_TRUE(allocator->init());
+
+    auto block_pool = allocator->getBlockPool();
+    ASSERT_NE(block_pool, nullptr);
+
+    size_t free_before = allocator->freeBlocksNum();
+    ASSERT_GT(free_before, 3u);
+
+    // Direct block pool malloc/free
+    auto blocks = block_pool->malloc(3);
+    ASSERT_EQ(blocks.size(), 3u);
+    EXPECT_EQ(allocator->freeBlocksNum(), free_before - 3);
+
+    block_pool->requestFree(blocks);
+    EXPECT_EQ(allocator->freeBlocksNum(), free_before);
+}
+
+TEST_F(DSV4AllocatorTest, SevenGroupLayerMapping) {
+    auto config = makeDSV4AllocatorConfig();
+
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "csa_kv")).size(), 30u);
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "hca_kv")).size(), 31u);
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "indexer_kv")).size(), 30u);
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "indexer_state")).size(), 30u);
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "csa_state")).size(), 30u);
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "hca_state")).size(), 31u);
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "swa_kv")).size(), 61u);
+
+    EXPECT_EQ(config.typeForGroup(gidForTag(config, "csa_kv")), CacheGroupType::FULL);
+    EXPECT_EQ(config.typeForGroup(gidForTag(config, "hca_kv")), CacheGroupType::FULL);
+    EXPECT_EQ(config.typeForGroup(gidForTag(config, "indexer_kv")), CacheGroupType::FULL);
+    EXPECT_EQ(config.typeForGroup(gidForTag(config, "indexer_state")), CacheGroupType::SWA);
+    EXPECT_EQ(config.typeForGroup(gidForTag(config, "csa_state")), CacheGroupType::SWA);
+    EXPECT_EQ(config.typeForGroup(gidForTag(config, "hca_state")), CacheGroupType::SWA);
+    EXPECT_EQ(config.typeForGroup(gidForTag(config, "swa_kv")), CacheGroupType::SWA);
+}
+
+TEST_F(DSV4AllocatorTest, SpecBlockSizesMatchPoolSpecs) {
+    auto config = makeDSV4AllocatorConfig();
+
+    ASSERT_EQ(static_cast<size_t>(config.groupNums()), 7u);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "csa_kv"))->block_size_bytes(), 32u * kDsv4KvEntryBytes);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "hca_kv"))->block_size_bytes(), 1u * kDsv4KvEntryBytes);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "indexer_kv"))->block_size_bytes(), 32u * kDsv4IndexerEntryBytes);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "indexer_state"))->block_size_bytes(), 8u * 512u * 4u);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "csa_state"))->block_size_bytes(), 8u * 2048u * 4u);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "hca_state"))->block_size_bytes(), 128u * 1024u * 4u);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "swa_kv"))->block_size_bytes(), kDsv4TokensPerBlock * kDsv4KvEntryBytes);
+}
+
+TEST_F(DSV4AllocatorTest, KVBlockStrideIsMaxAcrossGroups) {
+    auto config = makeDSV4AllocatorConfig();
+
+    // kv_block_stride_bytes should be the max block_size_bytes across all 7 pools
+    size_t expected_max = 0;
+    for (int i = 0; i < kDsv4PoolNum; i++) {
+        expected_max = std::max(expected_max, config.specForGroup(i)->block_size_bytes());
+    }
+    EXPECT_EQ(config.kv_block_stride_bytes, expected_max);
+    // HCA_STATE has the largest per-block bytes (128 entries * 1024 * 4)
+    EXPECT_EQ(expected_max, config.specForGroup(gidForTag(config, "hca_state"))->block_size_bytes());
+}
+
+TEST_F(DSV4AllocatorTest, HCAStateIsExcludedFromReuseCachePolicy) {
+    auto config = makeDSV4AllocatorConfig();
+    ASSERT_EQ(static_cast<size_t>(config.groupNums()), 7u);
+    ASSERT_EQ(config.groupPoliciesSnapshot().size(), static_cast<size_t>(config.groupNums()));
+
+    for (size_t gid = 0; gid < static_cast<size_t>(config.groupNums()); ++gid) {
+        if (config.tagForGroup(gid) == "hca_state") {
+            EXPECT_EQ(config.policyForGroup(gid).reuse_policy, CacheReusePolicy::NON_REUSABLE)
+                << "HCA_STATE should skip reuse cache";
+        } else {
+            EXPECT_EQ(config.policyForGroup(gid).reuse_policy, CacheReusePolicy::REUSABLE) << "group " << gid;
+        }
+    }
+}
+
+// ============================================================
+// Flash config: allocator integration
+// ============================================================
+
+TEST_F(DSV4AllocatorTest, FlashGroupTypes) {
+    auto config = makeDSV4AllocatorConfig(/*use_flash=*/true);
+
+    // Flash: 21 CSA + 20 HCA + 2 SWA-only = 43 layers
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "csa_kv")).size(), 21u);
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "hca_kv")).size(), 20u);
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "swa_kv")).size(), 43u);
+
+    EXPECT_EQ(config.typeForGroup(gidForTag(config, "csa_kv")), CacheGroupType::FULL);
+    EXPECT_EQ(config.typeForGroup(gidForTag(config, "hca_kv")), CacheGroupType::FULL);
+    EXPECT_EQ(config.typeForGroup(gidForTag(config, "indexer_kv")), CacheGroupType::FULL);
+    EXPECT_EQ(config.typeForGroup(gidForTag(config, "indexer_state")), CacheGroupType::SWA);
+    EXPECT_EQ(config.typeForGroup(gidForTag(config, "csa_state")), CacheGroupType::SWA);
+    EXPECT_EQ(config.typeForGroup(gidForTag(config, "hca_state")), CacheGroupType::SWA);
+    EXPECT_EQ(config.typeForGroup(gidForTag(config, "swa_kv")), CacheGroupType::SWA);
+}
+
+TEST_F(DSV4AllocatorTest, FlashAddressLookupAllGroups) {
+    auto config    = makeDSV4AllocatorConfig(/*use_flash=*/true);
+    auto allocator = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    ASSERT_TRUE(allocator->init());
+
+    for (int gid = 0; gid < 7; gid++) {
+        ASSERT_FALSE(config.layerIdsForGroup(gid).empty()) << "Flash group " << gid << " has no layers";
+        int  layer_id = config.layerIdsForGroup(gid)[0];
+        auto addr     = allocator->convertIndexToAddr(layer_id, gid, /*block_id=*/1);
+        EXPECT_NE(addr.kv_addr, nullptr) << "Flash null kv_addr for group " << gid;
+    }
+}
+
+TEST_F(DSV4AllocatorTest, FlashBlockPoolTensors) {
+    auto config    = makeDSV4AllocatorConfig(/*use_flash=*/true);
+    auto allocator = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    ASSERT_TRUE(allocator->init());
+
+    auto layout = allocator->allLayerCacheBase();
+    EXPECT_EQ(layout.layers_to_kv_buffer_ptrs.size(), 43u);
+    for (size_t i = 0; i < layout.layers_to_kv_buffer_ptrs.size(); ++i) {
+        EXPECT_TRUE(layout.layers_to_kv_buffer_ptrs[i].defined()) << "Flash undefined kv buffer for layer " << i;
+    }
+}
+
+TEST_F(DSV4AllocatorTest, FlashLayerMapping) {
+    auto config = makeDSV4AllocatorConfig(/*use_flash=*/true);
+
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "csa_kv")).size(), 21u);
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "hca_kv")).size(), 20u);
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "indexer_kv")).size(), 21u);
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "indexer_state")).size(), 21u);
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "csa_state")).size(), 21u);
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "hca_state")).size(), 20u);
+    EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "swa_kv")).size(), 43u);
+}
+
+TEST_F(DSV4AllocatorTest, FlashSpecBlockSizes) {
+    auto config = makeDSV4AllocatorConfig(/*use_flash=*/true);
+
+    ASSERT_EQ(static_cast<size_t>(config.groupNums()), 7u);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "csa_kv"))->block_size_bytes(), 32u * kDsv4KvEntryBytes);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "hca_kv"))->block_size_bytes(), 1u * kDsv4KvEntryBytes);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "indexer_kv"))->block_size_bytes(), 32u * kDsv4IndexerEntryBytes);
+    EXPECT_EQ(config.specForGroup(gidForTag(config, "swa_kv"))->block_size_bytes(), kDsv4TokensPerBlock * kDsv4KvEntryBytes);
+}
+
+TEST_F(DSV4AllocatorTest, FlashMallocAndFree) {
+    auto config    = makeDSV4AllocatorConfig(/*use_flash=*/true);
+    auto allocator = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    ASSERT_TRUE(allocator->init());
+
+    auto   block_pool  = allocator->getBlockPool();
+    size_t free_before = allocator->freeBlocksNum();
+    ASSERT_GT(free_before, 5u);
+
+    auto blocks = block_pool->malloc(5);
+    ASSERT_EQ(blocks.size(), 5u);
+    EXPECT_EQ(allocator->freeBlocksNum(), free_before - 5);
+
+    block_pool->requestFree(blocks);
+    EXPECT_EQ(allocator->freeBlocksNum(), free_before);
+}
+
+// ============================================================
+// Prefix cache: insertIntoCache skips HCA_STATE but keeps other groups reusable.
+// ============================================================
+
+TEST_F(DSV4AllocatorTest, InsertIntoCacheAllGroups) {
+    auto config       = makeDSV4AllocatorConfig();
+    auto allocator    = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    auto shared_cache = std::make_shared<SharedBlockCache>();
+    allocator->setSharedBlockCache(shared_cache);
+    ASSERT_TRUE(allocator->init());
+
+    auto block_pool = allocator->getBlockPool();
+
+    // Manually set up a BatchKVCacheResource with blocks for all 7 groups
+    auto batch_res = std::make_shared<BatchKVCacheResource>();
+    batch_res->resetBatchSize(1);
+    initDsv4BatchGroups(*batch_res, config);
+
+    CacheKeysType keys = {200, 201, 202, 203};
+    batch_res->setBatchCacheKeys(0, keys);
+
+    // Allocate 3 blocks per group (simulating 3 full blocks)
+    for (int gid = 0; gid < 7; gid++) {
+        auto blocks = block_pool->malloc(3);
+        ASSERT_EQ(blocks.size(), 3u);
+        batch_res->mutableBlockIds(0, gid).assign(BlockIndicesType(blocks.begin(), blocks.end()));
+    }
+
+    // Create CompleteTokenIds: 3 full blocks * seq_size_per_block tokens + partial
+    int  seq_size_per_block         = allocator->seqSizePerBlock();
+    auto complete_token_ids         = std::make_shared<CompleteTokenIds>(1, 1, 4096, seq_size_per_block);
+    auto generate_input             = std::make_shared<GenerateInput>();
+    int  total_tokens               = 3 * seq_size_per_block + 1;  // 3 full blocks + 1 partial
+    generate_input->input_ids       = torch::arange(total_tokens, torch::kInt32);
+    generate_input->generate_config = std::make_shared<GenerateConfig>();
+    complete_token_ids->init(generate_input);
+
+    InsertInfo insert_info{batch_res, complete_token_ids, /*is_resident=*/false};
+    allocator->insertIntoCache(insert_info);
+
+    // HCA_STATE is runtime scratch state and must not be persisted as reusable prefix cache.
+    for (int gid = 0; gid < 7; gid++) {
+        if (config.tagForGroup(gid) == "hca_state") {
+            EXPECT_TRUE(isNullBlockIdx(shared_cache->matchGroup(200, gid))) << "HCA_STATE should skip key 200";
+            EXPECT_TRUE(isNullBlockIdx(shared_cache->matchGroup(201, gid))) << "HCA_STATE should skip tail key 201";
+            EXPECT_TRUE(isNullBlockIdx(shared_cache->matchGroup(202, gid))) << "HCA_STATE should skip tail key 202";
+            continue;
+        }
+        EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(200, gid))) << config.tagForGroup(gid);
+        if (config.typeForGroup(gid) != CacheGroupType::FULL) {
+            EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(201, gid))) << config.tagForGroup(gid);
+            EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(202, gid))) << config.tagForGroup(gid);
+        }
+    }
+
+    // Free all blocks
+    for (int gid = 0; gid < 7; gid++) {
+        const auto& blocks = batch_res->blocks(0, gid);
+        block_pool->requestFree(blocks);
+    }
+}
+
+// ============================================================
+// Prefix cache: Flash config insertIntoCache skips HCA_STATE.
+// ============================================================
+
+TEST_F(DSV4AllocatorTest, FlashInsertIntoCacheAllGroups) {
+    auto config       = makeDSV4AllocatorConfig(/*use_flash=*/true);
+    auto allocator    = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    auto shared_cache = std::make_shared<SharedBlockCache>();
+    allocator->setSharedBlockCache(shared_cache);
+    ASSERT_TRUE(allocator->init());
+
+    auto block_pool = allocator->getBlockPool();
+
+    auto batch_res = std::make_shared<BatchKVCacheResource>();
+    batch_res->resetBatchSize(1);
+    initDsv4BatchGroups(*batch_res, config);
+
+    CacheKeysType keys = {300, 301, 302, 303};
+    batch_res->setBatchCacheKeys(0, keys);
+
+    for (int gid = 0; gid < 7; gid++) {
+        auto blocks = block_pool->malloc(3);
+        ASSERT_EQ(blocks.size(), 3u);
+        batch_res->mutableBlockIds(0, gid).assign(BlockIndicesType(blocks.begin(), blocks.end()));
+    }
+
+    int  seq_size_per_block         = allocator->seqSizePerBlock();
+    auto complete_token_ids         = std::make_shared<CompleteTokenIds>(1, 1, 4096, seq_size_per_block);
+    auto generate_input             = std::make_shared<GenerateInput>();
+    int  total_tokens               = 3 * seq_size_per_block + 1;
+    generate_input->input_ids       = torch::arange(total_tokens, torch::kInt32);
+    generate_input->generate_config = std::make_shared<GenerateConfig>();
+    complete_token_ids->init(generate_input);
+
+    InsertInfo insert_info{batch_res, complete_token_ids, /*is_resident=*/false};
+    allocator->insertIntoCache(insert_info);
+
+    for (int gid = 0; gid < 7; gid++) {
+        if (config.tagForGroup(gid) == "hca_state") {
+            EXPECT_TRUE(isNullBlockIdx(shared_cache->matchGroup(300, gid))) << "Flash HCA_STATE should skip key 300";
+            EXPECT_TRUE(isNullBlockIdx(shared_cache->matchGroup(301, gid))) << "Flash HCA_STATE should skip tail key 301";
+            EXPECT_TRUE(isNullBlockIdx(shared_cache->matchGroup(302, gid))) << "Flash HCA_STATE should skip tail key 302";
+            continue;
+        }
+        EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(300, gid))) << config.tagForGroup(gid);
+        if (config.typeForGroup(gid) != CacheGroupType::FULL) {
+            EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(301, gid))) << config.tagForGroup(gid);
+            EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(302, gid))) << config.tagForGroup(gid);
+        }
+    }
+
+    for (int gid = 0; gid < 7; gid++) {
+        block_pool->requestFree(batch_res->blocks(0, gid));
+    }
+}
+
+// ============================================================
+// Prefix cache: paged FULL groups reuse; reusable SWA/state groups require a matched latest tail block.
+// ============================================================
+
+TEST_F(DSV4AllocatorTest, PrefixCacheReusePagedGroupsOnly) {
+    auto config       = makeDSV4AllocatorConfig();
+    auto allocator    = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    auto shared_cache = std::make_shared<SharedBlockCache>();
+    allocator->setSharedBlockCache(shared_cache);
+    ASSERT_TRUE(allocator->init());
+
+    auto block_pool = allocator->getBlockPool();
+
+    // Pre-populate cache for ALL 7 groups with keys {100,101,102}
+    constexpr int                          group_num   = 7;
+    CacheKeysType                          cached_keys = {100, 101, 102};
+    std::vector<std::vector<BlockIdxType>> cached_blocks(group_num);
+    for (int gid = 0; gid < group_num; gid++) {
+        auto blocks = block_pool->malloc(static_cast<int>(cached_keys.size()));
+        ASSERT_EQ(blocks.size(), cached_keys.size());
+        for (size_t i = 0; i < cached_keys.size(); ++i) {
+            std::vector<BlockIdxType> group_slots(group_num, NULL_BLOCK_IDX);
+            group_slots[gid] = blocks[i];
+            shared_cache->put(cached_keys[i], group_slots, true);
+        }
+        cached_blocks[gid] = blocks;
+        block_pool->requestFree(blocks);
+    }
+
+    // Now do a malloc with reuse enabled — keys {100,101,102,103}
+    auto batch_res = std::make_shared<BatchKVCacheResource>();
+    batch_res->resetBatchSize(1);
+    initDsv4BatchGroups(*batch_res, config);
+    batch_res->setBatchCacheKeys(0, CacheKeysType{100, 101, 102, 103});
+
+    int  seq_size_per_block         = allocator->seqSizePerBlock();
+    int  seq_len                    = 3 * seq_size_per_block + 1;  // 3 full + partial
+    auto complete_token_ids         = std::make_shared<CompleteTokenIds>(1, 1, 4096, seq_size_per_block);
+    auto generate_input             = std::make_shared<GenerateInput>();
+    generate_input->input_ids       = torch::arange(seq_len, torch::kInt32);
+    generate_input->generate_config = std::make_shared<GenerateConfig>();
+    complete_token_ids->init(generate_input);
+
+    MallocInfo info{batch_res, complete_token_ids};
+    info.enable_device_cache = true;
+    info.reuse_cache         = true;
+    auto result              = allocator->malloc(info);
+    ASSERT_TRUE(result.success);
+
+    EXPECT_GT(result.reuse_len, 0) << "Prefix cache reuse should work with paged DSV4 groups";
+
+    for (int gid = 0; gid < group_num; gid++) {
+        const auto& out_blocks = batch_res->blocks(0, gid);
+        ASSERT_GE(out_blocks.size(), 3u) << config.tagForGroup(gid);
+        if (config.typeForGroup(gid) == CacheGroupType::FULL) {
+            EXPECT_EQ(out_blocks[0], cached_blocks[gid][0]) << config.tagForGroup(gid);
+            EXPECT_EQ(out_blocks[1], cached_blocks[gid][1]) << config.tagForGroup(gid);
+            continue;
+        }
+        EXPECT_TRUE(isNullBlockIdx(out_blocks[1])) << config.tagForGroup(gid);
+        if (config.tagForGroup(gid) == "hca_state") {
+            EXPECT_TRUE(isNullBlockIdx(out_blocks[2])) << "HCA_STATE should not reuse a cached tail block";
+            continue;
+        }
+        EXPECT_EQ(out_blocks[2], cached_blocks[gid][2]) << config.tagForGroup(gid);
+    }
+
+    // Clean up
+    FreeInfo free_info{batch_res};
+    allocator->free(free_info);
+}
+
+TEST_F(DSV4AllocatorTest, PrefixCacheReuseRequiresSWATailHit) {
+    auto config       = makeDSV4AllocatorConfig();
+    auto allocator    = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    auto shared_cache = std::make_shared<SharedBlockCache>();
+    allocator->setSharedBlockCache(shared_cache);
+    ASSERT_TRUE(allocator->init());
+
+    auto block_pool = allocator->getBlockPool();
+
+    constexpr int                          group_num   = 7;
+    CacheKeysType                          cached_keys = {100, 101, 102};
+    std::vector<std::vector<BlockIdxType>> cached_blocks(3);
+    for (int gid = 0; gid < 3; gid++) {
+        auto blocks = block_pool->malloc(static_cast<int>(cached_keys.size()));
+        ASSERT_EQ(blocks.size(), cached_keys.size());
+        for (size_t i = 0; i < cached_keys.size(); ++i) {
+            std::vector<BlockIdxType> group_slots(group_num, NULL_BLOCK_IDX);
+            group_slots[gid] = blocks[i];
+            shared_cache->put(cached_keys[i], group_slots, true);
+        }
+        cached_blocks[gid] = blocks;
+        block_pool->requestFree(blocks);
+    }
+
+    auto batch_res = std::make_shared<BatchKVCacheResource>();
+    batch_res->resetBatchSize(1);
+    initDsv4BatchGroups(*batch_res, config);
+    batch_res->setBatchCacheKeys(0, CacheKeysType{100, 101, 102, 103});
+
+    int  seq_size_per_block         = allocator->seqSizePerBlock();
+    int  seq_len                    = 3 * seq_size_per_block + 1;
+    auto complete_token_ids         = std::make_shared<CompleteTokenIds>(1, 1, 4096, seq_size_per_block);
+    auto generate_input             = std::make_shared<GenerateInput>();
+    generate_input->input_ids       = torch::arange(seq_len, torch::kInt32);
+    generate_input->generate_config = std::make_shared<GenerateConfig>();
+    complete_token_ids->init(generate_input);
+
+    MallocInfo info{batch_res, complete_token_ids};
+    info.enable_device_cache = true;
+    info.reuse_cache         = true;
+    auto result              = allocator->malloc(info);
+    ASSERT_TRUE(result.success);
+
+    EXPECT_EQ(result.reuse_len, 0) << "SWA tail miss should veto paged prefix reuse";
+
+    FreeInfo free_info{batch_res};
+    allocator->free(free_info);
+}
+
+TEST_F(DSV4AllocatorTest, PrefixCacheReuseDoesNotRequireHCAStateHit) {
+    auto config       = makeDSV4AllocatorConfig();
+    auto allocator    = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    auto shared_cache = std::make_shared<SharedBlockCache>();
+    allocator->setSharedBlockCache(shared_cache);
+    ASSERT_TRUE(allocator->init());
+
+    auto block_pool = allocator->getBlockPool();
+
+    constexpr int                          group_num   = 7;
+    CacheKeysType                          cached_keys = {1100, 1101, 1102};
+    std::vector<std::vector<BlockIdxType>> cached_blocks(group_num);
+    for (int gid = 0; gid < group_num; gid++) {
+        if (config.tagForGroup(gid) == "hca_state") {
+            continue;
+        }
+        auto blocks = block_pool->malloc(static_cast<int>(cached_keys.size()));
+        ASSERT_EQ(blocks.size(), cached_keys.size());
+        for (size_t i = 0; i < cached_keys.size(); ++i) {
+            if (config.typeForGroup(gid) != CacheGroupType::FULL && i + 1 < cached_keys.size()) {
+                continue;
+            }
+            std::vector<BlockIdxType> group_slots(group_num, NULL_BLOCK_IDX);
+            group_slots[gid] = blocks[i];
+            shared_cache->put(cached_keys[i], group_slots, true);
+        }
+        cached_blocks[gid] = blocks;
+        block_pool->requestFree(blocks);
+    }
+
+    auto batch_res = std::make_shared<BatchKVCacheResource>();
+    batch_res->resetBatchSize(1);
+    initDsv4BatchGroups(*batch_res, config);
+    batch_res->setBatchCacheKeys(0, CacheKeysType{1100, 1101, 1102, 1103});
+
+    const int spb       = allocator->seqSizePerBlock();
+    auto      cti       = std::make_shared<CompleteTokenIds>(1, 1, 4096, spb);
+    auto      gi        = std::make_shared<GenerateInput>();
+    gi->input_ids       = torch::arange(3 * spb + 1, torch::kInt32);
+    gi->generate_config = std::make_shared<GenerateConfig>();
+    cti->init(gi);
+
+    MallocInfo info{batch_res, cti};
+    info.enable_device_cache = true;
+    info.reuse_cache         = true;
+    auto result              = allocator->malloc(info);
+    ASSERT_TRUE(result.success);
+
+    EXPECT_GT(result.reuse_len, 0) << "HCA_STATE miss should not veto DSV4 prefix reuse";
+    const auto hca_state_gid = gidForTag(config, "hca_state");
+    const auto swa_gid       = gidForTag(config, "swa_kv");
+    EXPECT_TRUE(isNullBlockIdx(batch_res->blocks(0, hca_state_gid).at(2))) << "HCA_STATE should remain non-reused";
+    EXPECT_EQ(batch_res->blocks(0, swa_gid).at(2), cached_blocks[swa_gid][2]) << "SWA_KV tail should still gate reuse";
+
+    FreeInfo free_info{batch_res};
+    allocator->free(free_info);
+}
+
+TEST_F(DSV4AllocatorTest, PrefixCacheReuseAcceptsSingleLatestSWATailHit) {
+    auto config       = makeDSV4AllocatorConfig();
+    auto allocator    = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    auto shared_cache = std::make_shared<SharedBlockCache>();
+    allocator->setSharedBlockCache(shared_cache);
+    ASSERT_TRUE(allocator->init());
+
+    auto block_pool = allocator->getBlockPool();
+
+    constexpr int group_num   = 7;
+    CacheKeysType cached_keys = {100, 101, 102};
+    for (int gid = 0; gid < group_num; gid++) {
+        auto blocks = block_pool->malloc(static_cast<int>(cached_keys.size()));
+        ASSERT_EQ(blocks.size(), cached_keys.size());
+        for (size_t i = 0; i < cached_keys.size(); ++i) {
+            if (config.typeForGroup(gid) != CacheGroupType::FULL && i + 1 < cached_keys.size()) {
+                continue;
+            }
+            std::vector<BlockIdxType> group_slots(group_num, NULL_BLOCK_IDX);
+            group_slots[gid] = blocks[i];
+            shared_cache->put(cached_keys[i], group_slots, true);
+        }
+        block_pool->requestFree(blocks);
+    }
+
+    auto batch_res = std::make_shared<BatchKVCacheResource>();
+    batch_res->resetBatchSize(1);
+    initDsv4BatchGroups(*batch_res, config);
+    batch_res->setBatchCacheKeys(0, CacheKeysType{100, 101, 102, 103});
+
+    const int spb       = allocator->seqSizePerBlock();
+    auto      cti       = std::make_shared<CompleteTokenIds>(1, 1, 4096, spb);
+    auto      gi        = std::make_shared<GenerateInput>();
+    gi->input_ids       = torch::arange(3 * spb + 1, torch::kInt32);
+    gi->generate_config = std::make_shared<GenerateConfig>();
+    cti->init(gi);
+
+    MallocInfo info{batch_res, cti};
+    info.enable_device_cache = true;
+    info.reuse_cache         = true;
+    auto result              = allocator->malloc(info);
+    ASSERT_TRUE(result.success);
+
+    EXPECT_GT(result.reuse_len, 0) << "latest SWA tail hit should allow paged prefix reuse";
+
+    FreeInfo free_info{batch_res};
+    allocator->free(free_info);
+}
+
+TEST_F(DSV4AllocatorTest, FlashPrefixCacheReusePagedGroupsOnly) {
+    auto config       = makeDSV4AllocatorConfig(/*use_flash=*/true);
+    auto allocator    = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    auto shared_cache = std::make_shared<SharedBlockCache>();
+    allocator->setSharedBlockCache(shared_cache);
+    ASSERT_TRUE(allocator->init());
+
+    auto block_pool = allocator->getBlockPool();
+
+    constexpr int                          group_num   = 7;
+    CacheKeysType                          cached_keys = {500, 501, 502};
+    std::vector<std::vector<BlockIdxType>> cached_blocks(group_num);
+    for (int gid = 0; gid < group_num; gid++) {
+        auto blocks = block_pool->malloc(static_cast<int>(cached_keys.size()));
+        ASSERT_EQ(blocks.size(), cached_keys.size());
+        for (size_t i = 0; i < cached_keys.size(); ++i) {
+            std::vector<BlockIdxType> group_slots(group_num, NULL_BLOCK_IDX);
+            group_slots[gid] = blocks[i];
+            shared_cache->put(cached_keys[i], group_slots, true);
+        }
+        cached_blocks[gid] = blocks;
+        block_pool->requestFree(blocks);
+    }
+
+    auto batch_res = std::make_shared<BatchKVCacheResource>();
+    batch_res->resetBatchSize(1);
+    initDsv4BatchGroups(*batch_res, config);
+    batch_res->setBatchCacheKeys(0, CacheKeysType{500, 501, 502, 503});
+
+    int  seq_size_per_block         = allocator->seqSizePerBlock();
+    int  seq_len                    = 3 * seq_size_per_block + 1;
+    auto complete_token_ids         = std::make_shared<CompleteTokenIds>(1, 1, 4096, seq_size_per_block);
+    auto generate_input             = std::make_shared<GenerateInput>();
+    generate_input->input_ids       = torch::arange(seq_len, torch::kInt32);
+    generate_input->generate_config = std::make_shared<GenerateConfig>();
+    complete_token_ids->init(generate_input);
+
+    MallocInfo info{batch_res, complete_token_ids};
+    info.enable_device_cache = true;
+    info.reuse_cache         = true;
+    auto result              = allocator->malloc(info);
+    ASSERT_TRUE(result.success);
+
+    EXPECT_GT(result.reuse_len, 0) << "Flash prefix cache reuse should work for paged groups";
+
+    for (int gid = 0; gid < group_num; gid++) {
+        const auto& out_blocks = batch_res->blocks(0, gid);
+        ASSERT_GE(out_blocks.size(), 3u) << config.tagForGroup(gid);
+        if (config.typeForGroup(gid) == CacheGroupType::FULL) {
+            EXPECT_EQ(out_blocks[0], cached_blocks[gid][0]) << config.tagForGroup(gid);
+            continue;
+        }
+        EXPECT_TRUE(isNullBlockIdx(out_blocks[1])) << config.tagForGroup(gid);
+        if (config.tagForGroup(gid) == "hca_state") {
+            EXPECT_TRUE(isNullBlockIdx(out_blocks[2])) << "Flash HCA_STATE should not reuse a cached tail block";
+            continue;
+        }
+        EXPECT_EQ(out_blocks[2], cached_blocks[gid][2]) << config.tagForGroup(gid);
+    }
+
+    FreeInfo free_info{batch_res};
+    allocator->free(free_info);
+}
+
+TEST_F(DSV4AllocatorTest, HybridPoolReserveBlocksAreDistributedAcrossGroups) {
+    auto config      = makeDSV4AllocatorConfig(/*use_flash=*/true);
+    config.block_num = 200;
+    auto allocator   = std::make_shared<HybridPoolKVCacheAllocator>(
+        config, AllocationType::DEVICE, nullptr, /*reserve_block_ratio=*/10);
+    ASSERT_TRUE(allocator->init());
+
+    auto batch_res = std::make_shared<BatchKVCacheResource>();
+    batch_res->resetBatchSize(1);
+    initDsv4BatchGroups(*batch_res, config);
+    batch_res->setBatchCacheKeys(0, CacheKeysType{600, 601});
+
+    const int spb       = allocator->seqSizePerBlock();
+    auto      cti       = std::make_shared<CompleteTokenIds>(1, 1, 4096, spb);
+    auto      gi        = std::make_shared<GenerateInput>();
+    gi->input_ids       = torch::arange(spb, torch::kInt32);
+    gi->generate_config = std::make_shared<GenerateConfig>();
+    cti->init(gi);
+
+    MallocInfo info{batch_res, cti};
+    info.enable_device_cache = false;
+    info.reuse_cache         = false;
+    info.verbose             = true;
+    auto result              = allocator->malloc(info);
+    ASSERT_TRUE(result.success);
+
+    FreeInfo free_info{batch_res};
+    allocator->free(free_info);
+}
+
+TEST_F(DSV4AllocatorTest, HybridPoolReserveBlocksDoNotReduceExplicitHcaStateCapacity) {
+    auto              mc = makeFlashModelConfig();
+    ParallelismConfig pc;
+    auto              kv_config = makeDsv4KvCacheConfig();
+    setDsv4ExplicitPoolBlocks(mc, "hca_state", 11);
+    auto              config    = CacheConfigCreator::createBasicConfig(mc, pc, kv_config, false, 0);
+    config.block_num            = 40;
+    std::vector<uint32_t> block_nums(static_cast<size_t>(config.groupNums()), config.block_num);
+    for (size_t gid = 0; gid < static_cast<size_t>(config.groupNums()); ++gid) {
+        if (config.tagForGroup(gid) == "hca_state") {
+            block_nums[gid] = 11;
+        }
+    }
+    setGroupBlockNumsForTest(config, block_nums);
+
+    auto allocator = std::make_shared<HybridPoolKVCacheAllocator>(
+        config, AllocationType::DEVICE, nullptr, /*reserve_block_ratio=*/50);
+    ASSERT_TRUE(allocator->init());
+
+    auto batch_res = std::make_shared<BatchKVCacheResource>();
+    batch_res->resetBatchSize(1);
+    initDsv4BatchGroups(*batch_res, config);
+
+    const int spb       = allocator->seqSizePerBlock();
+    const int seq_len   = 10 * spb;
+    auto      cti       = std::make_shared<CompleteTokenIds>(1, 1, seq_len + spb, spb);
+    auto      gi        = std::make_shared<GenerateInput>();
+    gi->input_ids       = torch::arange(seq_len, torch::kInt32);
+    gi->generate_config = std::make_shared<GenerateConfig>();
+    cti->init(gi);
+
+    MallocInfo info{batch_res, cti};
+    info.enable_device_cache = false;
+    info.reuse_cache         = false;
+    info.verbose             = true;
+    auto result              = allocator->malloc(info);
+    ASSERT_TRUE(result.success);
+
+    FreeInfo free_info{batch_res};
+    allocator->free(free_info);
+}
+
+// ============================================================
+// SWA (group 6) prefix cache: verify SWA blocks participate in reuse
+// ============================================================
+
+TEST_F(DSV4AllocatorTest, SWAGroupParticipatesInPrefixCacheReuse) {
+    auto config       = makeDSV4AllocatorConfig();
+    config.block_num  = 100;
+    auto allocator    = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    auto shared_cache = std::make_shared<SharedBlockCache>();
+    allocator->setSharedBlockCache(shared_cache);
+    ASSERT_TRUE(allocator->init());
+
+    auto block_pool = allocator->getBlockPool();
+
+    constexpr int group_num = 7;
+
+    // Only populate SWA group (6) and one paged group (0) to verify SWA participates
+    CacheKeysType             cached_keys = {700, 701};
+    std::vector<BlockIdxType> swa_blocks, csa_blocks;
+
+    // Group 0 (CSA KV)
+    {
+        auto blocks = block_pool->malloc(2);
+        for (size_t i = 0; i < 2; ++i) {
+            std::vector<BlockIdxType> group_slots(group_num, NULL_BLOCK_IDX);
+            group_slots[0] = blocks[i];
+            shared_cache->put(cached_keys[i], group_slots, true);
+        }
+        csa_blocks = blocks;
+        block_pool->requestFree(blocks);
+    }
+    // Group 6 (SWA KV)
+    {
+        auto blocks = block_pool->malloc(2);
+        for (size_t i = 0; i < 2; ++i) {
+            std::vector<BlockIdxType> group_slots(group_num, NULL_BLOCK_IDX);
+            group_slots[6] = blocks[i];
+            shared_cache->put(cached_keys[i], group_slots, true);
+        }
+        swa_blocks = blocks;
+        block_pool->requestFree(blocks);
+    }
+
+    // Verify both groups have cache entries
+    EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(700, 0)));
+    EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(700, 6)));
+    EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(701, 0)));
+    EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(701, 6)));
+
+    // Groups 1,2,3,4,5 not populated — they will limit reuse to 0
+    // But this verifies SWA group 6 IS in the reuse path
+    EXPECT_TRUE(isNullBlockIdx(shared_cache->matchGroup(700, 3)));
+    EXPECT_TRUE(isNullBlockIdx(shared_cache->matchGroup(700, 4)));
+    EXPECT_TRUE(isNullBlockIdx(shared_cache->matchGroup(700, 5)));
+}
+
+// ============================================================
+// SWA prefix cache: cache entries exist and the matched tail window gates reuse.
+// ============================================================
+
+TEST_F(DSV4AllocatorTest, SWAPrefixCacheRestoresTailReuse) {
+    auto config       = makeDSV4AllocatorConfig();
+    auto allocator    = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    auto shared_cache = std::make_shared<SharedBlockCache>();
+    allocator->setSharedBlockCache(shared_cache);
+    ASSERT_TRUE(allocator->init());
+
+    auto block_pool = allocator->getBlockPool();
+
+    // Populate ALL 7 groups with same keys
+    constexpr int                          group_num   = 7;
+    CacheKeysType                          cached_keys = {800, 801};
+    std::vector<std::vector<BlockIdxType>> cached_blocks(group_num);
+    for (int gid = 0; gid < group_num; gid++) {
+        auto blocks = block_pool->malloc(2);
+        for (size_t i = 0; i < 2; ++i) {
+            std::vector<BlockIdxType> group_slots(group_num, NULL_BLOCK_IDX);
+            group_slots[gid] = blocks[i];
+            shared_cache->put(cached_keys[i], group_slots, true);
+        }
+        cached_blocks[gid] = blocks;
+        block_pool->requestFree(blocks);
+    }
+
+    // Malloc with reuse — keys {800, 801, 802}
+    auto batch_res = std::make_shared<BatchKVCacheResource>();
+    batch_res->resetBatchSize(1);
+    initDsv4BatchGroups(*batch_res, config);
+    batch_res->setBatchCacheKeys(0, CacheKeysType{800, 801, 802});
+
+    int  spb            = allocator->seqSizePerBlock();
+    int  seq_len        = 2 * spb + 1;
+    auto cti            = std::make_shared<CompleteTokenIds>(1, 1, 4096, spb);
+    auto gi             = std::make_shared<GenerateInput>();
+    gi->input_ids       = torch::arange(seq_len, torch::kInt32);
+    gi->generate_config = std::make_shared<GenerateConfig>();
+    cti->init(gi);
+
+    MallocInfo info{batch_res, cti};
+    info.enable_device_cache = true;
+    info.reuse_cache         = true;
+    auto result              = allocator->malloc(info);
+    ASSERT_TRUE(result.success);
+    EXPECT_GT(result.reuse_len, 0);
+
+    const auto& swa_out = batch_res->blocks(0, 6);
+    ASSERT_GE(swa_out.size(), 2u);
+    EXPECT_TRUE(isNullBlockIdx(swa_out[0])) << "SWA previous matched tail is evicted after new tail allocation";
+    EXPECT_EQ(swa_out[1], cached_blocks[6][1]) << "SWA last matched tail block should remain";
+
+    FreeInfo free_info{batch_res};
+    allocator->free(free_info);
+}
+
+// ============================================================
+// incrMalloc: decode grows sequence after initial prefill
+// ============================================================
+
+TEST_F(DSV4AllocatorTest, IncrMallocDecodeGrowsBlocks) {
+    auto config    = makeDSV4AllocatorConfig();
+    auto allocator = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    ASSERT_TRUE(allocator->init());
+
+    int spb = allocator->seqSizePerBlock();
+
+    // Initial malloc: 1 block worth of tokens
+    auto batch_res = std::make_shared<BatchKVCacheResource>();
+    batch_res->resetBatchSize(1);
+    initDsv4BatchGroups(*batch_res, config);
+    batch_res->setBatchCacheKeys(0, CacheKeysType{900, 901});
+
+    auto cti            = std::make_shared<CompleteTokenIds>(1, 1, 4096, spb);
+    auto gi             = std::make_shared<GenerateInput>();
+    gi->input_ids       = torch::arange(spb, torch::kInt32);
+    gi->generate_config = std::make_shared<GenerateConfig>();
+    cti->init(gi);
+
+    MallocInfo init_info{batch_res, cti};
+    init_info.enable_device_cache = false;
+    auto init_result              = allocator->malloc(init_info);
+    ASSERT_TRUE(init_result.success);
+
+    // All 7 groups should have 1 block each
+    for (int gid = 0; gid < 7; gid++) {
+        EXPECT_EQ(batch_res->blocksNum(0, gid), 1u) << "group " << gid << " should have 1 block after init";
+    }
+
+    size_t free_after_init = allocator->freeBlocksNum();
+
+    // incrMalloc: grow to 2 blocks
+    cti->setSeqLength(2 * spb);
+    MallocInfo incr_info{batch_res, cti};
+    incr_info.enable_device_cache = false;
+    auto incr_result              = allocator->malloc(incr_info);
+    ASSERT_TRUE(incr_result.success);
+
+    // All 7 groups should now have 2 blocks each
+    for (int gid = 0; gid < 7; gid++) {
+        EXPECT_EQ(batch_res->blocksNum(0, gid), 2u) << "group " << gid << " should have 2 blocks after incr";
+    }
+
+    // HCA_STATE is not reusable: decode may materialize a new tail, but the
+    // skipped old tail is released, so only the other six groups consume a net
+    // additional block.
+    EXPECT_EQ(allocator->freeBlocksNum(), free_after_init - 6);
+
+    FreeInfo free_info{batch_res};
+    allocator->free(free_info);
+}
+
+// ============================================================
+// Free and reallocate: blocks return to pool
+// ============================================================
+
+TEST_F(DSV4AllocatorTest, FreeReturnsBlocksToPool) {
+    auto config    = makeDSV4AllocatorConfig();
+    auto allocator = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    ASSERT_TRUE(allocator->init());
+
+    size_t free_before = allocator->freeBlocksNum();
+    int    spb         = allocator->seqSizePerBlock();
+
+    // Allocate
+    auto batch_res = std::make_shared<BatchKVCacheResource>();
+    batch_res->resetBatchSize(1);
+    initDsv4BatchGroups(*batch_res, config);
+    batch_res->setBatchCacheKeys(0, CacheKeysType{1000, 1001});
+
+    auto cti            = std::make_shared<CompleteTokenIds>(1, 1, 4096, spb);
+    auto gi             = std::make_shared<GenerateInput>();
+    gi->input_ids       = torch::arange(spb, torch::kInt32);
+    gi->generate_config = std::make_shared<GenerateConfig>();
+    cti->init(gi);
+
+    MallocInfo info{batch_res, cti};
+    info.enable_device_cache = false;
+    auto result              = allocator->malloc(info);
+    ASSERT_TRUE(result.success);
+
+    size_t free_after_alloc = allocator->freeBlocksNum();
+    EXPECT_LT(free_after_alloc, free_before);
+
+    // Free
+    FreeInfo free_info{batch_res};
+    allocator->free(free_info);
+
+    // All blocks should be returned
+    EXPECT_EQ(allocator->freeBlocksNum(), free_before);
+
+    // Can allocate again
+    auto batch_res2 = std::make_shared<BatchKVCacheResource>();
+    batch_res2->resetBatchSize(1);
+    initDsv4BatchGroups(*batch_res2, config);
+    batch_res2->setBatchCacheKeys(0, CacheKeysType{1100, 1101});
+
+    MallocInfo info2{batch_res2, cti};
+    info2.enable_device_cache = false;
+    auto result2              = allocator->malloc(info2);
+    ASSERT_TRUE(result2.success);
+
+    FreeInfo free_info2{batch_res2};
+    allocator->free(free_info2);
+    EXPECT_EQ(allocator->freeBlocksNum(), free_before);
+}
+
+// ============================================================
+// Flash: incrMalloc decode path
+// ============================================================
+
+TEST_F(DSV4AllocatorTest, FlashIncrMallocDecode) {
+    auto config    = makeDSV4AllocatorConfig(/*use_flash=*/true);
+    auto allocator = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    ASSERT_TRUE(allocator->init());
+
+    int spb = allocator->seqSizePerBlock();
+
+    auto batch_res = std::make_shared<BatchKVCacheResource>();
+    batch_res->resetBatchSize(1);
+    initDsv4BatchGroups(*batch_res, config);
+    batch_res->setBatchCacheKeys(0, CacheKeysType{1200, 1201});
+
+    auto cti            = std::make_shared<CompleteTokenIds>(1, 1, 4096, spb);
+    auto gi             = std::make_shared<GenerateInput>();
+    gi->input_ids       = torch::arange(spb, torch::kInt32);
+    gi->generate_config = std::make_shared<GenerateConfig>();
+    cti->init(gi);
+
+    MallocInfo init_info{batch_res, cti};
+    init_info.enable_device_cache = false;
+    ASSERT_TRUE(allocator->malloc(init_info).success);
+
+    for (int gid = 0; gid < 7; gid++) {
+        EXPECT_EQ(batch_res->blocksNum(0, gid), 1u) << "Flash group " << gid;
+    }
+
+    // Grow to 3 blocks
+    cti->setSeqLength(3 * spb);
+    MallocInfo incr_info{batch_res, cti};
+    incr_info.enable_device_cache = false;
+    ASSERT_TRUE(allocator->malloc(incr_info).success);
+
+    for (int gid = 0; gid < 7; gid++) {
+        EXPECT_EQ(batch_res->blocksNum(0, gid), 3u) << "Flash group " << gid << " after incr";
+    }
+
+    FreeInfo free_info{batch_res};
+    allocator->free(free_info);
+}
+
+}  // namespace test
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/test/FullKVCacheGroupTest.cc b/rtp_llm/cpp/cache/test/FullKVCacheGroupTest.cc
index e1855bfdb9..ad317209d0 100644
--- a/rtp_llm/cpp/cache/test/FullKVCacheGroupTest.cc
+++ b/rtp_llm/cpp/cache/test/FullKVCacheGroupTest.cc
@@ -4,7 +4,8 @@
 #include <thread>
 #include <atomic>
 #include <algorithm>
-#include "rtp_llm/cpp/cache/FullKVCacheGroup.h"
+#include "rtp_llm/cpp/cache/group/FullKVCacheGroup.h"
+#include "rtp_llm/cpp/cache/SharedBlockCache.h"
 #include "rtp_llm/cpp/cache/test/BlockPoolTestHelper.h"
 
 namespace rtp_llm {
@@ -75,22 +76,21 @@ TEST_F(FullKVCacheGroupTest, MatchTest) {
 
     auto block_pool = createBlockPool();
     block_pool->init();
-    auto block_cache = block_pool->blockCache();
 
-    BlockCache::CacheItem item    = {101, 0, 1, false};
-    auto                  result1 = block_cache->put(item);
-    EXPECT_TRUE(result1);
-
-    BlockCache::CacheItem item2   = {102, 0, 2, false};
-    auto                  result2 = block_cache->put(item2);
-    EXPECT_TRUE(result2);
+    auto                      shared_cache = std::make_shared<SharedBlockCache>();
+    std::vector<BlockPoolPtr> group_pools  = {block_pool};
+    shared_cache->init(1, group_pools);
 
     auto spec                = std::make_shared<MHAKVCacheSpec>();
     spec->seq_size_per_block = 4;
 
-    FullKVCacheGroup group1({}, spec, block_pool, 0);
+    FullKVCacheGroup group1({}, spec, block_pool, 0, shared_cache.get());
+
+    // Put items into shared cache: cache_key -> group_slots (group 0 = block_idx)
+    shared_cache->put(101, {1}, false);
+    shared_cache->put(102, {2}, false);
 
-    // zero math
+    // zero match
     CacheKeysType cache_keys    = {103, 104, 105, 106};
     auto          match_result1 = group1.match(cache_keys);
     ASSERT_EQ(match_result1.reuse_blocks, 0);
@@ -107,13 +107,8 @@ TEST_F(FullKVCacheGroupTest, MatchTest) {
     ASSERT_EQ(match_result2.block_indices, expected_result);
 
     // all match
-    BlockCache::CacheItem item3   = {103, 0, 3, false};
-    auto                  result3 = block_cache->put(item3);
-    EXPECT_TRUE(result3);
-
-    BlockCache::CacheItem item4   = {104, 0, 4, false};
-    auto                  result4 = block_cache->put(item4);
-    EXPECT_TRUE(result4);
+    shared_cache->put(103, {3}, false);
+    shared_cache->put(104, {4}, false);
 
     cache_keys         = {101, 102, 103, 104};
     auto match_result3 = group1.match(cache_keys);
@@ -154,85 +149,6 @@ TEST_F(FullKVCacheGroupTest, MallocFreeTest) {
     ASSERT_FALSE(group1.malloc(block_ids2, 180));
 }
 
-TEST_F(FullKVCacheGroupTest, InsertIntoCacheTest) {
-    auto block_pool = createBlockPool();
-    block_pool->init();
-    ASSERT_EQ(block_pool->freeBlocksNum(), 9);
-    ASSERT_EQ(block_pool->availableBlocksNum(), 9);
-
-    auto spec                = std::make_shared<MHAKVCacheSpec>();
-    spec->seq_size_per_block = 2;
-
-    FullKVCacheGroup group1({}, spec, block_pool, 0);
-
-    CacheKeysType cache_keys = {103, 104, 105, 106};
-    BlockIds      block_ids(/*kernel_blocks_per_kv_block=*/1);
-
-    group1.malloc(block_ids, 8);
-    ASSERT_EQ(block_pool->freeBlocksNum(), 5);
-    ASSERT_EQ(block_ids.blocks().size(), 4);
-    BlockIndicesType expected_result = {1, 2, 3, 4};
-    ASSERT_EQ(block_ids.blocks(), expected_result);
-
-    group1.insertIntoCache(cache_keys, block_ids.blocks(), false);
-
-    CacheKeysType cache_keys1   = {107, 108};
-    auto          match_result1 = group1.match(cache_keys1);
-    ASSERT_EQ(match_result1.reuse_length, 0);
-
-    CacheKeysType cache_keys2   = {103, 104, 107};
-    auto          match_result2 = group1.match(cache_keys2);
-    ASSERT_EQ(match_result2.reuse_length, 2 * 2);
-    BlockIndicesType expected_result2 = {1, 2};
-    ASSERT_EQ(match_result2.block_indices, expected_result2);
-
-    CacheKeysType cache_keys3   = {103, 104, 105, 106};
-    auto          match_result3 = group1.match(cache_keys3);
-    ASSERT_EQ(match_result3.reuse_length, 4 * 2);
-    BlockIndicesType expected_result3 = {1, 2, 3, 4};
-    ASSERT_EQ(match_result3.block_indices, expected_result3);
-}
-
-TEST_F(FullKVCacheGroupTest, EnsureFreeBlocksTest) {
-    auto block_pool = createBlockPool();
-    block_pool->init();
-    auto block_cache  = block_pool->blockCache();
-    auto total_blocks = block_pool->freeBlocksNum();
-
-    auto spec                = std::make_shared<MHAKVCacheSpec>();
-    spec->seq_size_per_block = 2;
-
-    FullKVCacheGroup group1({}, spec, block_pool, 0);
-    ASSERT_EQ(true, group1.ensureFreeBlocks(5));
-    ASSERT_EQ(block_pool->freeBlocksNum(), total_blocks);
-    ASSERT_EQ(block_pool->availableBlocksNum(), total_blocks);
-
-    ASSERT_EQ(false, group1.ensureFreeBlocks(10));
-
-    CacheKeysType cache_keys = {101, 102, 103, 104};
-    BlockIds      block_ids(/*kernel_blocks_per_kv_block=*/1);
-
-    ASSERT_TRUE(group1.malloc(block_ids, 8));
-    ASSERT_EQ(block_ids.blocks().size(), 4);
-    ASSERT_EQ(block_pool->freeBlocksNum(), total_blocks - 4);
-    ASSERT_EQ(block_pool->availableBlocksNum(), total_blocks - 4);
-
-    group1.insertIntoCache(cache_keys, block_ids.blocks(), false);
-    ASSERT_EQ(block_cache->size(), 4);
-    ASSERT_EQ(block_pool->freeBlocksNum(), total_blocks - 4);
-    ASSERT_EQ(block_pool->availableBlocksNum(), total_blocks - 4);
-
-    group1.free(block_ids.blocks());
-    ASSERT_EQ(block_cache->size(), 4);
-    ASSERT_EQ(block_pool->freeBlocksNum(), total_blocks - 4);
-    ASSERT_EQ(block_pool->availableBlocksNum(), total_blocks);
-
-    ASSERT_EQ(true, group1.ensureFreeBlocks(total_blocks - 2));
-    ASSERT_EQ(block_cache->size(), 2);
-    ASSERT_EQ(block_pool->freeBlocksNum(), total_blocks - 2);
-    ASSERT_EQ(block_pool->availableBlocksNum(), total_blocks);
-}
-
 }  // namespace test
 }  // namespace rtp_llm
 
diff --git a/rtp_llm/cpp/cache/test/HybridKVCacheAllocatorCPShardTest.cc b/rtp_llm/cpp/cache/test/HybridKVCacheAllocatorCPShardTest.cc
new file mode 100644
index 0000000000..1f7d412312
--- /dev/null
+++ b/rtp_llm/cpp/cache/test/HybridKVCacheAllocatorCPShardTest.cc
@@ -0,0 +1,295 @@
+// CP-shard (Stage 5, Plan A) UTs for HybridKVCacheAllocator.
+//
+// These exercise the cp_slot_mapper plumbing in initMallocForCommonLen,
+// incrMalloc, insertIntoCache, and getNeedBlocks. The shape of the tests
+// piggybacks on the helpers in HybridTypeKVCacheAllocatorTest.cc but
+// keeps the configuration self-contained so the two files build cleanly
+// alongside each other.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <vector>
+
+#include "rtp_llm/cpp/cache/BatchKVCacheResource.h"
+#include "rtp_llm/cpp/cache/CPSlotMapper.h"
+#include "rtp_llm/cpp/cache/allocator/HybridTypeKVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/SharedBlockCache.h"
+#include "rtp_llm/cpp/cache/test/BlockPoolTestHelper.h"
+#include "rtp_llm/cpp/engine_base/stream/CompleteTokenIds.h"
+#include "rtp_llm/cpp/utils/Logger.h"
+
+namespace rtp_llm {
+namespace test {
+
+namespace {
+
+// Two-group hybrid: gid=0 linear (won't be exercised here), gid=1 full (the CP-shard target).
+CacheConfig makeCPHybridConfig() {
+    CacheConfig config;
+    config.dtype                     = rtp_llm::DataType::TYPE_FP16;
+    config.layer_num                 = 4;
+    config.layer_all_num             = 4;
+    config.block_num                 = 32;  // headroom for cp_size=2 expansion
+    config.seq_size_per_block        = 4;
+    config.kernel_seq_size_per_block = 2;
+    config.linear_step               = 2;
+    config.group_layer_num           = 2;
+
+    auto linear_spec                = std::make_shared<LinearKVCacheSpec>();
+    linear_spec->type               = KVCacheSpecType::LinearAttention;
+    linear_spec->dtype              = config.dtype;
+    linear_spec->local_num_k_heads  = 1;
+    linear_spec->local_num_v_heads  = 1;
+    linear_spec->head_k_dim         = 1;
+    linear_spec->head_v_dim         = 1;
+    linear_spec->conv_kernel_dim    = 2;
+    linear_spec->local_head_num_kv  = 1;
+    linear_spec->seq_size_per_block = static_cast<uint32_t>(config.seq_size_per_block);
+
+    auto full_spec                = std::make_shared<MHAKVCacheSpec>();
+    full_spec->type               = KVCacheSpecType::MultiHeadAttention;
+    full_spec->dtype              = config.dtype;
+    full_spec->local_head_num_kv  = 1;
+    full_spec->size_per_head      = 1;
+    full_spec->seq_size_per_block = static_cast<uint32_t>(config.seq_size_per_block);
+
+    config.fromGroupedSpecs({linear_spec, full_spec},
+                            {{0, 1}, {2, 3}},
+                            {CacheGroupType::LINEAR, CacheGroupType::FULL},
+                            {"linear", "full"});
+
+    config.kv_block_stride_bytes = std::max(full_spec->block_size_bytes(), linear_spec->block_size_bytes());
+    config.kv_block_size_bytes   = static_cast<size_t>(config.group_layer_num) * config.kv_block_stride_bytes;
+    config.kv_scale_stride_bytes = 0;
+    config.kv_scale_size_bytes   = 0;
+    config.block_size_bytes      = config.kv_block_size_bytes + config.kv_scale_size_bytes;
+
+    return config;
+}
+
+CompleteTokenIdsPtr makeTokens(int batch_size, int seq_length, int seq_size_per_block) {
+    auto  tokens = std::make_shared<CompleteTokenIds>(batch_size, batch_size, seq_length + 64, seq_size_per_block);
+    auto  ids    = torch::empty({(int64_t)seq_length}, torch::kInt32);
+    auto* p      = ids.data_ptr<int32_t>();
+    for (int i = 0; i < seq_length; ++i) {
+        p[i] = i + 1;
+    }
+    auto gen             = std::make_shared<GenerateInput>();
+    gen->input_ids       = ids;
+    gen->generate_config = std::make_shared<GenerateConfig>();
+    tokens->init(gen);
+    return tokens;
+}
+
+BatchKVCacheResourcePtr makeBatchRes(int batch_size, const CacheConfig& config, CacheKeysType keys) {
+    auto res = std::make_shared<BatchKVCacheResource>();
+    res->resetBatchSize(batch_size);
+    res->initGroups(config.groupNums(),
+                    static_cast<int>(config.layer_all_num),
+                    config.layerGroupIdsSnapshot());
+    for (int b = 0; b < batch_size; ++b) {
+        res->setBatchCacheKeys(b, keys);
+    }
+    return res;
+}
+
+// Cache (key, group-slot) pairs into SharedBlockCache and drop request refs so blocks are reusable.
+std::vector<BlockIdxType> seedCache(
+    BlockPoolPtr block_pool, SharedBlockCachePtr shared_cache, int group_num, int group_id, const CacheKeysType& keys) {
+    auto blocks = block_pool->malloc(static_cast<int>(keys.size()));
+    EXPECT_EQ(blocks.size(), keys.size());
+    for (size_t i = 0; i < keys.size(); ++i) {
+        std::vector<BlockIdxType> group_slots(static_cast<size_t>(group_num), NULL_BLOCK_IDX);
+        group_slots[static_cast<size_t>(group_id)] = blocks[i];
+        shared_cache->put(keys[i], group_slots, true);
+    }
+    block_pool->requestFree(blocks);
+    return blocks;
+}
+
+}  // namespace
+
+class HybridKVCacheAllocatorCPShardTest: public ::testing::Test {
+protected:
+    void SetUp() override {
+        rtp_llm::initLogger();
+        createDevice();
+    }
+};
+
+// 1) When cp_slot_mapper is null/passthrough, behavior is identical to the non-CP baseline:
+//    a request occupying 4 logical blocks allocates 4 blocks in the full group.
+TEST_F(HybridKVCacheAllocatorCPShardTest, NullMapperIsPassthrough) {
+    auto config    = makeCPHybridConfig();
+    auto allocator = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    allocator->setSharedBlockCache(std::make_shared<SharedBlockCache>());
+    ASSERT_TRUE(allocator->init());
+
+    const int gid_full  = 1;
+    auto      batch_res = makeBatchRes(/*batch_size=*/1, config, CacheKeysType{100, 101, 102, 103});
+    // seq_len=16 => 4 slots @ block_size=4
+    auto       tokens = makeTokens(/*batch=*/1, /*seq_len=*/16, /*sspb=*/4);
+    MallocInfo info{batch_res, tokens};
+    info.enable_device_cache = false;
+    info.reuse_cache         = false;
+    // cp_slot_mapper intentionally left null.
+    auto result = allocator->malloc(info);
+    ASSERT_TRUE(result.success);
+    EXPECT_EQ(batch_res->blocksNum(0, gid_full), 4);
+}
+
+// 2) With cp_slot_mapper(cp_rank=0, cp_size=2, block_size=4): a 4-block request allocates ceil(4/2)=2
+//    physical blocks on this rank for the full group.
+TEST_F(HybridKVCacheAllocatorCPShardTest, ShardedAllocHalvesFullGroup) {
+    auto config    = makeCPHybridConfig();
+    auto allocator = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    allocator->setSharedBlockCache(std::make_shared<SharedBlockCache>());
+    ASSERT_TRUE(allocator->init());
+
+    const int gid_full  = 1;
+    auto      batch_res = makeBatchRes(1, config, CacheKeysType{100, 101, 102, 103});
+    auto tokens = makeTokens(1, 16, 4);  // 4 logical blocks worth
+
+    MallocInfo info{batch_res, tokens};
+    info.enable_device_cache = false;
+    info.reuse_cache         = false;
+    allocator->setCPSlotMapper(std::make_shared<CPSlotMapper>(/*cp_rank=*/0, /*cp_size=*/2, /*block_size=*/4));
+    auto result              = allocator->malloc(info);
+    ASSERT_TRUE(result.success);
+    EXPECT_EQ(batch_res->blocksNum(0, gid_full), 2)
+        << "cp_size=2 should halve allocation to ceil(4/2)=2 physical blocks per rank";
+}
+
+// 3) Reuse path: cache the last-rank canonical key and confirm a second malloc hits it,
+//    returning reuse_len in units of virtualBlockSize (= block_size * cp_size).
+TEST_F(HybridKVCacheAllocatorCPShardTest, ReuseHitOnLastRankCanonicalKey) {
+    auto config    = makeCPHybridConfig();
+    auto allocator = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    allocator->setSharedBlockCache(std::make_shared<SharedBlockCache>());
+    ASSERT_TRUE(allocator->init());
+
+    auto block_pool   = allocator->getBlockPool();
+    auto shared_cache = allocator->sharedBlockCache();
+    ASSERT_NE(block_pool, nullptr);
+    ASSERT_NE(shared_cache, nullptr);
+
+    const int gid_linear = 0;
+    const int gid_full   = 1;
+    const int group_num  = 2;
+    // Full keys for 4 blocks: {100,101,102,103}.
+    // localCacheKeys(cp_rank=cp_size-1=1, cp_size=2) selects indices {1,3} => {101, 103}.
+    // initMallocForCommonLen drops the last for matching => match_keys = {101}.
+    // Joint match requires the linear group's tail to also resolve, so seed both groups with key 101.
+    seedCache(block_pool, shared_cache, group_num, gid_full, CacheKeysType{101});
+    seedCache(block_pool, shared_cache, group_num, gid_linear, CacheKeysType{101});
+
+    auto batch_res = makeBatchRes(1, config, CacheKeysType{100, 101, 102, 103});
+    auto tokens = makeTokens(1, 16, 4);
+
+    MallocInfo info{batch_res, tokens};
+    info.enable_device_cache = true;
+    info.reuse_cache         = true;
+    allocator->setCPSlotMapper(std::make_shared<CPSlotMapper>(/*cp_rank=*/0, /*cp_size=*/2, /*block_size=*/4));
+    auto result              = allocator->malloc(info);
+    ASSERT_TRUE(result.success);
+
+    // Expect 1 reuse virtual-block * virtualBlockSize(=8 tokens).
+    EXPECT_EQ(result.reuse_len, 8);
+    // Per-rank physical blocks for full group still = ceil(4/2) = 2.
+    EXPECT_EQ(batch_res->blocksNum(0, gid_full), 2);
+}
+
+// 4) When reuse is disabled, cp_slot_mapper still translates seq_len for malloc and skips the match.
+TEST_F(HybridKVCacheAllocatorCPShardTest, ShardedAllocSkipsReuseWhenDisabled) {
+    auto config    = makeCPHybridConfig();
+    auto allocator = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    allocator->setSharedBlockCache(std::make_shared<SharedBlockCache>());
+    ASSERT_TRUE(allocator->init());
+
+    auto block_pool   = allocator->getBlockPool();
+    auto shared_cache = allocator->sharedBlockCache();
+
+    const int gid_full = 1;
+    seedCache(block_pool, shared_cache, /*group_num=*/2, gid_full, CacheKeysType{101});
+
+    auto batch_res = makeBatchRes(1, config, CacheKeysType{100, 101, 102, 103});
+    auto tokens = makeTokens(1, 16, 4);
+
+    MallocInfo info{batch_res, tokens};
+    info.enable_device_cache = false;
+    info.reuse_cache         = false;
+    allocator->setCPSlotMapper(std::make_shared<CPSlotMapper>(0, 2, 4));
+    auto result              = allocator->malloc(info);
+    ASSERT_TRUE(result.success);
+    EXPECT_EQ(result.reuse_len, 0);
+    EXPECT_EQ(batch_res->blocksNum(0, gid_full), 2);
+}
+
+// 5) insertIntoCache uses last-rank canonical keys and virtualBlockSize when sharded:
+//    a 12-token request (full_blocks_num = floor(12/8)=1 virtual block) inserts only key {103}
+//    (= last-rank canonical key at index cp_size-1=1 of the first virtual block window).
+TEST_F(HybridKVCacheAllocatorCPShardTest, InsertIntoCacheUsesCanonicalKeysAndVirtualBlockSize) {
+    auto config    = makeCPHybridConfig();
+    auto allocator = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    allocator->setSharedBlockCache(std::make_shared<SharedBlockCache>());
+    ASSERT_TRUE(allocator->init());
+
+    auto shared_cache = allocator->sharedBlockCache();
+    ASSERT_NE(shared_cache, nullptr);
+
+    const int gid_full  = 1;
+    auto      batch_res = makeBatchRes(1, config, CacheKeysType{100, 101, 102, 103});
+
+    // seq_len=16 => allocator computes 4 logical blocks; cp_size=2 keeps 2 per rank.
+    auto       tokens = makeTokens(1, 16, 4);
+    MallocInfo malloc_info{batch_res, tokens};
+    malloc_info.enable_device_cache = false;
+    malloc_info.reuse_cache         = false;
+    allocator->setCPSlotMapper(std::make_shared<CPSlotMapper>(0, 2, 4));
+    ASSERT_TRUE(allocator->malloc(malloc_info).success);
+    ASSERT_EQ(batch_res->blocksNum(0, gid_full), 2);
+
+    // CompleteTokenIds reflects token-len 16, so token_len-1 = 15. virtualBlockSize=8 =>
+    // full_blocks_num = floor(15/8) = 1. n = min(local_keys.size()=2, 1) = 1.
+    // local_keys = {101, 103}; first key is 101.
+    InsertInfo insert_info{batch_res, tokens, /*is_resident=*/false};
+    allocator->insertIntoCache(insert_info);
+
+    EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(101, gid_full)));
+    EXPECT_TRUE(isNullBlockIdx(shared_cache->matchGroup(100, gid_full)));
+    EXPECT_TRUE(isNullBlockIdx(shared_cache->matchGroup(102, gid_full)));
+    EXPECT_TRUE(isNullBlockIdx(shared_cache->matchGroup(103, gid_full)));
+}
+
+// 6) Two-malloc smoke: cp_size=4 sharding, request occupies 8 logical blocks ⇒ 2 per rank.
+TEST_F(HybridKVCacheAllocatorCPShardTest, ShardedAllocCpSize4) {
+    auto config    = makeCPHybridConfig();
+    auto allocator = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    allocator->setSharedBlockCache(std::make_shared<SharedBlockCache>());
+    ASSERT_TRUE(allocator->init());
+
+    const int     gid_full = 1;
+    CacheKeysType keys;
+    for (int i = 0; i < 8; ++i) {
+        keys.push_back(200 + i);
+    }
+    auto batch_res = makeBatchRes(1, config, keys);
+    auto tokens    = makeTokens(1, /*seq_len=*/32, 4);  // 8 logical blocks
+
+    MallocInfo info{batch_res, tokens};
+    info.enable_device_cache = false;
+    info.reuse_cache         = false;
+    allocator->setCPSlotMapper(std::make_shared<CPSlotMapper>(/*cp_rank=*/2, /*cp_size=*/4, /*block_size=*/4));
+    auto result              = allocator->malloc(info);
+    ASSERT_TRUE(result.success);
+    EXPECT_EQ(batch_res->blocksNum(0, gid_full), 2);  // ceil(8/4)=2
+}
+
+}  // namespace test
+}  // namespace rtp_llm
+
+int main(int argc, char** argv) {
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
diff --git a/rtp_llm/cpp/cache/test/HybridPoolKVCacheAllocatorTest.cc b/rtp_llm/cpp/cache/test/HybridPoolKVCacheAllocatorTest.cc
new file mode 100644
index 0000000000..2d8831b890
--- /dev/null
+++ b/rtp_llm/cpp/cache/test/HybridPoolKVCacheAllocatorTest.cc
@@ -0,0 +1,1367 @@
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <limits>
+#include <memory>
+#include <stdexcept>
+#include <unordered_set>
+#include <vector>
+
+#include "rtp_llm/cpp/utils/AssertUtils.h"
+
+#include "rtp_llm/cpp/cache/BatchKVCacheResource.h"
+#include "rtp_llm/cpp/cache/SharedBlockCache.h"
+#include "rtp_llm/cpp/cache/BlockPool.h"
+#include "rtp_llm/cpp/cache/CacheConfig.h"
+#include "rtp_llm/cpp/cache/spec/CacheGroupType.h"
+#include "rtp_llm/cpp/cache/CPSlotMapper.h"
+#include "rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h"
+#include "rtp_llm/cpp/cache/config_creator/HybridPoolConfigCreator.h"
+#include "rtp_llm/cpp/cache/allocator/HybridPoolKVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/spec/LinearKVCacheSpec.h"
+#include "rtp_llm/cpp/cache/spec/MHAKVCacheSpec.h"
+#include "rtp_llm/cpp/cache/test/BlockPoolTestHelper.h"
+#include "rtp_llm/cpp/cache/test/CacheConfigTestUtils.h"
+#include "rtp_llm/cpp/config/ModelConfig.h"
+#include "rtp_llm/cpp/disaggregate/cache_store/CacheStore.h"
+#include "rtp_llm/cpp/disaggregate/cache_store/MemoryUtil.h"
+#include "rtp_llm/cpp/engine_base/stream/CompleteTokenIds.h"
+#include "rtp_llm/cpp/utils/Logger.h"
+
+namespace rtp_llm {
+namespace test {
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+// Build a tiny multi-pool config with two groups: gid=0 LINEAR(layers 0,1)
+// and gid=1 FULL(layers 2,3). Each group has its own per-group block budget,
+// so HybridPoolKVCacheAllocator creates two independent BlockPools.
+static CacheConfig makeTinyMultiPoolHybridConfig(uint32_t       linear_block_num = 6,
+                                                 uint32_t       full_block_num   = 8,
+                                                 CacheGroupType second_type      = CacheGroupType::FULL) {
+    CacheConfig config;
+    config.dtype                     = rtp_llm::DataType::TYPE_FP16;
+    config.layer_num                 = 4;
+    config.layer_all_num             = 4;
+    config.block_num                 = std::max(linear_block_num, full_block_num);
+    config.seq_size_per_block        = 4;
+    config.kernel_seq_size_per_block = 4;
+    config.linear_step               = 2;
+    config.group_layer_num           = 2;
+
+    auto linear_spec                = std::make_shared<LinearKVCacheSpec>();
+    linear_spec->type               = KVCacheSpecType::LinearAttention;
+    linear_spec->dtype              = config.dtype;
+    linear_spec->local_num_k_heads  = 1;
+    linear_spec->local_num_v_heads  = 1;
+    linear_spec->head_k_dim         = 1;
+    linear_spec->head_v_dim         = 1;
+    linear_spec->conv_kernel_dim    = 2;
+    linear_spec->local_head_num_kv  = 1;
+    linear_spec->seq_size_per_block = static_cast<uint32_t>(config.seq_size_per_block);
+
+    auto full_spec                = std::make_shared<MHAKVCacheSpec>();
+    full_spec->type               = KVCacheSpecType::MultiHeadAttention;
+    full_spec->dtype              = config.dtype;
+    full_spec->local_head_num_kv  = 1;
+    full_spec->size_per_head      = 1;
+    full_spec->seq_size_per_block = static_cast<uint32_t>(config.seq_size_per_block);
+
+    config.use_independent_block_pools = true;
+    config.fromGroupedSpecs({linear_spec, full_spec},
+                            {{0, 1}, {2, 3}},
+                            {CacheGroupType::LINEAR, second_type},
+                            {"linear", second_type == CacheGroupType::SWA ? "swa" : "full"});
+
+    // Same tokens per block for both groups.
+    config.group_seq_size_per_block = {config.seq_size_per_block, config.seq_size_per_block};
+
+    config.kv_block_stride_bytes = std::max(full_spec->block_size_bytes(), linear_spec->block_size_bytes());
+    config.kv_block_size_bytes   = static_cast<size_t>(config.group_layer_num) * config.kv_block_stride_bytes;
+    config.kv_scale_stride_bytes = 0;
+    config.kv_scale_size_bytes   = 0;
+    config.block_size_bytes      = config.kv_block_size_bytes + config.kv_scale_size_bytes;
+    config.layer_to_block_stride_bytes.assign(static_cast<size_t>(config.layer_all_num),
+                                              static_cast<int>(config.kv_block_stride_bytes));
+    const auto linear_stride = linear_spec->block_size_bytes();
+    const auto full_stride   = full_spec->block_size_bytes();
+    config.setGroupBlockLayout({linear_block_num, full_block_num}, {linear_stride, full_stride}, {0, 0});
+    return config;
+}
+
+static CacheConfig makeTinySwaMultiPoolHybridConfig(uint32_t linear_block_num = 6, uint32_t swa_block_num = 8) {
+    return makeTinyMultiPoolHybridConfig(linear_block_num, swa_block_num, CacheGroupType::SWA);
+}
+
+static ModelConfig makeTinyDSV4ModelConfig() {
+    ModelConfig mc;
+    mc.num_layers                        = 5;
+    mc.hidden_size                       = 32;
+    mc.attn_config.head_num              = 4;
+    mc.attn_config.kv_head_num           = 1;
+    mc.attn_config.size_per_head         = 8;
+    mc.attn_config.rope_head_dim         = 4;
+    mc.attn_config.sliding_window        = 128;
+    mc.attn_config.indexer_head_dim      = 8;
+    mc.attn_config.indexer_head_num      = 2;
+    mc.attn_config.indexer_topk          = 16;
+    mc.attn_config.o_groups              = 2;
+    mc.attn_config.o_lora_rank                                   = 16;
+    mc.attn_config.layer_compress_ratios                         = {4, 128, 4, 128, 0};
+    mc.hybrid_attention_config.enable_hybrid_attention           = true;
+    mc.hybrid_attention_config.enable_independent_kv_cache_pools = true;
+    setDsv4KvCacheSpecs(mc);
+    return mc;
+}
+
+static ModelConfig makeProModelConfig() {
+    ModelConfig mc;
+    mc.num_layers                   = 61;
+    mc.hidden_size                  = 7168;
+    mc.attn_config.head_num         = 128;
+    mc.attn_config.kv_head_num      = 1;
+    mc.attn_config.size_per_head    = 512;
+    mc.attn_config.rope_head_dim    = 64;
+    mc.attn_config.sliding_window   = 128;
+    mc.attn_config.indexer_head_dim = 128;
+    mc.attn_config.indexer_head_num = 64;
+    mc.attn_config.indexer_topk     = 1024;
+    mc.attn_config.o_groups         = 16;
+    mc.attn_config.o_lora_rank      = 1024;
+    std::vector<int> ratios;
+    ratios.push_back(128);
+    ratios.push_back(128);
+    for (int i = 2; i < 61; i++) {
+        ratios.push_back((i % 2 == 0) ? 4 : 128);
+    }
+    ratios.push_back(0);
+    mc.attn_config.layer_compress_ratios = ratios;
+    setDsv4KvCacheSpecs(mc);
+    return mc;
+}
+
+// Build a DSV4 7-pool CacheConfig (uses use_independent_block_pools=true).
+static CacheConfig makeDSV4HybridPoolConfig(uint32_t block_num = 200) {
+    auto              mc = makeProModelConfig();
+    mc.hybrid_attention_config.enable_hybrid_attention           = true;
+    mc.hybrid_attention_config.enable_independent_kv_cache_pools = true;
+    ParallelismConfig pc;
+    KVCacheConfig     kv_cache_config;
+    kv_cache_config.seq_size_per_block        = 128;
+    kv_cache_config.kernel_seq_size_per_block = 128;
+    auto config                               = CacheConfigCreator::createBasicConfig(mc, pc, kv_cache_config, false, 0);
+    config.block_num                   = block_num;
+    return config;
+}
+
+static void setExplicitBlocksForGroup(CacheConfig& config, size_t group_id, uint32_t block_num) {
+    ASSERT_LT(group_id, static_cast<size_t>(config.groupNums()));
+    std::vector<CacheGroupPolicy> policies;
+    policies.reserve(static_cast<size_t>(config.groupNums()));
+    for (size_t gid = 0; gid < static_cast<size_t>(config.groupNums()); ++gid) {
+        policies.push_back(config.policyForGroup(gid));
+    }
+    policies[group_id].explicit_block_num = block_num;
+    config.setGroupPolicies(policies);
+}
+
+static size_t firstExplicitIndependentGroup(const CacheConfig& config) {
+    for (size_t gid = 0; gid < static_cast<size_t>(config.groupNums()); ++gid) {
+        const auto policy = config.policyForGroup(gid);
+        if (policy.evict_policy == CacheEvictPolicy::INDEPENDENT && policy.explicit_block_num > 0) {
+            return gid;
+        }
+    }
+    ADD_FAILURE() << "missing explicit independent cache group";
+    return 0;
+}
+
+static CompleteTokenIdsPtr makeCompleteTokenIds(int batch_size, int seq_length, int seq_size_per_block) {
+    auto  cti        = std::make_shared<CompleteTokenIds>(batch_size, batch_size, seq_length + 64, seq_size_per_block);
+    auto  input_ids  = torch::empty({(int64_t)seq_length}, torch::kInt32);
+    auto* token_data = input_ids.data_ptr<int32_t>();
+    for (int i = 0; i < seq_length; ++i) {
+        token_data[i] = i + 1;
+    }
+    auto gi             = std::make_shared<GenerateInput>();
+    gi->input_ids       = input_ids;
+    gi->generate_config = std::make_shared<GenerateConfig>();
+    cti->init(gi);
+    return cti;
+}
+
+static BatchKVCacheResourcePtr makeBatchResource(int batch_size, const CacheConfig& config) {
+    auto res = std::make_shared<BatchKVCacheResource>();
+    res->resetBatchSize(batch_size);
+    res->initGroups(config.groupNums(),
+                    static_cast<int>(config.layer_all_num),
+                    config.layerGroupIdsSnapshot(),
+                    config.kernelBlocksPerKvBlock(),
+                    config.groupTypesSnapshot());
+    return res;
+}
+
+static std::vector<uint32_t> groupBlockNumsSnapshot(const CacheConfig& config) {
+    std::vector<uint32_t> block_nums;
+    block_nums.reserve(static_cast<size_t>(config.groupNums()));
+    for (size_t gid = 0; gid < static_cast<size_t>(config.groupNums()); ++gid) {
+        block_nums.push_back(config.blockNumForGroup(gid));
+    }
+    return block_nums;
+}
+
+static void setGroupBlockNums(CacheConfig& config, const std::vector<uint32_t>& block_nums) {
+    std::vector<size_t> kv_strides;
+    std::vector<size_t> scale_strides;
+    kv_strides.reserve(static_cast<size_t>(config.groupNums()));
+    scale_strides.reserve(static_cast<size_t>(config.groupNums()));
+    for (size_t gid = 0; gid < static_cast<size_t>(config.groupNums()); ++gid) {
+        kv_strides.push_back(config.kvBlockStrideBytesForGroup(gid));
+        scale_strides.push_back(config.kvScaleStrideBytesForGroup(gid));
+    }
+    config.setGroupBlockLayout(block_nums, kv_strides, scale_strides);
+}
+
+static size_t validBlockCount(const BlockIndicesType& blocks) {
+    return static_cast<size_t>(
+        std::count_if(blocks.begin(), blocks.end(), [](BlockIdxType block) { return !isNullBlockIdx(block); }));
+}
+
+// Create HybridPoolKVCacheAllocator with SharedBlockCache injected (required before init()).
+static HybridPoolKVCacheAllocatorPtr makeAllocator(const CacheConfig& config, RoleType role_type = RoleType::PDFUSION) {
+    auto allocator =
+        std::make_shared<HybridPoolKVCacheAllocator>(config, AllocationType::DEVICE, nullptr, 0, role_type);
+    auto shared_cache = std::make_shared<SharedBlockCache>();
+    allocator->setSharedBlockCache(shared_cache);
+    return allocator;
+}
+
+class RecordingMemoryUtil: public MemoryUtil {
+public:
+    bool regUserMr(void*, uint64_t, bool gpu, uint64_t) override {
+        reg_gpu_flags.push_back(gpu);
+        return true;
+    }
+
+    bool deregUserMr(void*, bool gpu) override {
+        dereg_gpu_flags.push_back(gpu);
+        return true;
+    }
+
+    bool isMemoryMr(void*, uint64_t, bool, bool) override {
+        return false;
+    }
+
+    bool findMemoryMr(void*, void*, uint64_t, bool, bool) override {
+        return false;
+    }
+
+    bool isRdmaMode() override {
+        return true;
+    }
+
+    std::vector<bool> reg_gpu_flags;
+    std::vector<bool> dereg_gpu_flags;
+};
+
+class RecordingCacheStore: public CacheStore {
+public:
+    explicit RecordingCacheStore(std::shared_ptr<MemoryUtil> memory_util): memory_util_(std::move(memory_util)) {}
+
+    void store(const std::shared_ptr<RequestBlockBuffer>&, CacheStoreStoreDoneCallback callback) override {
+        if (callback) {
+            callback(false, CacheStoreErrorCode::InvalidParams);
+        }
+    }
+
+    void load(const std::shared_ptr<RequestBlockBuffer>&,
+              CacheStoreLoadDoneCallback callback,
+              const std::string&,
+              uint32_t,
+              uint32_t,
+              uint32_t,
+              int,
+              int) override {
+        if (callback) {
+            callback(false, CacheStoreErrorCode::InvalidParams);
+        }
+    }
+
+    std::shared_ptr<LoadContext> loadBuffers(const std::vector<std::shared_ptr<RequestBlockBuffer>>&,
+                                             const std::string&,
+                                             uint32_t,
+                                             uint32_t,
+                                             int64_t,
+                                             LoadContext::CheckCancelFunc,
+                                             int,
+                                             int) override {
+        return nullptr;
+    }
+
+    std::shared_ptr<StoreContext> storeBuffers(const std::vector<std::shared_ptr<RequestBlockBuffer>>&,
+                                               int64_t) override {
+        return nullptr;
+    }
+
+    std::shared_ptr<RemoteStoreTask>
+    submitRemoteStoreTask(const std::shared_ptr<RemoteStoreRequest>&,
+                          const std::shared_ptr<CacheStoreRemoteStoreMetricsCollector>&,
+                          RemoteStoreTask::CheckCancelFunc) override {
+        return nullptr;
+    }
+
+    void releaseRemoteStoreTask(const std::shared_ptr<RemoteStoreTask>&) override {}
+
+    bool regUserBuffers(const std::vector<std::shared_ptr<BlockBuffer>>&) override {
+        return true;
+    }
+
+    std::shared_ptr<BlockBuffer> findUserBuffer(const std::string&) override {
+        return nullptr;
+    }
+
+    const std::shared_ptr<MemoryUtil>& getMemoryUtil() const override {
+        return memory_util_;
+    }
+
+    void debugInfo() override {}
+
+private:
+    std::shared_ptr<MemoryUtil> memory_util_;
+};
+
+// Insert a non-resident cache item into the shared block cache for a specific group.
+// Returns the BlockIdx allocated for the item (kept blockCache-referenced + request-released).
+static BlockIdxType
+seedNonResidentCacheItem(const HybridPoolKVCacheAllocatorPtr& allocator, int gid, CacheKeyType key) {
+    auto pool   = allocator->groupBlockPools()[static_cast<size_t>(gid)];
+    auto blocks = pool->malloc(1);
+    EXPECT_EQ(blocks.size(), 1u);
+    auto                      shared_cache = allocator->sharedBlockCache();
+    std::vector<BlockIdxType> group_slots(allocator->groupBlockPools().size(), NULL_BLOCK_IDX);
+    group_slots[static_cast<size_t>(gid)] = blocks[0];
+    shared_cache->put(key, group_slots, false);
+    // SharedBlockCache::put() internally calls pool->blockCacheReference()
+    pool->requestFree(blocks);
+    return blocks[0];
+}
+
+struct PoolCounters {
+    size_t free_blocks;
+    size_t available_blocks;
+    size_t request_refs;
+    size_t block_cache_refs;
+    size_t connector_refs;
+};
+
+static std::vector<PoolCounters> snapshotPoolCounters(const HybridPoolKVCacheAllocatorPtr& allocator) {
+    std::vector<PoolCounters> counters;
+    counters.reserve(allocator->groupBlockPools().size());
+    for (const auto& pool : allocator->groupBlockPools()) {
+        counters.push_back({pool->freeBlocksNum(),
+                            pool->availableBlocksNum(),
+                            pool->requestRefBlocksNum(),
+                            pool->blockCacheRefBlocksNum(),
+                            pool->connectorRefBlocksNum()});
+    }
+    return counters;
+}
+
+static void expectPoolCountersEq(const HybridPoolKVCacheAllocatorPtr& allocator,
+                                 const std::vector<PoolCounters>&     expected) {
+    ASSERT_EQ(allocator->groupBlockPools().size(), expected.size());
+    for (size_t gid = 0; gid < expected.size(); ++gid) {
+        const auto& pool = allocator->groupBlockPools()[gid];
+        EXPECT_EQ(pool->freeBlocksNum(), expected[gid].free_blocks) << "gid=" << gid;
+        EXPECT_EQ(pool->availableBlocksNum(), expected[gid].available_blocks) << "gid=" << gid;
+        EXPECT_EQ(pool->requestRefBlocksNum(), expected[gid].request_refs) << "gid=" << gid;
+        EXPECT_EQ(pool->blockCacheRefBlocksNum(), expected[gid].block_cache_refs) << "gid=" << gid;
+        EXPECT_EQ(pool->connectorRefBlocksNum(), expected[gid].connector_refs) << "gid=" << gid;
+    }
+}
+
+class HybridPoolKVCacheAllocatorTest: public ::testing::Test {
+protected:
+    void SetUp() override {
+        rtp_llm::initLogger();
+        createDevice();
+    }
+};
+
+// ---------------------------------------------------------------------------
+// Init / per-group pool creation
+// ---------------------------------------------------------------------------
+
+TEST_F(HybridPoolKVCacheAllocatorTest, InitCreatesIndependentBlockPoolPerGroup) {
+    auto config    = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/6, /*full_block_num=*/8);
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    ASSERT_EQ(allocator->groupBlockPools().size(), 2u);
+    EXPECT_NE(allocator->groupBlockPools()[0], allocator->groupBlockPools()[1]);
+
+    // Per-pool totalBlocksNum = group_block_nums[gid] - 1 (block 0 reserved).
+    EXPECT_EQ(allocator->groupBlockPools()[0]->totalBlocksNum(), 6u - 1u);
+    EXPECT_EQ(allocator->groupBlockPools()[1]->totalBlocksNum(), 8u - 1u);
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, SwaDefaultRegionGroupPoolUsesGpuBacking) {
+    auto config    = makeTinySwaMultiPoolHybridConfig(/*linear_block_num=*/6, /*swa_block_num=*/8);
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    ASSERT_EQ(allocator->groupBlockPools().size(), 2u);
+    EXPECT_EQ(allocator->groupBlockPools()[0]->where(), MemoryType::MEMORY_GPU);
+    EXPECT_EQ(allocator->groupBlockPools()[1]->where(), MemoryType::MEMORY_GPU);
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, GetBlockPoolReturnsNullptrInHybridPoolMode) {
+    // HybridPoolKVCacheAllocator owns one BlockPool per group and does not
+    // expose a single canonical block_pool_; getBlockPool() must return nullptr.
+    auto config    = makeTinyMultiPoolHybridConfig();
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+    EXPECT_EQ(allocator->getBlockPool(), nullptr);
+}
+
+// ---------------------------------------------------------------------------
+// Aggregated counters
+// ---------------------------------------------------------------------------
+
+TEST_F(HybridPoolKVCacheAllocatorTest, TotalAndFreeBlocksAggregateAcrossGroups) {
+    auto config    = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/6, /*full_block_num=*/8);
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    const size_t expected_total = (6u - 1u) + (8u - 1u);
+    EXPECT_EQ(allocator->totalBlocksNum(), expected_total);
+    EXPECT_EQ(allocator->freeBlocksNum(), expected_total);
+    EXPECT_EQ(allocator->availableBlocksNum(), expected_total);
+    EXPECT_EQ(allocator->notInUseBlocksNum(), expected_total);
+    EXPECT_EQ(allocator->requestRefBlocksNum(), 0u);
+    EXPECT_EQ(allocator->connectorRefBlocksNum(), 0u);
+    EXPECT_EQ(allocator->blockCacheRefBlocksNum(), 0u);
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, TokenAggregatorsUseDifferentCapacityScopes) {
+    auto config = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/6, /*full_block_num=*/8);
+    // Group 0 (LINEAR): seq_size_per_block=2 -> 5 blocks * 2 = 10
+    // Group 1 (FULL):   seq_size_per_block=4 -> 7 blocks * 4 = 28
+    config.group_seq_size_per_block = {2, 4};
+    auto allocator                  = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    EXPECT_EQ(allocator->maxAvailableTokensNum(), 28u);
+    EXPECT_EQ(allocator->availableTokensNum(), 28u);
+    EXPECT_EQ(allocator->totalTokensNum(), 28u);
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, TokenAggregatorsUseCPVirtualBlockSizeForFullGroups) {
+    auto config                     = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/6, /*full_block_num=*/8);
+    config.group_seq_size_per_block = {100, 4};
+    auto allocator                  = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    EXPECT_EQ(allocator->maxAvailableTokensNum(), 7u * 4u);
+    EXPECT_EQ(allocator->availableTokensNum(), 7u * 4u);
+
+    allocator->setCPSlotMapper(std::make_shared<CPSlotMapper>(/*cp_rank=*/0, /*cp_size=*/2, /*block_size=*/4));
+
+    EXPECT_EQ(allocator->maxAvailableTokensNum(), 7u * 8u);
+    EXPECT_EQ(allocator->availableTokensNum(), 7u * 8u);
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, TokenAggregatorsFallBackToGlobalSeqSize) {
+    auto config = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/6, /*full_block_num=*/6);
+    config.group_seq_size_per_block.clear();  // fall back to config.seq_size_per_block
+    config.seq_size_per_block = 4;
+    auto allocator            = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    EXPECT_EQ(allocator->maxAvailableTokensNum(), 5u * 4u);
+    EXPECT_EQ(allocator->availableTokensNum(), 5u * 4u);
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, RequestAndConnectorRefAggregateAcrossGroups) {
+    auto config    = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/6, /*full_block_num=*/8);
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    auto pool0 = allocator->groupBlockPools()[0];
+    auto pool1 = allocator->groupBlockPools()[1];
+
+    const size_t free_total_before = allocator->freeBlocksNum();
+    auto         g0_blocks         = pool0->malloc(2);
+    auto         g1_blocks         = pool1->malloc(3);
+    ASSERT_EQ(g0_blocks.size(), 2u);
+    ASSERT_EQ(g1_blocks.size(), 3u);
+
+    EXPECT_EQ(allocator->requestRefBlocksNum(), 5u);
+    EXPECT_EQ(allocator->freeBlocksNum(), free_total_before - 5u);
+    EXPECT_EQ(allocator->availableBlocksNum(), free_total_before - 5u);
+
+    // Mark some blocks as connector-referenced (simulating cache transfer).
+    pool0->connectorReference(g0_blocks[0]);
+    pool1->connectorReference(g1_blocks[0]);
+    EXPECT_EQ(allocator->connectorRefBlocksNum(), 2u);
+
+    pool0->requestFree(g0_blocks);
+    pool1->requestFree(g1_blocks);
+    EXPECT_EQ(allocator->requestRefBlocksNum(), 0u);
+
+    // Connector still holds 2 blocks → freeBlocksNum (set of returnable
+    // ids) drops by 2; notInUseBlocksNum counts blocks not held by *request*
+    // or *block cache* refs, so connector-held blocks still count as "not
+    // in use" → equals the full pool total.
+    EXPECT_EQ(allocator->freeBlocksNum(), free_total_before - 2u);
+    EXPECT_EQ(allocator->notInUseBlocksNum(), free_total_before);
+
+    pool0->connectorFree(g0_blocks[0]);
+    pool1->connectorFree(g1_blocks[0]);
+    EXPECT_EQ(allocator->connectorRefBlocksNum(), 0u);
+    EXPECT_EQ(allocator->freeBlocksNum(), free_total_before);
+    EXPECT_EQ(allocator->notInUseBlocksNum(), free_total_before);
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, BlockCacheRefAggregatesAcrossGroups) {
+    auto config    = makeTinyMultiPoolHybridConfig();
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    seedNonResidentCacheItem(allocator, /*gid=*/0, /*key=*/100);
+    seedNonResidentCacheItem(allocator, /*gid=*/1, /*key=*/200);
+    seedNonResidentCacheItem(allocator, /*gid=*/1, /*key=*/201);
+
+    EXPECT_EQ(allocator->blockCacheRefBlocksNum(), 3u);
+}
+
+// ---------------------------------------------------------------------------
+// Address / buffer lookups
+// ---------------------------------------------------------------------------
+
+TEST_F(HybridPoolKVCacheAllocatorTest, ConvertIndexToAddrAndBufferDefault) {
+    auto config    = makeTinyMultiPoolHybridConfig();
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    // Layer in linear group.
+    {
+        auto addr = allocator->convertIndexToAddr(/*layer_id=*/0, /*block_id=*/1);
+        EXPECT_NE(addr.kv_addr, nullptr);
+        auto bufs = allocator->convertIndexToBuffer(/*layer_id=*/0, /*block_id=*/1);
+        ASSERT_FALSE(bufs.empty());
+        EXPECT_NE(bufs[0].addr, nullptr);
+    }
+    // Layer in full group.
+    {
+        auto addr = allocator->convertIndexToAddr(/*layer_id=*/3, /*block_id=*/1);
+        EXPECT_NE(addr.kv_addr, nullptr);
+        auto bufs = allocator->convertIndexToBuffer(/*layer_id=*/3, /*block_id=*/1);
+        ASSERT_FALSE(bufs.empty());
+        EXPECT_NE(bufs[0].addr, nullptr);
+    }
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, ConvertIndexToBufferPartitionDefault) {
+    auto config    = makeTinyMultiPoolHybridConfig();
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    auto bufs = allocator->convertIndexToBuffer(
+        /*layer_id=*/3, /*block_id=*/1, /*partition_count=*/1, /*partition_id=*/0);
+    ASSERT_FALSE(bufs.empty());
+    EXPECT_NE(bufs[0].addr, nullptr);
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, ConvertIndexToAddrAndBufferByGroup) {
+    auto config    = makeTinyMultiPoolHybridConfig();
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    auto addr_default   = allocator->convertIndexToAddr(/*layer_id=*/0, /*group_id=*/0, /*block_id=*/1);
+    auto addr_via_layer = allocator->convertIndexToAddr(/*layer_id=*/0, /*block_id=*/1);
+    EXPECT_EQ(addr_default.kv_addr, addr_via_layer.kv_addr);
+
+    auto bufs_default = allocator->convertIndexToBuffer(/*layer_id=*/0, /*group_id=*/0, /*block_id=*/1);
+    ASSERT_FALSE(bufs_default.empty());
+    EXPECT_NE(bufs_default[0].addr, nullptr);
+
+    auto bufs_partitioned = allocator->convertIndexToBuffer(
+        /*layer_id=*/0, /*group_id=*/0, /*block_id=*/1, /*partition_count=*/1, /*partition_id=*/0);
+    ASSERT_FALSE(bufs_partitioned.empty());
+    EXPECT_NE(bufs_partitioned[0].addr, nullptr);
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, AllLayerCacheBaseExposesPerLayerAndPerGroupTensors) {
+    auto config    = makeTinyMultiPoolHybridConfig();
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    auto layout = allocator->allLayerCacheBase();
+    ASSERT_EQ(layout.layers_to_kv_buffer_ptrs.size(), static_cast<size_t>(config.layer_all_num));
+    for (size_t i = 0; i < layout.layers_to_kv_buffer_ptrs.size(); ++i) {
+        EXPECT_TRUE(layout.layers_to_kv_buffer_ptrs[i].defined()) << "layer " << i << " missing kv buffer";
+    }
+    EXPECT_EQ(layout.layer_to_group_ids, config.layerGroupIdsSnapshot());
+    EXPECT_EQ(layout.group_types, config.groupTypesSnapshot());
+
+    ASSERT_EQ(layout.layers_to_kv_buffer_ptrs_by_group.size(), static_cast<size_t>(config.layer_all_num));
+    for (size_t i = 0; i < layout.layers_to_kv_buffer_ptrs_by_group.size(); ++i) {
+        EXPECT_EQ(layout.layers_to_kv_buffer_ptrs_by_group[i].size(), static_cast<size_t>(config.groupNums()));
+    }
+
+    for (size_t i = 0; i < static_cast<size_t>(config.layer_all_num); ++i) {
+        ASSERT_FALSE(layout.layer_to_group_ids[i].empty());
+        const auto gid        = static_cast<size_t>(layout.layer_to_group_ids[i].front());
+        const auto& by_default = layout.layers_to_kv_buffer_ptrs_by_group[i][gid];
+        EXPECT_TRUE(by_default.defined()) << "layer " << i << " primary group tensor undefined";
+        EXPECT_EQ(by_default.data_ptr(), layout.layers_to_kv_buffer_ptrs[i].data_ptr());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// regUserMr / getMrCostTimeMs
+// ---------------------------------------------------------------------------
+
+TEST_F(HybridPoolKVCacheAllocatorTest, RegUserMrWithoutCacheStoreIsNoOpAndZeroCost) {
+    auto config    = makeTinyMultiPoolHybridConfig();
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    // No CacheStore is plumbed in: regUserMr should be a benign no-op for every
+    // group pool, and the aggregated MR cost remains zero.
+    EXPECT_NO_THROW(allocator->regUserMr(/*model_id=*/0, /*cache_store=*/nullptr));
+    EXPECT_EQ(allocator->getMrCostTimeMs(), 0);
+}
+
+// ---------------------------------------------------------------------------
+// popBlocksFromCache / blockCacheFree
+// ---------------------------------------------------------------------------
+
+TEST_F(HybridPoolKVCacheAllocatorTest, PopBlocksFromCacheReturnsEvictedBatchAcrossGroups) {
+    auto config    = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/6, /*full_block_num=*/8);
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    // Seed identical key on both groups, plus a unique key on the full group.
+    auto g0_block_for_100 = seedNonResidentCacheItem(allocator, /*gid=*/0, /*key=*/100);
+    auto g1_block_for_100 = seedNonResidentCacheItem(allocator, /*gid=*/1, /*key=*/100);
+    auto g1_block_for_200 = seedNonResidentCacheItem(allocator, /*gid=*/1, /*key=*/200);
+    EXPECT_EQ(allocator->blockCacheRefBlocksNum(), 3u);
+
+    auto evicted = allocator->popBlocksFromCache(/*min_blocks_to_free=*/3);
+    ASSERT_NE(evicted, nullptr);
+    EXPECT_EQ(evicted->batchSize(), 1);
+    EXPECT_EQ(evicted->groupNums(), 2);
+    EXPECT_TRUE(evicted->cacheResource(0).cacheKeysAreCpCanonical());
+    const auto& keys = evicted->cacheKeys(0);
+    EXPECT_EQ(keys.size(), 2u);  // 100 (shared) + 200 (g1 only)
+
+    std::unordered_set<CacheKeyType> key_set(keys.begin(), keys.end());
+    EXPECT_TRUE(key_set.count(100));
+    EXPECT_TRUE(key_set.count(200));
+
+    // Per-group block ids: each group's blocks should be set only at the slot
+    // matching the key it owned, and NULL elsewhere.
+    const auto& g0_blocks = evicted->blocks(/*batch_id=*/0, /*gid=*/0);
+    const auto& g1_blocks = evicted->blocks(/*batch_id=*/0, /*gid=*/1);
+    ASSERT_EQ(g0_blocks.size(), 2u);
+    ASSERT_EQ(g1_blocks.size(), 2u);
+
+    auto idx_of = [&](CacheKeyType k) -> size_t {
+        for (size_t i = 0; i < keys.size(); ++i) {
+            if (keys[i] == k) {
+                return i;
+            }
+        }
+        return keys.size();
+    };
+    const size_t pos_100 = idx_of(100);
+    const size_t pos_200 = idx_of(200);
+    ASSERT_LT(pos_100, keys.size());
+    ASSERT_LT(pos_200, keys.size());
+
+    EXPECT_EQ(g0_blocks[pos_100], g0_block_for_100);
+    EXPECT_TRUE(isNullBlockIdx(g0_blocks[pos_200]));
+    EXPECT_EQ(g1_blocks[pos_100], g1_block_for_100);
+    EXPECT_EQ(g1_blocks[pos_200], g1_block_for_200);
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, PopBlocksFromCacheZeroFreeReturnsNull) {
+    auto config    = makeTinyMultiPoolHybridConfig();
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+    EXPECT_EQ(allocator->popBlocksFromCache(0), nullptr);
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, PopBlocksFromCacheEmptyCachesReturnsNull) {
+    auto config    = makeTinyMultiPoolHybridConfig();
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+    EXPECT_EQ(allocator->popBlocksFromCache(/*min_blocks_to_free=*/4), nullptr);
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, BlockCacheFreeReleasesEvictedBatchAcrossGroups) {
+    auto config    = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/6, /*full_block_num=*/6);
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    seedNonResidentCacheItem(allocator, /*gid=*/0, /*key=*/100);
+    seedNonResidentCacheItem(allocator, /*gid=*/1, /*key=*/200);
+    EXPECT_EQ(allocator->blockCacheRefBlocksNum(), 2u);
+
+    const size_t free_before = allocator->freeBlocksNum();
+    auto         evicted     = allocator->popBlocksFromCache(/*min_blocks_to_free=*/2);
+    ASSERT_NE(evicted, nullptr);
+    // Eviction releases the LRU entries from BlockCache; the underlying blocks
+    // are still referenced by blockCacheRef. Releasing those refs is what
+    // blockCacheFree() does.
+    allocator->blockCacheFree(evicted);
+    EXPECT_EQ(allocator->blockCacheRefBlocksNum(), 0u);
+    EXPECT_EQ(allocator->freeBlocksNum(), free_before + 2u);
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, BlockCacheFreeNullPtrIsNoOp) {
+    auto config    = makeTinyMultiPoolHybridConfig();
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+    EXPECT_NO_THROW(allocator->blockCacheFree(nullptr));
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, BlockCacheFreeIgnoresDuplicateAndNullBlockIds) {
+    auto config    = makeTinyMultiPoolHybridConfig();
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    auto seeded = seedNonResidentCacheItem(allocator, /*gid=*/1, /*key=*/300);
+    EXPECT_EQ(allocator->blockCacheRefBlocksNum(), 1u);
+
+    auto batch = std::make_shared<BatchKVCacheResource>();
+    batch->resetBatchSize(1);
+    batch->initGroups(config.groupNums(), static_cast<int>(config.layer_all_num), config.layerGroupIdsSnapshot());
+    // Same block listed twice in the same group should only be released once;
+    // NULL_BLOCK_IDX entries should be skipped.
+    batch->mutableBlockIds(0, /*gid=*/1).assign(BlockIndicesType{seeded, seeded, NULL_BLOCK_IDX});
+    EXPECT_NO_THROW(allocator->blockCacheFree(batch));
+    EXPECT_EQ(allocator->blockCacheRefBlocksNum(), 0u);
+}
+
+// ---------------------------------------------------------------------------
+// hasAvailableBlocksForReserve via reserve_block_num
+// ---------------------------------------------------------------------------
+
+TEST_F(HybridPoolKVCacheAllocatorTest, ReserveBlocksAreDistributedAcrossGroupsForInitMalloc) {
+    // Group 0 (linear) gets 6 blocks (5 free), group 1 (full) gets 4 blocks (3 free).
+    // total_available = 8. Set reserve = 4.
+    // Expected per-group reserve: floor(4 * 5/8) = 2 for gid=0, floor(4 * 3/8) = 1 for gid=1.
+    auto config    = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/6, /*full_block_num=*/4);
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    allocator->setReserveBlockNum(4);
+
+    // seq_len=4 -> 1 block per group.
+    auto batch_res = makeBatchResource(/*batch_size=*/1, config);
+    batch_res->setBatchCacheKeys(0, CacheKeysType{100});
+    auto       token_ids = makeCompleteTokenIds(/*batch_size=*/1, /*seq_length=*/4, /*seq_size_per_block=*/4);
+    MallocInfo malloc_info{batch_res, token_ids};
+    malloc_info.enable_device_cache = false;
+    malloc_info.reuse_cache         = false;
+    auto result                     = allocator->malloc(malloc_info);
+    EXPECT_TRUE(result.success);
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, ReserveBlocksRejectsWhenGroupCannotMeetItsShare) {
+    // Force a group whose available_blocks < need + group_reserve_blocks.
+    auto config    = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/6, /*full_block_num=*/4);
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    // A reserve large enough to hide most blocks should reject init malloc.
+    allocator->setReserveBlockNum(allocator->availableBlocksNum());
+
+    auto batch_res = makeBatchResource(/*batch_size=*/1, config);
+    batch_res->setBatchCacheKeys(0, CacheKeysType{100});
+    auto       token_ids = makeCompleteTokenIds(/*batch_size=*/1, /*seq_length=*/4, /*seq_size_per_block=*/4);
+    MallocInfo malloc_info{batch_res, token_ids};
+    malloc_info.enable_device_cache = false;
+    malloc_info.reuse_cache         = false;
+    malloc_info.verbose             = false;
+    auto result                     = allocator->malloc(malloc_info);
+    EXPECT_FALSE(result.success);
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, PoolMetricsSnapshotsReportReserveBlocks) {
+    auto config    = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/6, /*full_block_num=*/8);
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    constexpr size_t reserve_blocks = 6;
+    allocator->setReserveBlockNum(reserve_blocks);
+
+    const auto snapshots = allocator->poolMetricsSnapshots();
+    ASSERT_EQ(snapshots.size(), 2u);
+    EXPECT_EQ("linear", snapshots[0].pool_name);
+    EXPECT_EQ("full", snapshots[1].pool_name);
+
+    const size_t total_reservable_available_blocks =
+        snapshots[0].available_blocks + snapshots[1].available_blocks;
+    ASSERT_GT(total_reservable_available_blocks, 0u);
+    EXPECT_EQ(reserve_blocks * snapshots[0].available_blocks / total_reservable_available_blocks,
+              snapshots[0].reserve_blocks);
+    EXPECT_EQ(reserve_blocks * snapshots[1].available_blocks / total_reservable_available_blocks,
+              snapshots[1].reserve_blocks);
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, ReserveBlocksUseCPShardedFullGroupNeed) {
+    auto config    = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/20, /*full_block_num=*/6);
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    allocator->setReserveBlockNum(1);
+
+    auto batch_res = makeBatchResource(/*batch_size=*/1, config);
+    batch_res->setBatchCacheKeys(0, CacheKeysType{100, 101, 102, 103, 104, 105, 106, 107});
+    auto token_ids = makeCompleteTokenIds(/*batch_size=*/1, /*seq_length=*/32, /*seq_size_per_block=*/4);
+    allocator->setCPSlotMapper(std::make_shared<CPSlotMapper>(/*cp_rank=*/0, /*cp_size=*/2, /*block_size=*/4));
+
+    MallocInfo malloc_info{batch_res, token_ids};
+    malloc_info.enable_device_cache = false;
+    malloc_info.reuse_cache         = false;
+
+    auto result = allocator->malloc(malloc_info);
+    ASSERT_TRUE(result.success);
+    EXPECT_EQ(validBlockCount(batch_res->blocks(0, /*gid=*/1)), 4u);
+
+    FreeInfo free_info{batch_res, token_ids};
+    allocator->free(free_info);
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, ReserveCheckIsBypassedWhenMallocInfoLacksContext) {
+    // hasAvailableBlocksForReserve returns true when info has no resource/tokens.
+    auto config    = makeTinyMultiPoolHybridConfig();
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    MallocInfo info{};
+    EXPECT_TRUE(allocator->hasAvailableBlocksForReserve(info, /*reserve_blocks=*/9999));
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, InitMallocRollbackFreesPartiallyAllocatedGroupBlocks) {
+    // gid=0 has enough room for the LINEAR tail block; gid=1 cannot satisfy
+    // the 3 FULL blocks needed for seq_len=9. initMallocForCommonLen should
+    // roll gid=0 back after gid=1 fails.
+    auto config    = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/3, /*full_block_num=*/3);
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    const auto counters_before = snapshotPoolCounters(allocator);
+
+    auto batch_res = makeBatchResource(/*batch_size=*/1, config);
+    batch_res->setBatchCacheKeys(0, CacheKeysType{100, 101, 102});
+    auto       token_ids = makeCompleteTokenIds(/*batch_size=*/1, /*seq_length=*/9, /*seq_size_per_block=*/4);
+    MallocInfo malloc_info{batch_res, token_ids};
+    malloc_info.enable_device_cache = false;
+    malloc_info.reuse_cache         = false;
+    malloc_info.verbose             = false;
+
+    auto result = allocator->malloc(malloc_info);
+    EXPECT_FALSE(result.success);
+
+    EXPECT_EQ(batch_res->curBlocksNum(), 0u);
+    EXPECT_EQ(batch_res->blocksNum(0, /*gid=*/0), 0u);
+    EXPECT_EQ(batch_res->blocksNum(0, /*gid=*/1), 0u);
+    EXPECT_EQ(allocator->requestRefBlocksNum(), 0u);
+    expectPoolCountersEq(allocator, counters_before);
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, InitMallocRollbackReleasesDeviceReuseReferencesOnReserveReject) {
+    auto config    = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/4, /*full_block_num=*/4);
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    const auto linear_cached = seedNonResidentCacheItem(allocator, /*gid=*/0, /*key=*/100);
+    const auto full_cached   = seedNonResidentCacheItem(allocator, /*gid=*/1, /*key=*/100);
+    ASSERT_FALSE(isNullBlockIdx(linear_cached));
+    ASSERT_FALSE(isNullBlockIdx(full_cached));
+    ASSERT_EQ(allocator->requestRefBlocksNum(), 0u);
+    ASSERT_EQ(allocator->blockCacheRefBlocksNum(), 2u);
+
+    const size_t available_before = allocator->availableBlocksNum();
+    const auto   counters_before  = snapshotPoolCounters(allocator);
+    allocator->setReserveBlockNum(std::max<size_t>(1, available_before * 8));
+
+    auto batch_res = makeBatchResource(/*batch_size=*/1, config);
+    batch_res->setBatchCacheKeys(0, CacheKeysType{100, 101, 102});
+    auto       token_ids = makeCompleteTokenIds(/*batch_size=*/1, /*seq_length=*/8, /*seq_size_per_block=*/4);
+    MallocInfo malloc_info{batch_res, token_ids};
+    malloc_info.enable_device_cache = true;
+    malloc_info.reuse_cache         = true;
+    malloc_info.verbose             = false;
+
+    auto result = allocator->malloc(malloc_info);
+    EXPECT_FALSE(result.success);
+
+    EXPECT_EQ(batch_res->curBlocksNum(), 0u);
+    EXPECT_EQ(batch_res->blocksNum(0, /*gid=*/0), 0u);
+    EXPECT_EQ(batch_res->blocksNum(0, /*gid=*/1), 0u);
+    EXPECT_EQ(allocator->requestRefBlocksNum(), 0u);
+    EXPECT_EQ(allocator->blockCacheRefBlocksNum(), 2u);
+    expectPoolCountersEq(allocator, counters_before);
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, IncrMallocRollbackFreesPartiallyAllocatedGroupBlocks) {
+    auto config    = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/4, /*full_block_num=*/2);
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    auto batch_res = makeBatchResource(/*batch_size=*/1, config);
+    batch_res->setBatchCacheKeys(0, CacheKeysType{100, 101, 102});
+
+    auto       token_ids = makeCompleteTokenIds(/*batch_size=*/1, /*seq_length=*/4, /*seq_size_per_block=*/4);
+    MallocInfo init_info{batch_res, token_ids};
+    init_info.enable_device_cache = false;
+    init_info.reuse_cache         = false;
+    ASSERT_TRUE(allocator->malloc(init_info).success);
+
+    ASSERT_EQ(batch_res->blocksNum(0, /*gid=*/0), 1u);
+    ASSERT_EQ(batch_res->blocksNum(0, /*gid=*/1), 1u);
+    const auto linear_block_before = batch_res->blocks(0, /*gid=*/0)[0];
+    const auto full_block_before   = batch_res->blocks(0, /*gid=*/1)[0];
+    const auto counters_before     = snapshotPoolCounters(allocator);
+
+    // gid=0 can append one real LINEAR tail block. gid=1 has no remaining
+    // free blocks and no cache to evict, so FULL allocation fails.
+    token_ids->setSeqLength(9);
+    MallocInfo incr_info{batch_res, token_ids};
+    incr_info.enable_device_cache = false;
+    incr_info.reuse_cache         = false;
+    auto incr_result              = allocator->malloc(incr_info);
+    EXPECT_FALSE(incr_result.success);
+
+    ASSERT_EQ(batch_res->blocksNum(0, /*gid=*/0), 1u);
+    ASSERT_EQ(batch_res->blocksNum(0, /*gid=*/1), 1u);
+    EXPECT_EQ(batch_res->blocks(0, /*gid=*/0)[0], linear_block_before);
+    EXPECT_EQ(batch_res->blocks(0, /*gid=*/1)[0], full_block_before);
+    expectPoolCountersEq(allocator, counters_before);
+}
+
+// ---------------------------------------------------------------------------
+// Full malloc / free cycle
+// ---------------------------------------------------------------------------
+
+TEST_F(HybridPoolKVCacheAllocatorTest, MallocAndFreeCycleAcrossPerGroupPools) {
+    auto config    = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/8, /*full_block_num=*/8);
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    const size_t free_before = allocator->freeBlocksNum();
+
+    auto batch_res = makeBatchResource(/*batch_size=*/1, config);
+    batch_res->setBatchCacheKeys(0, CacheKeysType{100, 101, 102});
+    auto       token_ids = makeCompleteTokenIds(/*batch_size=*/1, /*seq_length=*/12, /*seq_size_per_block=*/4);
+    MallocInfo malloc_info{batch_res, token_ids};
+    malloc_info.enable_device_cache = false;
+    malloc_info.reuse_cache         = false;
+    auto result                     = allocator->malloc(malloc_info);
+    ASSERT_TRUE(result.success);
+    EXPECT_LT(allocator->freeBlocksNum(), free_before);
+
+    FreeInfo free_info{batch_res, token_ids};
+    allocator->free(free_info);
+    EXPECT_EQ(allocator->freeBlocksNum(), free_before);
+}
+
+// ---------------------------------------------------------------------------
+// DSV4 7-group HybridPool: covers per-tag addressing and SWA tail
+// ---------------------------------------------------------------------------
+
+TEST_F(HybridPoolKVCacheAllocatorTest, DSV4InitAndAggregatedCounters) {
+    auto config    = makeDSV4HybridPoolConfig(/*block_num=*/200);
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    EXPECT_EQ(config.groupNums(), 7);
+    ASSERT_EQ(allocator->groupBlockPools().size(), 7u);
+
+    // Sum of per-pool totals must equal aggregated totalBlocksNum.
+    size_t expected_total = 0;
+    for (const auto& pool : allocator->groupBlockPools()) {
+        expected_total += pool->totalBlocksNum();
+    }
+    EXPECT_EQ(allocator->totalBlocksNum(), expected_total);
+    EXPECT_EQ(allocator->freeBlocksNum(), expected_total);
+    EXPECT_EQ(allocator->availableBlocksNum(), expected_total);
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, DSV4FixedTagPoolsUseGpuBacking) {
+    auto config = makeDSV4HybridPoolConfig(/*block_num=*/200);
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    ASSERT_EQ(allocator->groupBlockPools().size(), 7u);
+    for (size_t gid = 0; gid < allocator->groupBlockPools().size(); ++gid) {
+        EXPECT_EQ(allocator->groupBlockPools()[gid]->where(), MemoryType::MEMORY_GPU)
+            << "gid=" << gid << " tag=" << config.tagForGroup(gid);
+    }
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, DSV4HCAStateReuseEnabledAllocatesTailOnly) {
+    auto config        = makeDSV4HybridPoolConfig(/*block_num=*/200);
+    config.linear_step = 4;
+    auto allocator     = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    const int hca_state_gid = config.groupIdForTag("hca_state");
+    ASSERT_GT(config.groupNums(), hca_state_gid);
+    ASSERT_GT(allocator->groupBlockPools().size(), static_cast<size_t>(hca_state_gid));
+
+    const size_t hca_free_before = allocator->groupBlockPools()[hca_state_gid]->freeBlocksNum();
+
+    auto batch_res = makeBatchResource(/*batch_size=*/1, config);
+    batch_res->setBatchCacheKeys(0, CacheKeysType{100, 101, 102, 103, 104, 105, 106, 107, 108, 109});
+    auto token_ids = makeCompleteTokenIds(
+        /*batch_size=*/1, /*seq_length=*/10 * static_cast<int>(config.seq_size_per_block), config.seq_size_per_block);
+
+    MallocInfo malloc_info{batch_res, token_ids};
+    malloc_info.enable_device_cache = false;
+    malloc_info.reuse_cache         = true;
+    auto result                     = allocator->malloc(malloc_info);
+    ASSERT_TRUE(result.success);
+
+    const auto& hca_blocks = batch_res->blocks(0, hca_state_gid);
+    ASSERT_EQ(hca_blocks.size(), 10u);
+    EXPECT_EQ(validBlockCount(hca_blocks), 1u);
+    EXPECT_TRUE(isNullBlockIdx(hca_blocks[8]));
+    EXPECT_FALSE(isNullBlockIdx(hca_blocks[9]));
+    EXPECT_EQ(hca_free_before - allocator->groupBlockPools()[hca_state_gid]->freeBlocksNum(), 1u);
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, TokenAggregatorsIgnoreSmallHCAStatePool) {
+    auto config = makeDSV4HybridPoolConfig(/*block_num=*/50);
+
+    const int hca_state_gid = config.groupIdForTag("hca_state");
+    ASSERT_GT(config.groupNums(), hca_state_gid);
+    auto block_nums = groupBlockNumsSnapshot(config);
+    block_nums[hca_state_gid] = 2;
+    setGroupBlockNums(config, block_nums);
+
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+    ASSERT_GT(allocator->groupBlockPools().size(), static_cast<size_t>(hca_state_gid));
+
+    const auto hca_state_tokens =
+        allocator->groupBlockPools()[hca_state_gid]->totalBlocksNum() * config.group_seq_size_per_block[hca_state_gid];
+    EXPECT_LT(hca_state_tokens, allocator->totalTokensNum());
+    EXPECT_EQ(allocator->availableTokensNum(), allocator->maxAvailableTokensNum());
+    EXPECT_EQ(allocator->totalTokensNum(), allocator->maxAvailableTokensNum());
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, DSV4ConfigUsesOnlyPagedGroupsForBlockSize) {
+    auto              mc = makeTinyDSV4ModelConfig();
+    ParallelismConfig pc;
+    KVCacheConfig     kv_cache_config;
+    kv_cache_config.seq_size_per_block        = 128;
+    kv_cache_config.kernel_seq_size_per_block = 128;
+    auto config                               = CacheConfigCreator::createBasicConfig(mc, pc, kv_cache_config, false, 0);
+
+    ASSERT_EQ(config.groupNums(), 7);
+    ASSERT_EQ(config.groupNums(), 7);
+
+    size_t expected_non_full_bytes = 0;
+    size_t expected_full_bytes     = 0;
+    for (size_t gid = 0; gid < static_cast<size_t>(config.groupNums()); ++gid) {
+        const auto type = config.typeForGroup(gid);
+        if (type == CacheGroupType::FULL) {
+            expected_full_bytes += config.blockSizeBytesForGroup(gid);
+        } else {
+            expected_non_full_bytes += config.blockSizeBytesForGroup(gid);
+        }
+    }
+
+    EXPECT_GT(expected_non_full_bytes, 0u);
+    EXPECT_GT(expected_full_bytes, 0u);
+
+    EXPECT_EQ(config.block_size_bytes, expected_full_bytes);
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, DSV4FinalizeBlockNumsUsesHcaStatePoolBlocks) {
+    auto config = makeDSV4HybridPoolConfig(/*block_num=*/50);
+    const size_t explicit_gid = firstExplicitIndependentGroup(config);
+    setExplicitBlocksForGroup(config, explicit_gid, 50);
+
+    RuntimeConfig rt;  // unused inside finalizeBlockNums today
+    config.finalizeBlockNums(/*global_block_num=*/200, rt);
+
+    for (size_t gid = 0; gid < static_cast<size_t>(config.groupNums()); ++gid) {
+        const uint32_t expected = config.policyForGroup(gid).explicit_block_num > 0 ? 50u : 200u;
+        EXPECT_EQ(config.blockNumForGroup(gid), expected) << "gid=" << gid;
+    }
+
+    const size_t expected_reserve = 50u * config.blockSizeBytesForGroup(explicit_gid);
+    EXPECT_EQ(config.explicitly_sized_pool_reserve_bytes, expected_reserve);
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, DSV4FinalizeBlockNumsUsesGlobalBlocksWhenHcaStateBlocksDisabled) {
+    auto config = makeDSV4HybridPoolConfig(/*block_num=*/123);
+    setExplicitBlocksForGroup(config, firstExplicitIndependentGroup(config), 0);
+
+    RuntimeConfig rt;
+    config.finalizeBlockNums(/*global_block_num=*/123, rt);
+
+    for (size_t gid = 0; gid < static_cast<size_t>(config.groupNums()); ++gid) {
+        EXPECT_EQ(config.blockNumForGroup(gid), 123u);
+    }
+    EXPECT_EQ(config.explicitly_sized_pool_reserve_bytes, 0u);
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, DSV4GpuHcaStatePoolIncludesFixedReserve) {
+    auto config = makeDSV4HybridPoolConfig(/*block_num=*/50);
+    const size_t explicit_gid = firstExplicitIndependentGroup(config);
+    setExplicitBlocksForGroup(config, explicit_gid, 50);
+
+    RuntimeConfig rt;
+    config.finalizeBlockNums(/*global_block_num=*/200, rt);
+
+    for (size_t gid = 0; gid < static_cast<size_t>(config.groupNums()); ++gid) {
+        const uint32_t expected = config.policyForGroup(gid).explicit_block_num > 0 ? 50u : 200u;
+        EXPECT_EQ(config.blockNumForGroup(gid), expected) << "gid=" << gid;
+    }
+    const size_t expected_reserve = 50u * config.blockSizeBytesForGroup(explicit_gid);
+    EXPECT_EQ(config.explicitly_sized_pool_reserve_bytes, expected_reserve);
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, DSV4StateSwaPoolsWithoutExplicitBlocksUseGlobalBlocks) {
+    auto              mc = makeProModelConfig();
+    mc.hybrid_attention_config.enable_hybrid_attention           = true;
+    mc.hybrid_attention_config.enable_independent_kv_cache_pools = true;
+    ParallelismConfig pc;
+    KVCacheConfig     kv_cache_config;
+    kv_cache_config.seq_size_per_block        = 128;
+    kv_cache_config.kernel_seq_size_per_block = 128;
+    setDsv4ExplicitPoolBlocks(mc, "hca_state", 0);
+    auto config                               = CacheConfigCreator::createBasicConfig(mc, pc, kv_cache_config, false, 0);
+    config.linear_step                 = 4;
+
+    RuntimeConfig rt;
+    config.finalizeBlockNums(/*global_block_num=*/128, rt);
+
+    for (size_t gid = 0; gid < static_cast<size_t>(config.groupNums()); ++gid) {
+        EXPECT_EQ(config.blockNumForGroup(gid), 128u) << "gid=" << gid;
+    }
+    EXPECT_EQ(config.explicitly_sized_pool_reserve_bytes, 0u);
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, DSV4ConvertIndexToAddrByTagRoutesToCorrectPool) {
+    auto config    = makeDSV4HybridPoolConfig();
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    // CSA layer (compress_ratio=4) -- pick the first one.
+    int csa_layer = -1;
+    for (size_t l = 0; l < config.layer_all_num; ++l) {
+        if (config.layerTagToGroupIdSnapshot()[l].count("csa_kv") > 0) {
+            csa_layer = static_cast<int>(l);
+            break;
+        }
+    }
+    ASSERT_GE(csa_layer, 0);
+
+    // csa_kv tag routes to gid=0; it must produce a non-null kv address that
+    // matches the CSA group's pool.
+    auto addr_csa = allocator->convertIndexToAddrByTag(csa_layer, "csa_kv", 1);
+    EXPECT_NE(addr_csa.kv_addr, nullptr);
+
+    auto addr_swa = allocator->convertIndexToAddrByTag(csa_layer, "swa_kv", 1);
+    EXPECT_NE(addr_swa.kv_addr, nullptr);
+
+    // The two tags live in different pools, so their addresses cannot alias.
+    EXPECT_NE(addr_csa.kv_addr, addr_swa.kv_addr);
+
+    // Default single-group access is ambiguous for multi-tag layers.
+    EXPECT_ANY_THROW((void)allocator->convertIndexToAddr(csa_layer, /*block_id=*/1));
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, DSV4ConvertIndexToBufferByTagAndPartition) {
+    auto config    = makeDSV4HybridPoolConfig();
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    int csa_layer = -1;
+    for (size_t l = 0; l < config.layer_all_num; ++l) {
+        if (config.layerTagToGroupIdSnapshot()[l].count("csa_kv") > 0) {
+            csa_layer = static_cast<int>(l);
+            break;
+        }
+    }
+    ASSERT_GE(csa_layer, 0);
+
+    auto buf = allocator->convertIndexToBufferByTag(csa_layer, "csa_kv", /*block_id=*/1);
+    ASSERT_FALSE(buf.empty());
+    EXPECT_NE(buf[0].addr, nullptr);
+
+    auto buf_part = allocator->convertIndexToBufferByTag(
+        csa_layer, "csa_kv", /*block_id=*/1, /*partition_count=*/1, /*partition_id=*/0);
+    ASSERT_FALSE(buf_part.empty());
+    EXPECT_NE(buf_part[0].addr, nullptr);
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, DSV4AllLayerCacheBaseHasPerGroupTensors) {
+    auto config    = makeDSV4HybridPoolConfig();
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    auto layout = allocator->allLayerCacheBase();
+    ASSERT_EQ(layout.layers_to_kv_buffer_ptrs.size(), static_cast<size_t>(config.layer_all_num));
+    ASSERT_EQ(layout.layers_to_kv_buffer_ptrs_by_group.size(), static_cast<size_t>(config.layer_all_num));
+
+    const int swa_kv_gid = config.groupIdForTag("swa_kv");
+    for (size_t l = 0; l < static_cast<size_t>(config.layer_all_num); ++l) {
+        EXPECT_FALSE(layout.layers_to_kv_buffer_ptrs[l].defined())
+            << "multi-tag DSV4 layer should not publish a legacy single-group tensor";
+        const auto& swa_t = layout.layers_to_kv_buffer_ptrs_by_group[l][swa_kv_gid];
+        EXPECT_TRUE(swa_t.defined()) << "layer " << l << " missing SWA_KV tensor";
+    }
+    EXPECT_EQ(layout.group_tags.size(), 7u);
+    EXPECT_EQ(layout.group_types.size(), 7u);
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, DSV4SharedBlockCacheIsUnifiedAcrossGroups) {
+    auto config    = makeDSV4HybridPoolConfig();
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    // All groups share a single SharedBlockCache owned by the allocator.
+    auto shared_cache = allocator->sharedBlockCache();
+    ASSERT_NE(shared_cache, nullptr);
+
+    // Inserting a cache item for one group is visible via the shared cache.
+    auto pool0  = allocator->groupBlockPools()[0];
+    auto blocks = pool0->malloc(1);
+    ASSERT_EQ(blocks.size(), 1u);
+    std::vector<BlockIdxType> group_slots(allocator->groupBlockPools().size(), NULL_BLOCK_IDX);
+    group_slots[0] = blocks[0];
+    shared_cache->put(/*cache_key=*/42, group_slots, /*is_resident=*/false);
+    EXPECT_TRUE(shared_cache->contains(42));
+
+    // The same cache is returned by the allocator accessor.
+    EXPECT_EQ(allocator->sharedBlockCache(), shared_cache);
+
+    // Clean up.
+    pool0->requestFree(blocks);
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, DSV4CPShardedInsertThenReuseSamePrefix) {
+    auto config    = makeDSV4HybridPoolConfig(/*block_num=*/64);
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    const int spb     = static_cast<int>(config.seq_size_per_block);
+    const int seq_len = 10 * spb + 17;
+
+    CacheKeysType full_keys;
+    for (int i = 0; i < 10; ++i) {
+        full_keys.push_back(1000 + i);
+    }
+    CacheKeysType request_keys = full_keys;
+    request_keys.push_back(2000);  // partial tail key present on the incoming request.
+
+    auto cp_mapper = std::make_shared<CPSlotMapper>(/*cp_rank=*/0, /*cp_size=*/2, spb);
+    allocator->setCPSlotMapper(cp_mapper);
+
+    auto seed_res = makeBatchResource(/*batch_size=*/1, config);
+    seed_res->setBatchCacheKeys(0, full_keys);
+    auto seed_tokens = makeCompleteTokenIds(/*batch_size=*/1, seq_len, spb);
+
+    MallocInfo seed_malloc{seed_res, seed_tokens};
+    seed_malloc.reuse_cache         = true;
+    seed_malloc.enable_device_cache = false;
+    allocator->setCPSlotMapper(cp_mapper);
+    ASSERT_TRUE(allocator->malloc(seed_malloc).success);
+
+    InsertInfo insert_info{seed_res, seed_tokens, /*is_resident=*/false};
+    allocator->setCPSlotMapper(cp_mapper);
+    allocator->insertIntoCache(insert_info);
+
+    FreeInfo seed_free{seed_res, seed_tokens};
+    allocator->free(seed_free);
+
+    auto hit_res = makeBatchResource(/*batch_size=*/1, config);
+    hit_res->setBatchCacheKeys(0, request_keys);
+    auto hit_tokens = makeCompleteTokenIds(/*batch_size=*/1, seq_len, spb);
+
+    MallocInfo hit_malloc{hit_res, hit_tokens};
+    hit_malloc.reuse_cache         = true;
+    hit_malloc.enable_device_cache = true;
+    allocator->setCPSlotMapper(cp_mapper);
+    auto result = allocator->malloc(hit_malloc);
+
+    ASSERT_TRUE(result.success);
+    EXPECT_EQ(result.reuse_len, 5 * spb * 2);
+
+    FreeInfo hit_free{hit_res, hit_tokens};
+    allocator->free(hit_free);
+}
+
+TEST_F(HybridPoolKVCacheAllocatorTest, DSV4CPShardedEvictionMarksCanonicalResource) {
+    auto config    = makeDSV4HybridPoolConfig(/*block_num=*/64);
+    auto allocator = makeAllocator(config);
+    ASSERT_TRUE(allocator->init());
+
+    const int spb     = static_cast<int>(config.seq_size_per_block);
+    const int seq_len = 10 * spb + 17;
+
+    CacheKeysType full_keys;
+    for (int i = 0; i < 10; ++i) {
+        full_keys.push_back(1000 + i);
+    }
+
+    auto cp_mapper = std::make_shared<CPSlotMapper>(/*cp_rank=*/0, /*cp_size=*/2, spb);
+    allocator->setCPSlotMapper(cp_mapper);
+
+    auto seed_res = makeBatchResource(/*batch_size=*/1, config);
+    seed_res->setBatchCacheKeys(0, full_keys);
+    auto seed_tokens = makeCompleteTokenIds(/*batch_size=*/1, seq_len, spb);
+
+    MallocInfo seed_malloc{seed_res, seed_tokens};
+    seed_malloc.reuse_cache         = true;
+    seed_malloc.enable_device_cache = false;
+    ASSERT_TRUE(allocator->malloc(seed_malloc).success);
+
+    InsertInfo insert_info{seed_res, seed_tokens, /*is_resident=*/false};
+    allocator->insertIntoCache(insert_info);
+
+    FreeInfo seed_free{seed_res, seed_tokens};
+    allocator->free(seed_free);
+
+    auto evicted = allocator->popBlocksFromCache(/*min_blocks_to_free=*/4);
+    ASSERT_NE(evicted, nullptr);
+    ASSERT_TRUE(evicted->hasCacheKeys());
+    EXPECT_TRUE(evicted->cacheResource(0).cacheKeysAreCpCanonical());
+
+    KVCacheResource canonical_source;
+    canonical_source.setCacheKeys(full_keys);
+    const auto expected_canonical = canonical_source.localCacheKeys(cp_mapper->cpSize() - 1, cp_mapper->cpSize());
+    EXPECT_EQ(evicted->cacheKeys(0), expected_canonical);
+    const auto& dependencies = evicted->cacheResource(0).blockDependencies();
+    ASSERT_EQ(dependencies.size(), expected_canonical.size());
+    for (size_t i = 0; i < dependencies.size(); ++i) {
+        EXPECT_EQ(dependencies[i].ordinal, static_cast<uint32_t>(i));
+        if (i == 0) {
+            EXPECT_FALSE(dependencies[i].has_parent);
+        } else {
+            EXPECT_TRUE(dependencies[i].has_parent);
+            EXPECT_EQ(dependencies[i].parent_key, expected_canonical[i - 1]);
+        }
+    }
+}
+
+}  // namespace test
+}  // namespace rtp_llm
+
+int main(int argc, char** argv) {
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
diff --git a/rtp_llm/cpp/cache/test/HybridTypeKVCacheAllocatorTest.cc b/rtp_llm/cpp/cache/test/HybridTypeKVCacheAllocatorTest.cc
index c175826cee..ab99d3f1bc 100644
--- a/rtp_llm/cpp/cache/test/HybridTypeKVCacheAllocatorTest.cc
+++ b/rtp_llm/cpp/cache/test/HybridTypeKVCacheAllocatorTest.cc
@@ -3,13 +3,15 @@
 #include <limits>
 #include <memory>
 #include <optional>
+#include <string>
 #include <vector>
 
 #include "rtp_llm/cpp/cache/BatchKVCacheResource.h"
-#include "rtp_llm/cpp/cache/BlockCache.h"
-#include "rtp_llm/cpp/cache/HybridTypeKVCacheAllocator.h"
-#include "rtp_llm/cpp/cache/CacheConfigCreator.h"
+#include "rtp_llm/cpp/cache/SharedBlockCache.h"
+#include "rtp_llm/cpp/cache/allocator/HybridTypeKVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h"
 #include "rtp_llm/cpp/cache/test/BlockPoolTestHelper.h"
+#include "rtp_llm/cpp/cache/test/CacheConfigTestUtils.h"
 #include "rtp_llm/cpp/config/ModelConfig.h"
 #include "rtp_llm/cpp/engine_base/stream/CompleteTokenIds.h"
 #include "rtp_llm/cpp/utils/Logger.h"
@@ -33,7 +35,6 @@ static CacheConfig makeTinyHybridConfig() {
     auto linear_spec                = std::make_shared<LinearKVCacheSpec>();
     linear_spec->type               = KVCacheSpecType::LinearAttention;
     linear_spec->dtype              = config.dtype;
-    linear_spec->layer_num          = 2;
     linear_spec->local_num_k_heads  = 1;
     linear_spec->local_num_v_heads  = 1;
     linear_spec->head_k_dim         = 1;
@@ -46,17 +47,15 @@ static CacheConfig makeTinyHybridConfig() {
     auto full_spec                = std::make_shared<MHAKVCacheSpec>();
     full_spec->type               = KVCacheSpecType::MultiHeadAttention;
     full_spec->dtype              = config.dtype;
-    full_spec->layer_num          = 2;
     full_spec->local_head_num_kv  = 1;
     full_spec->size_per_head      = 1;
     full_spec->seq_size_per_block = static_cast<uint32_t>(config.seq_size_per_block);
 
     // Order matters: linear groups first, then full groups (as in CacheConfigCreator).
-    config.layer_ids        = {{0, 1}, {2, 3}};
-    config.global_layer_ids = config.layer_ids;
-    config.cache_specs      = {linear_spec, full_spec};
-    config.linear_group_num = 1;
-    config.full_group_num   = 1;
+    config.fromGroupedSpecs({linear_spec, full_spec},
+                            {{0, 1}, {2, 3}},
+                            {CacheGroupType::LINEAR, CacheGroupType::FULL},
+                            {"linear", "full"});
 
     // Physical block strides: take max between full and linear.
     config.kv_block_stride_bytes = std::max(full_spec->block_size_bytes(), linear_spec->block_size_bytes());
@@ -68,12 +67,6 @@ static CacheConfig makeTinyHybridConfig() {
 
     config.block_size_bytes = config.kv_block_size_bytes + config.kv_scale_size_bytes;
 
-    config.layer_to_group_id.assign(static_cast<size_t>(config.layer_num), 0);
-    for (size_t gid = 0; gid < config.layer_ids.size(); ++gid) {
-        for (int layer_id : config.layer_ids[gid]) {
-            config.layer_to_group_id[static_cast<size_t>(layer_id)] = static_cast<int>(gid);
-        }
-    }
     return config;
 }
 
@@ -105,6 +98,8 @@ static CacheConfig makeTinyHybridMtpConfigByCreateSpConfig() {
     score_model_cfg.linear_attention_config.linear_value_head_dim  = 8;
     score_model_cfg.linear_attention_config.linear_num_key_heads   = 2;
     score_model_cfg.linear_attention_config.linear_num_value_heads = 2;
+    setHybridAttentionKvCacheSpecs(score_model_cfg);
+    setDefaultKvCacheSpec(propose_model_cfg);
 
     ParallelismConfig parallelism_cfg;
     parallelism_cfg.tp_size = 1;
@@ -143,11 +138,12 @@ static CompleteTokenIdsPtr makeCompleteTokenIds(int batch_size, int seq_length,
     return complete_token_ids;
 }
 
-static BatchKVCacheResourcePtr makeBatchResource(
-    int batch_size, int group_nums, int layer_num, const std::vector<int>& layer_to_group_id, CacheKeysType keys) {
+static BatchKVCacheResourcePtr makeBatchResource(int batch_size, const CacheConfig& config, CacheKeysType keys) {
     auto res = std::make_shared<BatchKVCacheResource>();
     res->resetBatchSize(batch_size);
-    res->initGroups(group_nums, layer_num, layer_to_group_id);
+    res->initGroups(config.groupNums(),
+                    static_cast<int>(config.layer_all_num),
+                    config.layerGroupIdsSnapshot());
     for (int b = 0; b < batch_size; ++b) {
         res->setBatchCacheKeys(b, keys);
     }
@@ -155,21 +151,18 @@ static BatchKVCacheResourcePtr makeBatchResource(
 }
 
 static std::vector<BlockIdxType> allocateAndCache(BlockPoolPtr         block_pool,
-                                                  BlockCachePtr        block_cache,
+                                                  SharedBlockCachePtr  shared_cache,
                                                   int                  group_id,
+                                                  int                  group_num,
                                                   const CacheKeysType& keys,
                                                   bool                 is_resident = true) {
     auto blocks = block_pool->malloc(static_cast<int>(keys.size()));
     EXPECT_EQ(blocks.size(), keys.size());
 
     for (size_t i = 0; i < keys.size(); ++i) {
-        BlockCache::CacheItem item;
-        item.cache_key   = keys[i];
-        item.group_id    = group_id;
-        item.block_index = blocks[i];
-        item.is_resident = is_resident;
-        EXPECT_TRUE(block_cache->put(item));
-        block_pool->blockCacheReference(blocks[i]);
+        std::vector<BlockIdxType> group_slots(static_cast<size_t>(group_num), NULL_BLOCK_IDX);
+        group_slots[static_cast<size_t>(group_id)] = blocks[i];
+        shared_cache->put(keys[i], group_slots, is_resident);
     }
 
     // Drop request references so these blocks behave like "cached but available" blocks.
@@ -178,21 +171,18 @@ static std::vector<BlockIdxType> allocateAndCache(BlockPoolPtr         block_poo
 }
 
 static std::vector<BlockIdxType> allocateAndCacheKeepAllocated(BlockPoolPtr         block_pool,
-                                                               BlockCachePtr        block_cache,
+                                                               SharedBlockCachePtr  shared_cache,
                                                                int                  group_id,
+                                                               int                  group_num,
                                                                const CacheKeysType& keys,
                                                                bool                 is_resident = true) {
     auto blocks = block_pool->malloc(static_cast<int>(keys.size()));
     EXPECT_EQ(blocks.size(), keys.size());
 
     for (size_t i = 0; i < keys.size(); ++i) {
-        BlockCache::CacheItem item;
-        item.cache_key   = keys[i];
-        item.group_id    = group_id;
-        item.block_index = blocks[i];
-        item.is_resident = is_resident;
-        EXPECT_TRUE(block_cache->put(item));
-        block_pool->blockCacheReference(blocks[i]);
+        std::vector<BlockIdxType> group_slots(static_cast<size_t>(group_num), NULL_BLOCK_IDX);
+        group_slots[static_cast<size_t>(group_id)] = blocks[i];
+        shared_cache->put(keys[i], group_slots, is_resident);
     }
 
     // NOTE: intentionally keep these blocks allocated/unavailable to avoid accidental reuse via malloc().
@@ -220,6 +210,7 @@ class HybridTypeKVCacheAllocatorTest: public ::testing::Test {
 TEST_F(HybridTypeKVCacheAllocatorTest, InitAndAddressLookupSmoke) {
     auto config    = makeTinyHybridConfig();
     auto allocator = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    allocator->setSharedBlockCache(std::make_shared<SharedBlockCache>());
     ASSERT_TRUE(allocator->init());
 
     EXPECT_EQ(allocator->seqSizePerBlock(), 4);
@@ -247,7 +238,7 @@ TEST_F(HybridTypeKVCacheAllocatorTest, ConvertToGlobalLayerIdHybridNoMtp) {
               std::numeric_limits<uint32_t>::max());
 }
 
-TEST_F(HybridTypeKVCacheAllocatorTest, ConvertToGlobalLayerIdHybridWithMtpSubConfigs) {
+TEST_F(HybridTypeKVCacheAllocatorTest, DISABLED_ConvertToGlobalLayerIdHybridWithMtpSubConfigs) {
     auto config    = makeTinyHybridMtpConfigByCreateSpConfig();
     auto allocator = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
 
@@ -263,41 +254,34 @@ TEST_F(HybridTypeKVCacheAllocatorTest, ConvertToGlobalLayerIdHybridWithMtpSubCon
 TEST_F(HybridTypeKVCacheAllocatorTest, GetNeedBlocksUsesGroupGetNeedBlocksAndReuseFlag) {
     auto config    = makeTinyHybridConfig();
     auto allocator = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    allocator->setSharedBlockCache(std::make_shared<SharedBlockCache>());
     ASSERT_TRUE(allocator->init());
 
     // batch=2, seq_len=12 (3 slots), reserve_step=2
     auto token_ids = makeCompleteTokenIds(/*batch_size=*/2, /*seq_length=*/12, /*seq_size_per_block=*/4);
     token_ids->setReserveStep(2);
 
-    // Reuse disabled: linear group keeps only tail for common blocks; reserve_step contributes extra blocks.
+    // Reuse disabled: linear group keeps tail and tail-1 for common blocks; reserve_step contributes extra blocks.
     // full group contributes common=3, extra=1.
     {
-        auto       batch_res = makeBatchResource(/*batch_size=*/2,
-                                           /*group_nums=*/2,
-                                           /*layer_num=*/static_cast<int>(config.layer_all_num),
-                                           /*layer_to_group_id=*/config.layer_to_group_id,
-                                           CacheKeysType{100, 101, 102, 103});
+        auto       batch_res = makeBatchResource(/*batch_size=*/2, config, CacheKeysType{100, 101, 102, 103});
         MallocInfo info{batch_res, token_ids};
         info.enable_device_cache = false;
         info.reuse_cache         = false;
-        // common_total = full(3) + linear(1) = 4
+        // common_total = full(3) + linear(2) = 5
         // extra_total  = full(1) + linear(reserve_step-1=1) = 2
-        // total = 4 + 2*2 = 8
-        EXPECT_EQ(allocator->getNeedBlocks(info), 8);
+        // total = 5 + 2*2 = 9
+        EXPECT_EQ(allocator->getNeedBlocks(info), 9);
     }
 
-    // Reuse enabled but no existing blocks: linear group uses sparse counting from begin=0.
+    // Reuse enabled but no existing blocks: linear group keeps step hits plus tail/tail-1.
     {
-        auto       batch_res = makeBatchResource(/*batch_size=*/2,
-                                           /*group_nums=*/2,
-                                           /*layer_num=*/static_cast<int>(config.layer_all_num),
-                                           /*layer_to_group_id=*/config.layer_to_group_id,
-                                           CacheKeysType{100, 101, 102, 103});
+        auto       batch_res = makeBatchResource(/*batch_size=*/2, config, CacheKeysType{100, 101, 102, 103});
         MallocInfo info{batch_res, token_ids};
         info.enable_device_cache = true;
         info.reuse_cache         = true;
         // full: common=3 extra=1
-        // linear: common=count(0,3]=2, extra=reserve_step-1(=1)
+        // linear: common=2, extra=reserve_step-1(=1)
         // common_total = 3 + 2 = 5
         // extra_total  = 1 + 1 = 2
         // total = 5 + 2*2 = 9
@@ -308,32 +292,30 @@ TEST_F(HybridTypeKVCacheAllocatorTest, GetNeedBlocksUsesGroupGetNeedBlocksAndReu
 TEST_F(HybridTypeKVCacheAllocatorTest, JointReuseUsesFullPrefixAndLinearTailOnly) {
     auto config    = makeTinyHybridConfig();
     auto allocator = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    allocator->setSharedBlockCache(std::make_shared<SharedBlockCache>());
     ASSERT_TRUE(allocator->init());
 
-    auto block_pool  = allocator->getBlockPool();
-    auto block_cache = block_pool->blockCache();
+    auto block_pool   = allocator->getBlockPool();
+    auto shared_cache = allocator->sharedBlockCache();
     ASSERT_NE(block_pool, nullptr);
-    ASSERT_NE(block_cache, nullptr);
+    ASSERT_NE(shared_cache, nullptr);
 
     // Config order: gid=0 linear, gid=1 full.
     const int gid_linear = 0;
     const int gid_full   = 1;
+    const int group_num  = 2;
 
     // Full group has prefix matches for {100,101,102}.
     CacheKeysType full_keys   = {100, 101, 102};
-    auto          full_blocks = allocateAndCache(block_pool, block_cache, gid_full, full_keys);
+    auto          full_blocks = allocateAndCache(block_pool, shared_cache, gid_full, group_num, full_keys);
 
     // Linear group only matches key 101 (so joint match should backoff to pos=1 => reuse_blocks_len=2).
     CacheKeysType linear_keys   = {101};
-    auto          linear_blocks = allocateAndCache(block_pool, block_cache, gid_linear, linear_keys);
+    auto          linear_blocks = allocateAndCache(block_pool, shared_cache, gid_linear, group_num, linear_keys);
     ASSERT_EQ(linear_blocks.size(), 1u);
 
     // Request has 4 keys, but allocator drops the last for matching.
-    auto batch_res = makeBatchResource(/*batch_size=*/1,
-                                       /*group_nums=*/2,
-                                       /*layer_num=*/static_cast<int>(config.layer_all_num),
-                                       /*layer_to_group_id=*/config.layer_to_group_id,
-                                       CacheKeysType{100, 101, 102, 103});
+    auto batch_res = makeBatchResource(/*batch_size=*/1, config, CacheKeysType{100, 101, 102, 103});
     // Enable device cache reuse for joint match.
 
     // seq_len=12 => 3 slots (4 tokens per block).
@@ -359,16 +341,13 @@ TEST_F(HybridTypeKVCacheAllocatorTest, JointReuseUsesFullPrefixAndLinearTailOnly
     EXPECT_FALSE(isNullBlockIdx(linear_out[2]));  // allocated tail for common length
 }
 
-TEST_F(HybridTypeKVCacheAllocatorTest, DisableReuseKeepsOnlyLinearTailOnInitMalloc) {
+TEST_F(HybridTypeKVCacheAllocatorTest, DisableReuseKeepsLinearTailAndTailMinusOneOnInitMalloc) {
     auto config    = makeTinyHybridConfig();
     auto allocator = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    allocator->setSharedBlockCache(std::make_shared<SharedBlockCache>());
     ASSERT_TRUE(allocator->init());
 
-    auto batch_res = makeBatchResource(/*batch_size=*/1,
-                                       /*group_nums=*/2,
-                                       /*layer_num=*/static_cast<int>(config.layer_all_num),
-                                       /*layer_to_group_id=*/config.layer_to_group_id,
-                                       CacheKeysType{100, 101, 102, 103});
+    auto batch_res = makeBatchResource(/*batch_size=*/1, config, CacheKeysType{100, 101, 102, 103});
     // Disable device cache reuse.
 
     auto token_ids = makeCompleteTokenIds(/*batch_size=*/1, /*seq_length=*/12, /*seq_size_per_block=*/4);
@@ -379,39 +358,37 @@ TEST_F(HybridTypeKVCacheAllocatorTest, DisableReuseKeepsOnlyLinearTailOnInitMall
     auto result              = allocator->malloc(info);
     ASSERT_TRUE(result.success);
 
-    // Linear group should keep only the tail block across common length slots.
+    // Linear group should keep tail and tail-1 across common length slots.
     const auto& linear_out = batch_res->blocks(0, /*group_id=*/0);
     ASSERT_EQ(linear_out.size(), 3u);
     EXPECT_TRUE(isNullBlockIdx(linear_out[0]));
-    EXPECT_TRUE(isNullBlockIdx(linear_out[1]));
+    EXPECT_FALSE(isNullBlockIdx(linear_out[1]));
     EXPECT_FALSE(isNullBlockIdx(linear_out[2]));
 }
 
-TEST_F(HybridTypeKVCacheAllocatorTest, DisableDeviceCacheSkipsReuseMatchAndAllocatesOnlyLinearTail) {
+TEST_F(HybridTypeKVCacheAllocatorTest, DisableDeviceCacheSkipsReuseMatchAndAllocatesLinearTailAndTailMinusOne) {
     auto config    = makeTinyHybridConfig();
     auto allocator = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    allocator->setSharedBlockCache(std::make_shared<SharedBlockCache>());
     ASSERT_TRUE(allocator->init());
 
-    auto block_pool  = allocator->getBlockPool();
-    auto block_cache = block_pool->blockCache();
+    auto block_pool   = allocator->getBlockPool();
+    auto shared_cache = allocator->sharedBlockCache();
     ASSERT_NE(block_pool, nullptr);
-    ASSERT_NE(block_cache, nullptr);
+    ASSERT_NE(shared_cache, nullptr);
 
     // Config order: gid=0 linear, gid=1 full.
     const int gid_linear = 0;
     const int gid_full   = 1;
+    const int group_num  = 2;
 
     // Prepare cached blocks for full group; keep them allocated so allocator's malloc() cannot accidentally return same
     // ids.
     CacheKeysType full_keys   = {100, 101, 102};
-    auto          full_blocks = allocateAndCacheKeepAllocated(block_pool, block_cache, gid_full, full_keys);
+    auto          full_blocks = allocateAndCacheKeepAllocated(block_pool, shared_cache, gid_full, group_num, full_keys);
     ASSERT_EQ(full_blocks.size(), 3u);
 
-    auto batch_res = makeBatchResource(/*batch_size=*/1,
-                                       /*group_nums=*/2,
-                                       /*layer_num=*/static_cast<int>(config.layer_all_num),
-                                       /*layer_to_group_id=*/config.layer_to_group_id,
-                                       CacheKeysType{100, 101, 102, 103});
+    auto batch_res = makeBatchResource(/*batch_size=*/1, config, CacheKeysType{100, 101, 102, 103});
     // Disable device cache reuse: allocator should skip reuse match even if cache exists.
 
     auto token_ids = makeCompleteTokenIds(/*batch_size=*/1, /*seq_length=*/12, /*seq_size_per_block=*/4);  // 3 slots
@@ -435,18 +412,19 @@ TEST_F(HybridTypeKVCacheAllocatorTest, DisableDeviceCacheSkipsReuseMatchAndAlloc
     EXPECT_NE(full_out[1], full_blocks[1]);
     EXPECT_NE(full_out[2], full_blocks[2]);
 
-    // Linear group keeps only tail block (others NULL) when reuse is disabled.
+    // Linear group keeps tail and tail-1 when reuse is disabled.
     const auto& linear_out = batch_res->blocks(0, gid_linear);
     ASSERT_EQ(linear_out.size(), 3u);
     EXPECT_TRUE(isNullBlockIdx(linear_out[0]));
-    EXPECT_TRUE(isNullBlockIdx(linear_out[1]));
+    EXPECT_FALSE(isNullBlockIdx(linear_out[1]));
     EXPECT_FALSE(isNullBlockIdx(linear_out[2]));
-    EXPECT_EQ(countValidBlocks(linear_out), 1u);
+    EXPECT_EQ(countValidBlocks(linear_out), 2u);
 }
 
 TEST_F(HybridTypeKVCacheAllocatorTest, IncrDecrKVCacheRefReferencesOnlyMatchedValidBlocksAcrossGroups) {
     auto config    = makeTinyHybridConfig();
     auto allocator = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::HOST);
+    allocator->setSharedBlockCache(std::make_shared<SharedBlockCache>());
     ASSERT_TRUE(allocator->init());
 
     auto block_pool = allocator->getBlockPool();
@@ -458,9 +436,12 @@ TEST_F(HybridTypeKVCacheAllocatorTest, IncrDecrKVCacheRefReferencesOnlyMatchedVa
     EXPECT_EQ(allocator->freeBlocksNum(), free_before - 4);
 
     KVCacheResource resource;
+    std::vector<CacheGroupType> group_types = {CacheGroupType::LINEAR, CacheGroupType::FULL};
     resource.initGroups(/*group_nums=*/2,
                         /*layer_num=*/static_cast<int>(config.layer_all_num),
-                        /*layer_to_group_id=*/config.layer_to_group_id);
+                        /*layer_group_ids=*/config.layerGroupIdsSnapshot(),
+                        /*kernel_blocks_per_kv_block=*/config.kernelBlocksPerKvBlock(),
+                        /*group_types=*/group_types);
     resource.cacheKeys() = CacheKeysType{100, 101, 102};
     resource.mutableBlockIds(/*gid=*/0).assign(
         BlockIndicesType{blocks[0], 0, blocks[1]});  // linear group (contains a 0)
@@ -470,7 +451,7 @@ TEST_F(HybridTypeKVCacheAllocatorTest, IncrDecrKVCacheRefReferencesOnlyMatchedVa
     auto ref = allocator->incrKVCacheRef(resource, CacheKeysType{101, 999, 102});
     ASSERT_NE(ref, nullptr);
     ASSERT_EQ(ref->groupNums(), 2);
-    ASSERT_EQ(ref->cacheKeys().size(), 3u);
+    ASSERT_EQ(ref->cacheKeys(), (CacheKeysType{101, 102}));
     ASSERT_EQ(ref->blocks(0).size(), 2u);
     ASSERT_EQ(ref->blocks(1).size(), 2u);
 
@@ -481,28 +462,65 @@ TEST_F(HybridTypeKVCacheAllocatorTest, IncrDecrKVCacheRefReferencesOnlyMatchedVa
     EXPECT_EQ(allocator->freeBlocksNum(), free_before);
 }
 
-TEST_F(HybridTypeKVCacheAllocatorTest, InsertIntoCacheInsertsOnlyFullBlocks) {
+TEST_F(HybridTypeKVCacheAllocatorTest, IncrKVCacheRefPreservesConnectorDummyTail) {
+    auto config    = makeTinyHybridConfig();
+    auto allocator = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::HOST);
+    allocator->setSharedBlockCache(std::make_shared<SharedBlockCache>());
+    ASSERT_TRUE(allocator->init());
+
+    auto block_pool = allocator->getBlockPool();
+    ASSERT_NE(block_pool, nullptr);
+
+    const size_t free_before = allocator->freeBlocksNum();
+    auto         blocks      = block_pool->malloc(2);
+    ASSERT_EQ(blocks.size(), 2u);
+
+    KVCacheResource resource;
+    resource.initGroups(/*group_nums=*/2,
+                        /*layer_num=*/static_cast<int>(config.layer_all_num),
+                        /*layer_group_ids=*/config.layerGroupIdsSnapshot());
+    resource.cacheKeys() = CacheKeysType{101, 103, 999};
+    resource.rebuildLinearBlockDependencies();
+    resource.setLastBlockAligned(false);
+    resource.mutableBlockIds(/*gid=*/0).assign(BlockIndicesType{NULL_BLOCK_IDX, NULL_BLOCK_IDX});
+    resource.mutableBlockIds(/*gid=*/1).assign(BlockIndicesType{blocks[0], blocks[1]});
+
+    auto ref = allocator->incrKVCacheRef(resource, CacheKeysType{101, 103, 999}, /*is_connector=*/true);
+    ASSERT_NE(ref, nullptr);
+    EXPECT_FALSE(ref->lastBlockAligned());
+    EXPECT_EQ(ref->cacheKeys(), (CacheKeysType{101, 103, 999}));
+    ASSERT_EQ(ref->blocks(0).size(), 3u);
+    ASSERT_EQ(ref->blocks(1).size(), 3u);
+    EXPECT_TRUE(isNullBlockIdx(ref->blocks(0)[2]));
+    EXPECT_TRUE(isNullBlockIdx(ref->blocks(1)[2]));
+
+    block_pool->requestFree(blocks);
+    EXPECT_EQ(allocator->freeBlocksNum(), free_before - 2);
+
+    ref.reset();
+    EXPECT_EQ(allocator->freeBlocksNum(), free_before);
+}
+
+TEST_F(HybridTypeKVCacheAllocatorTest, InsertIntoCachePreservesLegacyNonCpAggregateSurface) {
     auto config    = makeTinyHybridConfig();
     auto allocator = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    allocator->setSharedBlockCache(std::make_shared<SharedBlockCache>());
     ASSERT_TRUE(allocator->init());
 
-    auto block_pool  = allocator->getBlockPool();
-    auto block_cache = block_pool->blockCache();
+    auto block_pool   = allocator->getBlockPool();
+    auto shared_cache = allocator->sharedBlockCache();
     ASSERT_NE(block_pool, nullptr);
-    ASSERT_NE(block_cache, nullptr);
+    ASSERT_NE(shared_cache, nullptr);
 
     // gid=0 linear, gid=1 full.
     const int gid_linear = 0;
     const int gid_full   = 1;
 
-    auto batch_res = makeBatchResource(/*batch_size=*/1,
-                                       /*group_nums=*/2,
-                                       /*layer_num=*/static_cast<int>(config.layer_all_num),
-                                       /*layer_to_group_id=*/config.layer_to_group_id,
-                                       CacheKeysType{100, 101, 102});
+    auto batch_res = makeBatchResource(/*batch_size=*/1, config, CacheKeysType{100, 101, 102});
     // Disable device cache reuse.
 
-    // seq_len=10 => 3 slots, full_blocks_num = floor(10/4)=2 -> only first 2 keys inserted.
+    // Non-CP insert keeps the legacy aggregate surface: every materialized
+    // group slot is merged under its key, including hybrid tail slots.
     auto token_ids = makeCompleteTokenIds(/*batch_size=*/1, /*seq_length=*/10, /*seq_size_per_block=*/4);
 
     MallocInfo malloc_info{batch_res, token_ids};
@@ -516,19 +534,21 @@ TEST_F(HybridTypeKVCacheAllocatorTest, InsertIntoCacheInsertsOnlyFullBlocks) {
     InsertInfo insert_info{batch_res, token_ids, /*is_resident=*/false};
     allocator->insertIntoCache(insert_info);
 
-    // Full group should have cached first two keys.
-    EXPECT_TRUE(block_cache->contains(100, gid_full));
-    EXPECT_TRUE(block_cache->contains(101, gid_full));
-    EXPECT_FALSE(block_cache->contains(102, gid_full));
+    // Full group has all allocated slots cached, including the trailing block.
+    EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(100, gid_full)));
+    EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(101, gid_full)));
+    EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(102, gid_full)));
 
-    // Linear group has NULL in early slots when reuse disabled, thus should not insert these full blocks.
-    EXPECT_FALSE(block_cache->contains(100, gid_linear));
-    EXPECT_FALSE(block_cache->contains(101, gid_linear));
+    // Linear group keeps its tail and tail-minus-one slots.
+    EXPECT_TRUE(isNullBlockIdx(shared_cache->matchGroup(100, gid_linear)));
+    EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(101, gid_linear)));
+    EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(102, gid_linear)));
 }
 
 TEST_F(HybridTypeKVCacheAllocatorTest, ConvertIndexToBufferAndAllLayerCacheBaseSmoke) {
     auto config    = makeTinyHybridConfig();
     auto allocator = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    allocator->setSharedBlockCache(std::make_shared<SharedBlockCache>());
     ASSERT_TRUE(allocator->init());
 
     KVCacheAllocator* base = allocator.get();
@@ -547,17 +567,14 @@ TEST_F(HybridTypeKVCacheAllocatorTest, IncrMallocRollbackFreesPartiallyAllocated
     auto config      = makeTinyHybridConfig();
     config.block_num = 6;  // free=5
     auto allocator   = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
+    allocator->setSharedBlockCache(std::make_shared<SharedBlockCache>());
     ASSERT_TRUE(allocator->init());
 
     auto block_pool = allocator->getBlockPool();
     ASSERT_NE(block_pool, nullptr);
 
-    auto batch_res = makeBatchResource(/*batch_size=*/1,
-                                       /*group_nums=*/2,
-                                       /*layer_num=*/static_cast<int>(config.layer_all_num),
-                                       /*layer_to_group_id=*/config.layer_to_group_id,
-                                       CacheKeysType{100, 101, 102});
-    // Disable device cache reuse (makes linear group allocate only tail for new slots).
+    auto batch_res = makeBatchResource(/*batch_size=*/1, config, CacheKeysType{100, 101, 102});
+    // Disable device cache reuse (linear group still materializes tail and tail-1).
 
     // Initial small allocation: seq_len=4 => 1 slot per group.
     auto       token_ids = makeCompleteTokenIds(/*batch_size=*/1, /*seq_length=*/4, /*seq_size_per_block=*/4);
@@ -597,114 +614,6 @@ TEST_F(HybridTypeKVCacheAllocatorTest, IncrMallocRollbackFreesPartiallyAllocated
     block_pool->requestFree(keep);
 }
 
-// Prefill init path (StreamCacheResource::initKVBlock sets enable_remove_skipped_blocks=false).
-// With step=2 and reuse_blocks_len=3, the reused linear tail lands at pos 2, which is NOT
-// a step hit ((2+1)%2==1). Without sparse cleanup, that slot must survive so that
-// causal_conv1d can still read it by prefix_length.
-TEST_F(HybridTypeKVCacheAllocatorTest, PrefillInitSkipsSparseCleanupAndPreservesReusedLinearTail) {
-    auto config      = makeTinyHybridConfig();
-    config.block_num = 16;  // 6 cached (resident, non-evictable) + 4 new + 1 null reserved
-    auto allocator   = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
-    ASSERT_TRUE(allocator->init());
-
-    auto block_pool  = allocator->getBlockPool();
-    auto block_cache = block_pool->blockCache();
-    ASSERT_NE(block_pool, nullptr);
-    ASSERT_NE(block_cache, nullptr);
-
-    const int gid_linear = 0;
-    const int gid_full   = 1;
-
-    CacheKeysType shared_keys          = {100, 101, 102};
-    auto          cached_full_blocks   = allocateAndCache(block_pool, block_cache, gid_full, shared_keys);
-    auto          cached_linear_blocks = allocateAndCache(block_pool, block_cache, gid_linear, shared_keys);
-    ASSERT_EQ(cached_linear_blocks.size(), 3u);
-
-    // Request has 5 keys; allocator drops the last before matching, leaving {100,101,102,103}.
-    // Full matches the first 3 (103 is absent); linear joint backoff stops at pos=2 => reuse_blocks_len=3.
-    auto batch_res = makeBatchResource(/*batch_size=*/1,
-                                       /*group_nums=*/2,
-                                       /*layer_num=*/static_cast<int>(config.layer_all_num),
-                                       /*layer_to_group_id=*/config.layer_to_group_id,
-                                       CacheKeysType{100, 101, 102, 103, 104});
-
-    // seq_len=20 => 5 slots. block_size-3-reserve_step = 2, so removeSkippedBlocks would scan pos 2.
-    auto token_ids = makeCompleteTokenIds(/*batch_size=*/1, /*seq_length=*/20, /*seq_size_per_block=*/4);
-
-    MallocInfo info{batch_res, token_ids};
-    info.enable_device_cache   = true;
-    info.reuse_cache           = true;
-    info.enable_remove_skipped_blocks = false;  // prefill init path
-    auto result                = allocator->malloc(info);
-    ASSERT_TRUE(result.success);
-
-    const auto& linear_out = batch_res->blocks(0, gid_linear);
-    ASSERT_EQ(linear_out.size(), 5u);
-    EXPECT_TRUE(isNullBlockIdx(linear_out[0]));
-    EXPECT_TRUE(isNullBlockIdx(linear_out[1]));
-    EXPECT_EQ(linear_out[2], cached_linear_blocks[2]) << "reused linear tail must survive prefill init";
-    EXPECT_FALSE(isNullBlockIdx(linear_out[3]));
-    EXPECT_FALSE(isNullBlockIdx(linear_out[4]));
-}
-
-// Decode path (StreamCacheResource::incrKVBlock sets enable_remove_skipped_blocks=true).
-// The allocator is invoked on an already-populated resource, so malloc() dispatches directly
-// to incrMalloc(). Sparse cleanup must prune non-step blocks while preserving step hits and
-// the last two slots.
-TEST_F(HybridTypeKVCacheAllocatorTest, DecodeIncrMallocAppliesSparseCleanupOnLinearGroups) {
-    auto config      = makeTinyHybridConfig();
-    config.block_num = 16;  // pre-allocates 6 + 6 = 12 blocks plus the reserved null block
-    auto allocator   = std::make_shared<HybridTypeKVCacheAllocator>(config, AllocationType::DEVICE);
-    ASSERT_TRUE(allocator->init());
-
-    auto block_pool = allocator->getBlockPool();
-    ASSERT_NE(block_pool, nullptr);
-
-    const int gid_linear = 0;
-    const int gid_full   = 1;
-
-    auto linear_alloc = block_pool->malloc(6);
-    auto full_alloc   = block_pool->malloc(6);
-    ASSERT_EQ(linear_alloc.size(), 6u);
-    ASSERT_EQ(full_alloc.size(), 6u);
-
-    auto batch_res = makeBatchResource(/*batch_size=*/1,
-                                       /*group_nums=*/2,
-                                       /*layer_num=*/static_cast<int>(config.layer_all_num),
-                                       /*layer_to_group_id=*/config.layer_to_group_id,
-                                       CacheKeysType{});
-    batch_res->mutableBlockIds(0, gid_linear).assign(linear_alloc);
-    batch_res->mutableBlockIds(0, gid_full).assign(full_alloc);
-    ASSERT_GT(batch_res->curBlocksNum(), 0);
-
-    // seq_len=24 => 6 slots; current_blocks==6 so group malloc is a no-op and only cleanup runs.
-    auto token_ids = makeCompleteTokenIds(/*batch_size=*/1, /*seq_length=*/24, /*seq_size_per_block=*/4);
-
-    MallocInfo info{batch_res, token_ids};
-    info.enable_device_cache   = false;
-    info.reuse_cache           = true;
-    info.enable_remove_skipped_blocks = true;  // decode path
-    auto result                = allocator->malloc(info);
-    ASSERT_TRUE(result.success);
-
-    // For step=2 and size=6: keep pos 1, 3 (step hits) and last two (4, 5); null pos 0, 2.
-    const auto& linear_out = batch_res->blocks(0, gid_linear);
-    ASSERT_EQ(linear_out.size(), 6u);
-    EXPECT_TRUE(isNullBlockIdx(linear_out[0]));
-    EXPECT_FALSE(isNullBlockIdx(linear_out[1]));
-    EXPECT_TRUE(isNullBlockIdx(linear_out[2]));
-    EXPECT_FALSE(isNullBlockIdx(linear_out[3]));
-    EXPECT_FALSE(isNullBlockIdx(linear_out[4]));
-    EXPECT_FALSE(isNullBlockIdx(linear_out[5]));
-
-    // Full group is untouched by sparse cleanup.
-    const auto& full_out = batch_res->blocks(0, gid_full);
-    ASSERT_EQ(full_out.size(), 6u);
-    for (size_t i = 0; i < full_out.size(); ++i) {
-        EXPECT_EQ(full_out[i], full_alloc[i]);
-    }
-}
-
 }  // namespace test
 }  // namespace rtp_llm
 
diff --git a/rtp_llm/cpp/cache/test/KVCacheGroupTest.cc b/rtp_llm/cpp/cache/test/KVCacheGroupTest.cc
index 438f4ab870..77cb7b4881 100644
--- a/rtp_llm/cpp/cache/test/KVCacheGroupTest.cc
+++ b/rtp_llm/cpp/cache/test/KVCacheGroupTest.cc
@@ -4,7 +4,7 @@
 #include <thread>
 #include <atomic>
 #include <algorithm>
-#include "rtp_llm/cpp/cache/FullKVCacheGroup.h"
+#include "rtp_llm/cpp/cache/group/FullKVCacheGroup.h"
 #include "rtp_llm/cpp/cache/test/BlockPoolTestHelper.h"
 
 namespace rtp_llm {
diff --git a/rtp_llm/cpp/cache/test/KVCacheManagerCPSlotMapperTest.cc b/rtp_llm/cpp/cache/test/KVCacheManagerCPSlotMapperTest.cc
new file mode 100644
index 0000000000..1070266c64
--- /dev/null
+++ b/rtp_llm/cpp/cache/test/KVCacheManagerCPSlotMapperTest.cc
@@ -0,0 +1,309 @@
+#include <gtest/gtest.h>
+#include <memory>
+#include <vector>
+
+#include "rtp_llm/cpp/cache/KVCacheManager.h"
+#include "rtp_llm/cpp/cache/CPSlotMapper.h"
+#include "rtp_llm/cpp/cache/test/CacheConfigTestUtils.h"
+#include "rtp_llm/cpp/cache/test/BlockPoolTestHelper.h"
+#include "rtp_llm/cpp/cache/BatchKVCacheResource.h"
+#include "rtp_llm/cpp/engine_base/stream/CompleteTokenIds.h"
+#include "rtp_llm/cpp/utils/Logger.h"
+
+namespace rtp_llm {
+namespace test {
+
+static CacheConfig makeTestConfig(int block_num = 20, int seq_size_per_block = 4) {
+    return makeSimpleMhaCacheConfig(
+        /*layer_num=*/2,
+        block_num,
+        /*tokens_per_block=*/static_cast<size_t>(seq_size_per_block),
+        rtp_llm::DataType::TYPE_FP16,
+        /*local_head_num_kv=*/1,
+        /*size_per_head=*/16);
+}
+
+static CompleteTokenIdsPtr makeTokenIds(int batch_size, int seq_len, int block_size) {
+    auto  ids       = std::make_shared<CompleteTokenIds>(batch_size, batch_size, seq_len + 100, block_size);
+    auto  input_ids = torch::empty({(int64_t)seq_len}, torch::kInt32);
+    auto* ptr       = input_ids.data_ptr<int32_t>();
+    for (int i = 0; i < seq_len; ++i)
+        ptr[i] = i + 1;
+    auto gi             = std::make_shared<GenerateInput>();
+    gi->input_ids       = input_ids;
+    gi->generate_config = std::make_shared<GenerateConfig>();
+    ids->init(gi);
+    return ids;
+}
+
+static BatchKVCacheResourcePtr makeResource(int batch_size, int layer_num) {
+    auto res = std::make_shared<BatchKVCacheResource>();
+    res->resetBatchSize(batch_size);
+    std::vector<std::vector<int>> layer_group_ids(static_cast<size_t>(layer_num), std::vector<int>{0});
+    res->initGroups(/*group_nums=*/1, layer_num, layer_group_ids);
+    return res;
+}
+
+class KVCacheManagerCPSlotMapperTest: public ::testing::Test {
+protected:
+    void SetUp() override {
+        rtp_llm::initLogger();
+        createDevice();
+    }
+};
+
+// When kv_cache_sharded is false (default), cpSlotMapper() should return nullptr.
+TEST_F(KVCacheManagerCPSlotMapperTest, NoCPSharding_ReturnsNullMapper) {
+    auto              config = makeTestConfig();
+    ParallelismConfig par;
+    par.tp_rank                            = 0;
+    par.tp_size                            = 2;
+    par.prefill_cp_config.kv_cache_sharded = false;
+
+    // warmup=true skips allocateAndSync (which would NCCL all-gather across the
+    // tp_size process group; in single-process UT there are no peers).  cp_slot_mapper_
+    // is constructed regardless of warmup, so cpSlotMapper() check is unaffected.
+    auto mgr = std::make_shared<KVCacheManager>(config, /*warmup=*/true, nullptr, KVCacheConfig{}, par);
+    ASSERT_TRUE(mgr->init());
+
+    EXPECT_EQ(mgr->cpSlotMapper(), nullptr);
+}
+
+// When tp_size == 1, cpSlotMapper() should return nullptr even if kv_cache_sharded is true.
+TEST_F(KVCacheManagerCPSlotMapperTest, SingleRank_ReturnsNullMapper) {
+    auto              config = makeTestConfig();
+    ParallelismConfig par;
+    par.tp_rank                            = 0;
+    par.tp_size                            = 1;
+    par.prefill_cp_config.kv_cache_sharded = true;
+
+    // warmup=true skips allocateAndSync (which would NCCL all-gather across the
+    // tp_size process group; in single-process UT there are no peers).  cp_slot_mapper_
+    // is constructed regardless of warmup, so cpSlotMapper() check is unaffected.
+    auto mgr = std::make_shared<KVCacheManager>(config, /*warmup=*/true, nullptr, KVCacheConfig{}, par);
+    ASSERT_TRUE(mgr->init());
+
+    EXPECT_EQ(mgr->cpSlotMapper(), nullptr);
+}
+
+// When kv_cache_sharded is true and tp_size > 1, cpSlotMapper() should return a valid mapper.
+TEST_F(KVCacheManagerCPSlotMapperTest, CPShardingEnabled_ReturnsValidMapper) {
+    const int seq_size_per_block = 4;
+    auto      config             = makeTestConfig(/*block_num=*/20, seq_size_per_block);
+
+    ParallelismConfig par;
+    par.tp_rank                            = 1;
+    par.tp_size                            = 2;
+    par.prefill_cp_config.kv_cache_sharded = true;
+
+    // warmup=true skips allocateAndSync (which would NCCL all-gather across the
+    // tp_size process group; in single-process UT there are no peers).  cp_slot_mapper_
+    // is constructed regardless of warmup, so cpSlotMapper() check is unaffected.
+    auto mgr = std::make_shared<KVCacheManager>(config, /*warmup=*/true, nullptr, KVCacheConfig{}, par);
+    ASSERT_TRUE(mgr->init());
+
+    auto mapper = mgr->cpSlotMapper();
+    ASSERT_NE(mapper, nullptr);
+    EXPECT_TRUE(mapper->isSharded());
+    EXPECT_EQ(mapper->cpRank(), 1);
+    EXPECT_EQ(mapper->cpSize(), 2);
+    EXPECT_EQ(mapper->blockSize(), seq_size_per_block);
+    EXPECT_EQ(mapper->virtualBlockSize(), seq_size_per_block * 2);
+}
+
+TEST_F(KVCacheManagerCPSlotMapperTest, CPShardingEnabled_CacheInfoReportsVirtualBlockSize) {
+    const int seq_size_per_block = 4;
+    auto      config             = makeTestConfig(/*block_num=*/20, seq_size_per_block);
+
+    ParallelismConfig par;
+    par.tp_rank                            = 0;
+    par.tp_size                            = 4;
+    par.prefill_cp_config.kv_cache_sharded = true;
+
+    auto mgr = std::make_shared<KVCacheManager>(config, /*warmup=*/true, nullptr, KVCacheConfig{}, par);
+    ASSERT_TRUE(mgr->init());
+
+    auto info = mgr->getKVCacheInfo(/*latest_version=*/-1, /*need_cache_keys=*/false);
+    EXPECT_EQ(info.block_size, static_cast<size_t>(seq_size_per_block * par.tp_size));
+}
+
+// Partial tails may be allocated as live KV blocks before they become cacheable
+// full blocks. CP invariants must therefore be based on logical sequence length,
+// not cacheKeys().size().
+TEST_F(KVCacheManagerCPSlotMapperTest, CPShardedMallocAllowsPartialTailWithoutCacheKey) {
+    const int seq_size_per_block = 4;
+    auto      config             = makeTestConfig(/*block_num=*/20, seq_size_per_block);
+
+    ParallelismConfig par;
+
+    auto mgr = std::make_shared<KVCacheManager>(config, /*warmup=*/false, nullptr, KVCacheConfig{}, par);
+    ASSERT_TRUE(mgr->init());
+
+    auto resource  = makeResource(1, config.layer_num);
+    auto token_ids = makeTokenIds(1, /*seq_len=*/1, seq_size_per_block);
+
+    MallocInfo info{resource, token_ids};
+    auto       cp_mapper = std::make_shared<CPSlotMapper>(0, 2, seq_size_per_block);
+    mgr->cp_slot_mapper_ = cp_mapper;
+    mgr->allocator_->setCPSlotMapper(cp_mapper);
+
+    auto result = mgr->malloc(info);
+    ASSERT_TRUE(result.success);
+    EXPECT_EQ(resource->blocksNum(0, 0), 1);
+
+    token_ids->setSeqLength(2);
+    result = mgr->malloc(info);
+    ASSERT_TRUE(result.success);
+    EXPECT_EQ(resource->blocksNum(0, 0), 1);
+    EXPECT_EQ(resource->cacheKeys(0).size(), 0);
+}
+
+// malloc() should use the manager-level cpSlotMapper.
+// With CP sharding (cp_size=2, block_size=4), virtual_block_size=8.
+// A sequence of 16 tokens needs ceil(16/8)=2 physical blocks per batch (not 4).
+// DISABLED: needs multi-rank NCCL harness (KVCacheManager::allocateAndSync calls
+// execAllGather across the tp_size group); covered end-to-end in Stage 6 smoke.
+TEST_F(KVCacheManagerCPSlotMapperTest, DISABLED_MallocAutoInjectReducesBlockCount) {
+    const int seq_size_per_block = 4;
+    auto      config             = makeTestConfig(/*block_num=*/20, seq_size_per_block);
+
+    ParallelismConfig par;
+    par.tp_rank                            = 0;
+    par.tp_size                            = 2;
+    par.prefill_cp_config.kv_cache_sharded = true;
+
+    // warmup=true skips allocateAndSync (which would NCCL all-gather across the
+    // tp_size process group; in single-process UT there are no peers).  cp_slot_mapper_
+    // is constructed regardless of warmup, so cpSlotMapper() check is unaffected.
+    auto mgr = std::make_shared<KVCacheManager>(config, /*warmup=*/true, nullptr, KVCacheConfig{}, par);
+    ASSERT_TRUE(mgr->init());
+
+    const int seq_len   = 16;
+    auto      resource  = makeResource(1, config.layer_num);
+    auto      token_ids = makeTokenIds(1, seq_len, seq_size_per_block);
+
+    MallocInfo info{resource, token_ids};
+    auto result = mgr->malloc(info);
+    ASSERT_TRUE(result.success);
+
+    // virtual_block_size = 4 * 2 = 8
+    // effectiveSeqLenForAlloc(16) = ceil(16/8) * 4 = 8 tokens worth => ceil(8/4) = 2 blocks
+    EXPECT_EQ(resource->blocksNum(0, 0), 2);
+}
+
+// Without CP sharding, the same seq_len should allocate more blocks.
+// DISABLED: needs multi-rank NCCL harness (KVCacheManager::allocateAndSync calls
+// execAllGather across the tp_size group); covered end-to-end in Stage 6 smoke.
+TEST_F(KVCacheManagerCPSlotMapperTest, DISABLED_MallocWithoutCPAllocatesFullBlocks) {
+    const int seq_size_per_block = 4;
+    auto      config             = makeTestConfig(/*block_num=*/20, seq_size_per_block);
+
+    ParallelismConfig par;
+    par.tp_rank                            = 0;
+    par.tp_size                            = 2;
+    par.prefill_cp_config.kv_cache_sharded = false;
+
+    // warmup=true skips allocateAndSync (which would NCCL all-gather across the
+    // tp_size process group; in single-process UT there are no peers).  cp_slot_mapper_
+    // is constructed regardless of warmup, so cpSlotMapper() check is unaffected.
+    auto mgr = std::make_shared<KVCacheManager>(config, /*warmup=*/true, nullptr, KVCacheConfig{}, par);
+    ASSERT_TRUE(mgr->init());
+
+    const int seq_len   = 16;
+    auto      resource  = makeResource(1, config.layer_num);
+    auto      token_ids = makeTokenIds(1, seq_len, seq_size_per_block);
+
+    MallocInfo info{resource, token_ids};
+    auto       result = mgr->malloc(info);
+    ASSERT_TRUE(result.success);
+
+    // Without CP: ceil(16/4) = 4 blocks
+    EXPECT_EQ(resource->blocksNum(0, 0), 4);
+}
+
+// Allocator-level cp_slot_mapper should drive malloc sharding.
+// DISABLED: needs multi-rank NCCL harness (KVCacheManager::allocateAndSync calls
+// execAllGather across the tp_size group); covered end-to-end in Stage 6 smoke.
+TEST_F(KVCacheManagerCPSlotMapperTest, DISABLED_AllocatorMapperControlsMalloc) {
+    const int seq_size_per_block = 4;
+    auto      config             = makeTestConfig(/*block_num=*/30, seq_size_per_block);
+
+    ParallelismConfig par;
+    par.tp_rank                            = 0;
+    par.tp_size                            = 2;
+    par.prefill_cp_config.kv_cache_sharded = true;
+
+    // warmup=true skips allocateAndSync (which would NCCL all-gather across the
+    // tp_size process group; in single-process UT there are no peers).  cp_slot_mapper_
+    // is constructed regardless of warmup, so cpSlotMapper() check is unaffected.
+    auto mgr = std::make_shared<KVCacheManager>(config, /*warmup=*/true, nullptr, KVCacheConfig{}, par);
+    ASSERT_TRUE(mgr->init());
+
+    const int seq_len   = 64;
+    auto      resource  = makeResource(1, config.layer_num);
+    auto      token_ids = makeTokenIds(1, seq_len, seq_size_per_block);
+
+    auto explicit_mapper = std::make_shared<CPSlotMapper>(0, 4, seq_size_per_block);
+    // virtual_block_size = 4 * 4 = 16
+    // effectiveSeqLenForAlloc(64) = ceil(64/16)*4 = 16 tokens => ceil(16/4) = 4 blocks
+
+    MallocInfo info{resource, token_ids};
+    mgr->cp_slot_mapper_ = explicit_mapper;
+    mgr->allocator_->setCPSlotMapper(explicit_mapper);
+    auto result         = mgr->malloc(info);
+    ASSERT_TRUE(result.success);
+
+    EXPECT_EQ(resource->blocksNum(0, 0), 4);
+}
+
+// insertIntoCache() should also use the manager-level mapper.
+// DISABLED: same reason as above (multi-rank harness needed).
+TEST_F(KVCacheManagerCPSlotMapperTest, DISABLED_InsertAutoInjectsMapper) {
+    const int seq_size_per_block = 4;
+    auto      config             = makeTestConfig(/*block_num=*/20, seq_size_per_block);
+
+    ParallelismConfig par;
+    par.tp_rank                            = 0;
+    par.tp_size                            = 2;
+    par.prefill_cp_config.kv_cache_sharded = true;
+
+    KVCacheConfig kv_cfg;
+    kv_cfg.reuse_cache         = true;
+    kv_cfg.enable_device_cache = true;
+
+    auto mgr = std::make_shared<KVCacheManager>(config, false, nullptr, kv_cfg, par);
+    ASSERT_TRUE(mgr->init());
+    // virtual_block_size = 4 * 2 = 8
+    // effectiveSeqLenForAlloc(16) = ceil(16/8) * 4 = 8 tokens worth => ceil(8/4) = 2 blocks
+
+    const int seq_len   = 16;
+    auto      resource  = makeResource(1, config.layer_num);
+    auto      token_ids = makeTokenIds(1, seq_len, seq_size_per_block);
+
+    MallocInfo malloc_info{resource, token_ids};
+    malloc_info.reuse_cache         = true;
+    malloc_info.enable_device_cache = true;
+    auto result                     = mgr->malloc(malloc_info);
+    ASSERT_TRUE(result.success);
+
+    // Insert into cache using the allocator-level cp_slot_mapper.
+    // This should not crash and should use sharded insert logic.
+    InsertInfo insert_info{resource, token_ids, /*is_resident=*/false};
+    EXPECT_NO_THROW(mgr->insertIntoCache(insert_info));
+
+    // Now try to malloc again with the same token_ids -- should get reuse hit.
+    auto       resource2 = makeResource(1, config.layer_num);
+    MallocInfo malloc_info2{resource2, token_ids};
+    malloc_info2.reuse_cache         = true;
+    malloc_info2.enable_device_cache = true;
+    auto result2                     = mgr->malloc(malloc_info2);
+    ASSERT_TRUE(result2.success);
+    // With CP sharding (cp_size=2, block_size=4), virtual_block_size=8.
+    // seq_len=16 produces 2 cache keys (each covering 8 tokens).
+    // match drops the last key → 1 matched key → reuse_len = 1 * virtual_block_size = 8.
+    // The sharded reuse_length adjustment ensures this is 1 * virtual_block_size = 8, not 1 * seq_size_per_block = 4.
+    EXPECT_EQ(result2.reuse_len, seq_size_per_block * par.tp_size);  // = 4 * 2 = 8
+}
+
+}  // namespace test
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/test/KVCacheManagerTest.cc b/rtp_llm/cpp/cache/test/KVCacheManagerTest.cc
index 36d45115e8..d55a76e97e 100644
--- a/rtp_llm/cpp/cache/test/KVCacheManagerTest.cc
+++ b/rtp_llm/cpp/cache/test/KVCacheManagerTest.cc
@@ -4,10 +4,14 @@
 #include <memory>
 #include <optional>
 #include <algorithm>
+#include <limits>
 #include <thread>
 
 #include "kmonitor/client/MetricsReporter.h"
-#include "rtp_llm/cpp/cache/BlockCache.h"
+#include "rtp_llm/cpp/cache/SharedBlockCache.h"
+#include "rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h"
+#include "rtp_llm/cpp/cache/allocator/HybridPoolKVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/config_creator/HybridPoolConfigCreator.h"
 #include "rtp_llm/cpp/cache/KVCacheManager.h"
 #include "rtp_llm/cpp/cache/test/CacheConfigTestUtils.h"
 #include "rtp_llm/cpp/cache/test/BlockPoolTestHelper.h"
@@ -16,19 +20,35 @@
 #include "rtp_llm/cpp/cache/connector/test/mock/MockAsyncContext.h"
 #include "rtp_llm/cpp/cache/connector/test/mock/MockKVCacheConnectorCoordinator.h"
 #include "rtp_llm/cpp/cache/connector/test/mock/MockKVCacheConnectorReadWriteContext.h"
+#include "rtp_llm/cpp/config/ModelConfig.h"
+#include "rtp_llm/cpp/config/StaticConfig.h"
+#include "rtp_llm/cpp/engine_base/stream/CompleteTokenIds.h"
 #include "rtp_llm/cpp/utils/Logger.h"
 
 namespace rtp_llm {
 namespace test {
 
+namespace {
+constexpr int kDsv4PoolNum = 7;
+const std::vector<std::string> kDsv4Tags = {
+    "swa_kv", "csa_kv", "indexer_kv", "indexer_state", "csa_state", "hca_kv", "hca_state"};
+}
+
 class KVCacheManagerTest: public ::testing::Test {
 protected:
     void SetUp() override {
+        old_core_dump_on_exception_                 = StaticConfig::user_ft_core_dump_on_exception;
+        StaticConfig::user_ft_core_dump_on_exception = false;
         rtp_llm::initLogger();
         createDevice();
     }
 
-protected:
+    void TearDown() override {
+        StaticConfig::user_ft_core_dump_on_exception = old_core_dump_on_exception_;
+    }
+
+private:
+    bool old_core_dump_on_exception_{false};
 };
 
 static void assertBlockBytesEq(const std::shared_ptr<rtp_llm::KVCacheManager>& cache_manager,
@@ -79,6 +99,207 @@ static void assertScaleEq(const std::shared_ptr<rtp_llm::KVCacheManager>& cache_
     }
 }
 
+static ModelConfig makeDSV4ManagerFlashModelConfig() {
+    ModelConfig mc;
+    mc.num_layers                   = 43;
+    mc.hidden_size                  = 4096;
+    mc.attn_config.head_num         = 64;
+    mc.attn_config.kv_head_num      = 1;
+    mc.attn_config.size_per_head    = 512;
+    mc.attn_config.rope_head_dim    = 64;
+    mc.attn_config.sliding_window   = 128;
+    mc.attn_config.indexer_head_dim = 128;
+    mc.attn_config.indexer_head_num = 64;
+    mc.attn_config.indexer_topk     = 512;
+    mc.attn_config.o_groups         = 8;
+    mc.attn_config.o_lora_rank      = 1024;
+    std::vector<int> ratios         = {0, 0};
+    for (int i = 2; i < 43; i++) {
+        ratios.push_back((i % 2 == 0) ? 4 : 128);
+    }
+    ratios.push_back(0);
+    mc.attn_config.layer_compress_ratios = ratios;
+    mc.hybrid_attention_config.enable_hybrid_attention           = true;
+    mc.hybrid_attention_config.enable_independent_kv_cache_pools = true;
+    setDsv4KvCacheSpecs(mc);
+    return mc;
+}
+
+static void setGroupBlockNumsForTest(CacheConfig& config, const std::vector<uint32_t>& block_nums) {
+    std::vector<size_t> kv_strides;
+    std::vector<size_t> scale_strides;
+    kv_strides.reserve(static_cast<size_t>(config.groupNums()));
+    scale_strides.reserve(static_cast<size_t>(config.groupNums()));
+    for (size_t gid = 0; gid < static_cast<size_t>(config.groupNums()); ++gid) {
+        kv_strides.push_back(config.kvBlockStrideBytesForGroup(gid));
+        scale_strides.push_back(config.kvScaleStrideBytesForGroup(gid));
+    }
+    config.setGroupBlockLayout(block_nums, kv_strides, scale_strides);
+}
+
+static CacheConfig makeCompactDSV4ManagerConfig(uint32_t block_num = 16) {
+    ParallelismConfig pc;
+    KVCacheConfig     kv_cache_config;
+    kv_cache_config.seq_size_per_block     = 128;
+    auto              mc = makeDSV4ManagerFlashModelConfig();
+    setDsv4ExplicitPoolBlocks(mc, "hca_state", 0);
+    auto              config = CacheConfigCreator::createBasicConfig(mc, pc, kv_cache_config, false, 0);
+    config.block_num         = block_num;
+    setGroupBlockNumsForTest(config, std::vector<uint32_t>(static_cast<size_t>(config.groupNums()), block_num));
+    return config;
+}
+
+static bool isValidGroup(const CacheConfig& config, int gid) {
+    return gid >= 0 && static_cast<size_t>(gid) < static_cast<size_t>(config.groupNums());
+}
+
+static bool isFullGroup(const CacheConfig& config, int gid) {
+    return isValidGroup(config, gid) && config.typeForGroup(static_cast<size_t>(gid)) == CacheGroupType::FULL;
+}
+
+static bool isFixedTailGroup(const CacheConfig& config, int gid) {
+    return isValidGroup(config, gid) && config.typeForGroup(static_cast<size_t>(gid)) != CacheGroupType::FULL;
+}
+
+static bool isHcaStateGroup(const CacheConfig& config, int gid) {
+    return isValidGroup(config, gid) && config.tagForGroup(static_cast<size_t>(gid)) == "hca_state";
+}
+
+static std::vector<int> dsv4GroupIdsByType(const CacheConfig& config, CacheGroupType type) {
+    std::vector<int> group_ids;
+    for (int gid = 0; gid < config.groupNums(); ++gid) {
+        if (config.typeForGroup(static_cast<size_t>(gid)) == type) {
+            group_ids.push_back(gid);
+        }
+    }
+    return group_ids;
+}
+
+static std::vector<int> dsv4FixedTailGroupIds(const CacheConfig& config) {
+    std::vector<int> group_ids;
+    for (int gid = 0; gid < config.groupNums(); ++gid) {
+        if (isFixedTailGroup(config, gid)) {
+            group_ids.push_back(gid);
+        }
+    }
+    return group_ids;
+}
+
+static int dsv4ActiveTailBlocks(const CacheConfig& config, int gid) {
+    return isHcaStateGroup(config, gid) ? 1 : 2;
+}
+
+static void expectDsv4SwaAllocatedBlocks(const CacheConfig& config,
+                                         const BlockIndicesType& blocks,
+                                         int gid,
+                                         const std::string& label,
+                                         bool enable_reuse_cache = false) {
+    const int active_tail_blocks = dsv4ActiveTailBlocks(config, gid);
+    const int tail_begin         = std::max(static_cast<int>(blocks.size()) - active_tail_blocks, 0);
+    const int linear_step        = std::max(1, config.linear_step);
+    const bool effective_reuse   = enable_reuse_cache && !isHcaStateGroup(config, gid);
+    for (int i = 0; i < static_cast<int>(blocks.size()); ++i) {
+        const bool should_allocate = i >= tail_begin || (effective_reuse && ((i + 1) % linear_step == 0));
+        if (should_allocate) {
+            EXPECT_FALSE(isNullBlockIdx(blocks[static_cast<size_t>(i)]))
+                << label << " group " << gid << " pos " << i;
+        } else {
+            EXPECT_TRUE(isNullBlockIdx(blocks[static_cast<size_t>(i)]))
+                << label << " group " << gid << " pos " << i;
+        }
+    }
+}
+
+// Creates an intentionally tight DSV4 config for eviction stress tests: FULL
+// groups use a large paged pool, while SWA groups use a small independent pool.
+static CacheConfig makeDSV4ConfigWithConcurrencyPool(uint32_t full_block_num, uint32_t swa_batch_size) {
+    ParallelismConfig pc;
+    KVCacheConfig     kv_cache_config;
+    kv_cache_config.seq_size_per_block     = 128;
+    auto              mc = makeDSV4ManagerFlashModelConfig();
+    setDsv4ExplicitPoolBlocks(mc, "hca_state", 0);
+    auto              config = CacheConfigCreator::createBasicConfig(mc, pc, kv_cache_config, false, 0);
+    config.block_num         = full_block_num;
+    std::vector<uint32_t> block_nums(static_cast<size_t>(config.groupNums()), full_block_num);
+    for (int gid = 0; gid < config.groupNums(); ++gid) {
+        block_nums[static_cast<size_t>(gid)] = isFullGroup(config, gid) ? full_block_num : (2u * swa_batch_size);
+    }
+    setGroupBlockNumsForTest(config, block_nums);
+    return config;
+}
+
+static CacheConfig
+makeProductionDSV4Config(uint32_t full_block_num, uint32_t max_concurrency, uint32_t hca_state_pool_blocks = 4) {
+    ParallelismConfig pc;
+    RuntimeConfig     runtime_config;
+    KVCacheConfig     kv_cache_config;
+    kv_cache_config.seq_size_per_block                         = 128;
+    kv_cache_config.test_block_num                              = full_block_num;
+    auto              mc = makeDSV4ManagerFlashModelConfig();
+    setDsv4ExplicitPoolBlocks(mc, "hca_state", hca_state_pool_blocks);
+    runtime_config.max_generate_batch_size                      = max_concurrency;
+    runtime_config.fifo_scheduler_config.max_context_batch_size = max_concurrency;
+    return CacheConfigCreator::createConfig(mc, pc, runtime_config, kv_cache_config);
+}
+
+static BatchKVCacheResourcePtr makeDSV4BatchResource(const CacheConfig& config) {
+    auto res = std::make_shared<BatchKVCacheResource>();
+    res->resetBatchSize(1);
+    res->initGroups(config.groupNums(),
+                    static_cast<int>(config.layer_all_num),
+                    config.layerGroupIdsSnapshot(),
+                    config.kernelBlocksPerKvBlock(),
+                    config.groupTypesSnapshot());
+    return res;
+}
+
+static CompleteTokenIdsPtr makeDSV4CompleteTokenIds(int initial_seq_len, int max_seq_len, int seq_size_per_block) {
+    auto input_ids      = torch::arange(max_seq_len, torch::kInt32);
+    auto gi             = std::make_shared<GenerateInput>();
+    gi->input_ids       = input_ids;
+    gi->generate_config = std::make_shared<GenerateConfig>();
+
+    auto complete_token_ids = std::make_shared<CompleteTokenIds>(1, 1, max_seq_len + 16, seq_size_per_block);
+    complete_token_ids->init(gi);
+    complete_token_ids->setSeqLength(initial_seq_len);
+    return complete_token_ids;
+}
+
+static void writeDsv4RegionPattern(const std::shared_ptr<KVCacheManager>& manager,
+                                   int                                    block_id,
+                                   int                                    layer_id,
+                                   int                                    group_id,
+                                   size_t                                 bytes,
+                                   uint8_t                                pattern) {
+    auto addr_info = manager->convertIndexToAddr(block_id, layer_id, group_id);
+    ASSERT_NE(addr_info.kv_addr, nullptr);
+
+    auto dst =
+        torch::from_blob(addr_info.kv_addr, {(int64_t)bytes}, torch::TensorOptions(torch::kUInt8).device(torch::kCUDA));
+    auto src = torch::full({(int64_t)bytes}, pattern, torch::TensorOptions(torch::kUInt8).device(torch::kCPU));
+    dst.copy_(src);
+    runtimeSyncAndCheck();
+}
+
+static void assertDsv4RegionPatternEq(const std::shared_ptr<KVCacheManager>& manager,
+                                      int                                    block_id,
+                                      int                                    layer_id,
+                                      int                                    group_id,
+                                      size_t                                 bytes,
+                                      uint8_t                                expected) {
+    auto addr_info = manager->convertIndexToAddr(block_id, layer_id, group_id);
+    ASSERT_NE(addr_info.kv_addr, nullptr);
+
+    auto dev_t =
+        torch::from_blob(addr_info.kv_addr, {(int64_t)bytes}, torch::TensorOptions(torch::kUInt8).device(torch::kCUDA));
+    auto        host_t = dev_t.cpu();
+    const auto* ptr    = host_t.data_ptr<uint8_t>();
+    for (size_t i = 0; i < bytes; ++i) {
+        ASSERT_EQ(ptr[i], expected) << "mismatch at byte " << i << " layer=" << layer_id << " block=" << block_id
+                                    << " group=" << group_id;
+    }
+}
+
 TEST_F(KVCacheManagerTest, WarmupConfigSmoke) {
     auto cache_config = makeSimpleMhaCacheConfig(
         /*layer_num=*/1, /*block_num=*/4, /*tokens_per_block=*/2, rtp_llm::DataType::TYPE_INT8);
@@ -92,6 +313,39 @@ TEST_F(KVCacheManagerTest, WarmupConfigSmoke) {
     EXPECT_EQ(cache_manager->freeBlocksNum(), 0);
 }
 
+TEST_F(KVCacheManagerTest, DSV4IndependentPoolsUseGpuBacking) {
+    auto expect_pool_backing = [](RoleType role_type) {
+        auto config = makeCompactDSV4ManagerConfig(/*block_num=*/8);
+
+        PDSepConfig pd_sep_config;
+        pd_sep_config.role_type = role_type;
+        KVCacheConfig kv_cache_config;
+        auto          cache_manager = std::make_shared<KVCacheManager>(config,
+                                                              /*warmup=*/false,
+                                                              nullptr,
+                                                              kv_cache_config,
+                                                              ParallelismConfig{},
+                                                              RuntimeConfig{},
+                                                              SpeculativeExecutionConfig{},
+                                                              pd_sep_config);
+        ASSERT_TRUE(cache_manager->init());
+
+        auto allocator = std::dynamic_pointer_cast<HybridPoolKVCacheAllocator>(cache_manager->allocator_);
+        ASSERT_NE(allocator, nullptr);
+        ASSERT_EQ(allocator->groupBlockPools().size(), static_cast<size_t>(config.groupNums()));
+
+        for (size_t gid = 0; gid < allocator->groupBlockPools().size(); ++gid) {
+            const auto& tag = config.tagForGroup(gid);
+            EXPECT_EQ(allocator->groupBlockPools()[gid]->where(), MemoryType::MEMORY_GPU)
+                << "role=" << static_cast<int>(role_type) << " gid=" << gid << " tag=" << tag;
+        }
+    };
+
+    expect_pool_backing(RoleType::PREFILL);
+    expect_pool_backing(RoleType::DECODE);
+    expect_pool_backing(RoleType::PDFUSION);
+}
+
 TEST_F(KVCacheManagerTest, MetricsThreadSmoke) {
     auto cache_config = makeSimpleMhaCacheConfig(
         /*layer_num=*/1, /*block_num=*/4, /*tokens_per_block=*/2, rtp_llm::DataType::TYPE_INT8);
@@ -115,7 +369,7 @@ TEST_F(KVCacheManagerTest, SetKVBlockValueAndBlockCopy) {
     auto cache_manager = std::make_shared<KVCacheManager>(cache_config, /*warmup=*/false);
     ASSERT_TRUE(cache_manager->init());
 
-    auto&        spec    = cache_manager->cacheConfig().cache_specs[0];
+    auto&        spec    = cache_manager->cacheConfig().specForGroup(0);
     const size_t k_bytes = spec->k_block_size_bytes();
     const size_t v_bytes = spec->v_block_size_bytes();
     ASSERT_GT(k_bytes, 0u);
@@ -129,7 +383,7 @@ TEST_F(KVCacheManagerTest, SetKVBlockValueAndBlockCopy) {
     auto                k_t = torch::from_blob(k_vec.data(), {(int64_t)k_bytes}, torch::kInt8).clone();
     auto                v_t = torch::from_blob(v_vec.data(), {(int64_t)v_bytes}, torch::kInt8).clone();
 
-    ASSERT_TRUE(cache_manager->setKVBlockValue(block_src, k_t, v_t));
+    ASSERT_TRUE(cache_manager->writeKVBlockForTest(block_src, k_t, v_t));
 
     std::vector<int8_t> expected_block(k_bytes + v_bytes, 0);
     std::fill(expected_block.begin(), expected_block.begin() + k_bytes, 7);
@@ -149,7 +403,7 @@ TEST_F(KVCacheManagerTest, SetKVBlockValueAndBlockCopy) {
     std::vector<int8_t> v2_vec(v_bytes, 2);
     auto                k2_t = torch::from_blob(k2_vec.data(), {(int64_t)k_bytes}, torch::kInt8).clone();
     auto                v2_t = torch::from_blob(v2_vec.data(), {(int64_t)v_bytes}, torch::kInt8).clone();
-    ASSERT_TRUE(cache_manager->setKVBlockValue(block_dst, /*layer_id=*/0, k2_t, v2_t));
+    ASSERT_TRUE(cache_manager->writeKVBlockForTest(block_dst, /*layer_id=*/0, k2_t, v2_t));
 
     std::vector<int8_t> expected_layer0(k_bytes + v_bytes, 0);
     std::fill(expected_layer0.begin(), expected_layer0.begin() + k_bytes, 1);
@@ -212,7 +466,7 @@ TEST_F(KVCacheManagerTest, BlockBatchCopy) {
     auto cache_manager = std::make_shared<KVCacheManager>(cache_config, /*warmup=*/false);
     ASSERT_TRUE(cache_manager->init());
 
-    auto&        spec    = cache_manager->cacheConfig().cache_specs[0];
+    auto&        spec    = cache_manager->cacheConfig().specForGroup(0);
     const size_t k_bytes = spec->k_block_size_bytes();
     const size_t v_bytes = spec->v_block_size_bytes();
 
@@ -226,7 +480,7 @@ TEST_F(KVCacheManagerTest, BlockBatchCopy) {
         std::vector<int8_t> v_vec(v_bytes, static_cast<int8_t>(block_id + 10));
         auto                k_t = torch::from_blob(k_vec.data(), {(int64_t)k_bytes}, torch::kInt8).clone();
         auto                v_t = torch::from_blob(v_vec.data(), {(int64_t)v_bytes}, torch::kInt8).clone();
-        ASSERT_TRUE(cache_manager->setKVBlockValue(block_id, k_t, v_t));
+        ASSERT_TRUE(cache_manager->writeKVBlockForTest(block_id, k_t, v_t));
     }
 
     std::vector<BlockIdPair> mapping;
@@ -253,6 +507,362 @@ TEST_F(KVCacheManagerTest, BlockBatchCopy) {
     }
 }
 
+TEST_F(KVCacheManagerTest, DSV4MallocIncrFreeExposesSevenTypedRegions) {
+    auto manager_config = makeCompactDSV4ManagerConfig(/*block_num=*/16);
+    auto manager        = std::make_shared<KVCacheManager>(manager_config, /*warmup=*/false);
+    ASSERT_TRUE(manager->init());
+
+    const size_t free_before = manager->freeBlocksNum();
+    const int    spb         = static_cast<int>(manager_config.seq_size_per_block);
+    auto         resource    = makeDSV4BatchResource(manager_config);
+    auto         tokens      = makeDSV4CompleteTokenIds(/*initial_seq_len=*/2 * spb + 17,
+                                           /*max_seq_len=*/4 * spb + 32,
+                                           spb);
+
+    MallocInfo malloc_info{resource, tokens};
+    malloc_info.reuse_cache         = false;
+    malloc_info.enable_device_cache = false;
+    auto malloc_result              = manager->malloc(malloc_info);
+    ASSERT_TRUE(malloc_result.success);
+    ASSERT_EQ(resource->groupNums(), kDsv4PoolNum);
+
+    for (int gid = 0; gid < kDsv4PoolNum; ++gid) {
+        ASSERT_EQ(resource->blocksNum(0, gid), 3) << "group " << gid;
+        const auto& blocks = resource->blocks(0, gid);
+        if (isFullGroup(manager_config, gid)) {
+            EXPECT_FALSE(isNullBlockIdx(blocks[0])) << "paged group " << gid;
+            EXPECT_FALSE(isNullBlockIdx(blocks[1])) << "paged group " << gid;
+            EXPECT_FALSE(isNullBlockIdx(blocks[2])) << "paged group " << gid;
+        } else {
+            expectDsv4SwaAllocatedBlocks(manager_config, blocks, gid, "tail group");
+        }
+    }
+
+    tokens->setSeqLength(4 * spb);
+    MallocInfo incr_info{resource, tokens};
+    incr_info.reuse_cache         = false;
+    incr_info.enable_device_cache = false;
+    auto incr_result              = manager->malloc(incr_info);
+    ASSERT_TRUE(incr_result.success);
+
+    for (int gid = 0; gid < kDsv4PoolNum; ++gid) {
+        EXPECT_EQ(resource->blocksNum(0, gid), 4) << "group " << gid;
+    }
+
+    auto layout = manager->getMainModelCacheLayerLayout();
+    ASSERT_EQ(layout.group_tags.size(), static_cast<size_t>(kDsv4PoolNum));
+    EXPECT_EQ(layout.group_tags, kDsv4Tags);
+    ASSERT_EQ(layout.group_seq_size_per_block, manager_config.group_seq_size_per_block);
+    EXPECT_EQ(layout.layers_to_kv_buffer_ptrs_by_group.size(), static_cast<size_t>(manager_config.layer_num));
+
+    const int swa_gid           = manager_config.groupIdForTag("swa_kv");
+    const int csa_gid           = manager_config.groupIdForTag("csa_kv");
+    const int indexer_gid       = manager_config.groupIdForTag("indexer_kv");
+    const int csa_state_gid     = manager_config.groupIdForTag("csa_state");
+    const int hca_gid           = manager_config.groupIdForTag("hca_kv");
+    const int hca_state_gid     = manager_config.groupIdForTag("hca_state");
+    const int csa_layer         = manager_config.layerIdsForGroup(static_cast<size_t>(csa_gid))[0];
+    const int hca_layer         = manager_config.layerIdsForGroup(static_cast<size_t>(hca_gid))[0];
+    EXPECT_NE(manager->convertIndexToAddr(resource->blocks(0, csa_gid)[0], csa_layer, csa_gid).kv_addr, nullptr);
+    EXPECT_NE(manager->convertIndexToAddr(resource->blocks(0, indexer_gid)[0], csa_layer, indexer_gid).kv_addr,
+              nullptr);
+    EXPECT_NE(manager->convertIndexToAddr(resource->blocks(0, csa_state_gid)[2], csa_layer, csa_state_gid).kv_addr,
+              nullptr);
+    EXPECT_NE(manager->convertIndexToAddr(resource->blocks(0, hca_state_gid).back(), hca_layer, hca_state_gid).kv_addr,
+              nullptr);
+    EXPECT_NE(manager->convertIndexToAddr(resource->blocks(0, hca_gid)[0], hca_layer, hca_gid).kv_addr, nullptr);
+    EXPECT_NE(manager->convertIndexToAddr(resource->blocks(0, swa_gid)[2], csa_layer, swa_gid).kv_addr, nullptr);
+    EXPECT_ANY_THROW((void)manager->convertIndexToAddr(resource->blocks(0, hca_gid)[0], csa_layer, hca_gid));
+
+    FreeInfo free_info{resource, tokens};
+    manager->free(free_info);
+    EXPECT_EQ(manager->freeBlocksNum(), free_before);
+}
+
+TEST_F(KVCacheManagerTest, DSV4LayerRegionBlockTablesMatchInferenceAccessPattern) {
+    auto manager_config = makeCompactDSV4ManagerConfig(/*block_num=*/16);
+    auto manager        = std::make_shared<KVCacheManager>(manager_config, /*warmup=*/false);
+    ASSERT_TRUE(manager->init());
+
+    const int spb      = static_cast<int>(manager_config.seq_size_per_block);
+    auto      resource = makeDSV4BatchResource(manager_config);
+    auto      tokens   = makeDSV4CompleteTokenIds(/*initial_seq_len=*/3 * spb + 17,
+                                           /*max_seq_len=*/4 * spb + 32,
+                                           spb);
+
+    MallocInfo malloc_info{resource, tokens};
+    malloc_info.reuse_cache         = false;
+    malloc_info.enable_device_cache = false;
+    ASSERT_TRUE(manager->malloc(malloc_info).success);
+
+    auto expectTagGroup = [&](int layer_id, const std::string& tag, int expected_gid) {
+        EXPECT_EQ(manager_config.groupIdForLayerTag(layer_id, tag), expected_gid)
+            << "layer=" << layer_id << " tag=" << tag;
+        EXPECT_EQ(resource->groupId(/*batch_id=*/0, layer_id, expected_gid), expected_gid)
+            << "layer=" << layer_id << " tag=" << tag;
+        EXPECT_EQ(resource->blocks(/*batch_id=*/0, layer_id, expected_gid), resource->blocks(0, expected_gid))
+            << "layer=" << layer_id << " tag=" << tag;
+        EXPECT_EQ(resource->kernelBlocks(/*batch_id=*/0, layer_id, expected_gid),
+                  resource->kernelBlocks(0, expected_gid))
+            << "layer=" << layer_id << " tag=" << tag;
+    };
+
+    // Flash DSV4 layers 0/1 are SWA-only. Inference resolves typed block tables by semantic tag.
+    expectTagGroup(/*layer_id=*/0, "swa_kv", manager_config.groupIdForTag("swa_kv"));
+    EXPECT_THROW((void)manager_config.groupIdForLayerTag(/*layer_id=*/0, "csa_kv"), std::exception);
+    EXPECT_THROW((void)manager_config.groupIdForLayerTag(/*layer_id=*/0, "hca_kv"), std::exception);
+
+    // Layer 2 is CSA: CSA_KV + INDEXER_KV + INDEXER_STATE + CSA_STATE + SWA_KV.
+    const int csa_layer = manager_config.layerIdsForGroup(static_cast<size_t>(manager_config.groupIdForTag("csa_kv")))[0];
+    expectTagGroup(csa_layer, "csa_kv", manager_config.groupIdForTag("csa_kv"));
+    expectTagGroup(csa_layer, "indexer_kv", manager_config.groupIdForTag("indexer_kv"));
+    expectTagGroup(csa_layer, "indexer_state", manager_config.groupIdForTag("indexer_state"));
+    expectTagGroup(csa_layer, "csa_state", manager_config.groupIdForTag("csa_state"));
+    expectTagGroup(csa_layer, "swa_kv", manager_config.groupIdForTag("swa_kv"));
+    EXPECT_THROW((void)manager_config.groupIdForLayerTag(csa_layer, "hca_kv"), std::exception);
+
+    // Layer 3 is HCA: HCA_KV + HCA_STATE + SWA_KV.
+    const int hca_layer = manager_config.layerIdsForGroup(static_cast<size_t>(manager_config.groupIdForTag("hca_kv")))[0];
+    expectTagGroup(hca_layer, "hca_kv", manager_config.groupIdForTag("hca_kv"));
+    expectTagGroup(hca_layer, "hca_state", manager_config.groupIdForTag("hca_state"));
+    expectTagGroup(hca_layer, "swa_kv", manager_config.groupIdForTag("swa_kv"));
+    EXPECT_THROW((void)manager_config.groupIdForLayerTag(hca_layer, "csa_kv"), std::exception);
+
+    FreeInfo free_info{resource, tokens};
+    manager->free(free_info);
+}
+
+TEST_F(KVCacheManagerTest, DSV4BlockCopyPreservesTypedRegionBytes) {
+    auto manager_config = makeCompactDSV4ManagerConfig(/*block_num=*/8);
+    auto manager        = std::make_shared<KVCacheManager>(manager_config, /*warmup=*/false);
+    ASSERT_TRUE(manager->init());
+
+    const int spb      = static_cast<int>(manager_config.seq_size_per_block);
+    const int seq_len  = 3 * spb + 1;
+    auto      resource = makeDSV4BatchResource(manager_config);
+    auto      tokens   = makeDSV4CompleteTokenIds(seq_len, seq_len, spb);
+
+    MallocInfo malloc_info{resource, tokens};
+    malloc_info.reuse_cache         = false;
+    malloc_info.enable_device_cache = false;
+    ASSERT_TRUE(manager->malloc(malloc_info).success);
+
+    const int src_block      = 1;
+    const int dst_block      = 2;
+    const int swa_gid        = manager_config.groupIdForTag("swa_kv");
+    const int csa_gid        = manager_config.groupIdForTag("csa_kv");
+    const int indexer_gid    = manager_config.groupIdForTag("indexer_kv");
+    const int indexer_state_gid = manager_config.groupIdForTag("indexer_state");
+    const int csa_state_gid  = manager_config.groupIdForTag("csa_state");
+    const int hca_gid        = manager_config.groupIdForTag("hca_kv");
+    const int hca_state_gid  = manager_config.groupIdForTag("hca_state");
+    const int csa_layer      = manager_config.layerIdsForGroup(static_cast<size_t>(csa_gid))[0];
+    const int hca_layer      = manager_config.layerIdsForGroup(static_cast<size_t>(hca_gid))[0];
+    const int swa_only_layer = 0;
+
+    for (int gid = 0; gid < kDsv4PoolNum; ++gid) {
+        const auto& blocks = resource->blocks(0, gid);
+        EXPECT_NE(std::find(blocks.begin(), blocks.end(), src_block), blocks.end()) << "group " << gid;
+        if (!isHcaStateGroup(manager_config, gid)) {
+            EXPECT_NE(std::find(blocks.begin(), blocks.end(), dst_block), blocks.end()) << "group " << gid;
+        }
+    }
+
+    struct RegionCase {
+        int               gid;
+        int               layer_id;
+        uint8_t           pattern;
+    };
+
+    const std::vector<RegionCase> cases = {
+        {swa_gid, csa_layer, 0x11},
+        {csa_gid, csa_layer, 0x22},
+        {indexer_gid, csa_layer, 0x33},
+        {indexer_state_gid, csa_layer, 0x44},
+        {csa_state_gid, csa_layer, 0x55},
+        {hca_gid, hca_layer, 0x66},
+        {hca_state_gid, hca_layer, 0x77},
+        {swa_gid, swa_only_layer, 0x88},
+    };
+
+    for (const auto& region_case : cases) {
+        const size_t bytes = manager_config.specForGroup(static_cast<size_t>(region_case.gid))->block_size_bytes();
+        ASSERT_GT(bytes, 0u);
+        writeDsv4RegionPattern(manager, src_block, region_case.layer_id, region_case.gid, bytes, region_case.pattern);
+        writeDsv4RegionPattern(manager, dst_block, region_case.layer_id, region_case.gid, bytes, 0);
+        assertDsv4RegionPatternEq(
+            manager, src_block, region_case.layer_id, region_case.gid, bytes, region_case.pattern);
+        assertDsv4RegionPatternEq(manager, dst_block, region_case.layer_id, region_case.gid, bytes, 0);
+    }
+
+    manager->blockCopy(src_block, dst_block);
+    runtimeSyncAndCheck();
+
+    for (const auto& region_case : cases) {
+        const size_t bytes = manager_config.specForGroup(static_cast<size_t>(region_case.gid))->block_size_bytes();
+        assertDsv4RegionPatternEq(manager, dst_block, region_case.layer_id, region_case.gid, bytes, region_case.pattern);
+    }
+
+    FreeInfo free_info{resource, tokens};
+    manager->free(free_info);
+}
+
+TEST_F(KVCacheManagerTest, DSV4InsertIntoDeviceBlockCacheThenReuseSamePrefix) {
+    auto manager_config = makeCompactDSV4ManagerConfig(/*block_num=*/16);
+    auto manager        = std::make_shared<KVCacheManager>(manager_config, /*warmup=*/false);
+    ASSERT_TRUE(manager->init());
+
+    const int spb     = static_cast<int>(manager_config.seq_size_per_block);
+    const int seq_len = 3 * spb + 17;
+
+    auto first_resource = makeDSV4BatchResource(manager_config);
+    auto first_tokens   = makeDSV4CompleteTokenIds(seq_len, seq_len, spb);
+
+    MallocInfo first_malloc{first_resource, first_tokens};
+    first_malloc.reuse_cache         = true;
+    first_malloc.enable_device_cache = false;
+    ASSERT_TRUE(manager->malloc(first_malloc).success);
+
+    std::vector<BlockIndicesType> first_blocks;
+    first_blocks.reserve(kDsv4PoolNum);
+    for (int gid = 0; gid < kDsv4PoolNum; ++gid) {
+        first_blocks.push_back(first_resource->blocks(0, gid));
+    }
+
+    InsertInfo insert_info{first_resource, first_tokens, /*is_resident=*/false};
+    manager->insertIntoCache(insert_info);
+
+    FreeInfo first_free{first_resource, first_tokens};
+    manager->free(first_free);
+
+    auto second_resource = makeDSV4BatchResource(manager_config);
+    auto second_tokens   = makeDSV4CompleteTokenIds(seq_len, seq_len, spb);
+
+    MallocInfo second_malloc{second_resource, second_tokens};
+    second_malloc.reuse_cache         = true;
+    second_malloc.enable_device_cache = true;
+    auto reuse_result                 = manager->malloc(second_malloc);
+    ASSERT_TRUE(reuse_result.success);
+    EXPECT_GE(reuse_result.reuse_len, spb);
+
+    for (int gid : dsv4GroupIdsByType(manager_config, CacheGroupType::FULL)) {
+        ASSERT_GE(second_resource->blocksNum(0, gid), 3) << "paged group " << gid;
+        EXPECT_EQ(second_resource->blocks(0, gid)[0], first_blocks[gid][0]);
+        EXPECT_EQ(second_resource->blocks(0, gid)[1], first_blocks[gid][1]);
+    }
+    for (int gid : dsv4FixedTailGroupIds(manager_config)) {
+        if (manager_config.policyForGroup(static_cast<size_t>(gid)).reuse_policy == CacheReusePolicy::NON_REUSABLE) {
+            continue;
+        }
+        ASSERT_GE(second_resource->blocksNum(0, gid), 3) << "tail group " << gid;
+        EXPECT_EQ(second_resource->blocks(0, gid)[2], first_blocks[gid][2]);
+    }
+
+    FreeInfo second_free{second_resource, second_tokens};
+    manager->free(second_free);
+}
+
+TEST_F(KVCacheManagerTest, DSV4InitReuseKeepsSWAPrefixTailBlock) {
+    auto manager_config = makeCompactDSV4ManagerConfig(/*block_num=*/64);
+    auto manager        = std::make_shared<KVCacheManager>(manager_config, /*warmup=*/false);
+    ASSERT_TRUE(manager->init());
+
+    const int spb = static_cast<int>(manager_config.seq_size_per_block);
+
+    auto first_resource = makeDSV4BatchResource(manager_config);
+    auto first_tokens   = makeDSV4CompleteTokenIds(/*initial_seq_len=*/4 * spb, /*max_seq_len=*/4 * spb + 1, spb);
+
+    MallocInfo first_malloc{first_resource, first_tokens};
+    first_malloc.reuse_cache         = false;
+    first_malloc.enable_device_cache = false;
+    ASSERT_TRUE(manager->malloc(first_malloc).success);
+
+    std::vector<BlockIdxType> first_swa_tail_blocks(static_cast<size_t>(kDsv4PoolNum), NULL_BLOCK_IDX);
+    for (int gid : dsv4FixedTailGroupIds(manager_config)) {
+        ASSERT_EQ(first_resource->blocksNum(0, gid), 4) << "first SWA group " << gid;
+        expectDsv4SwaAllocatedBlocks(manager_config, first_resource->blocks(0, gid), gid, "first SWA");
+        first_swa_tail_blocks[static_cast<size_t>(gid)] = first_resource->blocks(0, gid)[3];
+    }
+
+    // Simulate one generated token before inserting into the device cache, so
+    // the fourth full block is cached and can be reused by the next prefill.
+    first_tokens->setSeqLength(4 * spb + 1);
+    manager->insertIntoCache(InsertInfo{first_resource, first_tokens, /*is_resident=*/false});
+    manager->free(FreeInfo{first_resource, first_tokens});
+
+    auto second_resource = makeDSV4BatchResource(manager_config);
+    auto second_tokens   = makeDSV4CompleteTokenIds(/*initial_seq_len=*/24 * spb, /*max_seq_len=*/24 * spb, spb);
+
+    MallocInfo second_malloc{second_resource, second_tokens};
+    second_malloc.reuse_cache                  = true;
+    second_malloc.enable_device_cache          = true;
+    second_malloc.enable_remove_skipped_blocks = false;
+    auto reuse_result                          = manager->malloc(second_malloc);
+    ASSERT_TRUE(reuse_result.success);
+    EXPECT_EQ(reuse_result.reuse_len, 4 * spb);
+
+    for (int gid : dsv4FixedTailGroupIds(manager_config)) {
+        if (manager_config.policyForGroup(static_cast<size_t>(gid)).reuse_policy == CacheReusePolicy::NON_REUSABLE) {
+            continue;
+        }
+        const auto& blocks = second_resource->blocks(0, gid);
+        ASSERT_EQ(blocks.size(), 24u) << "second SWA group " << gid;
+        EXPECT_TRUE(isNullBlockIdx(blocks[2])) << "SWA reuse prefix penultimate block is NULL (no prev lookup)";
+        EXPECT_EQ(blocks[3], first_swa_tail_blocks[static_cast<size_t>(gid)])
+            << "SWA reuse prefix tail block must stay readable";
+        EXPECT_FALSE(isNullBlockIdx(blocks[22])) << "second SWA group " << gid << " fresh tail block 22";
+        EXPECT_FALSE(isNullBlockIdx(blocks[23])) << "second SWA group " << gid << " fresh tail block 23";
+    }
+
+    manager->free(FreeInfo{second_resource, second_tokens});
+}
+
+TEST_F(KVCacheManagerTest, DSV4PopCachedBlocksPreservesGroupShape) {
+    auto manager_config = makeCompactDSV4ManagerConfig(/*block_num=*/16);
+    auto manager        = std::make_shared<KVCacheManager>(manager_config, /*warmup=*/false);
+    ASSERT_TRUE(manager->init());
+
+    const int spb      = static_cast<int>(manager_config.seq_size_per_block);
+    const int seq_len  = 3 * spb + 1;
+    auto      resource = makeDSV4BatchResource(manager_config);
+    auto      tokens   = makeDSV4CompleteTokenIds(seq_len, seq_len, spb);
+
+    MallocInfo malloc_info{resource, tokens};
+    malloc_info.reuse_cache         = true;
+    malloc_info.enable_device_cache = false;
+    ASSERT_TRUE(manager->malloc(malloc_info).success);
+
+    InsertInfo insert_info{resource, tokens, /*is_resident=*/false};
+    manager->insertIntoCache(insert_info);
+    FreeInfo free_info{resource, tokens};
+    manager->free(free_info);
+
+    auto evicted = manager->popBlocksFromCache(/*min_blocks_to_free=*/10);
+    ASSERT_NE(evicted, nullptr);
+    ASSERT_TRUE(evicted->hasCacheKeys());
+    EXPECT_EQ(evicted->groupNums(), kDsv4PoolNum);
+    EXPECT_EQ(evicted->cacheResource(0).layerGroupBlocks().size(), static_cast<size_t>(manager_config.layer_num));
+
+    bool saw_paged_block = false;
+    bool saw_tail_block  = false;
+    for (int gid = 0; gid < kDsv4PoolNum; ++gid) {
+        ASSERT_EQ(evicted->blocksNum(0, gid), static_cast<int>(evicted->cacheKeys(0).size())) << "group " << gid;
+        for (auto block : evicted->blocks(0, gid)) {
+            if (!isNullBlockIdx(block)) {
+                if (isFullGroup(manager_config, gid)) {
+                    saw_paged_block = true;
+                } else {
+                    saw_tail_block = true;
+                }
+            }
+        }
+    }
+    EXPECT_TRUE(saw_paged_block);
+    EXPECT_TRUE(saw_tail_block);
+
+    manager->blockCacheFree(evicted);
+}
+
 TEST_F(KVCacheManagerTest, Init_ReturnTrue_WhenMemoryCacheDisabled) {
     auto          cache_config = makeSimpleMhaCacheConfig(1, 4, 2, rtp_llm::DataType::TYPE_INT8);
     KVCacheConfig kv_cache_config;
@@ -405,28 +1015,23 @@ TEST_F(KVCacheManagerTest, GetKVCacheInfo_MergesDeviceAndMemoryKeys_Dedup) {
     ASSERT_NE(kv_cache_manager->coordinator_, nullptr);
 
     // Seed device block cache with keys: 10, 11, 12 (put makes MRU at front => snapshot order: 12,11,10)
-    auto block_cache = kv_cache_manager->allocator_->getBlockPool()->blockCache();
-    ASSERT_NE(block_cache, nullptr);
+    auto shared_cache = kv_cache_manager->allocator_->sharedBlockCache();
+    ASSERT_NE(shared_cache, nullptr);
     {
-        BlockCache::CacheItem item;
-        item.group_id    = 0;
-        item.is_resident = false;
-        item.cache_key   = 10;
-        item.block_index = 1;
-        ASSERT_TRUE(block_cache->put(item));
-        item.cache_key   = 11;
-        item.block_index = 2;
-        ASSERT_TRUE(block_cache->put(item));
-        item.cache_key   = 12;
-        item.block_index = 3;
-        ASSERT_TRUE(block_cache->put(item));
+        std::vector<BlockIdxType> group_slots(1);
+        group_slots[0] = 1;
+        shared_cache->put(10, group_slots, false);
+        group_slots[0] = 2;
+        shared_cache->put(11, group_slots, false);
+        group_slots[0] = 3;
+        shared_cache->put(12, group_slots, false);
     }
 
     // Inject a lightweight memory connector with a MemoryBlockCache snapshot:
     // put 11 then 13 => MRU order: 13,11 (11 duplicates device key)
     auto mem_connector = std::make_shared<KVCacheMemoryConnector>(
         cache_config, kv_cache_config, kv_cache_manager->allocator_, std::vector<std::string>{});
-    mem_connector->block_cache_ = std::make_shared<MemoryBlockCache>();
+    mem_connector->block_cache_ = std::make_shared<MemoryDiskBlockCache>();
     {
         MemoryBlockCache::CacheItem item;
         item.cache_key   = 11;
@@ -440,7 +1045,7 @@ TEST_F(KVCacheManagerTest, GetKVCacheInfo_MergesDeviceAndMemoryKeys_Dedup) {
     }
     kv_cache_manager->coordinator_->memory_connector_ = mem_connector;
 
-    // latest_version=-1 forces BlockCache snapshot to return all current keys.
+    // latest_version=-1 forces SharedBlockCache snapshot to return all current keys.
     auto info = kv_cache_manager->getKVCacheInfo(/*latest_version=*/-1, /*need_cache_keys=*/true);
 
     // Current implementation uses unordered_set -> assign, so order is not stable.
@@ -452,6 +1057,68 @@ TEST_F(KVCacheManagerTest, GetKVCacheInfo_MergesDeviceAndMemoryKeys_Dedup) {
     EXPECT_EQ(got, expected);
 }
 
+TEST_F(KVCacheManagerTest, GetKVCacheInfo_UsesSmallestHybridPoolTokenCapacity) {
+    auto cache_config = makeDSV4ConfigWithConcurrencyPool(/*full_block_num=*/16, /*swa_batch_size=*/3);
+
+    auto kv_cache_manager = std::make_shared<KVCacheManager>(cache_config);
+    ASSERT_TRUE(kv_cache_manager->init());
+
+    auto hybrid_allocator = std::dynamic_pointer_cast<HybridPoolKVCacheAllocator>(kv_cache_manager->allocator_);
+    ASSERT_NE(hybrid_allocator, nullptr);
+
+    size_t      expected_total_tokens     = std::numeric_limits<size_t>::max();
+    size_t      expected_available_tokens = std::numeric_limits<size_t>::max();
+    const auto& pools                     = hybrid_allocator->groupBlockPools();
+    ASSERT_GT(pools.size(), 1u);
+
+    for (size_t gid = 0; gid < pools.size(); ++gid) {
+        ASSERT_NE(pools[gid], nullptr);
+        const size_t seq_size =
+            (gid < cache_config.group_seq_size_per_block.size() && cache_config.group_seq_size_per_block[gid] > 0) ?
+                cache_config.group_seq_size_per_block[gid] :
+                cache_config.seq_size_per_block;
+        expected_total_tokens     = std::min(expected_total_tokens, pools[gid]->totalBlocksNum() * seq_size);
+        expected_available_tokens = std::min(expected_available_tokens, pools[gid]->availableBlocksNum() * seq_size);
+    }
+
+    auto info = kv_cache_manager->getKVCacheInfo(/*latest_version=*/-1, /*need_cache_keys=*/false);
+
+    EXPECT_EQ(info.total_kv_cache, expected_total_tokens);
+    EXPECT_EQ(info.available_kv_cache, expected_available_tokens);
+    EXPECT_LT(info.total_kv_cache, kv_cache_manager->totalBlocksNum() * cache_config.seq_size_per_block);
+}
+
+TEST_F(KVCacheManagerTest, MaxAvailableTokensNumUsesCPVirtualBlockSizeForHybridPoolFullGroups) {
+    auto cache_config = makeDSV4ConfigWithConcurrencyPool(/*full_block_num=*/16, /*swa_batch_size=*/3);
+
+    auto kv_cache_manager = std::make_shared<KVCacheManager>(cache_config);
+    ASSERT_TRUE(kv_cache_manager->init());
+
+    auto hybrid_allocator = std::dynamic_pointer_cast<HybridPoolKVCacheAllocator>(kv_cache_manager->allocator_);
+    ASSERT_NE(hybrid_allocator, nullptr);
+
+    const size_t physical_capacity = hybrid_allocator->maxAvailableTokensNum();
+    auto cp_slot_mapper =
+        std::make_shared<CPSlotMapper>(/*cp_rank=*/0, /*cp_size=*/2, static_cast<int>(cache_config.seq_size_per_block));
+    kv_cache_manager->cp_slot_mapper_ = cp_slot_mapper;
+    hybrid_allocator->setCPSlotMapper(cp_slot_mapper);
+
+    size_t      expected_logical_capacity = std::numeric_limits<size_t>::max();
+    const auto& pools                     = hybrid_allocator->groupBlockPools();
+    for (size_t gid = 0; gid < pools.size(); ++gid) {
+        if (gid < static_cast<size_t>(cache_config.groupNums())
+            && cache_config.typeForGroup(gid) != CacheGroupType::FULL) {
+            continue;
+        }
+        expected_logical_capacity =
+            std::min(expected_logical_capacity,
+                     pools[gid]->totalBlocksNum() * static_cast<size_t>(cache_config.seq_size_per_block * 2));
+    }
+
+    EXPECT_EQ(kv_cache_manager->maxAvailableTokensNum(), expected_logical_capacity);
+    EXPECT_GT(kv_cache_manager->maxAvailableTokensNum(), physical_capacity);
+}
+
 TEST_F(KVCacheManagerTest, GetKVCacheInfo_IncludesMemoryBlocksInTotalAndAvailable) {
     auto          cache_config = makeSimpleMhaCacheConfig(1, 8, 2, rtp_llm::DataType::TYPE_INT8);
     KVCacheConfig kv_cache_config;
@@ -481,5 +1148,508 @@ TEST_F(KVCacheManagerTest, GetKVCacheInfo_IncludesMemoryBlocksInTotalAndAvailabl
     EXPECT_GE(info.available_kv_cache, device_only_available);
 }
 
+TEST_F(KVCacheManagerTest, DSV4EvictionTriggeredWhenPoolExhaustedByCache) {
+    // This test verifies that when block pools are exhausted by cached (but freed) requests,
+    // a new allocation correctly triggers LRU eviction from each group's independent BlockCache.
+    //
+    // Setup: block_num=8 → 7 usable blocks per group (block 0 reserved).
+    // Request seq_len = 3*spb. FULL groups allocate 3 blocks. Reusable SWA groups allocate
+    // linear-step blocks (step=1 here, so all 3), while HCA_STATE keeps only its active tail block.
+    // insertIntoCache drops the active tail slot, so each completed request caches:
+    //   FULL groups: 2 blocks per group
+    //   SWA/state groups: fixed-window cached blocks; HCA_STATE skips reuse.
+    //
+    // After 3 requests are cached and request-freed:
+    //   FULL groups (0,1,2): 6 blocks cached, 1 free → new request needs 3, triggers eviction
+    //   SWA/state groups (3,4,5,6): reusable groups may also evict under their independent pools.
+    //
+    // The fourth allocation MUST succeed via eviction on FULL groups.
+    auto manager_config = makeCompactDSV4ManagerConfig(/*block_num=*/8);
+    auto manager        = std::make_shared<KVCacheManager>(manager_config, /*warmup=*/false);
+    ASSERT_TRUE(manager->init());
+
+    const int    spb         = static_cast<int>(manager_config.seq_size_per_block);
+    const int    seq_len     = 3 * spb;
+    const size_t free_before = manager->freeBlocksNum();
+    // 7 groups × 7 usable blocks = 49 total free.
+    EXPECT_EQ(free_before, 7u * 7u);
+
+    // Helper: create CompleteTokenIds with a token-value offset so each request gets unique cache keys.
+    auto makeTokens = [&](int offset) {
+        auto input_ids      = torch::arange(offset, offset + seq_len, torch::kInt32);
+        auto gi             = std::make_shared<GenerateInput>();
+        gi->input_ids       = input_ids;
+        gi->generate_config = std::make_shared<GenerateConfig>();
+        auto cti            = std::make_shared<CompleteTokenIds>(1, 1, seq_len + 16, spb);
+        cti->init(gi);
+        cti->setSeqLength(seq_len);
+        return cti;
+    };
+
+    // --- Request A: allocate, cache, free request reference ---
+    auto       res_a    = makeDSV4BatchResource(manager_config);
+    auto       tokens_a = makeTokens(/*offset=*/0);
+    MallocInfo malloc_a{res_a, tokens_a};
+    malloc_a.reuse_cache         = true;
+    malloc_a.enable_device_cache = false;
+    ASSERT_TRUE(manager->malloc(malloc_a).success);
+
+    InsertInfo insert_a{res_a, tokens_a, /*is_resident=*/false};
+    manager->insertIntoCache(insert_a);
+    FreeInfo free_a{res_a, tokens_a};
+    manager->free(free_a);
+
+    const size_t free_after_a = manager->freeBlocksNum();
+    EXPECT_LT(free_after_a, free_before);
+
+    // --- Request B: different tokens → different cache keys ---
+    auto       res_b    = makeDSV4BatchResource(manager_config);
+    auto       tokens_b = makeTokens(/*offset=*/10000);
+    MallocInfo malloc_b{res_b, tokens_b};
+    malloc_b.reuse_cache         = true;
+    malloc_b.enable_device_cache = false;
+    ASSERT_TRUE(manager->malloc(malloc_b).success);
+
+    InsertInfo insert_b{res_b, tokens_b, /*is_resident=*/false};
+    manager->insertIntoCache(insert_b);
+    FreeInfo free_b{res_b, tokens_b};
+    manager->free(free_b);
+
+    const size_t free_after_b = manager->freeBlocksNum();
+    EXPECT_LT(free_after_b, free_after_a);
+
+    // --- Request C: still fits, but leaves FULL groups with only one free block ---
+    auto       res_c    = makeDSV4BatchResource(manager_config);
+    auto       tokens_c = makeTokens(/*offset=*/20000);
+    MallocInfo malloc_c{res_c, tokens_c};
+    malloc_c.reuse_cache         = true;
+    malloc_c.enable_device_cache = false;
+    ASSERT_TRUE(manager->malloc(malloc_c).success);
+
+    InsertInfo insert_c{res_c, tokens_c, /*is_resident=*/false};
+    manager->insertIntoCache(insert_c);
+    FreeInfo free_c{res_c, tokens_c};
+    manager->free(free_c);
+
+    const size_t free_after_c = manager->freeBlocksNum();
+    EXPECT_LE(free_after_c, free_after_b);
+
+    // --- Request D: triggers eviction on FULL groups ---
+    auto       res_d    = makeDSV4BatchResource(manager_config);
+    auto       tokens_d = makeTokens(/*offset=*/30000);
+    MallocInfo malloc_d{res_d, tokens_d};
+    malloc_d.reuse_cache         = true;
+    malloc_d.enable_device_cache = false;
+
+    // This allocation MUST succeed — FULL groups trigger ensureFreeBlocks → evict from cache.
+    auto result_d = manager->malloc(malloc_d);
+    ASSERT_TRUE(result_d.success) << "Fourth allocation should succeed via eviction";
+
+    // Verify block structure for request D.
+    ASSERT_EQ(res_d->groupNums(), kDsv4PoolNum);
+    for (int gid = 0; gid < kDsv4PoolNum; ++gid) {
+        ASSERT_EQ(res_d->blocksNum(0, gid), 3) << "group " << gid;
+        const auto& blocks = res_d->blocks(0, gid);
+        if (isFullGroup(manager_config, gid)) {
+            for (int i = 0; i < 3; ++i) {
+                EXPECT_FALSE(isNullBlockIdx(blocks[i])) << "FULL group " << gid << " pos " << i;
+            }
+        } else {
+            expectDsv4SwaAllocatedBlocks(manager_config, blocks, gid, "fixed group", /*enable_reuse_cache=*/true);
+        }
+    }
+
+    EXPECT_LE(manager->freeBlocksNum(), free_after_c) << "Pool should be tighter after D allocated";
+
+    // --- Free D and verify blocks return to pool ---
+    FreeInfo free_d{res_d, tokens_d};
+    manager->free(free_d);
+
+    // After freeing D, its blocks (request_ref→0, cache_ref=0 since we did not insert D into cache)
+    // return to the free pool.
+    // But cached blocks from eviction of A are fully freed (both refs=0) so they also count.
+    // Expect freeBlocksNum >= free_after_c (at least as good as before D was allocated).
+    EXPECT_GE(manager->freeBlocksNum(), free_after_c);
+
+    // --- Pop all remaining cached blocks and verify full pool recovery ---
+    auto evicted = manager->popBlocksFromCache(/*min_blocks_to_free=*/100);
+    if (evicted) {
+        manager->blockCacheFree(evicted);
+    }
+    EXPECT_EQ(manager->freeBlocksNum(), free_before);
+}
+
+TEST_F(KVCacheManagerTest, DSV4MaxConcurrencyOneReuseOneBlockAndAllocTwoTailBlocks) {
+    auto manager_config =
+        makeProductionDSV4Config(/*full_block_num=*/8, /*max_concurrency=*/1, /*hca_state_pool_blocks=*/12);
+    ASSERT_EQ(manager_config.groupBlockNumsSnapshot().size(), static_cast<size_t>(kDsv4PoolNum));
+    for (int gid : dsv4FixedTailGroupIds(manager_config)) {
+        const uint32_t expected = isHcaStateGroup(manager_config, gid) ? 12u : 8u;
+        ASSERT_EQ(manager_config.blockNumForGroup(static_cast<size_t>(gid)), expected) << "group " << gid;
+    }
+
+    auto manager = std::make_shared<KVCacheManager>(manager_config, /*warmup=*/false);
+    ASSERT_TRUE(manager->init());
+
+    const size_t free_before = manager->freeBlocksNum();
+    EXPECT_EQ(free_before, 6u * 7u + 11u);
+    const int spb = static_cast<int>(manager_config.seq_size_per_block);
+
+    auto makeTokens = [&](int seq_len) {
+        auto input_ids      = torch::arange(0, seq_len, torch::kInt32);
+        auto gi             = std::make_shared<GenerateInput>();
+        gi->input_ids       = input_ids;
+        gi->generate_config = std::make_shared<GenerateConfig>();
+        auto cti            = std::make_shared<CompleteTokenIds>(1, 1, /*max_seq_len=*/4 * spb, spb);
+        cti->init(gi);
+        cti->setSeqLength(seq_len);
+        return cti;
+    };
+
+    // Seed one reusable SWA/state block per independent pool. For a 2-block request,
+    // insertIntoCache keeps only the first full block; the active tail is not cached.
+    auto       seed_res    = makeDSV4BatchResource(manager_config);
+    auto       seed_tokens = makeTokens(2 * spb);
+    MallocInfo seed_malloc{seed_res, seed_tokens};
+    seed_malloc.reuse_cache         = false;
+    seed_malloc.enable_device_cache = false;
+    ASSERT_TRUE(manager->malloc(seed_malloc).success);
+
+    for (int gid : dsv4FixedTailGroupIds(manager_config)) {
+        ASSERT_EQ(seed_res->blocksNum(0, gid), 2) << "seed group " << gid;
+        expectDsv4SwaAllocatedBlocks(manager_config, seed_res->blocks(0, gid), gid, "seed group");
+    }
+
+    manager->insertIntoCache(InsertInfo{seed_res, seed_tokens, /*is_resident=*/false});
+    manager->free(FreeInfo{seed_res, seed_tokens});
+
+    // Same prefix, one more block. This hits one cached independent-pool block and
+    // must still have room for the two fresh tail blocks.  The matched block is
+    // then skipped out of the active SWA tail by the decode allocation path.
+    auto       reuse_res    = makeDSV4BatchResource(manager_config);
+    auto       reuse_tokens = makeTokens(3 * spb);
+    MallocInfo reuse_malloc{reuse_res, reuse_tokens};
+    reuse_malloc.reuse_cache         = true;
+    reuse_malloc.enable_device_cache = true;
+    auto reuse_result                = manager->malloc(reuse_malloc);
+    ASSERT_TRUE(reuse_result.success);
+    EXPECT_EQ(reuse_result.reuse_len, 2 * spb);
+
+    for (int gid : dsv4FixedTailGroupIds(manager_config)) {
+        if (manager_config.policyForGroup(static_cast<size_t>(gid)).reuse_policy == CacheReusePolicy::NON_REUSABLE) {
+            continue;
+        }
+        const auto& blocks = reuse_res->blocks(0, gid);
+        ASSERT_EQ(blocks.size(), 3u) << "reuse group " << gid;
+        EXPECT_TRUE(isNullBlockIdx(blocks[0])) << "reuse group " << gid << " skipped reused prefix";
+        EXPECT_FALSE(isNullBlockIdx(blocks[1])) << "reuse group " << gid << " tail block 1";
+        EXPECT_FALSE(isNullBlockIdx(blocks[2])) << "reuse group " << gid << " tail block 2";
+    }
+
+    manager->free(FreeInfo{reuse_res, reuse_tokens});
+    auto evicted = manager->popBlocksFromCache(/*min_blocks_to_free=*/100);
+    if (evicted) {
+        manager->blockCacheFree(evicted);
+    }
+    EXPECT_EQ(manager->freeBlocksNum(), free_before);
+}
+
+TEST_F(KVCacheManagerTest, DSV4EvictionOnSWAGroupsDuringInferenceWithDecodeContinuation) {
+    // This test simulates full DSV4 inference including SWA group eviction.
+    //
+    // Tight stress layout:
+    //   FULL groups (0,1,2): large paged pool (block_num=8, 7 usable)
+    //   SWA  groups (3,4,5,6): small independent pool with 3 usable blocks
+    //
+    // SWA pools are sized by concurrency, NOT by global block_num. This test verifies that
+    // eviction is triggered independently on SWA groups when concurrent requests exhaust
+    // the independent pool, and that decode-phase removeSkippedBlocks interacts correctly with eviction.
+    //
+    // Lifecycle:
+    //   Phase 1: 2 requests complete and get cached → SWA pools nearly full (2 of 3 cached)
+    //   Phase 2: 3rd request triggers eviction on SWA groups
+    //   Phase 3: Decode-phase incrKVBlock triggers further FULL/SWA eviction + removeSkippedBlocks
+    //   Phase 4: Free and verify pool recovery
+    auto manager_config = makeDSV4ConfigWithConcurrencyPool(/*full_block_num=*/8, /*swa_batch_size=*/4);
+    auto manager        = std::make_shared<KVCacheManager>(manager_config, /*warmup=*/false);
+    ASSERT_TRUE(manager->init());
+
+    const int spb     = static_cast<int>(manager_config.seq_size_per_block);
+    const int seq_len = 3 * spb;
+
+    // Verify differentiated pool sizes.
+    const size_t free_before = manager->freeBlocksNum();
+    EXPECT_EQ(free_before, 3u * 7u + 4u * 7u);
+
+    // Helper: create tokens with unique offset for distinct cache keys.
+    auto makeTokens = [&](int offset) {
+        auto input_ids      = torch::arange(offset, offset + seq_len, torch::kInt32);
+        auto gi             = std::make_shared<GenerateInput>();
+        gi->input_ids       = input_ids;
+        gi->generate_config = std::make_shared<GenerateConfig>();
+        auto cti            = std::make_shared<CompleteTokenIds>(1, 1, /*max_seq_len=*/10 * spb, spb);
+        cti->init(gi);
+        cti->setSeqLength(seq_len);
+        return cti;
+    };
+
+    // === Phase 1: Fill caches with 2 completed requests ===
+    auto       res_a    = makeDSV4BatchResource(manager_config);
+    auto       tokens_a = makeTokens(/*offset=*/0);
+    MallocInfo malloc_a{res_a, tokens_a};
+    malloc_a.reuse_cache         = true;
+    malloc_a.enable_device_cache = false;
+    ASSERT_TRUE(manager->malloc(malloc_a).success);
+    InsertInfo insert_a{res_a, tokens_a, /*is_resident=*/false};
+    manager->insertIntoCache(insert_a);
+    manager->free(FreeInfo{res_a, tokens_a});
+
+    auto       res_b    = makeDSV4BatchResource(manager_config);
+    auto       tokens_b = makeTokens(/*offset=*/10000);
+    MallocInfo malloc_b{res_b, tokens_b};
+    malloc_b.reuse_cache         = true;
+    malloc_b.enable_device_cache = false;
+    ASSERT_TRUE(manager->malloc(malloc_b).success);
+    InsertInfo insert_b{res_b, tokens_b, /*is_resident=*/false};
+    manager->insertIntoCache(insert_b);
+    manager->free(FreeInfo{res_b, tokens_b});
+
+    const size_t free_after_cache = manager->freeBlocksNum();
+    EXPECT_LT(free_after_cache, free_before);
+
+    // === Phase 2: 3rd request triggers eviction on SWA groups ===
+    auto       res_c    = makeDSV4BatchResource(manager_config);
+    auto       tokens_c = makeTokens(/*offset=*/20000);
+    MallocInfo malloc_c{res_c, tokens_c};
+    malloc_c.reuse_cache         = true;
+    malloc_c.enable_device_cache = false;
+
+    // FULL needs 3, has exactly 3 free → no FULL eviction yet.
+    // SWA needs 2, only 1 free → ensureFreeBlocks evicts 1 from SWA cache.
+    auto result_c = manager->malloc(malloc_c);
+    ASSERT_TRUE(result_c.success) << "3rd allocation must succeed via SWA eviction";
+
+    // Verify block structure.
+    ASSERT_EQ(res_c->groupNums(), kDsv4PoolNum);
+    for (int gid = 0; gid < kDsv4PoolNum; ++gid) {
+        ASSERT_EQ(res_c->blocksNum(0, gid), 3) << "group " << gid;
+        const auto& blocks = res_c->blocks(0, gid);
+        if (isFullGroup(manager_config, gid)) {
+            for (int i = 0; i < 3; ++i) {
+                EXPECT_FALSE(isNullBlockIdx(blocks[i])) << "FULL group " << gid << " pos " << i;
+            }
+        } else {
+            expectDsv4SwaAllocatedBlocks(manager_config, blocks, gid, "SWA group", /*enable_reuse_cache=*/true);
+        }
+    }
+
+    // === Phase 3: Decode incrKVBlock → SWA removeSkippedBlocks + further SWA eviction ===
+
+    // --- Incr to 4*spb ---
+    // Non-HCA SWA state starts from the reusable linear-step allocation and then keeps the active tail window.
+    // HCA_STATE skips reuse and keeps only its active tail block.
+    // FULL pool after Phase 2: 4 cached + 3 request = 7 used, 0 free → ensureFreeBlocks evicts 1.
+    tokens_c->setSeqLength(4 * spb);
+    MallocInfo incr1{res_c, tokens_c};
+    incr1.reuse_cache         = false;
+    incr1.enable_device_cache = false;
+    ASSERT_TRUE(manager->malloc(incr1).success) << "First incr must succeed via eviction";
+
+    for (int gid = 0; gid < kDsv4PoolNum; ++gid) {
+        ASSERT_EQ(res_c->blocksNum(0, gid), 4) << "group " << gid << " after incr to 4*spb";
+    }
+    // SWA/state fixed groups retain the current tail window.
+    for (int gid : dsv4FixedTailGroupIds(manager_config)) {
+        expectDsv4SwaAllocatedBlocks(manager_config, res_c->blocks(0, gid), gid, "SWA group");
+    }
+
+    // --- Incr to 5*spb ---
+    // Non-HCA SWA removes blocks before the active two-block tail; HCA_STATE keeps a one-block tail.
+    // SWA pools may need another eviction if no free block remains.
+    tokens_c->setSeqLength(5 * spb);
+    MallocInfo incr2{res_c, tokens_c};
+    incr2.reuse_cache         = false;
+    incr2.enable_device_cache = false;
+    ASSERT_TRUE(manager->malloc(incr2).success) << "Second incr must succeed (removeSkipped frees block)";
+
+    for (int gid = 0; gid < kDsv4PoolNum; ++gid) {
+        ASSERT_EQ(res_c->blocksNum(0, gid), 5) << "group " << gid << " after incr to 5*spb";
+    }
+    // SWA/state fixed groups keep only the active tail window.
+    for (int gid : dsv4FixedTailGroupIds(manager_config)) {
+        expectDsv4SwaAllocatedBlocks(manager_config, res_c->blocks(0, gid), gid, "SWA group");
+    }
+
+    // === Phase 4: Free all and verify full pool recovery ===
+    manager->free(FreeInfo{res_c, tokens_c});
+
+    // Pop remaining cached blocks to restore pool.
+    auto evicted = manager->popBlocksFromCache(/*min_blocks_to_free=*/100);
+    if (evicted) {
+        manager->blockCacheFree(evicted);
+    }
+    EXPECT_EQ(manager->freeBlocksNum(), free_before);
+}
+TEST_F(KVCacheManagerTest, DSV4InitThenIncrWithRemoveSkippedBlocksFullLifecycle) {
+    // This test exercises the full lifecycle of a DSV4 request:
+    //   1. initKVBlock (first malloc with 4 blocks)
+    //   2. Multiple incrKVBlock calls (decode phase) that trigger removeSkippedBlocks
+    //   3. Verify SWA groups free old non-tail blocks during incr
+    //   4. Final free returns all blocks to pool
+    auto manager_config = makeCompactDSV4ManagerConfig(/*block_num=*/32);
+    auto manager        = std::make_shared<KVCacheManager>(manager_config, /*warmup=*/false);
+    ASSERT_TRUE(manager->init());
+
+    const size_t free_before = manager->freeBlocksNum();
+    const int    spb         = static_cast<int>(manager_config.seq_size_per_block);
+    auto         resource    = makeDSV4BatchResource(manager_config);
+
+    // --- Phase 1: initKVBlock with 4 blocks (simulates prefill completion) ---
+    const int init_seq_len = 4 * spb;
+    auto      tokens       = makeDSV4CompleteTokenIds(init_seq_len, /*max_seq_len=*/10 * spb, spb);
+
+    MallocInfo init_info{resource, tokens};
+    init_info.reuse_cache         = false;
+    init_info.enable_device_cache = false;
+    auto init_result              = manager->malloc(init_info);
+    ASSERT_TRUE(init_result.success);
+    ASSERT_EQ(resource->groupNums(), kDsv4PoolNum);
+
+    // After init: FULL groups (0,1,2) have 4 real blocks each.
+    //             SWA groups keep the active tail window; HCA_STATE keeps a one-block tail.
+    for (int gid = 0; gid < kDsv4PoolNum; ++gid) {
+        ASSERT_EQ(resource->blocksNum(0, gid), 4) << "group " << gid;
+        const auto& blocks = resource->blocks(0, gid);
+        if (isFullGroup(manager_config, gid)) {
+            for (int i = 0; i < 4; ++i) {
+                EXPECT_FALSE(isNullBlockIdx(blocks[i])) << "FULL group " << gid << " pos " << i;
+            }
+        } else {
+            expectDsv4SwaAllocatedBlocks(manager_config, blocks, gid, "SWA group");
+        }
+    }
+
+    // Record block IDs allocated after init for later validation.
+    std::vector<BlockIndicesType> init_blocks(kDsv4PoolNum);
+    for (int gid = 0; gid < kDsv4PoolNum; ++gid) {
+        init_blocks[gid] = resource->blocks(0, gid);
+    }
+    const size_t free_after_init = manager->freeBlocksNum();
+
+    // --- Phase 2: First incrKVBlock (4 → 5 blocks) ---
+    // removeSkippedBlocks on SWA groups: [NULL, NULL, A, B] → keep_begin=2, loop i=0..1 both NULL → no free.
+    // Then allocate 1 new block per group.
+    tokens->setSeqLength(5 * spb);
+    MallocInfo incr1_info{resource, tokens};
+    incr1_info.reuse_cache         = false;
+    incr1_info.enable_device_cache = false;
+    ASSERT_TRUE(manager->malloc(incr1_info).success);
+
+    for (int gid = 0; gid < kDsv4PoolNum; ++gid) {
+        ASSERT_EQ(resource->blocksNum(0, gid), 5) << "group " << gid << " after incr1";
+    }
+    // FULL groups: all 5 blocks should be real.
+    for (int gid : dsv4GroupIdsByType(manager_config, CacheGroupType::FULL)) {
+        const auto& blocks = resource->blocks(0, gid);
+        for (int i = 0; i < 5; ++i) {
+            EXPECT_FALSE(isNullBlockIdx(blocks[i])) << "FULL group " << gid << " pos " << i << " after incr1";
+        }
+        // Original init blocks should be preserved.
+        for (int i = 0; i < 4; ++i) {
+            EXPECT_EQ(blocks[i], init_blocks[gid][i]) << "FULL group " << gid << " pos " << i << " changed";
+        }
+    }
+    // SWA/state fixed groups keep the current tail window.
+    for (int gid : dsv4FixedTailGroupIds(manager_config)) {
+        const auto& blocks = resource->blocks(0, gid);
+        expectDsv4SwaAllocatedBlocks(manager_config, blocks, gid, "SWA group after incr1");
+        if (!isHcaStateGroup(manager_config, gid)) {
+            EXPECT_EQ(blocks[3], init_blocks[gid][3]) << "SWA group " << gid << " old tail pos 3";
+        }
+    }
+
+    // Four fixed groups freed one stale block and all seven groups allocated one new block.
+    EXPECT_EQ(manager->freeBlocksNum(), free_after_init - 7 + 4);
+    const size_t free_after_incr1 = manager->freeBlocksNum();
+
+    // Record SWA tail blocks after incr1 for the next step.
+    std::vector<BlockIdxType> swa_new_C(static_cast<size_t>(manager_config.groupNums()), NULL_BLOCK_IDX);
+    for (int gid : dsv4FixedTailGroupIds(manager_config)) {
+        swa_new_C[static_cast<size_t>(gid)] = resource->blocks(0, gid)[4];
+    }
+
+    // --- Phase 3: Second incrKVBlock (5 → 6 blocks) — triggers removeSkippedBlocks ---
+    // SWA removeSkippedBlocks on [NULL, NULL, A, B, C] (size=5): keep_begin = 5-2 = 3.
+    //   Loop i=0: NULL → skip.
+    //   Loop i=1: NULL → skip.
+    //   Loop i=2: A (real block) → FREE it, set to NULL.
+    // After remove: [NULL, NULL, NULL, B, C]
+    // Then malloc allocates 1 new block D → [NULL, NULL, NULL, B, C, D]
+    tokens->setSeqLength(6 * spb);
+    MallocInfo incr2_info{resource, tokens};
+    incr2_info.reuse_cache         = false;
+    incr2_info.enable_device_cache = false;
+    ASSERT_TRUE(manager->malloc(incr2_info).success);
+
+    for (int gid = 0; gid < kDsv4PoolNum; ++gid) {
+        ASSERT_EQ(resource->blocksNum(0, gid), 6) << "group " << gid << " after incr2";
+    }
+
+    // FULL groups: all 6 blocks real, first 4 unchanged.
+    for (int gid : dsv4GroupIdsByType(manager_config, CacheGroupType::FULL)) {
+        const auto& blocks = resource->blocks(0, gid);
+        for (int i = 0; i < 6; ++i) {
+            EXPECT_FALSE(isNullBlockIdx(blocks[i])) << "FULL group " << gid << " pos " << i << " after incr2";
+        }
+        for (int i = 0; i < 4; ++i) {
+            EXPECT_EQ(blocks[i], init_blocks[gid][i]) << "FULL group " << gid << " init block preserved";
+        }
+    }
+
+    // SWA/state fixed groups after incr2 keep their configured active tail window.
+    for (int gid : dsv4FixedTailGroupIds(manager_config)) {
+        const auto& blocks = resource->blocks(0, gid);
+        expectDsv4SwaAllocatedBlocks(manager_config, blocks, gid, "SWA group after incr2");
+        if (!isHcaStateGroup(manager_config, gid)) {
+            EXPECT_EQ(blocks[4], swa_new_C[static_cast<size_t>(gid)]) << "SWA group " << gid << " pos 4 = old C";
+        }
+    }
+
+    // Free block accounting: SWA freed 1 block per SWA group (4 groups) at removeSkippedBlocks,
+    // then allocated 1 new block per group (7 groups). Net change: -7 + 4 = -3.
+    EXPECT_EQ(manager->freeBlocksNum(), free_after_incr1 - 7 + 4);
+    const size_t free_after_incr2 = manager->freeBlocksNum();
+
+    // --- Phase 4: Third incrKVBlock (6 → 7 blocks) — triggers another removeSkippedBlocks ---
+    // SWA removeSkippedBlocks on [NULL, NULL, NULL, B, C, D] (size=6): keep_begin = 6-2 = 4.
+    //   Loop i=0..2: all NULL → skip.
+    //   Loop i=3: B (real block) → FREE it, set to NULL.
+    // After remove: [NULL, NULL, NULL, NULL, C, D]
+    // Then malloc allocates 1 new block E → [NULL, NULL, NULL, NULL, C, D, E]
+    tokens->setSeqLength(7 * spb);
+    MallocInfo incr3_info{resource, tokens};
+    incr3_info.reuse_cache         = false;
+    incr3_info.enable_device_cache = false;
+    ASSERT_TRUE(manager->malloc(incr3_info).success);
+
+    for (int gid = 0; gid < kDsv4PoolNum; ++gid) {
+        ASSERT_EQ(resource->blocksNum(0, gid), 7) << "group " << gid << " after incr3";
+    }
+
+    // SWA/state fixed groups after incr3 keep their configured active tail window.
+    for (int gid : dsv4FixedTailGroupIds(manager_config)) {
+        const auto& blocks = resource->blocks(0, gid);
+        expectDsv4SwaAllocatedBlocks(manager_config, blocks, gid, "SWA group after incr3");
+    }
+
+    // SWA freed 1 block per SWA group (4) and allocated 1 per all groups (7). Net: -7+4 = -3.
+    EXPECT_EQ(manager->freeBlocksNum(), free_after_incr2 - 7 + 4);
+
+    // --- Phase 5: Free all — all blocks should return to pool ---
+    FreeInfo free_info{resource, tokens};
+    manager->free(free_info);
+    EXPECT_EQ(manager->freeBlocksNum(), free_before);
+}
+
 }  // namespace test
 }  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/test/KVCacheResourceLocalCacheKeysTest.cc b/rtp_llm/cpp/cache/test/KVCacheResourceLocalCacheKeysTest.cc
new file mode 100644
index 0000000000..634021e130
--- /dev/null
+++ b/rtp_llm/cpp/cache/test/KVCacheResourceLocalCacheKeysTest.cc
@@ -0,0 +1,75 @@
+#include <gtest/gtest.h>
+
+#include "rtp_llm/cpp/cache/KVCacheResource.h"
+
+namespace rtp_llm {
+namespace test {
+
+class LocalCacheKeysTest: public ::testing::Test {
+protected:
+    KVCacheResource make(const CacheKeysType& keys) {
+        KVCacheResource r;
+        r.cacheKeys() = keys;
+        return r;
+    }
+};
+
+TEST_F(LocalCacheKeysTest, CpSize1Passthrough) {
+    auto r   = make({10, 20, 30, 40});
+    auto out = r.localCacheKeys(0, 1);
+    ASSERT_EQ(out.size(), 4u);
+    EXPECT_EQ(out[0], 10);
+    EXPECT_EQ(out[1], 20);
+    EXPECT_EQ(out[2], 30);
+    EXPECT_EQ(out[3], 40);
+}
+
+TEST_F(LocalCacheKeysTest, CpSize2EvenLengthLastRank) {
+    auto r = make({100, 101, 200, 201, 300, 301, 400, 401});
+    // last-rank stride: rank=1, size=2 → idx 1,3,5,7
+    auto out = r.localCacheKeys(1, 2);
+    ASSERT_EQ(out.size(), 4u);
+    EXPECT_EQ(out[0], 101);
+    EXPECT_EQ(out[1], 201);
+    EXPECT_EQ(out[2], 301);
+    EXPECT_EQ(out[3], 401);
+}
+
+TEST_F(LocalCacheKeysTest, CpSize2Rank0) {
+    auto r   = make({100, 101, 200, 201});
+    auto out = r.localCacheKeys(0, 2);
+    ASSERT_EQ(out.size(), 2u);
+    EXPECT_EQ(out[0], 100);
+    EXPECT_EQ(out[1], 200);
+}
+
+TEST_F(LocalCacheKeysTest, CpSize4NonDivisibleLastRankShorter) {
+    // 10 keys, cp_size=4 → last-rank (3) takes idx 3, 7 → length 2 (vs blocks=ceil(10/4)=3)
+    auto r   = make({0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+    auto out = r.localCacheKeys(3, 4);
+    ASSERT_EQ(out.size(), 2u);
+    EXPECT_EQ(out[0], 3);
+    EXPECT_EQ(out[1], 7);
+}
+
+TEST_F(LocalCacheKeysTest, EmptyKeys) {
+    auto r   = make({});
+    auto out = r.localCacheKeys(0, 4);
+    EXPECT_TRUE(out.empty());
+}
+
+TEST_F(LocalCacheKeysTest, KeysShorterThanCpSizeReturnsEmptyForLastRank) {
+    auto r   = make({42});
+    auto out = r.localCacheKeys(3, 4);  // last-rank starts at idx 3, but only 1 key
+    EXPECT_TRUE(out.empty());
+}
+
+TEST_F(LocalCacheKeysTest, KeysShorterThanCpSizeRank0HasOne) {
+    auto r   = make({42});
+    auto out = r.localCacheKeys(0, 4);
+    ASSERT_EQ(out.size(), 1u);
+    EXPECT_EQ(out[0], 42);
+}
+
+}  // namespace test
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/test/KVCacheResourceTest.cc b/rtp_llm/cpp/cache/test/KVCacheResourceTest.cc
index 283461ddbf..d4b57012d4 100644
--- a/rtp_llm/cpp/cache/test/KVCacheResourceTest.cc
+++ b/rtp_llm/cpp/cache/test/KVCacheResourceTest.cc
@@ -59,7 +59,7 @@ TEST(KVCacheResourceTest, InitGroups_RespectsGroupTypesAndBlocksPerKvBlock) {
     KVCacheResource resource;
     resource.initGroups(/*group_num=*/2,
                         /*layer_num=*/3,
-                        /*layer_to_group_id=*/{0, 1, 0},
+                        /*layer_group_ids=*/{{0}, {1}, {0}},
                         /*kernel_blocks_per_kv_block=*/4,
                         /*group_types=*/{CacheGroupType::FULL, CacheGroupType::LINEAR});
 
@@ -82,6 +82,38 @@ TEST(KVCacheResourceTest, InitGroups_RespectsGroupTypesAndBlocksPerKvBlock) {
     ASSERT_EQ(resource.kernelBlocks(1), (BlockIndicesType{1}));
 }
 
+TEST(KVCacheResourceTest, CacheKeysMaintainLinearDependencies) {
+    KVCacheResource resource;
+    resource.setCacheKeys(CacheKeysType{10, 20, 30});
+
+    ASSERT_EQ(resource.blockDependencies().size(), 3u);
+    EXPECT_FALSE(resource.blockDependencies()[0].has_parent);
+    EXPECT_EQ(resource.blockDependencies()[0].ordinal, 0u);
+    EXPECT_TRUE(resource.blockDependencies()[1].has_parent);
+    EXPECT_EQ(resource.blockDependencies()[1].parent_key, 10);
+    EXPECT_EQ(resource.blockDependencies()[1].ordinal, 1u);
+    EXPECT_TRUE(resource.blockDependencies()[2].has_parent);
+    EXPECT_EQ(resource.blockDependencies()[2].parent_key, 20);
+    EXPECT_EQ(resource.blockDependencies()[2].ordinal, 2u);
+
+    BlockDependenciesType custom = {
+        BlockDependency{false, 0, 7},
+        BlockDependency{true, 100, 8},
+    };
+    resource.setCacheKeys(CacheKeysType{100, 200});
+    resource.setBlockDependencies(custom);
+    resource.ensureLinearBlockDependencies();
+    ASSERT_EQ(resource.blockDependencies().size(), 2u);
+    EXPECT_EQ(resource.blockDependencies()[0].ordinal, 7u);
+    EXPECT_EQ(resource.blockDependencies()[1].parent_key, 100);
+
+    resource.cacheKeys().push_back(300);
+    resource.ensureLinearBlockDependencies();
+    ASSERT_EQ(resource.blockDependencies().size(), 3u);
+    EXPECT_EQ(resource.blockDependencies()[2].parent_key, 200);
+    EXPECT_EQ(resource.blockDependencies()[2].ordinal, 2u);
+}
+
 TEST(CacheConfigTest, KernelBlocksPerKvBlockSafeByDefault) {
     CacheConfig config;
     config.seq_size_per_block        = 1;
@@ -98,7 +130,7 @@ TEST(BatchKVCacheResourceTest, BasicBatchOperations_WorkAsExpected) {
     batch.resetBatchSize(2);
     batch.initGroups(/*group_nums=*/2,
                      /*layer_num=*/3,
-                     /*layer_to_group_id=*/{0, 1, 0},
+                     /*layer_group_ids=*/{{0}, {1}, {0}},
                      /*kernel_blocks_per_kv_block=*/4,
                      /*group_types=*/{CacheGroupType::FULL, CacheGroupType::LINEAR});
 
@@ -139,7 +171,7 @@ TEST(BatchKVCacheResourceTest, BasicBatchOperations_WorkAsExpected) {
     KVCacheResource moved;
     moved.initGroups(/*group_num=*/1,
                      /*layer_num=*/1,
-                     /*layer_to_group_id=*/{0},
+                     /*layer_group_ids=*/{{0}},
                      /*kernel_blocks_per_kv_block=*/2,
                      /*group_types=*/{CacheGroupType::FULL});
     moved.mutableBlockIds(0).add(BlockIndicesType{3});
diff --git a/rtp_llm/cpp/cache/test/LinearKVCacheGroupTest.cc b/rtp_llm/cpp/cache/test/LinearKVCacheGroupTest.cc
index 4a1db05b18..ba2207f585 100644
--- a/rtp_llm/cpp/cache/test/LinearKVCacheGroupTest.cc
+++ b/rtp_llm/cpp/cache/test/LinearKVCacheGroupTest.cc
@@ -3,8 +3,8 @@
 #include <memory>
 #include <vector>
 
-#include "rtp_llm/cpp/cache/BlockCache.h"
-#include "rtp_llm/cpp/cache/LinearKVCacheGroup.h"
+#include "rtp_llm/cpp/cache/SharedBlockCache.h"
+#include "rtp_llm/cpp/cache/group/LinearKVCacheGroup.h"
 #include "rtp_llm/cpp/cache/test/BlockPoolTestHelper.h"
 
 namespace rtp_llm {
@@ -14,7 +14,6 @@ static std::shared_ptr<LinearKVCacheSpec> makeLinearSpec(uint32_t seq_size_per_b
     auto spec                = std::make_shared<LinearKVCacheSpec>();
     spec->type               = KVCacheSpecType::LinearAttention;
     spec->dtype              = rtp_llm::DataType::TYPE_FP16;
-    spec->layer_num          = 2;
     spec->local_num_k_heads  = 1;
     spec->local_num_v_heads  = 1;
     spec->head_k_dim         = 1;
@@ -27,7 +26,24 @@ static std::shared_ptr<LinearKVCacheSpec> makeLinearSpec(uint32_t seq_size_per_b
 
 class LinearKVCacheGroupTest: public ::testing::Test {};
 
-TEST_F(LinearKVCacheGroupTest, GetNeedBlocksReuseDisabledCountsOnlyReserveStep) {
+TEST_F(LinearKVCacheGroupTest, DefaultPolicyDrivesBehaviorInterfaces) {
+    auto block_pool = createBlockPool();
+    ASSERT_TRUE(block_pool->init());
+
+    auto               spec = makeLinearSpec(/*seq_size_per_block=*/4);
+    LinearKVCacheGroup group(/*layer_ids=*/{}, spec, block_pool, /*group_id=*/0, /*linear_step=*/2);
+
+    EXPECT_FALSE(group.prefixReusable());
+    EXPECT_FALSE(group.isCpShardable());
+    EXPECT_TRUE(group.hasSparseSlots());
+    EXPECT_FALSE(group.hasKernelBlockSubdiv());
+    EXPECT_TRUE(group.transferTailBlocks());
+    EXPECT_FALSE(group.cpCompactTailBlocks());
+    EXPECT_TRUE(group.isReservable());
+    EXPECT_FALSE(group.usesPinnedCpuBacking());
+}
+
+TEST_F(LinearKVCacheGroupTest, GetNeedBlocksReuseDisabledCountsLastTwoTailAndReserveStep) {
     auto block_pool = createBlockPool();
     ASSERT_TRUE(block_pool->init());
 
@@ -35,11 +51,11 @@ TEST_F(LinearKVCacheGroupTest, GetNeedBlocksReuseDisabledCountsOnlyReserveStep)
     LinearKVCacheGroup group(/*layer_ids=*/{}, spec, block_pool, /*group_id=*/0, /*linear_step=*/2);
     ASSERT_TRUE(group.init());
 
-    // common_slots=2, seq_slots=3, total_slots=5 => when reuse disabled, common=1(tail),
-    // extra=1(tail)+1(reserve_step-1)=2
+    // common_slots=2, seq_slots=3, total_slots=4 => common phase materializes
+    // its last two slots; incremental phase adds final tail and reserve slots.
     const auto need =
         group.getNeedBlocks(/*common_seq_len=*/8, /*seq_len=*/12, /*reserve_step=*/2, /*reuse_blocks_len=*/0, false);
-    EXPECT_EQ(need.common_blocks, 1);
+    EXPECT_EQ(need.common_blocks, 2);
     EXPECT_EQ(need.extra_blocks, 2);
 }
 
@@ -51,8 +67,8 @@ TEST_F(LinearKVCacheGroupTest, GetNeedBlocksReuseEnabledUsesSparseCountingAndRes
     LinearKVCacheGroup group(/*layer_ids=*/{}, spec, block_pool, /*group_id=*/0, /*linear_step=*/2);
     ASSERT_TRUE(group.init());
 
-    // common_slots=2:
-    // count(0,2]=2; count(2,3]=1; reserve_step=2 => extra=2
+    // common_slots=2, seq_slots=3, total_slots=4. Reuse enabled keeps step
+    // hits plus the last two seq slots, so this matches the disabled case here.
     const auto need =
         group.getNeedBlocks(/*common_seq_len=*/8, /*seq_len=*/12, /*reserve_step=*/2, /*reuse_blocks_len=*/0, true);
     EXPECT_EQ(need.common_blocks, 2);
@@ -74,14 +90,14 @@ TEST_F(LinearKVCacheGroupTest, MallocAllocatesStepHitsAndTailWhenReuseEnabled) {
     ASSERT_EQ(blocks.blocksNum(), 4u);
     EXPECT_TRUE(isNullBlockIdx(blocks.blocks()[0]));
     EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[1]));
-    EXPECT_TRUE(isNullBlockIdx(blocks.blocks()[2]));
+    EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[2]));  // tail-1 protects causal_conv1d boundary read
     EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[3]));
 
-    // Only 2 real blocks allocated.
-    EXPECT_EQ(block_pool->freeBlocksNum(), 7u);
+    // Step hit + tail-1 + tail.
+    EXPECT_EQ(block_pool->freeBlocksNum(), 6u);
 }
 
-TEST_F(LinearKVCacheGroupTest, MallocAllocatesOnlyTailWhenReuseDisabled) {
+TEST_F(LinearKVCacheGroupTest, MallocAllocatesLastTwoTailBlocksWhenReuseDisabled) {
     auto block_pool = createBlockPool();
     ASSERT_TRUE(block_pool->init());
     ASSERT_EQ(block_pool->freeBlocksNum(), 9u);
@@ -96,11 +112,10 @@ TEST_F(LinearKVCacheGroupTest, MallocAllocatesOnlyTailWhenReuseDisabled) {
     ASSERT_EQ(blocks.blocksNum(), 4u);
     EXPECT_TRUE(isNullBlockIdx(blocks.blocks()[0]));
     EXPECT_TRUE(isNullBlockIdx(blocks.blocks()[1]));
-    EXPECT_TRUE(isNullBlockIdx(blocks.blocks()[2]));
+    EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[2]));  // tail-1 protects causal_conv1d boundary read
     EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[3]));
 
-    // Only 1 real block allocated.
-    EXPECT_EQ(block_pool->freeBlocksNum(), 8u);
+    EXPECT_EQ(block_pool->freeBlocksNum(), 7u);
 }
 
 TEST_F(LinearKVCacheGroupTest, MallocAllocatesReserveTailBlocksWhenReuseDisabled) {
@@ -119,12 +134,95 @@ TEST_F(LinearKVCacheGroupTest, MallocAllocatesReserveTailBlocksWhenReuseDisabled
     ASSERT_EQ(blocks.blocksNum(), 5u);
     EXPECT_TRUE(isNullBlockIdx(blocks.blocks()[0]));
     EXPECT_TRUE(isNullBlockIdx(blocks.blocks()[1]));
-    EXPECT_TRUE(isNullBlockIdx(blocks.blocks()[2]));
+    EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[2]));  // tail-1 protects causal_conv1d boundary read
     EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[3]));  // seq tail
     EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[4]));  // reserve tail
 
-    // Tail + reserve_step blocks are allocated.
-    EXPECT_EQ(block_pool->freeBlocksNum(), 7u);
+    EXPECT_EQ(block_pool->freeBlocksNum(), 6u);
+}
+
+TEST_F(LinearKVCacheGroupTest, MallocBackfillsExistingNullReadSlot) {
+    auto block_pool = createBlockPool();
+    ASSERT_TRUE(block_pool->init());
+    ASSERT_EQ(block_pool->freeBlocksNum(), 9u);
+
+    auto               spec = makeLinearSpec(/*seq_size_per_block=*/4);
+    LinearKVCacheGroup group(/*layer_ids=*/{}, spec, block_pool, /*group_id=*/0, /*linear_step=*/2);
+    ASSERT_TRUE(group.init());
+
+    auto allocated = block_pool->malloc(2);
+    ASSERT_EQ(allocated.size(), 2u);
+
+    BlockIds blocks;
+    blocks.assign(BlockIndicesType{allocated[0], NULL_BLOCK_IDX, allocated[1]});
+    const size_t free_before = block_pool->freeBlocksNum();
+
+    // seq_len=12 => seq_slots=3. Position 1 is tail-1 and is the read slot
+    // for sequence_length=13, so it must be materialized even though no new
+    // slots are appended.
+    ASSERT_TRUE(group.malloc(blocks, /*seq_len=*/12, /*enable_reuse_cache=*/false));
+
+    ASSERT_EQ(blocks.blocksNum(), 3u);
+    EXPECT_EQ(blocks.blocks()[0], allocated[0]);
+    EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[1]));
+    EXPECT_EQ(blocks.blocks()[2], allocated[1]);
+    EXPECT_EQ(block_pool->freeBlocksNum(), free_before - 1);
+}
+
+TEST_F(LinearKVCacheGroupTest, MallocMaterializesCausalConvReadSlotAtBoundaries) {
+    const std::vector<int> seq_lens = {4, 5, 8, 9};
+
+    for (bool enable_reuse_cache : {false, true}) {
+        for (int seq_len : seq_lens) {
+            auto block_pool = createBlockPool();
+            ASSERT_TRUE(block_pool->init());
+
+            auto               spec = makeLinearSpec(/*seq_size_per_block=*/4);
+            LinearKVCacheGroup group(/*layer_ids=*/{}, spec, block_pool, /*group_id=*/0, /*linear_step=*/2);
+            ASSERT_TRUE(group.init());
+
+            BlockIds blocks;
+            ASSERT_TRUE(group.malloc(blocks, seq_len, enable_reuse_cache)) << "seq_len=" << seq_len;
+
+            const int read_pos = (seq_len - 2) / 4;
+            ASSERT_GE(read_pos, 0);
+            ASSERT_LT(static_cast<size_t>(read_pos), blocks.blocksNum());
+            EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[static_cast<size_t>(read_pos)]))
+                << "seq_len=" << seq_len << " reuse=" << enable_reuse_cache << " read_pos=" << read_pos;
+        }
+    }
+}
+
+TEST_F(LinearKVCacheGroupTest, GetNeedBlocksMatchesMallocForReserveSteps) {
+    for (bool enable_reuse_cache : {false, true}) {
+        for (int reserve_step : {0, 1, 2, 3}) {
+            auto block_pool = createBlockPool();
+            ASSERT_TRUE(block_pool->init());
+
+            auto               spec = makeLinearSpec(/*seq_size_per_block=*/4);
+            LinearKVCacheGroup group(/*layer_ids=*/{}, spec, block_pool, /*group_id=*/0, /*linear_step=*/2);
+            ASSERT_TRUE(group.init());
+
+            const auto need = group.getNeedBlocks(/*common_seq_len=*/8,
+                                                  /*seq_len=*/12,
+                                                  reserve_step,
+                                                  /*reuse_blocks_len=*/0,
+                                                  enable_reuse_cache);
+
+            BlockIds blocks;
+            ASSERT_TRUE(group.malloc(blocks, /*seq_len=*/8, enable_reuse_cache));
+            ASSERT_TRUE(group.malloc(blocks, /*seq_len=*/12, enable_reuse_cache, reserve_step));
+
+            size_t valid_count = 0;
+            for (auto block : blocks.blocks()) {
+                if (!isNullBlockIdx(block)) {
+                    valid_count++;
+                }
+            }
+            EXPECT_EQ(valid_count, static_cast<size_t>(need.common_blocks + need.extra_blocks))
+                << "reserve_step=" << reserve_step << " reuse=" << enable_reuse_cache;
+        }
+    }
 }
 
 TEST_F(LinearKVCacheGroupTest, RemoveSkippedBlocksFreesNonStepBlocksButKeepsLastTwo) {
@@ -158,53 +256,51 @@ TEST_F(LinearKVCacheGroupTest, RemoveSkippedBlocksFreesNonStepBlocksButKeepsLast
     EXPECT_EQ(block_pool->freeBlocksNum(), free_before + 2);
 }
 
-TEST_F(LinearKVCacheGroupTest, InsertIntoCacheSkipsNullBlocks) {
+TEST_F(LinearKVCacheGroupTest, PutIntoCacheSkipsNullBlocks) {
     auto block_pool = createBlockPool();
     ASSERT_TRUE(block_pool->init());
 
-    auto               spec = makeLinearSpec(/*seq_size_per_block=*/4);
-    LinearKVCacheGroup group(/*layer_ids=*/{}, spec, block_pool, /*group_id=*/3, /*linear_step=*/2);
-    ASSERT_TRUE(group.init());
+    auto                      shared_cache = std::make_shared<SharedBlockCache>();
+    std::vector<BlockPoolPtr> group_pools(4, block_pool);
+    shared_cache->init(4, group_pools);
 
-    auto block_cache = block_pool->blockCache();
-    ASSERT_NE(block_cache, nullptr);
+    auto block1 = block_pool->malloc(1)[0];
+    auto block2 = block_pool->malloc(1)[0];
 
-    BlockIndicesType blocks;
-    blocks.push_back(NULL_BLOCK_IDX);
-    blocks.push_back(block_pool->malloc(1)[0]);
-    blocks.push_back(NULL_BLOCK_IDX);
-    blocks.push_back(block_pool->malloc(1)[0]);
+    // Only put entries with non-NULL blocks (simulating allocator-level filtering)
+    std::vector<BlockIdxType> slots1(4, NULL_BLOCK_IDX);
+    slots1[3] = block1;
+    shared_cache->put(101, slots1, /*is_resident=*/false);
 
-    CacheKeysType keys = {100, 101, 102, 103};
-    group.insertIntoCache(keys, blocks, /*is_resident=*/false);
+    std::vector<BlockIdxType> slots2(4, NULL_BLOCK_IDX);
+    slots2[3] = block2;
+    shared_cache->put(103, slots2, /*is_resident=*/false);
 
-    EXPECT_FALSE(block_cache->contains(100, /*group_id=*/3));
-    EXPECT_TRUE(block_cache->contains(101, /*group_id=*/3));
-    EXPECT_FALSE(block_cache->contains(102, /*group_id=*/3));
-    EXPECT_TRUE(block_cache->contains(103, /*group_id=*/3));
+    EXPECT_FALSE(shared_cache->contains(100));
+    EXPECT_TRUE(shared_cache->contains(101));
+    EXPECT_FALSE(shared_cache->contains(102));
+    EXPECT_TRUE(shared_cache->contains(103));
 }
 
 TEST_F(LinearKVCacheGroupTest, MatchSingleKeyReturnsMatchedBlockOrEmpty) {
     auto block_pool = createBlockPool();
     ASSERT_TRUE(block_pool->init());
 
+    auto                      shared_cache = std::make_shared<SharedBlockCache>();
+    std::vector<BlockPoolPtr> group_pools(8, block_pool);
+    shared_cache->init(8, group_pools);
+
     auto               spec = makeLinearSpec(/*seq_size_per_block=*/4);
-    LinearKVCacheGroup group(/*layer_ids=*/{}, spec, block_pool, /*group_id=*/7, /*linear_step=*/2);
+    LinearKVCacheGroup group(/*layer_ids=*/{}, spec, block_pool, /*group_id=*/7, /*linear_step=*/2, shared_cache.get());
     ASSERT_TRUE(group.init());
 
-    auto block_cache = block_pool->blockCache();
-    ASSERT_NE(block_cache, nullptr);
-
     // Allocate a block, then put it into cache for group_id=7.
     auto blocks = block_pool->malloc(1);
     ASSERT_EQ(blocks.size(), 1u);
 
-    BlockCache::CacheItem item;
-    item.cache_key   = 123;
-    item.group_id    = 7;
-    item.block_index = blocks[0];
-    item.is_resident = false;
-    ASSERT_TRUE(block_cache->put(item));
+    std::vector<BlockIdxType> group_slots(8, NULL_BLOCK_IDX);
+    group_slots[7] = blocks[0];
+    shared_cache->put(123, group_slots, /*is_resident=*/false);
 
     auto hit = group.matchSingleKey(123);
     ASSERT_EQ(hit.block_indices.size(), 1u);
@@ -259,14 +355,19 @@ TEST_F(LinearKVCacheGroupTest, MallocEnsuresFreeBlocksByEvictingCache) {
     ASSERT_TRUE(block_pool->init());
     ASSERT_EQ(block_pool->freeBlocksNum(), 9u);
 
+    auto                      shared_cache = std::make_shared<SharedBlockCache>();
+    std::vector<BlockPoolPtr> group_pools  = {block_pool};
+    shared_cache->init(1, group_pools);
+
     auto               spec = makeLinearSpec(/*seq_size_per_block=*/4);
-    LinearKVCacheGroup group(/*layer_ids=*/{}, spec, block_pool, /*group_id=*/0, /*linear_step=*/2);
+    LinearKVCacheGroup group(/*layer_ids=*/{}, spec, block_pool, /*group_id=*/0, /*linear_step=*/2, shared_cache.get());
     ASSERT_TRUE(group.init());
 
     // Put one block into cache (non-resident) and release request reference so it becomes evictable.
     auto cached = block_pool->malloc(1);
     ASSERT_EQ(cached.size(), 1u);
-    group.insertIntoCache(CacheKeysType{123}, cached, /*is_resident=*/false);
+    std::vector<BlockIdxType> slots = {cached[0]};
+    shared_cache->put(123, slots, /*is_resident=*/false);
     block_pool->requestFree(cached);
 
     // Exhaust the remaining free blocks so malloc must evict from cache to proceed.
@@ -357,23 +458,6 @@ TEST_F(LinearKVCacheGroupTest, ReferenceAppendsAndIncrementsRefCountForValidBloc
     EXPECT_EQ(block_pool->freeBlocksNum(), free_before + 1);
 }
 
-TEST_F(LinearKVCacheGroupTest, InsertIntoCacheWithEmptyInputsIsNoop) {
-    auto block_pool = createBlockPool();
-    ASSERT_TRUE(block_pool->init());
-
-    auto               spec = makeLinearSpec(/*seq_size_per_block=*/4);
-    LinearKVCacheGroup group(/*layer_ids=*/{}, spec, block_pool, /*group_id=*/3, /*linear_step=*/2);
-    ASSERT_TRUE(group.init());
-
-    auto block_cache = block_pool->blockCache();
-    ASSERT_NE(block_cache, nullptr);
-    ASSERT_EQ(block_cache->size(), 0u);
-
-    group.insertIntoCache(CacheKeysType{}, BlockIndicesType{1, 2}, /*is_resident=*/false);
-    group.insertIntoCache(CacheKeysType{100, 101}, BlockIndicesType{}, /*is_resident=*/false);
-    EXPECT_EQ(block_cache->size(), 0u);
-}
-
 }  // namespace test
 }  // namespace rtp_llm
 
diff --git a/rtp_llm/cpp/cache/test/MemoryLayoutStrategyTest.cc b/rtp_llm/cpp/cache/test/MemoryLayoutStrategyTest.cc
index 2407179f83..7354ee4b59 100644
--- a/rtp_llm/cpp/cache/test/MemoryLayoutStrategyTest.cc
+++ b/rtp_llm/cpp/cache/test/MemoryLayoutStrategyTest.cc
@@ -1,5 +1,6 @@
 #include <gtest/gtest.h>
 #include <memory>
+#include <numeric>
 #include <vector>
 #include <torch/torch.h>
 #include "rtp_llm/cpp/cache/MemoryLayoutStrategy.h"
@@ -9,6 +10,7 @@
 #include "rtp_llm/models_py/bindings/core/ExecOps.h"
 #include "rtp_llm/cpp/config/ConfigModules.h"
 #include "rtp_llm/cpp/config/ModelConfig.h"
+#include "rtp_llm/cpp/config/StaticConfig.h"
 #include "rtp_llm/cpp/utils/Logger.h"
 
 namespace rtp_llm {
@@ -29,6 +31,8 @@ class MemoryLayoutStrategyTest: public ::testing::Test {
     };
 
     void SetUp() override {
+        old_core_dump_on_exception_                  = StaticConfig::user_ft_core_dump_on_exception;
+        StaticConfig::user_ft_core_dump_on_exception = false;
         rtp_llm::initLogger();
         torch::manual_seed(114514);
 
@@ -39,7 +43,9 @@ class MemoryLayoutStrategyTest: public ::testing::Test {
         ASSERT_TRUE(rtp_llm::isRuntimeInitialized());
     }
 
-    void TearDown() override {}
+    void TearDown() override {
+        StaticConfig::user_ft_core_dump_on_exception = old_core_dump_on_exception_;
+    }
 
     static KVCacheSpecPtr createTestKvCacheSpec(uint32_t          layer_num,
                                                 rtp_llm::DataType dtype,
@@ -70,8 +76,7 @@ class MemoryLayoutStrategyTest: public ::testing::Test {
             auto spec                = std::make_shared<MHAKVCacheSpec>();
             spec->type               = KVCacheSpecType::MultiHeadAttention;
             spec->dtype              = dtype;
-            spec->layer_num          = layer_num;
-            spec->local_head_num_kv  = local_head_num_kv;
+                spec->local_head_num_kv  = local_head_num_kv;
             spec->seq_size_per_block = seq_size_per_block;
             spec->size_per_head      = static_cast<uint32_t>(k_elems / denom);
             return spec;
@@ -79,8 +84,7 @@ class MemoryLayoutStrategyTest: public ::testing::Test {
             auto spec                = std::make_shared<MLAKVCacheSpec>();
             spec->type               = KVCacheSpecType::MultiHeadLatentAttention;
             spec->dtype              = dtype;
-            spec->layer_num          = layer_num;
-            spec->local_head_num_kv  = local_head_num_kv;
+                spec->local_head_num_kv  = local_head_num_kv;
             spec->seq_size_per_block = seq_size_per_block;
             spec->kv_lora_rank       = static_cast<uint32_t>(k_elems / denom);
             spec->rope_head_dim      = static_cast<uint32_t>(v_elems / denom);
@@ -99,14 +103,14 @@ class MemoryLayoutStrategyTest: public ::testing::Test {
                                           /*k_block_stride_bytes=*/k_block_bytes,
                                           /*v_block_stride_bytes=*/v_block_bytes);
 
-        // Create CacheConfig with the spec
         rtp_llm::CacheConfig cache_config;
-        cache_config.cache_specs           = {spec};
         cache_config.layer_num             = layer_num;
+        cache_config.layer_all_num         = layer_num;
         cache_config.block_num             = block_num;
         cache_config.dtype                 = rtp_llm::DataType::TYPE_INT8;
         cache_config.seq_size_per_block    = 1;
         cache_config.kv_block_stride_bytes = spec->block_size_bytes();
+        initializeSingleGroup(cache_config, spec);
 
         auto pool_cfg   = BlockPoolConfigHelper::createConfig(cache_config);
         auto layout_cfg = pool_cfg.memory_layouts[0];
@@ -121,6 +125,12 @@ class MemoryLayoutStrategyTest: public ::testing::Test {
         return layout_cfg;
     }
 
+    static void initializeSingleGroup(rtp_llm::CacheConfig& cache_config, const KVCacheSpecPtr& spec) {
+        std::vector<int> layer_ids(cache_config.layer_num);
+        std::iota(layer_ids.begin(), layer_ids.end(), 0);
+        cache_config.fromGroupedSpecs({spec}, {layer_ids}, {CacheGroupType::FULL}, {"default"});
+    }
+
     static MemoryLayoutConfig createTestConfig(size_t k_block_bytes = 512, size_t v_block_bytes = 512) {
         return createTestConfig(/*layer_num=*/4, /*block_num=*/8, k_block_bytes, v_block_bytes);
     }
@@ -166,6 +176,8 @@ class MemoryLayoutStrategyTest: public ::testing::Test {
                                          BufferInitMode       init_mode     = BufferInitMode::Zeros) {
         return createTestContext(createTestConfig(k_block_bytes, v_block_bytes), device, init_mode);
     }
+
+    bool old_core_dump_on_exception_{false};
 };
 
 TEST_F(MemoryLayoutStrategyTest, Initialization) {
@@ -186,15 +198,15 @@ TEST_F(MemoryLayoutStrategyTest, InitializationWithScaleTensor) {
                                       /*seq_size_per_block=*/4,
                                       /*k_block_stride_bytes=*/512,
                                       /*v_block_stride_bytes=*/512);
-    // Create CacheConfig with the spec
     rtp_llm::CacheConfig cache_config;
-    cache_config.cache_specs           = {spec};
     cache_config.layer_num             = 4;
+    cache_config.layer_all_num         = 4;
     cache_config.block_num             = 8;
     cache_config.dtype                 = rtp_llm::DataType::TYPE_INT8;
     cache_config.seq_size_per_block    = 4;
     cache_config.kv_block_stride_bytes = spec->block_size_bytes();
     cache_config.kv_scale_stride_bytes = spec->scale_block_size_bytes();
+    initializeSingleGroup(cache_config, spec);
 
     auto pool_cfg = BlockPoolConfigHelper::createConfig(cache_config);
     auto config   = pool_cfg.memory_layouts[0];  // keep enable_kv_scale=true
@@ -351,14 +363,14 @@ TEST_F(MemoryLayoutStrategyTest, ConvertIndexToBufferPartitionedByHeadFp16UsesBy
                                       /*seq_size_per_block=*/64,
                                       /*k_block_stride_bytes=*/1024,
                                       /*v_block_stride_bytes=*/1024);
-    // Create CacheConfig with the spec
     rtp_llm::CacheConfig cache_config;
-    cache_config.cache_specs           = {spec};
     cache_config.layer_num             = 4;
+    cache_config.layer_all_num         = 4;
     cache_config.block_num             = 8;
     cache_config.dtype                 = rtp_llm::DataType::TYPE_FP16;
     cache_config.seq_size_per_block    = 64;
     cache_config.kv_block_stride_bytes = spec->block_size_bytes();
+    initializeSingleGroup(cache_config, spec);
 
     auto pool_cfg = BlockPoolConfigHelper::createConfig(cache_config);
     auto config   = pool_cfg.memory_layouts[0];
@@ -422,15 +434,15 @@ TEST_F(MemoryLayoutStrategyTest, ConvertIndexToBufferPartitionedByHeadWithScale)
                                       /*seq_size_per_block=*/64,
                                       /*k_block_stride_bytes=*/512,
                                       /*v_block_stride_bytes=*/512);
-    // Create CacheConfig with the spec
     rtp_llm::CacheConfig cache_config;
-    cache_config.cache_specs           = {spec};
     cache_config.layer_num             = 4;
+    cache_config.layer_all_num         = 4;
     cache_config.block_num             = 8;
     cache_config.dtype                 = rtp_llm::DataType::TYPE_INT8;
     cache_config.seq_size_per_block    = 64;
     cache_config.kv_block_stride_bytes = spec->block_size_bytes();
     cache_config.kv_scale_stride_bytes = spec->scale_block_size_bytes();
+    initializeSingleGroup(cache_config, spec);
 
     auto pool_cfg = BlockPoolConfigHelper::createConfig(cache_config);
     auto config   = pool_cfg.memory_layouts[0];  // keep enable_kv_scale=true
@@ -582,6 +594,32 @@ TEST_F(MemoryLayoutStrategyTest, AddressSequentiality) {
     EXPECT_EQ(addr2_val - addr1_val, ctx.config.kv_block_stride_bytes);
 }
 
+TEST_F(MemoryLayoutStrategyTest, ConvertIndexToBufferUsesPhysicalStrideForKernelView) {
+    auto config                       = createTestConfig(/*layer_num=*/2, /*block_num=*/4, 64, 64);
+    config.kernel_blocks_per_kv_block = 4;
+    auto ctx                          = createTestContext(std::move(config), torch::kCPU, BufferInitMode::Arange);
+
+    auto          strategy = std::make_unique<MemoryLayoutStrategy>();
+    torch::Tensor empty_scale;
+    ASSERT_TRUE(strategy->init(ctx.config, ctx.kv_cache_buffer, empty_scale, ctx.cache_ptr));
+
+    auto layer_tensors = strategy->getLayerCacheTensors();
+    ASSERT_EQ(layer_tensors[0].size(0), static_cast<int64_t>(ctx.config.block_num * 4));
+    ASSERT_EQ(static_cast<size_t>(layer_tensors[0].stride(0) * layer_tensors[0].element_size()),
+              ctx.config.kv_block_stride_bytes / 4);
+
+    auto block0 = strategy->convertIndexToBuffer(/*layer_id=*/0, /*block_id=*/0);
+    auto block1 = strategy->convertIndexToBuffer(/*layer_id=*/0, /*block_id=*/1);
+    ASSERT_EQ(block0.size(), 1u);
+    ASSERT_EQ(block1.size(), 1u);
+    EXPECT_EQ(block0[0].size_bytes, ctx.config.kv_block_stride_bytes);
+    EXPECT_EQ(block1[0].size_bytes, ctx.config.kv_block_stride_bytes);
+
+    const auto addr0 = reinterpret_cast<uintptr_t>(block0[0].addr);
+    const auto addr1 = reinterpret_cast<uintptr_t>(block1[0].addr);
+    EXPECT_EQ(addr1 - addr0, ctx.config.kv_block_stride_bytes);
+}
+
 // Layout Comparison Test
 class LayoutComparisonTest: public MemoryLayoutStrategyTest {};
 
diff --git a/rtp_llm/cpp/cache/test/SWAKVCacheGroupMallocRangeTest.cc b/rtp_llm/cpp/cache/test/SWAKVCacheGroupMallocRangeTest.cc
new file mode 100644
index 0000000000..547fde3aac
--- /dev/null
+++ b/rtp_llm/cpp/cache/test/SWAKVCacheGroupMallocRangeTest.cc
@@ -0,0 +1,122 @@
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <cstdlib>
+#include <memory>
+#include <string>
+
+#include "rtp_llm/cpp/cache/BlockPool.h"
+#include "rtp_llm/cpp/cache/group/SWAKVCacheGroup.h"
+
+namespace rtp_llm {
+namespace test {
+namespace {
+
+class ScopedEnvVar {
+public:
+    ScopedEnvVar(const char* name, const char* value): name_(name) {
+        const char* old_value = std::getenv(name_);
+        if (old_value != nullptr) {
+            old_value_ = old_value;
+            had_value_ = true;
+        }
+        setenv(name_, value, 1);
+    }
+
+    ~ScopedEnvVar() {
+        if (had_value_) {
+            setenv(name_, old_value_.c_str(), 1);
+        } else {
+            unsetenv(name_);
+        }
+    }
+
+private:
+    const char* name_;
+    std::string old_value_;
+    bool        had_value_ = false;
+};
+
+BlockPoolConfig makeHostBlockPoolConfig() {
+    constexpr uint32_t kLayerNum        = 1;
+    constexpr uint32_t kBlockNum        = 4;
+    constexpr size_t   kKvBlockStride   = 1024;
+    constexpr size_t   kHalfBlockStride = kKvBlockStride / 2;
+
+    MemoryLayoutConfig layout;
+    layout.layer_num                = kLayerNum;
+    layout.block_num                = kBlockNum;
+    layout.dtype                    = rtp_llm::DataType::TYPE_FP16;
+    layout.kv_cache_offset_bytes    = 0;
+    layout.kv_scale_offset_bytes    = kLayerNum * kBlockNum * kKvBlockStride;
+    layout.kv_block_stride_bytes    = kKvBlockStride;
+    layout.k_block_stride_bytes     = kHalfBlockStride;
+    layout.v_block_stride_bytes     = kHalfBlockStride;
+    layout.kv_block_pool_size_bytes = kLayerNum * kBlockNum * kKvBlockStride;
+    layout.kv_scale_pool_size_bytes = 0;
+    layout.total_size_bytes         = layout.kv_block_pool_size_bytes;
+
+    BlockPoolConfig config;
+    config.block_num        = kBlockNum;
+    config.total_size_bytes = layout.total_size_bytes;
+    config.memory_layouts   = {layout};
+    return config;
+}
+
+BlockPoolPtr createHostBlockPool() {
+    auto block_pool = std::make_shared<BlockPool>(makeHostBlockPoolConfig(), AllocationType::HOST);
+    RTP_LLM_CHECK_WITH_INFO(block_pool->init(), "init host block pool failed");
+    return block_pool;
+}
+
+std::shared_ptr<MHAKVCacheSpec> makeMHASpec(int seq_size_per_block) {
+    auto spec                = std::make_shared<MHAKVCacheSpec>();
+    spec->seq_size_per_block = seq_size_per_block;
+    return spec;
+}
+
+}  // namespace
+
+TEST(SWAKVCacheGroupMallocRangeTest, EmptyBlockIdsKeepTailBlocksForSeqLenUpTo1M) {
+    constexpr int kSeqSizePerBlock = 256;
+    constexpr int kMaxSeqLen       = 1000000;
+
+    ScopedEnvVar    disable_pin_host_pool("RTP_LLM_PIN_HOST_BLOCK_POOL", "0");
+    auto            block_pool = createHostBlockPool();
+    SWAKVCacheGroup group({}, makeMHASpec(kSeqSizePerBlock), block_pool, 0);
+
+    auto check_seq_len = [&](int seq_len) {
+        BlockIds block_ids;
+        ASSERT_EQ(block_ids.blocksNum(), 0u) << "seq_len=" << seq_len;
+
+        ASSERT_TRUE(group.malloc(block_ids, seq_len, /*enable_reuse_cache=*/false, /*reserve_step=*/0))
+            << "seq_len=" << seq_len;
+
+        const auto& blocks = block_ids.blocks();
+        ASSERT_EQ(blocks.size(), static_cast<size_t>((seq_len + kSeqSizePerBlock - 1) / kSeqSizePerBlock))
+            << "seq_len=" << seq_len;
+        if (blocks.size() == 1) {
+            EXPECT_FALSE(isNullBlockIdx(blocks[0])) << "seq_len=" << seq_len;
+        } else {
+            EXPECT_FALSE(isNullBlockIdx(blocks[blocks.size() - 2])) << "seq_len=" << seq_len;
+            EXPECT_FALSE(isNullBlockIdx(blocks[blocks.size() - 1])) << "seq_len=" << seq_len;
+        }
+
+        group.free(blocks);
+    };
+
+    // SWA malloc depends on seq_slots=ceil(seq_len / block_size). The first
+    // and last seq_len in each slot cover all behavior classes from 1..1M.
+    const int max_seq_slots = (kMaxSeqLen + kSeqSizePerBlock - 1) / kSeqSizePerBlock;
+    for (int seq_slots = 1; seq_slots <= max_seq_slots; ++seq_slots) {
+        const int first_seq_len = (seq_slots - 1) * kSeqSizePerBlock + 1;
+        const int last_seq_len  = std::min(seq_slots * kSeqSizePerBlock, kMaxSeqLen);
+        check_seq_len(first_seq_len);
+        if (last_seq_len != first_seq_len) {
+            check_seq_len(last_seq_len);
+        }
+    }
+}
+
+}  // namespace test
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/test/SWAKVCacheGroupTest.cc b/rtp_llm/cpp/cache/test/SWAKVCacheGroupTest.cc
new file mode 100644
index 0000000000..6fc05678b4
--- /dev/null
+++ b/rtp_llm/cpp/cache/test/SWAKVCacheGroupTest.cc
@@ -0,0 +1,740 @@
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <cstdlib>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rtp_llm/cpp/cache/spec/OpaqueKVCacheSpec.h"
+#include "rtp_llm/cpp/cache/group/SWAKVCacheGroup.h"
+#include "rtp_llm/cpp/cache/SharedBlockCache.h"
+#include "rtp_llm/cpp/cache/test/BlockPoolTestHelper.h"
+#include "rtp_llm/cpp/config/StaticConfig.h"
+
+namespace rtp_llm {
+namespace test {
+
+namespace {
+
+class ScopedEnvVar {
+public:
+    ScopedEnvVar(const char* name, const char* value): name_(name) {
+        const char* old_value = std::getenv(name_);
+        if (old_value != nullptr) {
+            old_value_ = old_value;
+            had_value_ = true;
+        }
+        setenv(name_, value, 1);
+    }
+
+    ~ScopedEnvVar() {
+        if (had_value_) {
+            setenv(name_, old_value_.c_str(), 1);
+        } else {
+            unsetenv(name_);
+        }
+    }
+
+private:
+    const char* name_;
+    std::string old_value_;
+    bool        had_value_ = false;
+};
+
+std::shared_ptr<FixedStateCacheSpec> makeDsv4StateSpec(const std::string& tag, int seq_size_per_block) {
+    return std::make_shared<FixedStateCacheSpec>(tag,
+                                           /*state_elements=*/1024,
+                                           /*block_entries=*/128,
+                                           DataType::TYPE_FP32,
+                                           seq_size_per_block);
+}
+
+CacheGroupPolicy makePolicy(const KVCacheSpecPtr& spec) {
+    return CacheConfig::cacheGroupPolicyForSpec(spec, CacheGroupType::SWA);
+}
+
+size_t validBlockCount(const BlockIndicesType& blocks) {
+    return static_cast<size_t>(std::count_if(blocks.begin(), blocks.end(), [](BlockIdxType block) {
+        return !isNullBlockIdx(block);
+    }));
+}
+
+}  // namespace
+
+class SWAKVCacheGroupTest: public ::testing::Test {
+protected:
+    void SetUp() override {
+        old_core_dump_on_exception_                  = StaticConfig::user_ft_core_dump_on_exception;
+        StaticConfig::user_ft_core_dump_on_exception = false;
+        block_pool_ = createBlockPool();
+        block_pool_->init();
+        total_blocks_                         = block_pool_->freeBlocksNum();
+        shared_cache_                         = std::make_shared<SharedBlockCache>();
+        std::vector<BlockPoolPtr> group_pools = {block_pool_};
+        shared_cache_->init(1, group_pools);
+    }
+
+    void TearDown() override {
+        StaticConfig::user_ft_core_dump_on_exception = old_core_dump_on_exception_;
+    }
+
+    SWAKVCacheGroup makeGroup(int seq_size_per_block) {
+        auto spec                = std::make_shared<MHAKVCacheSpec>();
+        spec->seq_size_per_block = seq_size_per_block;
+        return SWAKVCacheGroup({}, spec, block_pool_, 0, 0, shared_cache_.get());
+    }
+
+    SWAKVCacheGroup makeGroupWithStep(int seq_size_per_block, int linear_step) {
+        auto spec                = std::make_shared<MHAKVCacheSpec>();
+        spec->seq_size_per_block = seq_size_per_block;
+        return SWAKVCacheGroup({}, spec, block_pool_, 0, linear_step, shared_cache_.get());
+    }
+
+    BlockPoolPtr        block_pool_;
+    SharedBlockCachePtr shared_cache_;
+    size_t              total_blocks_ = 0;
+    bool                old_core_dump_on_exception_{false};
+};
+
+TEST_F(SWAKVCacheGroupTest, DefaultPolicyDrivesBehaviorInterfaces) {
+    auto spec                = std::make_shared<MHAKVCacheSpec>();
+    spec->seq_size_per_block = 4;
+    SWAKVCacheGroup group({}, spec, block_pool_, 0, 0, shared_cache_.get());
+
+    EXPECT_FALSE(group.prefixReusable());
+    EXPECT_FALSE(group.isCpShardable());
+    EXPECT_TRUE(group.hasSparseSlots());
+    EXPECT_FALSE(group.hasKernelBlockSubdiv());
+    EXPECT_TRUE(group.transferTailBlocks());
+    EXPECT_TRUE(group.cpCompactTailBlocks());
+    EXPECT_TRUE(group.isReservable());
+    EXPECT_FALSE(group.usesPinnedCpuBacking());
+}
+
+// ==================== needBlocksNum ====================
+
+TEST_F(SWAKVCacheGroupTest, NeedBlocksNum_Basic) {
+    auto group = makeGroup(4);
+    EXPECT_EQ(group.needBlocksNum(1, 0), 1);
+    EXPECT_EQ(group.needBlocksNum(4, 0), 1);
+    EXPECT_EQ(group.needBlocksNum(5, 0), 2);
+    EXPECT_EQ(group.needBlocksNum(8, 0), 2);
+    EXPECT_EQ(group.needBlocksNum(9, 0), 3);
+}
+
+TEST_F(SWAKVCacheGroupTest, NeedBlocksNum_WithCurrentBlocks) {
+    auto group = makeGroup(4);
+    EXPECT_EQ(group.needBlocksNum(10, 1), 2);
+    EXPECT_EQ(group.needBlocksNum(10, 3), 0);
+    EXPECT_EQ(group.needBlocksNum(10, 5), 0);
+}
+
+TEST_F(SWAKVCacheGroupTest, NeedBlocksNum_WithReserveStep) {
+    auto group = makeGroup(4);
+    // reserve_step formula: ceil((seq_len + reserve_step) / block_size) - current
+    EXPECT_EQ(group.needBlocksNum(8, 0, 0), 2);  // ceil((8+0)/4) = 2
+    EXPECT_EQ(group.needBlocksNum(8, 0, 1), 3);  // ceil((8+1)/4) = 3
+    EXPECT_EQ(group.needBlocksNum(8, 0, 2), 3);  // ceil((8+2)/4) = 3
+    EXPECT_EQ(group.needBlocksNum(8, 0, 5), 4);  // ceil((8+5)/4) = 4
+}
+
+// ==================== getNeedBlocks ====================
+
+TEST_F(SWAKVCacheGroupTest, GetNeedBlocks_SeqLenZero) {
+    auto group = makeGroup(4);
+    auto need  = group.getNeedBlocks(0, 0, 0, 0, false);
+    EXPECT_EQ(need.common_blocks, 0);
+    EXPECT_EQ(need.extra_blocks, 0);
+}
+
+TEST_F(SWAKVCacheGroupTest, GetNeedBlocks_ReuseDisabledCountsActiveTail) {
+    auto group = makeGroupWithStep(4, 2);
+    // seq_len=12 => seq_slots=3, reuse disabled => last two active tail blocks.
+    auto need = group.getNeedBlocks(0, 12, 0, 0, false);
+    EXPECT_EQ(need.common_blocks, 0);
+    EXPECT_EQ(need.extra_blocks, 2);
+}
+
+TEST_F(SWAKVCacheGroupTest, GetNeedBlocks_ReuseEnabledUsesSparse) {
+    auto group = makeGroupWithStep(4, 2);
+    // seq_len=12 => seq_slots=3
+    // count_sparse(0,3): eligible=(3+1)/2-(0+1)/2=2-0=2, tail=(3+1)%2==0 => 0, total=2
+    auto need = group.getNeedBlocks(0, 12, 0, 0, true);
+    EXPECT_EQ(need.common_blocks, 0);
+    EXPECT_EQ(need.extra_blocks, 2);
+}
+
+TEST_F(SWAKVCacheGroupTest, GetNeedBlocks_HCAStateReuseEnabledCountsTailOnly) {
+    auto spec               = makeDsv4StateSpec("hca_state", 4);
+    spec->skip_prefix_reuse = true;
+    auto group = SWAKVCacheGroup({}, spec, block_pool_, 5, /*linear_step=*/3, shared_cache_.get(), nullptr, makePolicy(spec));
+
+    // seq_len=40 => seq_slots=10. If reuse sparse allocation were enabled, step hits
+    // would keep positions 2/5/8 plus tail position 9. HCA_STATE skips reuse and keeps only tail 9.
+    auto need = group.getNeedBlocks(0, 40, 0, 0, true);
+    EXPECT_EQ(need.common_blocks, 0);
+    EXPECT_EQ(need.extra_blocks, 1);
+}
+
+TEST_F(SWAKVCacheGroupTest, GetNeedBlocks_CSAStateReuseEnabledStillUsesSparse) {
+    auto spec  = makeDsv4StateSpec("csa_state", 4);
+    auto group = SWAKVCacheGroup({}, spec, block_pool_, 4, /*linear_step=*/3, shared_cache_.get(), nullptr, makePolicy(spec));
+
+    auto need = group.getNeedBlocks(0, 40, 0, 0, true);
+    EXPECT_EQ(need.common_blocks, 0);
+    EXPECT_EQ(need.extra_blocks, 4);
+}
+
+TEST_F(SWAKVCacheGroupTest, GetNeedBlocks_WithReserveStep) {
+    auto group = makeGroupWithStep(4, 2);
+    // seq_len=8 => two active tail blocks, plus one reserve block.
+    auto need = group.getNeedBlocks(0, 8, 2, 0, false);
+    EXPECT_EQ(need.extra_blocks, 3);
+}
+
+TEST_F(SWAKVCacheGroupTest, GetNeedBlocks_ReusePartialOverlap) {
+    auto group = makeGroupWithStep(4, 2);
+    // seq_len=12 => seq_slots=3
+    // reuse_blocks_len=2: count_sparse(2,3)
+    // eligible=(3+1)/2-(2+1)/2=2-1=1, tail=(3+1)%2==0 => 0, total=1
+    auto need = group.getNeedBlocks(0, 12, 0, 2, true);
+    EXPECT_EQ(need.extra_blocks, 1);
+}
+
+TEST_F(SWAKVCacheGroupTest, GetNeedBlocks_ReuseFullOverlap) {
+    auto group = makeGroupWithStep(4, 2);
+    // seq_len=12 => seq_slots=3
+    // reuse_blocks_len=3: count_sparse(3,3) = 0
+    auto need = group.getNeedBlocks(0, 12, 0, 3, true);
+    EXPECT_EQ(need.extra_blocks, 0);
+}
+
+TEST_F(SWAKVCacheGroupTest, GetNeedBlocks_CommonSeqLenIgnored) {
+    auto group = makeGroup(4);
+    auto need1 = group.getNeedBlocks(0, 20, 0, 0, false);
+    auto need2 = group.getNeedBlocks(20, 20, 0, 0, false);
+    auto need3 = group.getNeedBlocks(100, 20, 0, 0, false);
+    EXPECT_EQ(need1.extra_blocks, need2.extra_blocks);
+    EXPECT_EQ(need2.extra_blocks, need3.extra_blocks);
+    EXPECT_EQ(need1.common_blocks, 0);
+}
+
+// ==================== match ====================
+
+TEST_F(SWAKVCacheGroupTest, MatchAlwaysThrows) {
+    auto group = makeGroup(4);
+    EXPECT_THROW(group.match({101, 102, 103}), std::exception);
+}
+
+TEST_F(SWAKVCacheGroupTest, MatchSingleKey_NotFound) {
+    auto group  = makeGroup(4);
+    auto result = group.matchSingleKey(999);
+    EXPECT_TRUE(result.block_indices.empty());
+}
+
+TEST_F(SWAKVCacheGroupTest, MatchSingleKey_Found) {
+    auto                      group       = makeGroup(4);
+    std::vector<BlockIdxType> group_slots = {1};  // group_id=0, block_index=1
+    shared_cache_->put(101, group_slots, false);
+
+    auto result = group.matchSingleKey(101);
+    ASSERT_EQ(result.block_indices.size(), 1u);
+    EXPECT_EQ(result.block_indices[0], 1);
+}
+
+// ==================== malloc (default step=0, acts like step=1, tail-only) ====================
+
+TEST_F(SWAKVCacheGroupTest, Malloc_ShortSeq_OnlyOneBlock) {
+    auto     group = makeGroup(4);
+    BlockIds block_ids(1);
+    ASSERT_TRUE(group.malloc(block_ids, 3));
+    EXPECT_EQ(block_ids.blocksNum(), 1u);
+    EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[0]));
+    EXPECT_EQ(block_pool_->freeBlocksNum(), total_blocks_ - 1);
+}
+
+TEST_F(SWAKVCacheGroupTest, Malloc_ManyBlocks_LastTwoActiveBlocksReal) {
+    auto     group = makeGroup(4);
+    BlockIds block_ids(1);
+    ASSERT_TRUE(group.malloc(block_ids, 20));
+    // reuse_cache=false still keeps the last two active blocks.
+    ASSERT_EQ(block_ids.blocksNum(), 5u);
+    for (int i = 0; i < 3; ++i) {
+        EXPECT_TRUE(isNullBlockIdx(block_ids.blocks()[i])) << "position " << i << " should be NULL";
+    }
+    EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[3]));
+    EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[4]));
+    EXPECT_EQ(block_pool_->freeBlocksNum(), total_blocks_ - 2);
+}
+
+TEST_F(SWAKVCacheGroupTest, Malloc_DSV4PromptTailKeepsPenultimateBlock) {
+    auto     group = makeGroup(256);
+    BlockIds block_ids(1);
+
+    ASSERT_TRUE(group.malloc(block_ids, 5121, /*enable_reuse_cache=*/false, /*reserve_step=*/0));
+
+    ASSERT_EQ(block_ids.blocksNum(), 21u);
+    for (int i = 0; i < 19; ++i) {
+        EXPECT_TRUE(isNullBlockIdx(block_ids.blocks()[i])) << "position " << i << " should be NULL";
+    }
+    EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[19]));
+    EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[20]));
+    EXPECT_EQ(block_pool_->freeBlocksNum(), total_blocks_ - 2);
+}
+
+TEST_F(SWAKVCacheGroupTest, Malloc_NoOpWhenEnoughBlocks) {
+    auto     group = makeGroup(4);
+    BlockIds block_ids(1);
+    ASSERT_TRUE(group.malloc(block_ids, 8));
+    size_t free_after_first = block_pool_->freeBlocksNum();
+
+    ASSERT_TRUE(group.malloc(block_ids, 8));
+    EXPECT_EQ(block_ids.blocksNum(), 2u);
+    EXPECT_EQ(block_pool_->freeBlocksNum(), free_after_first);
+}
+
+TEST_F(SWAKVCacheGroupTest, Malloc_DSV4TrapSkipsHCAStateNullTail) {
+    ScopedEnvVar env("DSV4_TRAP_INVALID_KV_ACCESS", "1");
+    auto         spec  = makeDsv4StateSpec("hca_state", 4);
+    spec->skip_prefix_reuse = true;
+    auto group = SWAKVCacheGroup({}, spec, block_pool_, 5, 0, shared_cache_.get(), nullptr, makePolicy(spec));
+    BlockIds     block_ids(1);
+    block_ids.assign(BlockIndicesType{NULL_BLOCK_IDX, NULL_BLOCK_IDX, NULL_BLOCK_IDX});
+
+    EXPECT_NO_THROW((void)group.malloc(block_ids, 12));
+}
+
+TEST_F(SWAKVCacheGroupTest, Malloc_HCAStateReuseEnabledAllocatesTailOnly) {
+    auto     spec  = makeDsv4StateSpec("hca_state", 4);
+    spec->skip_prefix_reuse = true;
+    auto     group = SWAKVCacheGroup({}, spec, block_pool_, 5, /*linear_step=*/3, shared_cache_.get(), nullptr, makePolicy(spec));
+    BlockIds block_ids(1);
+
+    ASSERT_TRUE(group.malloc(block_ids, 40, /*enable_reuse_cache=*/true, /*reserve_step=*/0));
+
+    ASSERT_EQ(block_ids.blocksNum(), 10u);
+    EXPECT_EQ(validBlockCount(block_ids.blocks()), 1u);
+    EXPECT_TRUE(isNullBlockIdx(block_ids.blocks()[8]));
+    EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[9]));
+    EXPECT_EQ(block_pool_->freeBlocksNum(), total_blocks_ - 1);
+}
+
+TEST_F(SWAKVCacheGroupTest, Malloc_CSAStateReuseEnabledKeepsSparseBlocks) {
+    auto     spec  = makeDsv4StateSpec("csa_state", 4);
+    auto     group = SWAKVCacheGroup({}, spec, block_pool_, 4, /*linear_step=*/3, shared_cache_.get(), nullptr, makePolicy(spec));
+    BlockIds block_ids(1);
+
+    ASSERT_TRUE(group.malloc(block_ids, 40, /*enable_reuse_cache=*/true, /*reserve_step=*/0));
+
+    ASSERT_EQ(block_ids.blocksNum(), 10u);
+    EXPECT_EQ(validBlockCount(block_ids.blocks()), 4u);
+    EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[2]));
+    EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[5]));
+    EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[8]));
+    EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[9]));
+    EXPECT_EQ(block_pool_->freeBlocksNum(), total_blocks_ - 4);
+}
+
+TEST_F(SWAKVCacheGroupTest, Malloc_DSV4TrapChecksSWAKVNullTail) {
+    ScopedEnvVar env("DSV4_TRAP_INVALID_KV_ACCESS", "1");
+    auto         spec  = makeDsv4StateSpec("swa_kv", 4);
+    auto         group = SWAKVCacheGroup({}, spec, block_pool_, 6, 0, shared_cache_.get(), nullptr, makePolicy(spec));
+    BlockIds     block_ids(1);
+    block_ids.assign(BlockIndicesType{NULL_BLOCK_IDX, NULL_BLOCK_IDX, NULL_BLOCK_IDX});
+
+    EXPECT_THROW((void)group.malloc(block_ids, 12), std::exception);
+}
+
+TEST_F(SWAKVCacheGroupTest, Malloc_DSV4TrapChecksNonSkipStateNullTail) {
+    ScopedEnvVar env("DSV4_TRAP_INVALID_KV_ACCESS", "1");
+    auto         spec  = makeDsv4StateSpec("csa_state", 4);
+    auto         group = SWAKVCacheGroup({}, spec, block_pool_, 4, 0, shared_cache_.get(), nullptr, makePolicy(spec));
+    BlockIds     block_ids(1);
+    block_ids.assign(BlockIndicesType{NULL_BLOCK_IDX, NULL_BLOCK_IDX, NULL_BLOCK_IDX});
+
+    EXPECT_THROW((void)group.malloc(block_ids, 12), std::exception);
+}
+
+TEST_F(SWAKVCacheGroupTest, Malloc_WithReserveStep) {
+    auto     group = makeGroup(4);
+    BlockIds block_ids(1);
+    // seq_len=4 => seq_slots=1, reserve_step=2 => total=2 (1 + (2-1))
+    // index 0: seq_tail => REAL, index 1: reserve => REAL
+    ASSERT_TRUE(group.malloc(block_ids, 4, false, 2));
+    ASSERT_EQ(block_ids.blocksNum(), 2u);
+    EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[0]));
+    EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[1]));
+    EXPECT_EQ(block_pool_->freeBlocksNum(), total_blocks_ - 2);
+}
+
+TEST_F(SWAKVCacheGroupTest, Malloc_FailsWhenPoolExhausted) {
+    auto                  group = makeGroup(4);
+    std::vector<BlockIds> holders;
+    for (size_t i = 0; i < total_blocks_; ++i) {
+        holders.emplace_back(1);
+        if (!group.malloc(holders.back(), 4)) {
+            break;
+        }
+    }
+    EXPECT_EQ(block_pool_->freeBlocksNum(), 0u);
+
+    BlockIds block_ids(1);
+    EXPECT_FALSE(group.malloc(block_ids, 4));
+}
+
+// ==================== malloc with linear_step ====================
+
+TEST_F(SWAKVCacheGroupTest, Malloc_WithStep_ReuseEnabled) {
+    auto     group = makeGroupWithStep(4, 2);
+    BlockIds block_ids(1);
+    // seq_len=16 => 4 slots; keep step hits plus the last two active blocks.
+    ASSERT_TRUE(group.malloc(block_ids, 16, /*enable_reuse_cache=*/true));
+    ASSERT_EQ(block_ids.blocksNum(), 4u);
+    EXPECT_TRUE(isNullBlockIdx(block_ids.blocks()[0]));
+    EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[1]));
+    EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[2]));
+    EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[3]));
+    EXPECT_EQ(block_pool_->freeBlocksNum(), total_blocks_ - 3);
+}
+
+TEST_F(SWAKVCacheGroupTest, Malloc_WithStep_ReuseDisabled) {
+    auto     group = makeGroupWithStep(4, 2);
+    BlockIds block_ids(1);
+    // seq_len=16 => 4 slots, reuse_cache=false => active tail indices 2 and 3.
+    ASSERT_TRUE(group.malloc(block_ids, 16, /*enable_reuse_cache=*/false));
+    ASSERT_EQ(block_ids.blocksNum(), 4u);
+    EXPECT_TRUE(isNullBlockIdx(block_ids.blocks()[0]));
+    EXPECT_TRUE(isNullBlockIdx(block_ids.blocks()[1]));
+    EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[2]));
+    EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[3]));
+    EXPECT_EQ(block_pool_->freeBlocksNum(), total_blocks_ - 2);
+}
+
+TEST_F(SWAKVCacheGroupTest, Malloc_WithStep_ReserveAllocated) {
+    auto     group = makeGroupWithStep(4, 2);
+    BlockIds block_ids(1);
+    // seq_len=16 => seq_slots=4, reserve_step=2 => total_slots=5
+    // reuse disabled: active tail(2,3) and reserve(4) allocated
+    ASSERT_TRUE(group.malloc(block_ids, 16, /*enable_reuse_cache=*/false, /*reserve_step=*/2));
+    ASSERT_EQ(block_ids.blocksNum(), 5u);
+    EXPECT_TRUE(isNullBlockIdx(block_ids.blocks()[0]));
+    EXPECT_TRUE(isNullBlockIdx(block_ids.blocks()[1]));
+    EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[2]));
+    EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[3]));
+    EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[4]));
+    EXPECT_EQ(block_pool_->freeBlocksNum(), total_blocks_ - 3);
+}
+
+// ==================== removeSkippedBlocks ====================
+
+TEST_F(SWAKVCacheGroupTest, RemoveSkippedBlocks_TwoOrFewer_NoOp) {
+    auto     group = makeGroup(4);
+    BlockIds block_ids(1);
+    ASSERT_TRUE(group.malloc(block_ids, 5));
+    ASSERT_EQ(block_ids.blocksNum(), 2u);
+
+    group.removeSkippedBlocks(block_ids);
+    EXPECT_EQ(block_ids.blocksNum(), 2u);
+}
+
+TEST_F(SWAKVCacheGroupTest, RemoveSkippedBlocks_FreesNonTailReal) {
+    auto     group = makeGroupWithStep(4, 2);
+    BlockIds block_ids(1);
+    // First: 2 blocks with reuse
+    ASSERT_TRUE(group.malloc(block_ids, 5, true));
+    // Extend to 5 blocks with reuse
+    ASSERT_TRUE(group.malloc(block_ids, 20, true));
+    ASSERT_EQ(block_ids.blocksNum(), 5u);
+    size_t free_before = block_pool_->freeBlocksNum();
+
+    group.removeSkippedBlocks(block_ids, true);
+
+    // step=2: keep step_hit blocks + last 2
+    // step_hit: index 1 ((1+1)%2==0), index 3 ((3+1)%2==0)
+    // last 2: index 3, 4
+    // loop i from block_size-3=2 down to 0:
+    //   i=2: not null, not step_hit => free
+    //   i=1: not null, step_hit => continue
+    //   i=0: not null, not step_hit => free
+    // But wait, with reuse_cache=true for the first malloc (5 tokens), blocks at 0,1 are:
+    // active tail at 0,1 and step_hit at 1 => both REAL
+    // Then extending to 20 tokens with reuse: new blocks at 2,3,4
+    // step_hit at 3 and active tail at 3,4 => REAL. index 2: NULL
+    // So blocks are: [REAL, REAL, NULL, REAL, REAL]
+    // removeSkippedBlocks: loop from i=2 down:
+    //   i=2: NULL => break (stops on first null going backward)
+    // No blocks freed.
+    EXPECT_EQ(block_pool_->freeBlocksNum(), free_before);
+}
+
+TEST_F(SWAKVCacheGroupTest, RemoveSkippedBlocks_WithStep_FreesNonStepBlocks) {
+    auto block_pool = createBlockPool();
+    ASSERT_TRUE(block_pool->init());
+    ASSERT_EQ(block_pool->freeBlocksNum(), 9u);
+
+    auto spec                = std::make_shared<MHAKVCacheSpec>();
+    spec->seq_size_per_block = 4;
+    SWAKVCacheGroup group({}, spec, block_pool, 0, 2);
+
+    // Start with 6 allocated blocks (no NULLs).
+    auto allocated = block_pool->malloc(6);
+    ASSERT_EQ(allocated.size(), 6u);
+    BlockIds blocks;
+    blocks.assign(allocated);
+
+    const size_t free_before = block_pool->freeBlocksNum();
+    group.removeSkippedBlocks(blocks, true);
+
+    // step=2, size=6: keep step_hit + last 2
+    // step_hit: index 1 ((1+1)%2==0), 3 ((3+1)%2==0), 5 ((5+1)%2==0 but in last 2)
+    // last 2: index 4, 5
+    // loop from i=3 down: (block_size-3=3)
+    //   i=3: step_hit => continue
+    //   i=2: not step_hit => free
+    //   i=1: step_hit => continue
+    //   i=0: not step_hit => free
+    ASSERT_EQ(blocks.blocksNum(), 6u);
+    EXPECT_TRUE(isNullBlockIdx(blocks.blocks()[0]));
+    EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[1]));
+    EXPECT_TRUE(isNullBlockIdx(blocks.blocks()[2]));
+    EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[3]));
+    EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[4]));
+    EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[5]));
+
+    EXPECT_EQ(block_pool->freeBlocksNum(), free_before + 2);
+}
+
+TEST_F(SWAKVCacheGroupTest, RemoveSkippedBlocks_HCAStateReuseEnabledKeepsTailOnly) {
+    auto block_pool = createBlockPool();
+    ASSERT_TRUE(block_pool->init());
+    ASSERT_EQ(block_pool->freeBlocksNum(), 9u);
+
+    auto spec               = makeDsv4StateSpec("hca_state", 4);
+    spec->skip_prefix_reuse = true;
+    auto group = SWAKVCacheGroup({}, spec, block_pool, 5, /*linear_step=*/2, nullptr, nullptr, makePolicy(spec));
+
+    auto allocated = block_pool->malloc(6);
+    ASSERT_EQ(allocated.size(), 6u);
+    BlockIds blocks;
+    blocks.assign(allocated);
+
+    const size_t free_before = block_pool->freeBlocksNum();
+    group.removeSkippedBlocks(blocks, /*enable_reuse_cache=*/true);
+
+    ASSERT_EQ(blocks.blocksNum(), 6u);
+    for (int i = 0; i < 5; ++i) {
+        EXPECT_TRUE(isNullBlockIdx(blocks.blocks()[i])) << "position " << i << " should be freed";
+    }
+    EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[5]));
+    EXPECT_EQ(block_pool->freeBlocksNum(), free_before + 5);
+}
+
+TEST_F(SWAKVCacheGroupTest, RemoveSkippedBlocks_WithReserveStep) {
+    auto block_pool = createBlockPool();
+    ASSERT_TRUE(block_pool->init());
+    ASSERT_EQ(block_pool->freeBlocksNum(), 9u);
+
+    auto spec                = std::make_shared<MHAKVCacheSpec>();
+    spec->seq_size_per_block = 4;
+    SWAKVCacheGroup group({}, spec, block_pool, 0, 2);
+
+    auto allocated = block_pool->malloc(6);
+    ASSERT_EQ(allocated.size(), 6u);
+    BlockIds blocks;
+    blocks.assign(allocated);
+
+    const size_t free_before = block_pool->freeBlocksNum();
+    // reserve_step=1: keep last 2 + 1 more (index 3)
+    group.removeSkippedBlocks(blocks, false, 1);
+
+    // reuse_cache=false so no step_hit check
+    // loop from i=block_size-3-1=2 down:
+    //   i=2: free, i=1: free, i=0: free
+    ASSERT_EQ(blocks.blocksNum(), 6u);
+    EXPECT_TRUE(isNullBlockIdx(blocks.blocks()[0]));
+    EXPECT_TRUE(isNullBlockIdx(blocks.blocks()[1]));
+    EXPECT_TRUE(isNullBlockIdx(blocks.blocks()[2]));
+    EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[3]));
+    EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[4]));
+    EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[5]));
+
+    EXPECT_EQ(block_pool->freeBlocksNum(), free_before + 3);
+}
+
+// ==================== free ====================
+
+TEST_F(SWAKVCacheGroupTest, Free_ReleasesRealBlocks) {
+    auto     group = makeGroup(4);
+    BlockIds block_ids(1);
+    ASSERT_TRUE(group.malloc(block_ids, 20));
+    EXPECT_LT(block_pool_->freeBlocksNum(), total_blocks_);
+
+    group.free(block_ids.blocks());
+    EXPECT_EQ(block_pool_->freeBlocksNum(), total_blocks_);
+}
+
+TEST_F(SWAKVCacheGroupTest, Free_Empty) {
+    auto group = makeGroup(4);
+    group.free({});
+    EXPECT_EQ(block_pool_->freeBlocksNum(), total_blocks_);
+}
+
+TEST_F(SWAKVCacheGroupTest, Free_SkipsNullBlocks) {
+    auto     group = makeGroup(4);
+    BlockIds block_ids(1);
+    ASSERT_TRUE(group.malloc(block_ids, 20));
+    EXPECT_LT(block_pool_->freeBlocksNum(), total_blocks_);
+
+    group.free(block_ids.blocks());
+    EXPECT_EQ(block_pool_->freeBlocksNum(), total_blocks_);
+}
+
+// ==================== reference ====================
+
+TEST_F(SWAKVCacheGroupTest, Reference_AddsAndRefsBlocks) {
+    auto     group = makeGroup(4);
+    BlockIds block_ids(1);
+    ASSERT_TRUE(group.malloc(block_ids, 5));
+    auto original = block_ids.blocks();
+
+    BlockIds block_ids2(1);
+    group.reference(block_ids2, original);
+    EXPECT_EQ(block_ids2.blocksNum(), original.size());
+    EXPECT_EQ(block_ids2.blocks(), original);
+}
+
+TEST_F(SWAKVCacheGroupTest, Reference_NullBlocksNotReffed) {
+    auto     group = makeGroup(4);
+    BlockIds block_ids(1);
+    ASSERT_TRUE(group.malloc(block_ids, 20));
+    auto original = block_ids.blocks();
+
+    BlockIds block_ids2(1);
+    group.reference(block_ids2, original);
+    EXPECT_EQ(block_ids2.blocksNum(), original.size());
+}
+
+// ==================== put into cache (allocator-level) ====================
+
+TEST_F(SWAKVCacheGroupTest, PutIntoCache_SkipsNullBlocks) {
+    auto     group = makeGroup(4);
+    BlockIds block_ids(1);
+    ASSERT_TRUE(group.malloc(block_ids, 20));
+    CacheKeysType keys = {101, 102, 103, 104, 105};
+
+    // Simulate allocator-level insertIntoCache: only put non-NULL blocks
+    for (size_t i = 0; i < keys.size() && i < block_ids.blocksNum(); ++i) {
+        if (!isNullBlockIdx(block_ids.blocks()[i])) {
+            std::vector<BlockIdxType> slots = {block_ids.blocks()[i]};
+            shared_cache_->put(keys[i], slots, false);
+        }
+    }
+
+    auto result1 = group.matchSingleKey(101);
+    EXPECT_TRUE(result1.block_indices.empty());
+
+    // The last two active tail blocks are real.
+    auto result4 = group.matchSingleKey(104);
+    ASSERT_EQ(result4.block_indices.size(), 1u);
+    EXPECT_EQ(result4.block_indices[0], block_ids.blocks()[3]);
+
+    auto result5 = group.matchSingleKey(105);
+    ASSERT_EQ(result5.block_indices.size(), 1u);
+    EXPECT_EQ(result5.block_indices[0], block_ids.blocks()[4]);
+}
+
+// ==================== batch allocation atomicity (regression: mid-loop leak) ====================
+
+// Reproduces the historical bug where SWAKVCacheGroup::malloc called block_pool_->malloc(1)
+// repeatedly inside a loop. If a later iteration failed (e.g. concurrent allocators raced for
+// the last free blocks), the previously allocated blocks were leaked because they had only
+// been recorded in a stack-local vector and were never written back to block_ids; the upper
+// rollback in HybridKVCacheAllocator::initMallocForCommonLen could not see them.
+//
+// After the fix, SWAKVCacheGroup::malloc performs a single atomic batch malloc on the pool,
+// so a failed allocation must leave the pool's free counter unchanged.
+TEST_F(SWAKVCacheGroupTest, Malloc_FailsAtomicallyWithoutLeak) {
+    auto group = makeGroupWithStep(4, 2);
+
+    // Hold 7 blocks so that only 2 free blocks remain. shared_cache_ is empty here, so
+    // ensureFreeBlocks() cannot evict and refill the pool.
+    auto pre_alloc = block_pool_->malloc(7);
+    ASSERT_EQ(pre_alloc.size(), 7u);
+    const size_t free_before = block_pool_->freeBlocksNum();
+    ASSERT_EQ(free_before, total_blocks_ - 7);
+
+    // seq_len=16, step=2, reuse=true => seq_slots=4. The group needs 3 real blocks at
+    // positions {1, 2, 3}, which exceeds the 2 free blocks currently in the pool.
+    BlockIds block_ids(1);
+    EXPECT_FALSE(group.malloc(block_ids, 16, /*enable_reuse_cache=*/true));
+
+    // Free count must stay identical to the pre-call value (no stranded blocks).
+    EXPECT_EQ(block_pool_->freeBlocksNum(), free_before);
+    // No partial state should have leaked into block_ids either.
+    EXPECT_EQ(block_ids.blocksNum(), 0u);
+
+    // The pre-allocated blocks must still be releasable, proving that BlockPool ref
+    // counters were not corrupted by the failed malloc path.
+    block_pool_->requestFree(pre_alloc);
+    EXPECT_EQ(block_pool_->freeBlocksNum(), total_blocks_);
+}
+
+// Verifies the new behavior: SWAKVCacheGroup::malloc reserves all required physical blocks
+// via a single batch BlockPool::malloc(N) call instead of N individual malloc(1) calls.
+TEST_F(SWAKVCacheGroupTest, Malloc_AllocatesAtomicallyAsBatch) {
+    auto         group       = makeGroupWithStep(4, 2);
+    const size_t free_before = block_pool_->freeBlocksNum();
+
+    // seq_len=16, step=2, reuse=true => 4 slots. Real blocks expected at positions {1, 2, 3}.
+    BlockIds block_ids(1);
+    ASSERT_TRUE(group.malloc(block_ids, 16, /*enable_reuse_cache=*/true));
+    ASSERT_EQ(block_ids.blocksNum(), 4u);
+    EXPECT_TRUE(isNullBlockIdx(block_ids.blocks()[0]));
+    EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[1]));
+    EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[2]));
+    EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[3]));
+
+    // The pool's free count must drop by exactly the number of physical blocks (3).
+    EXPECT_EQ(block_pool_->freeBlocksNum(), free_before - 3);
+
+    group.free(block_ids.blocks());
+    EXPECT_EQ(block_pool_->freeBlocksNum(), total_blocks_);
+}
+
+// Larger sparse layout: with linear_step=2 and seq_len=24 (=> 6 slots) and reuse enabled,
+// the active-tail-2 plus step-hits set {1, 3, 4, 5} forms 4 physical blocks. Validates
+// that the batch path correctly distributes the 4 allocated indices across NULL/REAL slots.
+TEST_F(SWAKVCacheGroupTest, Malloc_BatchPlacementMatchesShouldAllocate) {
+    auto         group       = makeGroupWithStep(4, 2);
+    const size_t free_before = block_pool_->freeBlocksNum();
+
+    BlockIds block_ids(1);
+    ASSERT_TRUE(group.malloc(block_ids, 24, /*enable_reuse_cache=*/true));
+    ASSERT_EQ(block_ids.blocksNum(), 6u);
+    // Expected: idx0=NULL, idx1=REAL(step), idx2=NULL, idx3=REAL(step+tail), idx4=REAL(tail), idx5=REAL(tail).
+    EXPECT_TRUE(isNullBlockIdx(block_ids.blocks()[0]));
+    EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[1]));
+    EXPECT_TRUE(isNullBlockIdx(block_ids.blocks()[2]));
+    EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[3]));
+    EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[4]));
+    EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[5]));
+
+    // All 4 real blocks must be distinct (the batch BlockPool::malloc returns unique ids).
+    std::vector<BlockIdxType> reals = {
+        block_ids.blocks()[1], block_ids.blocks()[3], block_ids.blocks()[4], block_ids.blocks()[5]};
+    std::sort(reals.begin(), reals.end());
+    EXPECT_EQ(std::adjacent_find(reals.begin(), reals.end()), reals.end());
+
+    EXPECT_EQ(block_pool_->freeBlocksNum(), free_before - 4);
+
+    group.free(block_ids.blocks());
+    EXPECT_EQ(block_pool_->freeBlocksNum(), total_blocks_);
+}
+
+}  // namespace test
+}  // namespace rtp_llm
+
+int main(int argc, char** argv) {
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
diff --git a/rtp_llm/cpp/cache/test/SharedBlockCacheTest.cc b/rtp_llm/cpp/cache/test/SharedBlockCacheTest.cc
new file mode 100644
index 0000000000..8d56e31abd
--- /dev/null
+++ b/rtp_llm/cpp/cache/test/SharedBlockCacheTest.cc
@@ -0,0 +1,419 @@
+#include "gtest/gtest.h"
+
+#include "rtp_llm/cpp/cache/SharedBlockCache.h"
+
+namespace rtp_llm::test {
+namespace {
+
+BlockDependency rootDep(uint32_t ordinal = 0) {
+    BlockDependency dep;
+    dep.ordinal = ordinal;
+    return dep;
+}
+
+BlockDependency childDep(CacheKeyType parent, uint32_t ordinal) {
+    BlockDependency dep;
+    dep.has_parent = true;
+    dep.parent_key = parent;
+    dep.ordinal    = ordinal;
+    return dep;
+}
+
+void putOne(SharedBlockCache& cache,
+            CacheKeyType      key,
+            BlockIdxType      block,
+            const BlockDependency& dep,
+            SharedBlockCache::NamespaceId namespace_id = SharedBlockCache::kGpuLogicalNamespace,
+            bool resident = false) {
+    cache.put(key, std::vector<BlockIdxType>{block}, resident, namespace_id, dep);
+}
+
+}  // namespace
+
+TEST(SharedBlockCacheTest, PrefixTreeEvictsCollectedChainInParentFirstOrderWithDependencies) {
+    SharedBlockCache cache;
+    putOne(cache, 1, 101, rootDep(0));
+    putOne(cache, 2, 102, childDep(1, 1));
+    putOne(cache, 3, 103, childDep(2, 2));
+
+    auto evicted = cache.selectAndEvict(/*min_blocks=*/1);
+
+    ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{1, 2, 3}));
+    ASSERT_EQ(evicted.evicted_slots.at(1), (std::vector<BlockIdxType>{101}));
+    ASSERT_FALSE(evicted.evicted_dependencies.at(1).has_parent);
+    ASSERT_TRUE(evicted.evicted_dependencies.at(2).has_parent);
+    ASSERT_EQ(evicted.evicted_dependencies.at(2).parent_key, 1);
+    ASSERT_TRUE(evicted.evicted_dependencies.at(3).has_parent);
+    ASSERT_EQ(evicted.evicted_dependencies.at(3).parent_key, 2);
+    EXPECT_TRUE(cache.empty());
+}
+
+TEST(SharedBlockCacheTest, PrefixTreeStopsAtBranchPoint) {
+    SharedBlockCache cache;
+    putOne(cache, 1, 101, rootDep(0));
+    putOne(cache, 2, 102, childDep(1, 1));
+    putOne(cache, 3, 103, childDep(1, 2));
+
+    auto evicted = cache.selectAndEvict(/*min_blocks=*/1);
+
+    ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{2}));
+    EXPECT_FALSE(cache.contains(2));
+    EXPECT_TRUE(cache.contains(1));
+    EXPECT_TRUE(cache.contains(3));
+}
+
+TEST(SharedBlockCacheTest, PrefixTreeLinksChildInsertedBeforeParent) {
+    SharedBlockCache cache;
+    putOne(cache, 2, 102, childDep(1, 1));
+    putOne(cache, 1, 101, rootDep(0));
+
+    ASSERT_EQ(cache.matchGroup(2, 0), 102);
+
+    auto evicted = cache.selectAndEvict(/*min_blocks=*/1);
+
+    ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{1, 2}));
+    EXPECT_TRUE(cache.empty());
+}
+
+TEST(SharedBlockCacheTest, PrefixTreeEvictsOrphanLeafWithMissingParentDependency) {
+    SharedBlockCache cache;
+    putOne(cache, 2, 102, childDep(1, 1));
+
+    auto evicted = cache.selectAndEvict(/*min_blocks=*/1);
+
+    ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{2}));
+    ASSERT_TRUE(evicted.evicted_dependencies.count(2));
+    EXPECT_TRUE(evicted.evicted_dependencies.at(2).has_parent);
+    EXPECT_EQ(evicted.evicted_dependencies.at(2).parent_key, 1);
+    EXPECT_TRUE(cache.empty());
+}
+
+TEST(SharedBlockCacheTest, PrefixTreeAttachesMultiplePendingChildrenAndStopsAtBranch) {
+    SharedBlockCache cache;
+    putOne(cache, 2, 102, childDep(1, 1));
+    putOne(cache, 3, 103, childDep(1, 2));
+    putOne(cache, 1, 101, rootDep(0));
+
+    auto evicted = cache.selectAndEvict(/*min_blocks=*/1);
+
+    ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{2}));
+    EXPECT_FALSE(cache.contains(2));
+    EXPECT_TRUE(cache.contains(1));
+    EXPECT_TRUE(cache.contains(3));
+
+    evicted = cache.selectAndEvict(/*min_blocks=*/1);
+    ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{1, 3}));
+    EXPECT_TRUE(cache.empty());
+}
+
+TEST(SharedBlockCacheTest, PrefixTreeStopsAtResidentParent) {
+    SharedBlockCache cache;
+    putOne(cache, 1, 101, rootDep(0), SharedBlockCache::kGpuLogicalNamespace, /*resident=*/true);
+    putOne(cache, 2, 102, childDep(1, 1));
+
+    auto evicted = cache.selectAndEvict(/*min_blocks=*/1);
+
+    ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{2}));
+    ASSERT_TRUE(evicted.evicted_dependencies.count(2));
+    EXPECT_TRUE(evicted.evicted_dependencies.at(2).has_parent);
+    EXPECT_EQ(evicted.evicted_dependencies.at(2).parent_key, 1);
+    EXPECT_TRUE(cache.contains(1));
+    EXPECT_FALSE(cache.contains(2));
+}
+
+TEST(SharedBlockCacheTest, MatchGroupTouchesPrefixTreeLeafLru) {
+    SharedBlockCache cache;
+    putOne(cache, 1, 101, rootDep(0));
+    putOne(cache, 2, 102, childDep(1, 1));
+    putOne(cache, 3, 103, rootDep(0));
+
+    ASSERT_EQ(cache.matchGroup(2, 0), 102);
+
+    auto evicted = cache.selectAndEvict(/*min_blocks=*/1);
+
+    ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{3}));
+    EXPECT_TRUE(cache.contains(1));
+    EXPECT_TRUE(cache.contains(2));
+    EXPECT_FALSE(cache.contains(3));
+}
+
+TEST(SharedBlockCacheTest, ResidentIsStickyAcrossPuts) {
+    SharedBlockCache cache;
+    putOne(cache, 1, 101, rootDep(0), SharedBlockCache::kGpuLogicalNamespace, /*resident=*/false);
+    putOne(cache, 1, NULL_BLOCK_IDX, rootDep(0), SharedBlockCache::kGpuLogicalNamespace, /*resident=*/true);
+    putOne(cache, 1, NULL_BLOCK_IDX, rootDep(0), SharedBlockCache::kGpuLogicalNamespace, /*resident=*/false);
+
+    auto evicted = cache.selectAndEvict(/*min_blocks=*/1);
+
+    EXPECT_TRUE(evicted.evicted_keys.empty());
+    EXPECT_TRUE(cache.contains(1));
+}
+
+TEST(SharedBlockCacheTest, ResidentIsStickyAcrossNamespaceAliases) {
+    SharedBlockCache cache;
+    putOne(cache, 1, 101, rootDep(0), SharedBlockCache::kGpuLogicalNamespace, /*resident=*/false);
+    putOne(cache, 1, NULL_BLOCK_IDX, rootDep(0), SharedBlockCache::kGpuCpCanonicalNamespace, /*resident=*/true);
+
+    auto evicted = cache.selectAndEvict(/*min_blocks=*/1);
+
+    EXPECT_TRUE(evicted.evicted_keys.empty());
+    EXPECT_TRUE(cache.contains(1));
+}
+
+TEST(SharedBlockCacheTest, PrefixTreeEvictionReportsNamespace) {
+    SharedBlockCache cache;
+    putOne(cache, 1, 101, rootDep(0), SharedBlockCache::kGpuCpCanonicalNamespace);
+
+    auto evicted = cache.selectAndEvict(/*min_blocks=*/1);
+
+    ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{1}));
+    ASSERT_TRUE(evicted.evicted_namespaces.count(1));
+    EXPECT_EQ(evicted.evicted_namespaces.at(1), SharedBlockCache::kGpuCpCanonicalNamespace);
+}
+
+TEST(SharedBlockCacheTest, PrefixTreeEvictionKeepsCanonicalDependencyWhenLogicalAliasUpdatesSameKey) {
+    SharedBlockCache cache;
+    putOne(cache, 8, 108, rootDep(0), SharedBlockCache::kGpuCpCanonicalNamespace);
+    putOne(cache, 8, NULL_BLOCK_IDX, childDep(7, 7), SharedBlockCache::kGpuLogicalNamespace);
+
+    auto evicted = cache.selectAndEvict(/*min_blocks=*/1);
+
+    ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{8}));
+    ASSERT_TRUE(evicted.evicted_dependencies.count(8));
+    EXPECT_FALSE(evicted.evicted_dependencies.at(8).has_parent);
+    EXPECT_EQ(evicted.evicted_dependencies.at(8).ordinal, 0u);
+    ASSERT_TRUE(evicted.evicted_namespaces.count(8));
+    EXPECT_EQ(evicted.evicted_namespaces.at(8), SharedBlockCache::kGpuCpCanonicalNamespace);
+}
+
+TEST(SharedBlockCacheTest, CanonicalAliasOwnsEvictionWhenLogicalAliasIsOlder) {
+    SharedBlockCache cache;
+    putOne(cache, 100, 1000, rootDep(0), SharedBlockCache::kGpuLogicalNamespace);
+    putOne(cache, 101, 1010, childDep(100, 1), SharedBlockCache::kGpuLogicalNamespace);
+    putOne(cache, 102, 1020, childDep(101, 2), SharedBlockCache::kGpuLogicalNamespace);
+    putOne(cache, 103, 1030, childDep(102, 3), SharedBlockCache::kGpuLogicalNamespace);
+
+    putOne(cache, 101, NULL_BLOCK_IDX, rootDep(0), SharedBlockCache::kGpuCpCanonicalNamespace);
+    putOne(cache, 103, NULL_BLOCK_IDX, childDep(101, 1), SharedBlockCache::kGpuCpCanonicalNamespace);
+
+    auto evicted = cache.selectAndEvict(/*min_blocks=*/1);
+
+    ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{101, 103}));
+    ASSERT_TRUE(evicted.evicted_dependencies.count(101));
+    EXPECT_FALSE(evicted.evicted_dependencies.at(101).has_parent);
+    ASSERT_TRUE(evicted.evicted_dependencies.count(103));
+    EXPECT_TRUE(evicted.evicted_dependencies.at(103).has_parent);
+    EXPECT_EQ(evicted.evicted_dependencies.at(103).parent_key, 101);
+    EXPECT_EQ(evicted.evicted_namespaces.at(101), SharedBlockCache::kGpuCpCanonicalNamespace);
+    EXPECT_EQ(evicted.evicted_namespaces.at(103), SharedBlockCache::kGpuCpCanonicalNamespace);
+    EXPECT_TRUE(cache.contains(100));
+    EXPECT_TRUE(cache.contains(102));
+}
+
+TEST(SharedBlockCacheTest, FlatFallbackKeepsCanonicalDependencyWhenLogicalAliasUpdatesSameKey) {
+    SharedBlockCache cache;
+    cache.setPrefixTreeEnabled(false);
+
+    putOne(cache, 8, 108, rootDep(0), SharedBlockCache::kGpuCpCanonicalNamespace);
+    putOne(cache, 8, NULL_BLOCK_IDX, childDep(7, 7), SharedBlockCache::kGpuLogicalNamespace);
+
+    auto evicted = cache.selectAndEvict(/*min_blocks=*/1);
+
+    ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{8}));
+    ASSERT_TRUE(evicted.evicted_dependencies.count(8));
+    EXPECT_FALSE(evicted.evicted_dependencies.at(8).has_parent);
+    EXPECT_EQ(evicted.evicted_dependencies.at(8).ordinal, 0u);
+    ASSERT_TRUE(evicted.evicted_namespaces.count(8));
+    EXPECT_EQ(evicted.evicted_namespaces.at(8), SharedBlockCache::kGpuCpCanonicalNamespace);
+}
+
+TEST(SharedBlockCacheTest, NonMatchableSlotStillEvictsButDoesNotMatchGroup) {
+    SharedBlockCache cache;
+    cache.put(1,
+              std::vector<BlockIdxType>{101, 201},
+              /*is_resident=*/false,
+              SharedBlockCache::kGpuLogicalNamespace,
+              rootDep(0),
+              std::vector<bool>{true, false});
+
+    EXPECT_EQ(cache.matchGroup(1, 0), 101);
+    EXPECT_TRUE(isNullBlockIdx(cache.matchGroup(1, 1)));
+
+    auto evicted = cache.selectAndEvict(/*min_blocks=*/2);
+    ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{1}));
+    ASSERT_EQ(evicted.evicted_slots.at(1), (std::vector<BlockIdxType>{101, 201}));
+}
+
+TEST(SharedBlockCacheTest, StateIndependentEvictionDropsDeepestNonLeafStateFirst) {
+    SharedBlockCache cache;
+    cache.setIndependentGroupEviction(/*enabled=*/true, {3});
+
+    cache.put(1, std::vector<BlockIdxType>{101, NULL_BLOCK_IDX, NULL_BLOCK_IDX, 301}, false,
+              SharedBlockCache::kGpuLogicalNamespace, rootDep(0));
+    cache.put(2, std::vector<BlockIdxType>{102, NULL_BLOCK_IDX, NULL_BLOCK_IDX, 302}, false,
+              SharedBlockCache::kGpuLogicalNamespace, childDep(1, 1));
+    cache.put(3, std::vector<BlockIdxType>{103, NULL_BLOCK_IDX, NULL_BLOCK_IDX, 303}, false,
+              SharedBlockCache::kGpuLogicalNamespace, childDep(2, 2));
+
+    auto evicted = cache.selectAndEvictForGroup(/*group_id=*/3, /*min_blocks=*/1);
+
+    ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{2}));
+    ASSERT_EQ(evicted.evicted_slots.at(2),
+              (std::vector<BlockIdxType>{NULL_BLOCK_IDX, NULL_BLOCK_IDX, NULL_BLOCK_IDX, 302}));
+    ASSERT_TRUE(evicted.evicted_independent_group.count(2));
+    EXPECT_EQ(evicted.evicted_independent_group.at(2), 3);
+    EXPECT_EQ(cache.matchGroup(2, 0), 102);
+    EXPECT_TRUE(isNullBlockIdx(cache.matchGroup(2, 3)));
+    EXPECT_EQ(cache.matchGroup(3, 3), 303);
+}
+
+TEST(SharedBlockCacheTest, StateIndependentEvictionScansMultipleLeavesSafely) {
+    SharedBlockCache cache;
+    cache.setIndependentGroupEviction(/*enabled=*/true, {3});
+
+    cache.put(1, std::vector<BlockIdxType>{101, NULL_BLOCK_IDX, NULL_BLOCK_IDX, 301}, false,
+              SharedBlockCache::kGpuLogicalNamespace, rootDep(0));
+    cache.put(2, std::vector<BlockIdxType>{102, NULL_BLOCK_IDX, NULL_BLOCK_IDX, 302}, false,
+              SharedBlockCache::kGpuLogicalNamespace, childDep(1, 1));
+    cache.put(3, std::vector<BlockIdxType>{103, NULL_BLOCK_IDX, NULL_BLOCK_IDX, 303}, false,
+              SharedBlockCache::kGpuLogicalNamespace, childDep(2, 2));
+    cache.put(10, std::vector<BlockIdxType>{110, NULL_BLOCK_IDX, NULL_BLOCK_IDX, 310}, false,
+              SharedBlockCache::kGpuLogicalNamespace, rootDep(0));
+    cache.put(11, std::vector<BlockIdxType>{111, NULL_BLOCK_IDX, NULL_BLOCK_IDX, 311}, false,
+              SharedBlockCache::kGpuLogicalNamespace, childDep(10, 1));
+    cache.put(12, std::vector<BlockIdxType>{112, NULL_BLOCK_IDX, NULL_BLOCK_IDX, 312}, false,
+              SharedBlockCache::kGpuLogicalNamespace, childDep(11, 2));
+
+    auto evicted = cache.selectAndEvictForGroup(/*group_id=*/3, /*min_blocks=*/2);
+
+    ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{2, 11}));
+    EXPECT_TRUE(isNullBlockIdx(cache.matchGroup(2, 3)));
+    EXPECT_TRUE(isNullBlockIdx(cache.matchGroup(11, 3)));
+    EXPECT_EQ(cache.matchGroup(3, 3), 303);
+    EXPECT_EQ(cache.matchGroup(12, 3), 312);
+}
+
+TEST(SharedBlockCacheTest, StateIndependentEvictionFallsBackToWholeChainWhenOnlyLeafStateRemains) {
+    SharedBlockCache cache;
+    cache.setIndependentGroupEviction(/*enabled=*/true, {3});
+
+    cache.put(1, std::vector<BlockIdxType>{101, NULL_BLOCK_IDX, NULL_BLOCK_IDX, NULL_BLOCK_IDX}, false,
+              SharedBlockCache::kGpuLogicalNamespace, rootDep(0));
+    cache.put(2, std::vector<BlockIdxType>{102, NULL_BLOCK_IDX, NULL_BLOCK_IDX, 302}, false,
+              SharedBlockCache::kGpuLogicalNamespace, childDep(1, 1));
+
+    auto evicted = cache.selectAndEvictForGroup(/*group_id=*/3, /*min_blocks=*/1);
+
+    ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{1, 2}));
+    ASSERT_FALSE(evicted.evicted_independent_group.count(2));
+    EXPECT_TRUE(cache.empty());
+}
+
+TEST(SharedBlockCacheTest, SelectAndEvictForGroupSkipsChainsWithoutTargetSlot) {
+    SharedBlockCache cache;
+    cache.setIndependentGroupEviction(/*enabled=*/true, {3});
+
+    cache.put(1, std::vector<BlockIdxType>{101, NULL_BLOCK_IDX, NULL_BLOCK_IDX, NULL_BLOCK_IDX}, false,
+              SharedBlockCache::kGpuLogicalNamespace, rootDep(0));
+    cache.put(2, std::vector<BlockIdxType>{102, NULL_BLOCK_IDX, NULL_BLOCK_IDX, NULL_BLOCK_IDX}, false,
+              SharedBlockCache::kGpuLogicalNamespace, childDep(1, 1));
+    cache.put(10, std::vector<BlockIdxType>{110, NULL_BLOCK_IDX, NULL_BLOCK_IDX, NULL_BLOCK_IDX}, false,
+              SharedBlockCache::kGpuLogicalNamespace, rootDep(0));
+    cache.put(11, std::vector<BlockIdxType>{111, NULL_BLOCK_IDX, NULL_BLOCK_IDX, 311}, false,
+              SharedBlockCache::kGpuLogicalNamespace, childDep(10, 1));
+
+    auto evicted = cache.selectAndEvictForGroup(/*group_id=*/3, /*min_blocks=*/1);
+
+    ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{10, 11}));
+    EXPECT_FALSE(cache.contains(10));
+    EXPECT_FALSE(cache.contains(11));
+    EXPECT_TRUE(cache.contains(1));
+    EXPECT_TRUE(cache.contains(2));
+}
+
+TEST(SharedBlockCacheTest, SelectAndEvictForGroupPrunesBranchUntilTargetAncestorIsEvictable) {
+    SharedBlockCache cache;
+    cache.put(1,
+              std::vector<BlockIdxType>{101, 201},
+              /*is_resident=*/false,
+              SharedBlockCache::kGpuLogicalNamespace,
+              rootDep(0));
+    cache.put(2,
+              std::vector<BlockIdxType>{102, NULL_BLOCK_IDX},
+              /*is_resident=*/false,
+              SharedBlockCache::kGpuLogicalNamespace,
+              childDep(1, 1));
+    cache.put(3,
+              std::vector<BlockIdxType>{103, NULL_BLOCK_IDX},
+              /*is_resident=*/false,
+              SharedBlockCache::kGpuLogicalNamespace,
+              childDep(1, 2));
+
+    auto evicted = cache.selectAndEvictForGroup(/*group_id=*/1, /*min_blocks=*/1);
+
+    ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{2, 1, 3}));
+    ASSERT_EQ(evicted.evicted_slots.at(1), (std::vector<BlockIdxType>{101, 201}));
+    EXPECT_TRUE(isNullBlockIdx(evicted.evicted_slots.at(2)[1]));
+    EXPECT_TRUE(isNullBlockIdx(evicted.evicted_slots.at(3)[1]));
+    EXPECT_TRUE(cache.empty());
+}
+
+TEST(SharedBlockCacheTest, SelectAndEvictForGroupDoesNotPruneWhenTargetAncestorBlockedByResidentSibling) {
+    SharedBlockCache cache;
+    cache.put(1,
+              std::vector<BlockIdxType>{101, 201},
+              /*is_resident=*/false,
+              SharedBlockCache::kGpuLogicalNamespace,
+              rootDep(0));
+    cache.put(2,
+              std::vector<BlockIdxType>{102, NULL_BLOCK_IDX},
+              /*is_resident=*/false,
+              SharedBlockCache::kGpuLogicalNamespace,
+              childDep(1, 1));
+    cache.put(3,
+              std::vector<BlockIdxType>{103, NULL_BLOCK_IDX},
+              /*is_resident=*/true,
+              SharedBlockCache::kGpuLogicalNamespace,
+              childDep(1, 2));
+
+    auto evicted = cache.selectAndEvictForGroup(/*group_id=*/1, /*min_blocks=*/1);
+
+    EXPECT_TRUE(evicted.evicted_keys.empty());
+    EXPECT_TRUE(cache.contains(1));
+    EXPECT_TRUE(cache.contains(2));
+    EXPECT_TRUE(cache.contains(3));
+}
+
+TEST(SharedBlockCacheTest, SelectAndEvictForGroupDoesNotPruneWhenTargetAncestorBlockedByResidentDescendant) {
+    SharedBlockCache cache;
+    cache.put(1,
+              std::vector<BlockIdxType>{101, 201},
+              /*is_resident=*/false,
+              SharedBlockCache::kGpuLogicalNamespace,
+              rootDep(0));
+    cache.put(2,
+              std::vector<BlockIdxType>{102, NULL_BLOCK_IDX},
+              /*is_resident=*/false,
+              SharedBlockCache::kGpuLogicalNamespace,
+              childDep(1, 1));
+    cache.put(3,
+              std::vector<BlockIdxType>{103, NULL_BLOCK_IDX},
+              /*is_resident=*/false,
+              SharedBlockCache::kGpuLogicalNamespace,
+              childDep(1, 2));
+    cache.put(4,
+              std::vector<BlockIdxType>{104, NULL_BLOCK_IDX},
+              /*is_resident=*/true,
+              SharedBlockCache::kGpuLogicalNamespace,
+              childDep(3, 3));
+
+    auto evicted = cache.selectAndEvictForGroup(/*group_id=*/1, /*min_blocks=*/1);
+
+    EXPECT_TRUE(evicted.evicted_keys.empty());
+    EXPECT_TRUE(cache.contains(1));
+    EXPECT_TRUE(cache.contains(2));
+    EXPECT_TRUE(cache.contains(3));
+    EXPECT_TRUE(cache.contains(4));
+}
+
+}  // namespace rtp_llm::test
diff --git a/rtp_llm/cpp/cache/test/SingleTypeKVCacheAllocatorTest.cc b/rtp_llm/cpp/cache/test/SingleTypeKVCacheAllocatorTest.cc
index d9693afbd1..fc6fcf338f 100644
--- a/rtp_llm/cpp/cache/test/SingleTypeKVCacheAllocatorTest.cc
+++ b/rtp_llm/cpp/cache/test/SingleTypeKVCacheAllocatorTest.cc
@@ -5,13 +5,15 @@
 #include <optional>
 #include <torch/torch.h>
 #include "rtp_llm/cpp/utils/Logger.h"
-#include "rtp_llm/cpp/cache/SingleTypeKVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/allocator/SingleTypeKVCacheAllocator.h"
 #include "rtp_llm/cpp/cache/CacheConfig.h"
-#include "rtp_llm/cpp/cache/CacheConfigCreator.h"
+#include "rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h"
+#include "rtp_llm/cpp/cache/CPSlotMapper.h"
 #include "rtp_llm/models_py/bindings/core/ExecOps.h"
 #include "rtp_llm/cpp/cache/test/BlockPoolTestHelper.h"
 #include "rtp_llm/cpp/cache/test/CacheConfigTestUtils.h"
 #include "rtp_llm/cpp/cache/BatchKVCacheResource.h"
+#include "rtp_llm/cpp/cache/SharedBlockCache.h"
 #include "rtp_llm/cpp/engine_base/stream/CompleteTokenIds.h"
 
 namespace rtp_llm {
@@ -41,6 +43,7 @@ static rtp_llm::ModelConfig makeTestModelConfig(uint32_t num_layers) {
     m.attn_config.kv_lora_rank     = 0;
     m.attn_config.rope_head_dim    = 0;
     m.attn_config.head_num         = 2;
+    setDefaultKvCacheSpec(m);
     return m;
 }
 
@@ -94,8 +97,8 @@ BatchKVCacheResourcePtr createBatchKVCacheResource(int batch_size, int layer_num
     auto resource = std::make_shared<BatchKVCacheResource>();
     resource->resetBatchSize(batch_size);
     for (int i = 0; i < batch_size; ++i) {
-        std::vector<int> layer_to_group_id(layer_num, 0);
-        resource->initBatchGroups(i, 1, layer_num, layer_to_group_id);
+        std::vector<std::vector<int>> layer_group_ids(static_cast<size_t>(layer_num), std::vector<int>{0});
+        resource->initBatchGroups(i, 1, layer_num, layer_group_ids);
         resource->setBatchBlocks(i, 0, std::vector<int>(block_num_per_batch));
         resource->setBatchCacheKeys(i, CacheKeysType(block_num_per_batch, static_cast<CacheKeyType>(i * 100)));
     }
@@ -220,6 +223,7 @@ TEST_F(SingleTypeKVCacheAllocatorTest, ReserveBlocksOnlyAppliedToInitMalloc) {
 TEST_F(SingleTypeKVCacheAllocatorTest, ReserveBlocksCheckHappensAfterReuseReferenceInInitMallocForCommonLen) {
     auto config = createSingleTypeTestConfig(/*layer_num=*/4, /*block_num=*/10, /*seq_size_per_block=*/4);
     allocator_  = std::make_shared<SingleTypeKVCacheAllocator>(config);
+    allocator_->setSharedBlockCache(std::make_shared<SharedBlockCache>());
     ASSERT_TRUE(allocator_->init());
 
     allocator_->setReserveBlockNum(2);
@@ -477,7 +481,7 @@ TEST_F(SingleTypeKVCacheAllocatorTest, LayerCacheBase) {
     auto layout = allocator_->allLayerCacheBase();
     EXPECT_EQ(layout.layers_to_kv_buffer_ptrs.size(), config.layer_num);
     EXPECT_EQ(layout.layers_to_scale_buffer_ptrs.size(), config.layer_num);
-    EXPECT_EQ((std::vector<int>(4, 0)), layout.layer_to_groups);
+    EXPECT_EQ((std::vector<std::vector<int>>(4, std::vector<int>{0})), layout.layer_to_group_ids);
 
     for (size_t i = 0; i < layout.layers_to_kv_buffer_ptrs.size(); ++i) {
         EXPECT_TRUE(layout.layers_to_kv_buffer_ptrs[i].defined());
@@ -494,7 +498,7 @@ TEST_F(SingleTypeKVCacheAllocatorTest, BlockCopySingle) {
     int src_block = 0;
     int dst_block = 1;
 
-    auto&  spec         = config.cache_specs[0];
+    auto&  spec         = config.specForGroup(0);
     size_t k_block_size = spec->k_block_size();
     size_t v_block_size = spec->v_block_size();
 
@@ -550,7 +554,7 @@ TEST_F(SingleTypeKVCacheAllocatorTest, BlockBatchCopyVector) {
     copy_mapping.push_back({2, 3});
     copy_mapping.push_back({4, 5});
 
-    auto&  spec         = config.cache_specs[0];
+    auto&  spec         = config.specForGroup(0);
     size_t k_block_size = spec->k_block_size();
     size_t v_block_size = spec->v_block_size();
 
@@ -616,7 +620,7 @@ TEST_F(SingleTypeKVCacheAllocatorTest, BlockBatchCopyPointers) {
 
     BlockIdPair pairs[] = {{0, 1}, {2, 3}};
 
-    auto&  spec         = config.cache_specs[0];
+    auto&  spec         = config.specForGroup(0);
     size_t k_block_size = spec->k_block_size();
     size_t v_block_size = spec->v_block_size();
 
@@ -665,7 +669,7 @@ TEST_F(SingleTypeKVCacheAllocatorTest, BlockBatchCopyBuffer) {
     std::vector<int32_t> data   = {0, 1, 2, 3, 4, 5};  // 3 pairs: (0->1, 2->3, 4->5)
     auto                 tensor = torch::from_blob(data.data(), {3, 2}, torch::kInt32).clone();
 
-    auto&  spec         = config.cache_specs[0];
+    auto&  spec         = config.specForGroup(0);
     size_t k_block_size = spec->k_block_size();
     size_t v_block_size = spec->v_block_size();
 
@@ -740,7 +744,7 @@ TEST_F(SingleTypeKVCacheAllocatorTest, IncrKVCacheRefReferencesMatchedBlocksOnly
     EXPECT_EQ(allocator_->freeBlocksNum(), total_free_before - 4);
 
     KVCacheResource resource;
-    resource.initGroups(1, config.layer_all_num, config.layer_to_group_id);
+    resource.initGroups(1, config.layer_all_num, config.layerGroupIdsSnapshot());
 
     resource.cacheKeys() = CacheKeysType{100, 101, 102, 103};
     resource.mutableBlockIds(0).assign(BlockIndicesType{blocks[0], blocks[1], 0, blocks[2]});
@@ -760,6 +764,38 @@ TEST_F(SingleTypeKVCacheAllocatorTest, IncrKVCacheRefReferencesMatchedBlocksOnly
     EXPECT_EQ(allocator_->freeBlocksNum(), total_free_before);
 }
 
+TEST_F(SingleTypeKVCacheAllocatorTest, IncrKVCacheRefPreservesConnectorDummyTail) {
+    auto config = createSingleTypeTestConfig(/*layer_num=*/4, /*block_num=*/10, /*seq_size_per_block=*/8);
+    allocator_  = std::make_shared<SingleTypeKVCacheAllocator>(config, AllocationType::HOST);
+    ASSERT_TRUE(allocator_->init());
+
+    auto block_pool = allocator_->getBlockPool();
+    ASSERT_NE(block_pool, nullptr);
+
+    const size_t total_free_before = allocator_->freeBlocksNum();
+    auto         blocks            = block_pool->malloc(2);
+    ASSERT_EQ(blocks.size(), 2);
+
+    KVCacheResource resource;
+    resource.initGroups(1, config.layer_all_num, config.layerGroupIdsSnapshot());
+    resource.cacheKeys() = CacheKeysType{101, 103, 999};
+    resource.rebuildLinearBlockDependencies();
+    resource.setLastBlockAligned(false);
+    resource.mutableBlockIds(0).assign(BlockIndicesType{blocks[0], blocks[1]});
+
+    auto ref_resource = allocator_->incrKVCacheRef(resource, CacheKeysType{101, 103, 999}, /*is_connector=*/true);
+    ASSERT_NE(ref_resource, nullptr);
+    EXPECT_FALSE(ref_resource->lastBlockAligned());
+    EXPECT_EQ(ref_resource->cacheKeys(), (CacheKeysType{101, 103, 999}));
+    EXPECT_EQ(ref_resource->blocks(0), (BlockIndicesType{blocks[0], blocks[1], NULL_BLOCK_IDX}));
+
+    block_pool->requestFree(blocks);
+    EXPECT_EQ(allocator_->freeBlocksNum(), total_free_before - 2);
+
+    ref_resource.reset();
+    EXPECT_EQ(allocator_->freeBlocksNum(), total_free_before);
+}
+
 TEST_F(SingleTypeKVCacheAllocatorTest, IncrKVCacheRefEmptyInputNoEffect) {
     auto config = createSingleTypeTestConfig(/*layer_num=*/4, /*block_num=*/10, /*seq_size_per_block=*/8);
     allocator_  = std::make_shared<SingleTypeKVCacheAllocator>(config, AllocationType::HOST);
@@ -774,7 +810,7 @@ TEST_F(SingleTypeKVCacheAllocatorTest, IncrKVCacheRefEmptyInputNoEffect) {
     EXPECT_EQ(allocator_->freeBlocksNum(), total_free_before - 2);
 
     KVCacheResource resource;
-    resource.initGroups(1, config.layer_all_num, config.layer_to_group_id);
+    resource.initGroups(1, config.layer_all_num, config.layerGroupIdsSnapshot());
     resource.cacheKeys() = CacheKeysType{100, 101};
     resource.mutableBlockIds(0).assign(BlockIndicesType{blocks[0], blocks[1]});
 
@@ -801,6 +837,21 @@ TEST_F(SingleTypeKVCacheAllocatorTest, MaxSeqLen) {
     EXPECT_EQ(allocator_->maxAvailableTokensNum(), (10 - 1) * 8);  // block_num * seq_size_per_block
 }
 
+TEST_F(SingleTypeKVCacheAllocatorTest, CapacityAndNeedBlocksUseCPVirtualBlockSize) {
+    auto config = createSingleTypeTestConfig(/*layer_num=*/4, /*block_num=*/10, /*seq_size_per_block=*/8);
+    allocator_  = std::make_shared<SingleTypeKVCacheAllocator>(config);
+    ASSERT_TRUE(allocator_->init());
+
+    allocator_->setCPSlotMapper(
+        std::make_shared<CPSlotMapper>(/*cp_rank=*/0, /*cp_size=*/2, /*block_size=*/8));
+
+    EXPECT_EQ(allocator_->maxAvailableTokensNum(), (10u - 1u) * 16u);
+    EXPECT_EQ(allocator_->availableTokensNum(), (10u - 1u) * 16u);
+
+    auto batch_resource = createBatchKVCacheResource(/*batch_size=*/1, config.layer_num);
+    EXPECT_EQ(allocator_->singleBatchNeedBlocks(batch_resource, /*seq_len=*/65, /*reserve_step=*/0), 5);
+}
+
 // Test boundary conditions
 
 TEST_F(SingleTypeKVCacheAllocatorTest, MallocWithZeroSeqLength) {
@@ -832,6 +883,7 @@ TEST_F(SingleTypeKVCacheAllocatorTest, FreeEmptyBatchResource) {
 TEST_F(SingleTypeKVCacheAllocatorTest, InitMallocRollbackWhenInitMallocForCommonLenFails) {
     auto config = createSingleTypeTestConfig(/*layer_num=*/4, /*block_num=*/6, /*seq_size_per_block=*/4);
     allocator_  = std::make_shared<SingleTypeKVCacheAllocator>(config, AllocationType::HOST);
+    allocator_->setSharedBlockCache(std::make_shared<SharedBlockCache>());
     ASSERT_TRUE(allocator_->init());
 
     auto seed_resource = createBatchKVCacheResource(/*batch_size=*/1, config.layer_num);
diff --git a/rtp_llm/cpp/cache/test/mock/MockKVCacheAllocator.h b/rtp_llm/cpp/cache/test/mock/MockKVCacheAllocator.h
index 6b80aae4fa..1f86935664 100644
--- a/rtp_llm/cpp/cache/test/mock/MockKVCacheAllocator.h
+++ b/rtp_llm/cpp/cache/test/mock/MockKVCacheAllocator.h
@@ -2,7 +2,7 @@
 
 #include <gmock/gmock.h>
 
-#include "rtp_llm/cpp/cache/KVCacheAllocator.h"
+#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h"
 
 namespace rtp_llm {
 
diff --git a/rtp_llm/cpp/config/BUILD b/rtp_llm/cpp/config/BUILD
index fcec06b5be..764c5c1d12 100644
--- a/rtp_llm/cpp/config/BUILD
+++ b/rtp_llm/cpp/config/BUILD
@@ -55,7 +55,8 @@ cc_library(
         ":config_modules",
         "//rtp_llm/cpp/model_utils:model_utils",
         "//rtp_llm/models_py/bindings/core:types",
-        "//rtp_llm/models_py/bindings/core:type_convert"
+        "//rtp_llm/models_py/bindings/core:type_convert",
+        "//rtp_llm/cpp/cache:kv_cache_spec_desc_types",
     ],
     visibility = ["//visibility:public"],
     copts = copts(),
diff --git a/rtp_llm/cpp/config/ConfigModules.h b/rtp_llm/cpp/config/ConfigModules.h
index 0f4bbf1deb..1a8c1bdce1 100644
--- a/rtp_llm/cpp/config/ConfigModules.h
+++ b/rtp_llm/cpp/config/ConfigModules.h
@@ -29,7 +29,9 @@ enum class CPRotateMethod {
 struct PrefillCPConfig {
     CPRotateMethod method           = CPRotateMethod::DISABLED;
     size_t         comm_buffer_size = 512 * 1024 * 1024;  // 512MB
-    bool           is_enabled() const {
+    bool    kv_cache_sharded = false;
+    int64_t prefill_cp_size  = 0;
+    bool    is_enabled() const {
         return method != CPRotateMethod::DISABLED && method != CPRotateMethod::UNKNOWN
                && method != CPRotateMethod::PREFILL_CP;
     }
@@ -69,6 +71,8 @@ struct ParallelismConfig {
     bool    enable_sp        = false;
     bool    use_ub_comm      = false;
 
+    RoleType role_type = RoleType::PDFUSION;
+
     FfnDisAggregateConfig ffn_disaggregate_config;  // FFN disaggregate configuration
 
     // Context Parallel configuration
@@ -165,7 +169,10 @@ struct KVCacheConfig {
     bool    enable_memory_cache_sm_copy  = false;
     bool    enable_remote_cache          = false;
     bool    write_cache_sync             = false;
-    bool    enable_tiered_memory_cache   = false;
+    bool    enable_tiered_memory_cache           = false;
+    bool    enable_gpu_prefix_tree               = false;
+    bool    enable_prefix_tree_memory_cache      = false;
+    bool    enable_independent_group_eviction    = false;
     int64_t device_cache_min_free_blocks = 0;
     int     load_cache_retry_times       = 1;  // Maximum retry attempts for load cache transfer failures
 
@@ -538,7 +545,8 @@ enum class HybridAttentionType {
 };
 
 struct HybridAttentionConfig {
-    bool                             enable_hybrid_attention = false;
+    bool                             enable_hybrid_attention           = false;
+    bool                             enable_independent_kv_cache_pools = false;
     std::vector<HybridAttentionType> hybrid_attention_types;
     std::string                      to_string() const;
 };
diff --git a/rtp_llm/cpp/config/ModelConfig.h b/rtp_llm/cpp/config/ModelConfig.h
index 15981932b2..5cb0b836cc 100644
--- a/rtp_llm/cpp/config/ModelConfig.h
+++ b/rtp_llm/cpp/config/ModelConfig.h
@@ -11,12 +11,15 @@
 #include "rtp_llm/cpp/config/EplbConfig.h"
 #include "rtp_llm/cpp/config/ConfigModules.h"
 #include "rtp_llm/cpp/config/SpecialTokens.h"
+#include "rtp_llm/cpp/cache/spec/KVCacheSpecDescTypes.h"
 #include <vector>
 #include <string>
 #include <map>
 
 namespace rtp_llm {
 
+using LayerKVCacheSpecDescs = std::vector<std::vector<KVCacheSpecDesc>>;
+
 enum TaskType {
     DENSE_EMBEDDING    = 0,
     ALL_EMBEDDING      = 1,
@@ -122,6 +125,9 @@ class ModelConfig {
     // Multimodal model configuration
     MMModelConfig mm_model_config;
 
+    // Declarative per-model KV cache layout
+    LayerKVCacheSpecDescs kv_cache_spec_descs;
+
     // Fields merged from PyModelConfig
     std::string extra_data_path       = "";
     std::string local_extra_data_path = "";
diff --git a/rtp_llm/cpp/distribute/BUILD b/rtp_llm/cpp/distribute/BUILD
new file mode 100644
index 0000000000..5c338e77b5
--- /dev/null
+++ b/rtp_llm/cpp/distribute/BUILD
@@ -0,0 +1,24 @@
+load("//:def.bzl", "copts")
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "rpc_cpu_tp_broadcaster_hdr",
+    hdrs = ["RpcCpuTpBroadcaster.h"],
+    deps = [
+        "//rtp_llm/cpp/model_rpc:broadcast_manager",
+        "//rtp_llm/cpp/model_rpc/proto:model_rpc_service_cc_proto",
+    ],
+    copts = copts(),
+)
+
+cc_library(
+    name = "rpc_cpu_tp_broadcaster",
+    srcs = ["RpcCpuTpBroadcaster.cc"],
+    hdrs = ["RpcCpuTpBroadcaster.h"],
+    deps = [
+        ":rpc_cpu_tp_broadcaster_hdr",
+        "//rtp_llm/cpp/utils:core_utils",
+    ],
+    copts = copts(),
+)
diff --git a/rtp_llm/cpp/distribute/RpcCpuTpBroadcaster.cc b/rtp_llm/cpp/distribute/RpcCpuTpBroadcaster.cc
new file mode 100644
index 0000000000..016b6acfaa
--- /dev/null
+++ b/rtp_llm/cpp/distribute/RpcCpuTpBroadcaster.cc
@@ -0,0 +1,288 @@
+#include "rtp_llm/cpp/distribute/RpcCpuTpBroadcaster.h"
+
+#include "rtp_llm/cpp/utils/AssertUtils.h"
+#include "rtp_llm/cpp/utils/Logger.h"
+
+#include <chrono>
+#include <cstring>
+#include <sstream>
+
+namespace rtp_llm {
+
+namespace {
+
+constexpr int kDefaultTimeoutMs = 30000;
+
+int normalizeTimeoutMs(int timeout_ms) {
+    return timeout_ms > 0 ? timeout_ms : kDefaultTimeoutMs;
+}
+
+}  // namespace
+
+RpcCpuTpBroadcaster& RpcCpuTpBroadcaster::instance() {
+    static RpcCpuTpBroadcaster i;
+    return i;
+}
+
+std::size_t RpcCpuTpBroadcaster::InboxKeyHash::operator()(const InboxKey& key) const {
+    std::size_t h = std::hash<std::string>{}(key.group_key);
+    h ^= std::hash<uint64_t>{}(key.seq) + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2);
+    h ^= std::hash<int>{}(key.dst_tp_rank) + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2);
+    return h;
+}
+
+std::string RpcCpuTpBroadcaster::makeGroupKey(int dp_rank, int tp_size, int world_size) const {
+    std::ostringstream oss;
+    oss << "tp_cpu_broadcast:dp=" << dp_rank << ":tp=" << tp_size << ":world=" << world_size;
+    return oss.str();
+}
+
+void RpcCpuTpBroadcaster::initialize(int                            tp_rank,
+                                     int                            tp_size,
+                                     int                            dp_rank,
+                                     int                            world_size,
+                                     const std::vector<std::string>& worker_grpc_addrs,
+                                     int                            timeout_ms) {
+    std::lock_guard<std::mutex> lock(mu_);
+    timeout_ms = normalizeTimeoutMs(timeout_ms);
+
+    if (initialized_.load(std::memory_order_acquire)) {
+        const std::string new_group_key = makeGroupKey(dp_rank, tp_size, world_size);
+        RTP_LLM_CHECK_WITH_INFO(tp_rank_ == tp_rank && tp_size_ == tp_size && dp_rank_ == dp_rank
+                                    && world_size_ == world_size && group_key_ == new_group_key,
+                                "RpcCpuTpBroadcaster re-init mismatch: was rank=%d size=%d dp=%d world=%d group=%s, "
+                                "now rank=%d size=%d dp=%d world=%d group=%s",
+                                tp_rank_,
+                                tp_size_,
+                                dp_rank_,
+                                world_size_,
+                                group_key_.c_str(),
+                                tp_rank,
+                                tp_size,
+                                dp_rank,
+                                world_size,
+                                new_group_key.c_str());
+        return;
+    }
+
+    if (tp_size <= 1) {
+        tp_rank_    = tp_rank;
+        tp_size_    = tp_size;
+        dp_rank_    = dp_rank;
+        world_size_ = world_size;
+        timeout_ms_ = timeout_ms;
+        group_key_  = makeGroupKey(dp_rank, tp_size, world_size);
+        initialized_.store(true, std::memory_order_release);
+        return;
+    }
+
+    RTP_LLM_CHECK_WITH_INFO(tp_rank >= 0 && tp_rank < tp_size,
+                            "RpcCpuTpBroadcaster bad tp_rank=%d tp_size=%d",
+                            tp_rank,
+                            tp_size);
+    RTP_LLM_CHECK_WITH_INFO(static_cast<int>(worker_grpc_addrs.size()) >= world_size,
+                            "RpcCpuTpBroadcaster worker_grpc_addrs too small: addrs=%zu world_size=%d",
+                            worker_grpc_addrs.size(),
+                            world_size);
+
+    tp_rank_    = tp_rank;
+    tp_size_    = tp_size;
+    dp_rank_    = dp_rank;
+    world_size_ = world_size;
+    timeout_ms_ = timeout_ms;
+    group_key_  = makeGroupKey(dp_rank, tp_size, world_size);
+    seq_.store(0, std::memory_order_release);
+    inbox_.clear();
+    peer_addrs_.clear();
+    peer_tp_ranks_.clear();
+    broadcast_manager_.reset();
+
+    if (tp_rank_ == 0) {
+        peer_addrs_.reserve(tp_size - 1);
+        peer_tp_ranks_.reserve(tp_size - 1);
+        for (int peer_tp_rank = 1; peer_tp_rank < tp_size; ++peer_tp_rank) {
+            const int world_rank = dp_rank * tp_size + peer_tp_rank;
+            RTP_LLM_CHECK_WITH_INFO(world_rank >= 0 && world_rank < static_cast<int>(worker_grpc_addrs.size()),
+                                    "RpcCpuTpBroadcaster bad peer world_rank=%d addrs=%zu",
+                                    world_rank,
+                                    worker_grpc_addrs.size());
+            peer_addrs_.push_back(worker_grpc_addrs[world_rank]);
+            peer_tp_ranks_.push_back(peer_tp_rank);
+        }
+        broadcast_manager_ = std::make_shared<BroadcastManager>(peer_addrs_);
+        RTP_LLM_CHECK_WITH_INFO(broadcast_manager_->init(),
+                                "RpcCpuTpBroadcaster BroadcastManager init failed for %zu peer(s)",
+                                peer_addrs_.size());
+    }
+
+    initialized_.store(true, std::memory_order_release);
+    cv_.notify_all();
+    RTP_LLM_LOG_INFO("Initialized RpcCpuTpBroadcaster rank=%d tp_size=%d dp_rank=%d world_size=%d peers=%zu timeout_ms=%d",
+                     tp_rank_,
+                     tp_size_,
+                     dp_rank_,
+                     world_size_,
+                     peer_addrs_.size(),
+                     timeout_ms_);
+}
+
+void RpcCpuTpBroadcaster::reset() {
+    {
+        std::lock_guard<std::mutex> lock(mu_);
+        inbox_.clear();
+        peer_addrs_.clear();
+        peer_tp_ranks_.clear();
+        broadcast_manager_.reset();
+        tp_rank_    = 0;
+        tp_size_    = 1;
+        dp_rank_    = 0;
+        world_size_ = 1;
+        timeout_ms_ = kDefaultTimeoutMs;
+        group_key_.clear();
+        seq_.store(0, std::memory_order_release);
+        initialized_.store(false, std::memory_order_release);
+    }
+    cv_.notify_all();
+}
+
+uint64_t RpcCpuTpBroadcaster::nextSeq() {
+    return seq_.fetch_add(1, std::memory_order_acq_rel);
+}
+
+void RpcCpuTpBroadcaster::broadcast(void* buf, std::size_t nbytes, int root) {
+    RTP_LLM_CHECK_WITH_INFO(initialized_.load(std::memory_order_acquire),
+                            "RpcCpuTpBroadcaster::broadcast called before initialize");
+    if (tp_size_ <= 1 || nbytes == 0) {
+        return;
+    }
+    RTP_LLM_CHECK_WITH_INFO(root == 0, "RpcCpuTpBroadcaster supports only root=0; got %d", root);
+
+    const uint64_t seq = nextSeq();
+    if (tp_rank_ == 0) {
+        std::shared_ptr<BroadcastManager> manager;
+        std::vector<int>                  peer_tp_ranks;
+        std::string                       group_key;
+        int                               timeout_ms = kDefaultTimeoutMs;
+        {
+            std::lock_guard<std::mutex> lock(mu_);
+            manager       = broadcast_manager_;
+            peer_tp_ranks = peer_tp_ranks_;
+            group_key     = group_key_;
+            timeout_ms    = timeout_ms_;
+        }
+        RTP_LLM_CHECK_WITH_INFO(manager != nullptr, "RpcCpuTpBroadcaster root has no BroadcastManager");
+
+        std::vector<CpuTpBroadcastRequestPB> requests;
+        requests.reserve(peer_tp_ranks.size());
+        for (int peer_tp_rank : peer_tp_ranks) {
+            CpuTpBroadcastRequestPB request;
+            request.set_group_key(group_key);
+            request.set_seq(seq);
+            request.set_root(root);
+            request.set_src_tp_rank(tp_rank_);
+            request.set_dst_tp_rank(peer_tp_rank);
+            request.set_nbytes(static_cast<uint64_t>(nbytes));
+            request.set_payload(buf, nbytes);
+            requests.push_back(std::move(request));
+        }
+
+        auto rpc_call = [](std::shared_ptr<RpcService::Stub>& stub,
+                           std::shared_ptr<grpc::ClientContext>& ctx,
+                           const CpuTpBroadcastRequestPB& request,
+                           grpc::CompletionQueue* cq) {
+            return stub->AsyncCpuTpBroadcast(ctx.get(), request, cq);
+        };
+
+        auto result = manager->broadcast<CpuTpBroadcastRequestPB, CpuTpBroadcastResponsePB>(
+            requests, timeout_ms, rpc_call);
+        RTP_LLM_CHECK_WITH_INFO(result != nullptr,
+                                "RpcCpuTpBroadcaster broadcast setup failed seq=%lu nbytes=%zu",
+                                seq,
+                                nbytes);
+        RTP_LLM_CHECK_WITH_INFO(result->waitDone(timeout_ms),
+                                "RpcCpuTpBroadcaster broadcast wait timeout seq=%lu timeout_ms=%d",
+                                seq,
+                                timeout_ms);
+        RTP_LLM_CHECK_WITH_INFO(result->success(), "RpcCpuTpBroadcaster broadcast RPC failed seq=%lu", seq);
+        for (const auto& response : result->responses()) {
+            RTP_LLM_CHECK_WITH_INFO(response.success(),
+                                    "RpcCpuTpBroadcaster peer rejected seq=%lu: %s",
+                                    seq,
+                                    response.error_message().c_str());
+        }
+        return;
+    }
+
+    InboxKey    key;
+    std::string payload;
+    int         timeout_ms = kDefaultTimeoutMs;
+    {
+        std::unique_lock<std::mutex> lock(mu_);
+        key        = InboxKey{group_key_, seq, tp_rank_};
+        timeout_ms = timeout_ms_;
+        const bool ready = cv_.wait_for(lock, std::chrono::milliseconds(timeout_ms), [&] {
+            return !initialized_.load(std::memory_order_acquire) || inbox_.find(key) != inbox_.end();
+        });
+        RTP_LLM_CHECK_WITH_INFO(ready && initialized_.load(std::memory_order_acquire),
+                                "RpcCpuTpBroadcaster receive timeout seq=%lu rank=%d timeout_ms=%d",
+                                seq,
+                                tp_rank_,
+                                timeout_ms);
+        auto it = inbox_.find(key);
+        RTP_LLM_CHECK_WITH_INFO(it != inbox_.end(), "RpcCpuTpBroadcaster missing inbox payload seq=%lu", seq);
+        payload = std::move(it->second);
+        inbox_.erase(it);
+    }
+
+    RTP_LLM_CHECK_WITH_INFO(payload.size() == nbytes,
+                            "RpcCpuTpBroadcaster size mismatch seq=%lu rank=%d expected=%zu actual=%zu",
+                            seq,
+                            tp_rank_,
+                            nbytes,
+                            payload.size());
+    std::memcpy(buf, payload.data(), nbytes);
+}
+
+bool RpcCpuTpBroadcaster::handleBroadcastRequest(const CpuTpBroadcastRequestPB& request,
+                                                 CpuTpBroadcastResponsePB*      response) {
+    auto fail = [&](const std::string& message) {
+        response->set_success(false);
+        response->set_error_message(message);
+        RTP_LLM_LOG_WARNING("RpcCpuTpBroadcaster rejected request: %s", message.c_str());
+        return false;
+    };
+
+    std::unique_lock<std::mutex> lock(mu_);
+    if (!initialized_.load(std::memory_order_acquire)) {
+        cv_.wait_for(lock, std::chrono::milliseconds(kDefaultTimeoutMs), [&] {
+            return initialized_.load(std::memory_order_acquire);
+        });
+    }
+    if (!initialized_.load(std::memory_order_acquire)) {
+        return fail("broadcaster is not initialized");
+    }
+    if (request.group_key() != group_key_) {
+        return fail("group_key mismatch: got " + request.group_key() + ", expected " + group_key_);
+    }
+    if (request.root() != 0 || request.src_tp_rank() != 0) {
+        return fail("only root tp_rank 0 is supported");
+    }
+    if (request.dst_tp_rank() != tp_rank_) {
+        return fail("dst_tp_rank mismatch");
+    }
+    if (request.nbytes() != request.payload().size()) {
+        return fail("payload size mismatch");
+    }
+
+    InboxKey key{request.group_key(), request.seq(), request.dst_tp_rank()};
+    if (inbox_.find(key) != inbox_.end()) {
+        return fail("duplicate payload");
+    }
+    inbox_.emplace(std::move(key), request.payload());
+    response->set_success(true);
+    response->clear_error_message();
+    cv_.notify_all();
+    return true;
+}
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/distribute/RpcCpuTpBroadcaster.h b/rtp_llm/cpp/distribute/RpcCpuTpBroadcaster.h
new file mode 100644
index 0000000000..01537629d5
--- /dev/null
+++ b/rtp_llm/cpp/distribute/RpcCpuTpBroadcaster.h
@@ -0,0 +1,86 @@
+#pragma once
+
+#include <atomic>
+#include <condition_variable>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "rtp_llm/cpp/model_rpc/BroadcastManager.h"
+#include "rtp_llm/cpp/model_rpc/proto/model_rpc_service.pb.h"
+
+namespace rtp_llm {
+
+// Cross-node CPU TP broadcaster over RpcService. Root rank fanouts bytes to TP
+// peers; non-root ranks wait on a local inbox filled by the gRPC server thread.
+// The logical API intentionally matches CpuTpBroadcaster so execBroadcastCpu can
+// choose this path without changing tpSyncModelInputs' packing/unpacking logic.
+class RpcCpuTpBroadcaster {
+public:
+    static RpcCpuTpBroadcaster& instance();
+
+    void initialize(int                      tp_rank,
+                    int                      tp_size,
+                    int                      dp_rank,
+                    int                      world_size,
+                    const std::vector<std::string>& worker_grpc_addrs,
+                    int                      timeout_ms);
+
+    void reset();
+
+    bool isInitialized() const {
+        return initialized_.load(std::memory_order_acquire);
+    }
+
+    void broadcast(void* buf, std::size_t nbytes, int root);
+
+    bool handleBroadcastRequest(const CpuTpBroadcastRequestPB& request, CpuTpBroadcastResponsePB* response);
+
+private:
+    struct InboxKey {
+        std::string group_key;
+        uint64_t    seq = 0;
+        int         dst_tp_rank = 0;
+
+        bool operator==(const InboxKey& other) const {
+            return group_key == other.group_key && seq == other.seq && dst_tp_rank == other.dst_tp_rank;
+        }
+    };
+
+    struct InboxKeyHash {
+        std::size_t operator()(const InboxKey& key) const;
+    };
+
+    RpcCpuTpBroadcaster() = default;
+    ~RpcCpuTpBroadcaster() = default;
+    RpcCpuTpBroadcaster(const RpcCpuTpBroadcaster&)            = delete;
+    RpcCpuTpBroadcaster& operator=(const RpcCpuTpBroadcaster&) = delete;
+
+    uint64_t nextSeq();
+    std::string makeGroupKey(int dp_rank, int tp_size, int world_size) const;
+
+private:
+    mutable std::mutex mu_;
+    std::condition_variable cv_;
+    std::atomic<bool> initialized_{false};
+    std::atomic<uint64_t> seq_{0};
+
+    int         tp_rank_ = 0;
+    int         tp_size_ = 1;
+    int         dp_rank_ = 0;
+    int         world_size_ = 1;
+    int         timeout_ms_ = 3000;
+    std::string group_key_;
+
+    std::vector<std::string> peer_addrs_;
+    std::vector<int>         peer_tp_ranks_;
+    std::shared_ptr<BroadcastManager> broadcast_manager_;
+
+    std::unordered_map<InboxKey, std::string, InboxKeyHash> inbox_;
+};
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/cpp/engine_base/stream/StreamCacheResource.cc b/rtp_llm/cpp/engine_base/stream/StreamCacheResource.cc
index 20a601552d..778a28bae8 100644
--- a/rtp_llm/cpp/engine_base/stream/StreamCacheResource.cc
+++ b/rtp_llm/cpp/engine_base/stream/StreamCacheResource.cc
@@ -196,18 +196,18 @@ static bool applyP2PSideChannelToStream(const std::shared_ptr<FusedAsyncReadCont
 
 void StreamCacheResource::init(int batch_size) {
     batch_kv_cache_resource_->resetBatchSize(batch_size);
-    int                         group_nums     = 1;
-    int                         layer_all_num  = 0;
-    std::vector<int>            layer_to_group = {};
-    std::vector<CacheGroupType> group_types    = {};
+    int                              group_nums     = 1;
+    int                              layer_all_num  = 0;
+    std::vector<std::vector<int>>    layer_to_group = {};
+    std::vector<CacheGroupType>      group_types    = {};
 
     size_t kernel_blocks_per_kv_block = 1;
     if (resource_context_.cache_manager) {  // cache manager is null when warmup
         const auto& cache_config = resource_context_.cache_manager->cacheConfig();
         group_nums               = cache_config.groupNums();
         layer_all_num            = static_cast<int>(cache_config.layer_all_num);
-        layer_to_group           = cache_config.layer_to_group_id;
-        group_types              = cache_config.group_types;
+        layer_to_group           = cache_config.layerGroupIdsSnapshot();
+        group_types              = cache_config.groupTypesSnapshot();
         if (cache_config.kernel_seq_size_per_block > 0 && cache_config.seq_size_per_block > 0) {
             kernel_blocks_per_kv_block = cache_config.seq_size_per_block / cache_config.kernel_seq_size_per_block;
         }
@@ -522,18 +522,18 @@ const CacheKeysType& StreamCacheResource::cacheKeys(int32_t batch_id) const {
 void StreamCacheResource::fakeInitKVBlock(size_t reserved_blocks) {
     fake_inited_ = true;
     batch_kv_cache_resource_->resetBatchSize(stream_->maxBatchSize());
-    int                         group_nums                 = 1;
-    int                         layer_all_num              = 0;
-    size_t                      kernel_blocks_per_kv_block = 1;
-    std::vector<int>            layer_to_group             = {};
-    std::vector<CacheGroupType> group_types                = {};
+    int                              group_nums                 = 1;
+    int                              layer_all_num              = 0;
+    size_t                           kernel_blocks_per_kv_block = 1;
+    std::vector<std::vector<int>>    layer_to_group             = {};
+    std::vector<CacheGroupType>      group_types                = {};
 
     if (resource_context_.cache_manager) {
         const auto& cache_config   = resource_context_.cache_manager->cacheConfig();
         group_nums                 = cache_config.groupNums();
         layer_all_num              = static_cast<int>(cache_config.layer_all_num);
-        layer_to_group             = cache_config.layer_to_group_id;
-        group_types                = cache_config.group_types;
+        layer_to_group             = cache_config.layerGroupIdsSnapshot();
+        group_types                = cache_config.groupTypesSnapshot();
         kernel_blocks_per_kv_block = cache_config.kernelBlocksPerKvBlock();
     }
     batch_kv_cache_resource_->initGroups(
@@ -705,7 +705,7 @@ void StreamCacheResource::swapLinearBlocks(int32_t batch_id, size_t rhs, size_t
         return;
     }
 
-    auto type_list = resource_context_.cache_manager->cacheConfig().group_types;
+    auto type_list = resource_context_.cache_manager->cacheConfig().groupTypesSnapshot();
 
     for (size_t i = 0; i < type_list.size(); i++) {
         if (type_list[i] == CacheGroupType::LINEAR) {
diff --git a/rtp_llm/cpp/engine_base/stream/test/PdSepKVCacheReleaseTest.cc b/rtp_llm/cpp/engine_base/stream/test/PdSepKVCacheReleaseTest.cc
index 5468f396a7..f99e63afa9 100644
--- a/rtp_llm/cpp/engine_base/stream/test/PdSepKVCacheReleaseTest.cc
+++ b/rtp_llm/cpp/engine_base/stream/test/PdSepKVCacheReleaseTest.cc
@@ -5,23 +5,281 @@
 #define protected public
 #include "rtp_llm/cpp/cache/KVCacheManager.h"
 #include "rtp_llm/cpp/cache/CacheConfig.h"
+#include "rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h"
+#include "rtp_llm/cpp/cache/config_creator/HybridPoolConfigCreator.h"
+#include "rtp_llm/cpp/cache/KVCacheTransferPlanner.h"
 #include "rtp_llm/cpp/cache/KVCacheResource.h"
 #include "rtp_llm/cpp/cache/test/CacheConfigTestUtils.h"
+#include "rtp_llm/cpp/disaggregate/cache_store/RequestBlockBufferStore.h"
 #include "rtp_llm/cpp/engine_base/stream/GenerateStream.h"
 #include "rtp_llm/cpp/engine_base/stream/GenerateTypes.h"
 #include "rtp_llm/cpp/engine_base/stream/StreamCacheResource.h"
+#include "rtp_llm/cpp/model_rpc/DecodeRpcServer.h"
+#include "rtp_llm/cpp/model_rpc/PrefillGenerateContext.h"
 #include "rtp_llm/cpp/normal_engine/NormalGenerateStream.h"
 #include "rtp_llm/cpp/testing/TestBase.h"
 #include "rtp_llm/cpp/config/ConfigModules.h"
 #include "rtp_llm/cpp/config/RoleTypes.h"
+#include "rtp_llm/models_py/bindings/common/WriteCacheStoreOp.h"
+#include "rtp_llm/models_py/bindings/core/ExecOps.h"
 
 #include <atomic>
 #include <chrono>
+#include <cstring>
 #include <memory>
+#include <numeric>
 #include <thread>
+#include <unordered_map>
 
 namespace rtp_llm {
 
+using test::setDsv4KvCacheSpecs;
+
+namespace {
+
+constexpr int kDsv4PoolNum        = 7;
+constexpr int kDsv4TokensPerBlock = 256;
+
+class DummyMemoryUtil: public MemoryUtil {
+public:
+    bool regUserMr(void*, uint64_t, bool, uint64_t = 0) override {
+        return true;
+    }
+    bool deregUserMr(void*, bool) override {
+        return true;
+    }
+    bool isMemoryMr(void*, uint64_t, bool, bool) override {
+        return true;
+    }
+    bool findMemoryMr(void*, void*, uint64_t, bool, bool) override {
+        return true;
+    }
+    bool isRdmaMode() override {
+        return false;
+    }
+};
+
+class MemoryBackedCacheStore: public NormalCacheStore {
+public:
+    MemoryBackedCacheStore() {
+        memory_util_                = std::make_shared<DummyMemoryUtil>();
+        request_block_buffer_store_ = std::make_shared<RequestBlockBufferStore>(memory_util_);
+    }
+
+    void store(const std::shared_ptr<RequestBlockBuffer>& request_block_buffer,
+               CacheStoreStoreDoneCallback                callback) override {
+        runtimeSyncAndCheck();
+        for (const auto& [key, block] : request_block_buffer->getBlocks()) {
+            auto src_options = torch::TensorOptions(torch::kUInt8).device(block->gpu_mem ? torch::kCUDA : torch::kCPU);
+            auto src         = torch::from_blob(block->addr.get(), {(int64_t)block->len}, src_options);
+            auto host        = block->gpu_mem ? src.cpu().contiguous() : src.contiguous();
+            std::vector<uint8_t> bytes(static_cast<size_t>(block->len));
+            std::memcpy(bytes.data(), host.data_ptr<uint8_t>(), bytes.size());
+            stored_blocks_[key] = std::move(bytes);
+        }
+        store_request_keys_.push_back(request_block_buffer->getRequestKey());
+        store_buffer_requests_.push_back(request_block_buffer);
+        callback(true, CacheStoreErrorCode::None);
+    }
+
+    void load(const std::shared_ptr<RequestBlockBuffer>& request_block_buffer,
+              CacheStoreLoadDoneCallback                 callback,
+              const std::string&,
+              uint32_t,
+              uint32_t,
+              uint32_t = 1000,
+              int      = 1,
+              int      = 0) override {
+        bool ok = true;
+        for (const auto& [key, block] : request_block_buffer->getBlocks()) {
+            auto it = stored_blocks_.find(key);
+            if (it == stored_blocks_.end() || it->second.size() != block->len) {
+                ok = false;
+                continue;
+            }
+            auto host = torch::from_blob(const_cast<uint8_t*>(it->second.data()),
+                                         {(int64_t)it->second.size()},
+                                         torch::TensorOptions(torch::kUInt8).device(torch::kCPU))
+                            .clone();
+            auto dst_options = torch::TensorOptions(torch::kUInt8).device(block->gpu_mem ? torch::kCUDA : torch::kCPU);
+            auto dst         = torch::from_blob(block->addr.get(), {(int64_t)block->len}, dst_options);
+            dst.copy_(host);
+        }
+        runtimeSyncAndCheck();
+        load_request_keys_.push_back(request_block_buffer->getRequestKey());
+        callback(ok, ok ? CacheStoreErrorCode::None : CacheStoreErrorCode::LoadErrorUnknown);
+    }
+
+    std::shared_ptr<LoadContext>
+    loadBuffers(const std::vector<std::shared_ptr<RequestBlockBuffer>>& request_block_buffers,
+                const std::string&                                      ip,
+                uint32_t                                                port,
+                uint32_t                                                rdma_port,
+                int64_t                                                 timeout_ms,
+                LoadContext::CheckCancelFunc                            check_cancel_func,
+                int                                                     partition_count,
+                int                                                     partition_id) override {
+        load_buffer_requests_.insert(
+            load_buffer_requests_.end(), request_block_buffers.begin(), request_block_buffers.end());
+        auto context = std::make_shared<LoadContext>(shared_from_this(), false);
+        context->load(
+            request_block_buffers, ip, port, rdma_port, timeout_ms, check_cancel_func, partition_count, partition_id);
+        return context;
+    }
+
+    std::unordered_map<std::string, std::vector<uint8_t>> stored_blocks_;
+    std::vector<std::string>                              store_request_keys_;
+    std::vector<std::string>                              load_request_keys_;
+    std::vector<std::shared_ptr<RequestBlockBuffer>>      store_buffer_requests_;
+    std::vector<std::shared_ptr<RequestBlockBuffer>>      load_buffer_requests_;
+};
+
+class MinimalEngine: public EngineBase {
+public:
+    MinimalEngine(const EngineInitParams& params, std::shared_ptr<KVCacheManager> cache_manager): EngineBase(params) {
+        resource_context_.cache_manager = std::move(cache_manager);
+    }
+
+    std::shared_ptr<GenerateStream> enqueue(const std::shared_ptr<GenerateInput>&) override {
+        return nullptr;
+    }
+    void         enqueue(std::shared_ptr<GenerateStream>&) override {}
+    absl::Status stop() override {
+        return absl::OkStatus();
+    }
+    absl::StatusOr<GenerateStreamPtr> preRun(const std::shared_ptr<GenerateInput>&, preRunMode) override {
+        return absl::UnimplementedError("unused in test");
+    }
+    KVCacheInfo getCacheStatusInfo(int64_t, bool) override {
+        return KVCacheInfo();
+    }
+};
+
+void fillDsv4RegionBytes(const std::shared_ptr<KVCacheManager>& manager,
+                         int                                    block_id,
+                         int                                    layer_id,
+                         int                                    group_id,
+                         uint8_t                                value) {
+    auto parts = manager->convertIndexToBuffer(block_id, layer_id, group_id);
+    ASSERT_EQ(parts.size(), 1u);
+    auto device = torch::from_blob(
+        parts[0].addr, {(int64_t)parts[0].size_bytes}, torch::TensorOptions(torch::kUInt8).device(torch::kCUDA));
+    auto host =
+        torch::full({(int64_t)parts[0].size_bytes}, value, torch::TensorOptions(torch::kUInt8).device(torch::kCPU));
+    device.copy_(host);
+}
+
+void expectDsv4RegionBytes(const std::shared_ptr<KVCacheManager>& manager,
+                           int                                    block_id,
+                           int                                    layer_id,
+                           int                                    group_id,
+                           uint8_t                                value) {
+    auto parts = manager->convertIndexToBuffer(block_id, layer_id, group_id);
+    ASSERT_EQ(parts.size(), 1u);
+    auto device = torch::from_blob(
+        parts[0].addr, {(int64_t)parts[0].size_bytes}, torch::TensorOptions(torch::kUInt8).device(torch::kCUDA));
+    auto        host = device.cpu().contiguous();
+    const auto* ptr  = host.data_ptr<uint8_t>();
+    for (size_t i = 0; i < parts[0].size_bytes; ++i) {
+        ASSERT_EQ(ptr[i], value) << "byte=" << i << " layer=" << layer_id << " block=" << block_id
+                                 << " group=" << group_id;
+    }
+}
+
+uint8_t dsv4PdPattern(int layer_id, int gid, size_t block_pos) {
+    return static_cast<uint8_t>(17 + layer_id * 19 + gid * 11 + block_pos);
+}
+
+void setGroupBlockNumsForTest(CacheConfig& config, uint32_t block_num) {
+    const auto group_num = static_cast<size_t>(config.groupNums());
+    std::vector<uint32_t> block_nums(group_num, block_num);
+    std::vector<size_t>   kv_strides;
+    std::vector<size_t>   scale_strides;
+    kv_strides.reserve(group_num);
+    scale_strides.reserve(group_num);
+    for (size_t gid = 0; gid < group_num; ++gid) {
+        kv_strides.push_back(config.kvBlockStrideBytesForGroup(gid));
+        scale_strides.push_back(config.kvScaleStrideBytesForGroup(gid));
+    }
+    config.setGroupBlockLayout(block_nums, kv_strides, scale_strides);
+}
+
+std::vector<size_t> dsv4BlockPositionsForCacheTransfer(const CacheConfig& config,
+                                                       int                gid,
+                                                       size_t             block_num,
+                                                       size_t             reuse_block_size) {
+    const auto policy = config.policyForGroup(static_cast<size_t>(gid));
+    const size_t tail_block_count =
+        policy.active_tail_blocks > 0 ? static_cast<size_t>(policy.active_tail_blocks) : 0;
+    return blockPositionsForCacheTransfer(block_num,
+                                          reuse_block_size,
+                                          true,
+                                          tail_block_count > 0,
+                                          tail_block_count,
+                                          /*hybrid_full_from_begin=*/true);
+}
+
+size_t expectedDsv4StoredBlocks(const CacheConfig& config, int layer_num, int block_num, size_t reuse_block_size) {
+    size_t expected = 0;
+    const auto layer_group_ids = config.layerGroupIdsSnapshot();
+    for (int layer_id = 0; layer_id < layer_num; ++layer_id) {
+        for (int gid : layer_group_ids[layer_id]) {
+            expected += dsv4BlockPositionsForCacheTransfer(config, gid, block_num, reuse_block_size).size();
+        }
+    }
+    return expected;
+}
+
+torch::Tensor groupTypesTensorForConfig(const CacheConfig& config) {
+    std::vector<int32_t> group_types;
+    for (auto group_type : config.groupTypesSnapshot()) {
+        group_types.push_back(static_cast<int32_t>(group_type));
+    }
+    return torch::from_blob(group_types.data(),
+                            {static_cast<int64_t>(group_types.size())},
+                            torch::TensorOptions(torch::kInt32))
+        .clone();
+}
+
+torch::Tensor blockIdsTensor(const BatchKVCacheResourcePtr& resource, int gid) {
+    const auto& blocks = resource->blocks(0, gid);
+    return torch::from_blob(const_cast<int*>(blocks.data()), {1, static_cast<int64_t>(blocks.size())}, torch::kInt32)
+        .clone();
+}
+
+CacheStoreInputs makeSingleBlockWriteInputs(const std::string& cache_key_string,
+                                            int                request_id_val,
+                                            int                tokens_per_block,
+                                            int                kv_stride,
+                                            int                kv_scale_stride,
+                                            bool               use_opaque_kv_cache_store,
+                                            int                group_id,
+                                            const std::string& tag) {
+    CacheStoreInputs inputs;
+    inputs.input_lengths_host        = torch::tensor({tokens_per_block}, torch::kInt32);
+    inputs.prefix_lengths_host       = torch::tensor({0}, torch::kInt32);
+    inputs.host_kv_cache_offset      = torch::tensor({{1}}, torch::kInt32);
+    inputs.context_batch_size        = 1;
+    inputs.decoder_batch_size        = 0;
+    inputs.request_id                = torch::tensor({(int64_t)request_id_val}, torch::kInt64);
+    inputs.request_pd_separation     = torch::tensor({true}, torch::kBool);
+    inputs.cache_keys                = {cache_key_string};
+    inputs.tokens_per_block          = tokens_per_block;
+    inputs.kv_block_stride_bytes     = kv_stride;
+    inputs.kv_scale_stride_bytes     = kv_scale_stride;
+    inputs.pd_separation             = true;
+    inputs.model_id                  = 0;
+    inputs.decode_entrance           = false;
+    inputs.warmup                    = false;
+    inputs.use_opaque_kv_cache_store = use_opaque_kv_cache_store;
+    inputs.layer_id                  = 0;
+    inputs.group_id                  = group_id;
+    inputs.tag                       = tag;
+    return inputs;
+}
+
+}  // namespace
+
 // =============================================================================
 // Test fixture: PD sep KV cache release correctness
 // Validates that holdKVCacheForPDSep / releaseKVCacheForPDSep / releaseResource
@@ -44,10 +302,56 @@ class PdSepKVCacheReleaseTest: public DeviceTestBase {
                                               rtp_llm::DataType::TYPE_INT8);
     }
 
+    CacheConfig makeDsv4Config(uint32_t block_num               = 16,
+                               uint32_t seq_size_per_block      = kDsv4TokensPerBlock,
+                               uint32_t kernel_seq_size_per_blk = kDsv4TokensPerBlock) {
+        ModelConfig mc;
+        mc.num_layers                   = 43;
+        mc.hidden_size                  = 4096;
+        mc.attn_config.head_num         = 64;
+        mc.attn_config.kv_head_num      = 1;
+        mc.attn_config.size_per_head    = 512;
+        mc.attn_config.rope_head_dim    = 64;
+        mc.attn_config.sliding_window   = 128;
+        mc.attn_config.indexer_head_dim = 128;
+        mc.attn_config.indexer_head_num = 64;
+        mc.attn_config.indexer_topk     = 512;
+        mc.attn_config.o_groups         = 8;
+        mc.attn_config.o_lora_rank      = 1024;
+        std::vector<int> ratios         = {0, 0};
+        for (int i = 2; i < 43; ++i) {
+            ratios.push_back((i % 2 == 0) ? 4 : 128);
+        }
+        ratios.push_back(0);  // MTP tail marker.
+        mc.attn_config.layer_compress_ratios                       = ratios;
+        mc.hybrid_attention_config.enable_hybrid_attention           = true;
+        mc.hybrid_attention_config.enable_independent_kv_cache_pools = true;
+        setDsv4KvCacheSpecs(mc);
+
+        ParallelismConfig pc;
+        KVCacheConfig     kv_config;
+        kv_config.seq_size_per_block        = seq_size_per_block;
+        kv_config.kernel_seq_size_per_block = kernel_seq_size_per_blk;
+        auto config                         = CacheConfigCreator::createBasicConfig(mc, pc, kv_config, false, 0);
+        config.block_num                    = block_num;
+        setGroupBlockNumsForTest(config, block_num);
+        return config;
+    }
+
     // Build a PREFILL stream with reuse_cache enabled
     void prepareStream(const std::vector<int>& input_tokens) {
-        auto cache_config = makeConfig();
-        cache_manager_    = std::make_shared<KVCacheManager>(cache_config, /*warmup=*/false, nullptr);
+        prepareStreamWithConfig(input_tokens, makeConfig(), /*tokens_per_block=*/8, RoleType::PREFILL);
+    }
+
+    void prepareDsv4Stream(const std::vector<int>& input_tokens, RoleType role_type = RoleType::PREFILL) {
+        prepareStreamWithConfig(input_tokens, makeDsv4Config(), static_cast<int>(kDsv4TokensPerBlock), role_type);
+    }
+
+    void prepareStreamWithConfig(const std::vector<int>& input_tokens,
+                                 const CacheConfig&      cache_config,
+                                 int                     tokens_per_block,
+                                 RoleType                role_type) {
+        cache_manager_ = std::make_shared<KVCacheManager>(cache_config, /*warmup=*/false, nullptr);
         ASSERT_TRUE(cache_manager_->init());
         initial_free_blocks_ = cache_manager_->freeBlocksNum();
 
@@ -55,7 +359,7 @@ class PdSepKVCacheReleaseTest: public DeviceTestBase {
         resource_context.cache_manager       = cache_manager_;
         resource_context.reuse_cache         = true;
         resource_context.enable_device_cache = true;
-        resource_context.role_type           = RoleType::PREFILL;
+        resource_context.role_type           = role_type;
 
         auto generate_input                   = std::make_shared<GenerateInput>();
         auto generate_config                  = std::make_shared<GenerateConfig>();
@@ -67,8 +371,8 @@ class PdSepKVCacheReleaseTest: public DeviceTestBase {
         generate_input->generate_config = generate_config;
 
         ModelConfig model_config;
-        model_config.attn_config.tokens_per_block = 8;
-        model_config.max_seq_len                  = 2048;
+        model_config.attn_config.tokens_per_block = tokens_per_block;
+        model_config.max_seq_len                  = std::max<int64_t>(2048, input_tokens.size() + tokens_per_block);
         RuntimeConfig runtime_config;
 
         stream_ = std::make_shared<NormalGenerateStream>(
@@ -328,4 +632,842 @@ TEST_F(PdSepKVCacheReleaseTest, testHoldWithoutReleasePDSep_ResourceReleasedStil
         << "Blocks should be freed once pd_kvcache_ref_ is dropped (minus device cache refs)";
 }
 
+TEST_F(PdSepKVCacheReleaseTest, testPrefillContextStopStream_ReleasesPDSepHold) {
+    prepareStream({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+    allocateAndFinish();
+
+    auto& resource = stream_->streamCacheResource();
+    resource.holdKVCacheForPDSep();
+    ASSERT_NE(resource.pd_kvcache_ref_, nullptr);
+    ASSERT_GT(cache_manager_->allocator_->connectorRefBlocksNum(), 0);
+
+    RemoteServerResource remote_resource;
+    remote_resource.workers     = {"local"};
+    remote_resource.cache_store = std::make_shared<MemoryBackedCacheStore>();
+
+    GenerateInputPB request;
+    request.set_request_id(1001);
+    RPCContext                   rpc_context{&request, nullptr};
+    grpc::ServerContext          server_context;
+    kmonitor::MetricsReporterPtr metrics_reporter;
+    auto                         meta = std::make_shared<RpcServerRuntimeMeta>();
+
+    {
+        PrefillGenerateContext prefill_context(
+            &remote_resource, rpc_context, /*timeout_ms=*/0, &server_context, metrics_reporter, meta);
+        prefill_context.setStream(stream_);
+    }
+
+    EXPECT_EQ(resource.pd_kvcache_ref_, nullptr);
+    EXPECT_EQ(cache_manager_->allocator_->connectorRefBlocksNum(), 0);
+}
+
+TEST_F(PdSepKVCacheReleaseTest, testDsv4PDSepPrefillReleaseInsertsSevenGroupDeviceCache) {
+    const int        spb = static_cast<int>(kDsv4TokensPerBlock);
+    std::vector<int> tokens(3 * spb + 17);
+    std::iota(tokens.begin(), tokens.end(), 1);
+
+    auto config        = makeDsv4Config();
+    config.linear_step = 4;
+    prepareStreamWithConfig(tokens, config, spb, RoleType::PREFILL);
+    allocateAndFinish();
+
+    auto& resource = stream_->streamCacheResource();
+    ASSERT_EQ(resource.kvCache().groupNums(), kDsv4PoolNum);
+    ASSERT_GT(resource.curBlocksNum(), 0);
+    for (int gid = 0; gid < kDsv4PoolNum; ++gid) {
+        ASSERT_EQ(resource.kvCache().blocksNum(0, gid), 4) << "group " << gid;
+        const auto& blocks = resource.kvCache().blocks(0, gid);
+        if (config.typeForGroup(static_cast<size_t>(gid)) == CacheGroupType::FULL) {
+            EXPECT_FALSE(isNullBlockIdx(blocks[0])) << "paged group " << gid;
+        } else {
+            const int active_tail_blocks = config.policyForGroup(static_cast<size_t>(gid)).active_tail_blocks;
+            const int tail_begin         = std::max<int>(0, static_cast<int>(blocks.size()) - active_tail_blocks);
+            for (int block_idx = 0; block_idx < static_cast<int>(blocks.size()); ++block_idx) {
+                const bool expect_tail = block_idx >= tail_begin;
+                EXPECT_EQ(isNullBlockIdx(blocks[block_idx]), !expect_tail)
+                    << "tail group " << gid << " block " << block_idx;
+            }
+        }
+    }
+
+    resource.holdKVCacheForPDSep();
+    ASSERT_NE(resource.pd_kvcache_ref_, nullptr);
+
+    stream_->releaseResource();
+    EXPECT_TRUE(resource.resource_released_);
+    resource.releaseKVCacheForPDSep();
+    EXPECT_EQ(resource.pd_kvcache_ref_, nullptr);
+
+    ResourceContext resource_context2;
+    resource_context2.cache_manager       = cache_manager_;
+    resource_context2.reuse_cache         = true;
+    resource_context2.enable_device_cache = true;
+    resource_context2.role_type           = RoleType::PREFILL;
+
+    auto generate_input2                   = std::make_shared<GenerateInput>();
+    auto generate_config2                  = std::make_shared<GenerateConfig>();
+    generate_config2->num_return_sequences = 1;
+    generate_config2->reuse_cache          = true;
+    generate_config2->enable_device_cache  = true;
+    generate_input2->input_ids       = torch::tensor(std::vector<int32_t>(tokens.begin(), tokens.end()), torch::kInt32);
+    generate_input2->generate_config = generate_config2;
+
+    ModelConfig model_config;
+    model_config.attn_config.tokens_per_block = spb;
+    model_config.max_seq_len                  = 4096;
+    RuntimeConfig runtime_config;
+
+    auto stream2 = std::make_shared<NormalGenerateStream>(
+        generate_input2, model_config, runtime_config, resource_context2, nullptr);
+    stream2->generate_status_->status = StreamState::RUNNING;
+
+    auto& resource2 = stream2->streamCacheResource();
+    ASSERT_TRUE(resource2.initKVBlock().ok());
+    EXPECT_GE(stream2->reuseLength(), spb) << "DSV4 prefill should reuse cached 7-group prefix blocks";
+    EXPECT_EQ(resource2.kvCache().groupNums(), kDsv4PoolNum);
+
+    stream2->generate_status_->status = StreamState::FINISHED;
+    stream2->fillSubGenerateStatus(StreamState::FINISHED);
+    stream2->releaseResource();
+}
+
+TEST_F(PdSepKVCacheReleaseTest, testDsv4DecodeFirstMallocBypassesLocalDeviceReuseInPDSep) {
+    const int        spb = static_cast<int>(kDsv4TokensPerBlock);
+    std::vector<int> tokens(3 * spb + 17);
+    std::iota(tokens.begin(), tokens.end(), 1);
+
+    prepareDsv4Stream(tokens, RoleType::PREFILL);
+    allocateAndFinish();
+    auto& prefill_resource = stream_->streamCacheResource();
+    prefill_resource.holdKVCacheForPDSep();
+    stream_->releaseResource();
+    prefill_resource.releaseKVCacheForPDSep();
+
+    ResourceContext decode_resource_context;
+    decode_resource_context.cache_manager       = cache_manager_;
+    decode_resource_context.reuse_cache         = true;
+    decode_resource_context.enable_device_cache = true;
+    decode_resource_context.role_type           = RoleType::DECODE;
+
+    auto decode_input                   = std::make_shared<GenerateInput>();
+    auto decode_config                  = std::make_shared<GenerateConfig>();
+    decode_config->num_return_sequences = 1;
+    decode_config->reuse_cache          = true;
+    decode_config->enable_device_cache  = true;
+    decode_input->input_ids       = torch::tensor(std::vector<int32_t>(tokens.begin(), tokens.end()), torch::kInt32);
+    decode_input->generate_config = decode_config;
+
+    ModelConfig model_config;
+    model_config.attn_config.tokens_per_block = spb;
+    model_config.max_seq_len                  = 4096;
+    RuntimeConfig runtime_config;
+
+    auto decode_stream = std::make_shared<NormalGenerateStream>(
+        decode_input, model_config, runtime_config, decode_resource_context, nullptr);
+    decode_stream->generate_status_->status = StreamState::RUNNING;
+
+    auto& decode_resource = decode_stream->streamCacheResource();
+    ASSERT_TRUE(decode_resource.initKVBlock().ok());
+
+    EXPECT_EQ(decode_stream->reuseLength(), 0)
+        << "Hybrid DSV4 decode first malloc must not consume local device-cache reuse; PD load owns reuse.";
+    EXPECT_EQ(decode_resource.kvCache().groupNums(), kDsv4PoolNum);
+    for (int gid = 0; gid < kDsv4PoolNum; ++gid) {
+        EXPECT_EQ(decode_resource.kvCache().blocksNum(0, gid), 4) << "group " << gid;
+    }
+
+    decode_stream->releaseResource();
+}
+
+TEST_F(PdSepKVCacheReleaseTest, testDsv4CacheStorePDSepTransfersAllLayerRegions) {
+    const int     spb        = static_cast<int>(kDsv4TokensPerBlock);
+    const int     block_num  = 4;
+    const int64_t request_id = 9017;
+    const size_t  model_id   = 77;
+
+    auto config = makeDsv4Config(/*block_num=*/24);
+
+    auto makeResource = [&config]() {
+        auto resource = std::make_shared<BatchKVCacheResource>();
+        resource->resetBatchSize(1);
+        resource->initGroups(config.groupNums(),
+                             static_cast<int>(config.layer_all_num),
+                             config.layerGroupIdsSnapshot(),
+                             config.kernelBlocksPerKvBlock(),
+                             config.groupTypesSnapshot());
+        return resource;
+    };
+    auto makeCompleteTokens = [spb, block_num](int max_seq_len) {
+        auto input              = std::make_shared<GenerateInput>();
+        input->input_ids        = torch::arange(max_seq_len, torch::kInt32);
+        input->generate_config  = std::make_shared<GenerateConfig>();
+        auto complete_token_ids = std::make_shared<CompleteTokenIds>(1, 1, max_seq_len + spb, spb);
+        complete_token_ids->init(input);
+        complete_token_ids->setSeqLength(block_num * spb);
+        return complete_token_ids;
+    };
+
+    auto prefill_manager = std::make_shared<KVCacheManager>(config, /*warmup=*/false, nullptr);
+    auto decode_manager  = std::make_shared<KVCacheManager>(config, /*warmup=*/false, nullptr);
+    ASSERT_TRUE(prefill_manager->init());
+    ASSERT_TRUE(decode_manager->init());
+
+    auto prefill_resource = makeResource();
+    auto decode_resource  = makeResource();
+    ASSERT_TRUE(
+        prefill_manager->malloc({prefill_resource, makeCompleteTokens(block_num * spb), request_id, true, false, false})
+            .success);
+    ASSERT_TRUE(
+        decode_manager->malloc({decode_resource, makeCompleteTokens(block_num * spb), request_id, true, false, false})
+            .success);
+
+    std::vector<CacheKeyType> cache_keys;
+    std::vector<std::string>  cache_key_strings;
+    for (int i = 0; i < block_num; ++i) {
+        cache_keys.push_back(10000 + i);
+        cache_key_strings.push_back(std::to_string(cache_keys.back()));
+    }
+
+    for (int layer_id = 0; layer_id < 4; ++layer_id) {
+        for (int gid : config.groupIdsForLayer(layer_id)) {
+            auto positions   = dsv4BlockPositionsForCacheTransfer(config, gid, block_num, /*reuse_block_size=*/0);
+            for (auto block_pos : positions) {
+                auto prefill_block_id = prefill_resource->blocks(0, gid)[block_pos];
+                auto decode_block_id  = decode_resource->blocks(0, gid)[block_pos];
+                ASSERT_FALSE(isNullBlockIdx(prefill_block_id)) << "prefill gid=" << gid << " pos=" << block_pos;
+                ASSERT_FALSE(isNullBlockIdx(decode_block_id)) << "decode gid=" << gid << " pos=" << block_pos;
+                fillDsv4RegionBytes(
+                    prefill_manager, prefill_block_id, layer_id, gid, dsv4PdPattern(layer_id, gid, block_pos));
+                fillDsv4RegionBytes(decode_manager, decode_block_id, layer_id, gid, 0xEE);
+            }
+        }
+    }
+    runtimeSyncAndCheck();
+
+    auto group_types_tensor = groupTypesTensorForConfig(config);
+
+    auto cache_store = std::make_shared<MemoryBackedCacheStore>();
+    auto layout      = prefill_manager->getMainModelCacheLayerLayout();
+    for (int layer_id = 0; layer_id < 4; ++layer_id) {
+        for (int gid : config.groupIdsForLayer(layer_id)) {
+            auto tag = config.tagForGroup(static_cast<size_t>(gid));
+            auto group_idx  = static_cast<size_t>(gid);
+            ASSERT_TRUE(layout.layers_to_kv_buffer_ptrs_by_group[layer_id][group_idx].defined())
+                << "layer=" << layer_id << " region=" << group_idx;
+
+            CacheStoreInputs inputs;
+            inputs.input_lengths_host                  = torch::tensor({block_num * spb}, torch::kInt32);
+            inputs.prefix_lengths_host                 = torch::tensor({0}, torch::kInt32);
+            inputs.host_kv_cache_offset                = blockIdsTensor(prefill_resource, gid);
+            inputs.kv_cache_group_types_host           = group_types_tensor;
+            inputs.context_batch_size                  = 1;
+            inputs.decoder_batch_size                  = 0;
+            inputs.request_id                          = torch::tensor({request_id}, torch::kInt64);
+            inputs.request_pd_separation               = torch::tensor({true}, torch::kBool);
+            inputs.cache_keys                          = cache_key_strings;
+            inputs.tokens_per_block                    = spb;
+            inputs.kv_block_stride_bytes               = config.kvBlockStrideBytesForGroup(static_cast<size_t>(gid));
+            inputs.kv_scale_stride_bytes               = 0;
+            inputs.pd_separation                       = true;
+            inputs.model_id                            = model_id;
+            inputs.decode_entrance                     = false;
+            inputs.warmup                              = false;
+            inputs.use_opaque_kv_cache_store           = config.use_opaque_kv_cache_store;
+            inputs.layer_id                            = layer_id;
+            inputs.group_id                            = gid;
+            inputs.tag                                 = tag;
+
+            KvCacheInfo kv_cache_info;
+            kv_cache_info.kv_cache_buffer = layout.layers_to_kv_buffer_ptrs_by_group[layer_id][group_idx];
+            runtimeWriteCacheStore(inputs, kv_cache_info, /*mla_kvcache=*/false, cache_store);
+        }
+    }
+    ASSERT_EQ(cache_store->store_request_keys_.size(), 10u);
+    ASSERT_EQ(cache_store->stored_blocks_.size(),
+              expectedDsv4StoredBlocks(config, /*layer_num=*/4, block_num, /*reuse_block_size=*/0));
+
+    EngineInitParams params;
+    params.model_id                 = model_id;
+    params.model_config_.num_layers = 4;
+    params.parallelism_config       = ParallelismConfig();
+
+    DecodeRpcServer server;
+    server.engine_                   = std::make_shared<MinimalEngine>(params, decode_manager);
+    server.maga_init_params_         = params;
+    server.propose_maga_init_params_ = nullptr;
+    server.resource_.cache_store     = cache_store;
+
+    std::vector<std::string>            peer_addrs = {"127.0.0.1:12345:12346"};
+    grpc::ServerContext                 server_context;
+    DecodeRpcServer::LoadKVCacheContext load_context(request_id,
+                                                     "dsv4-cache-store-pd",
+                                                     peer_addrs,
+                                                     cache_keys,
+                                                     decode_resource->groupBlocks(),
+                                                     /*reuse_block_size=*/0,
+                                                     /*timeout_ms=*/5000,
+                                                     /*partition_count=*/1,
+                                                     /*partition_id=*/0,
+                                                     &server_context);
+    auto                                status = server.loadCache(load_context);
+    ASSERT_TRUE(status.ok()) << status.ToString();
+
+    EXPECT_EQ(cache_store->load_buffer_requests_.size(), 10u);
+    EXPECT_EQ(cache_store->load_request_keys_.size(), 10u);
+    for (int layer_id = 0; layer_id < 4; ++layer_id) {
+        for (int gid : config.groupIdsForLayer(layer_id)) {
+            auto positions   = dsv4BlockPositionsForCacheTransfer(config, gid, block_num, /*reuse_block_size=*/0);
+            for (auto block_pos : positions) {
+                auto decode_block_id = decode_resource->blocks(0, gid)[block_pos];
+                ASSERT_FALSE(isNullBlockIdx(decode_block_id));
+                expectDsv4RegionBytes(
+                    decode_manager, decode_block_id, layer_id, gid, dsv4PdPattern(layer_id, gid, block_pos));
+            }
+        }
+    }
+}
+
+TEST_F(PdSepKVCacheReleaseTest, testDsv4DecoupledCacheStoreTransfersPhysicalBlocks) {
+    const int     spb        = 8192;
+    const int     kernel_spb = 128;
+    const int     block_num  = 2;
+    const int64_t request_id = 9020;
+    const size_t  model_id   = 80;
+
+    auto config = makeDsv4Config(/*block_num=*/8, spb, kernel_spb);
+
+    auto makeResource = [&config]() {
+        auto resource = std::make_shared<BatchKVCacheResource>();
+        resource->resetBatchSize(1);
+        resource->initGroups(config.groupNums(),
+                             static_cast<int>(config.layer_all_num),
+                             config.layerGroupIdsSnapshot(),
+                             config.kernelBlocksPerKvBlock(),
+                             config.groupTypesSnapshot());
+        return resource;
+    };
+    auto makeCompleteTokens = [spb, block_num](int max_seq_len) {
+        auto input              = std::make_shared<GenerateInput>();
+        input->input_ids        = torch::arange(max_seq_len, torch::kInt32);
+        input->generate_config  = std::make_shared<GenerateConfig>();
+        auto complete_token_ids = std::make_shared<CompleteTokenIds>(1, 1, max_seq_len + spb, spb);
+        complete_token_ids->init(input);
+        complete_token_ids->setSeqLength(block_num * spb);
+        return complete_token_ids;
+    };
+
+    auto prefill_manager = std::make_shared<KVCacheManager>(config, /*warmup=*/false, nullptr);
+    auto decode_manager  = std::make_shared<KVCacheManager>(config, /*warmup=*/false, nullptr);
+    ASSERT_TRUE(prefill_manager->init());
+    ASSERT_TRUE(decode_manager->init());
+
+    auto prefill_resource = makeResource();
+    auto decode_resource  = makeResource();
+    ASSERT_TRUE(
+        prefill_manager->malloc({prefill_resource, makeCompleteTokens(block_num * spb), request_id, true, false, false})
+            .success);
+    ASSERT_TRUE(
+        decode_manager->malloc({decode_resource, makeCompleteTokens(block_num * spb), request_id, true, false, false})
+            .success);
+
+    std::vector<CacheKeyType> cache_keys;
+    std::vector<std::string>  cache_key_strings;
+    for (int i = 0; i < block_num; ++i) {
+        cache_keys.push_back(20000 + i);
+        cache_key_strings.push_back(std::to_string(cache_keys.back()));
+    }
+
+    for (int layer_id = 0; layer_id < 4; ++layer_id) {
+        for (int gid : config.groupIdsForLayer(layer_id)) {
+            auto positions   = dsv4BlockPositionsForCacheTransfer(config, gid, block_num, /*reuse_block_size=*/0);
+            for (auto block_pos : positions) {
+                auto prefill_block_id = prefill_resource->blocks(0, gid)[block_pos];
+                auto decode_block_id  = decode_resource->blocks(0, gid)[block_pos];
+                ASSERT_FALSE(isNullBlockIdx(prefill_block_id)) << "prefill gid=" << gid << " pos=" << block_pos;
+                ASSERT_FALSE(isNullBlockIdx(decode_block_id)) << "decode gid=" << gid << " pos=" << block_pos;
+                fillDsv4RegionBytes(
+                    prefill_manager, prefill_block_id, layer_id, gid, dsv4PdPattern(layer_id, gid, block_pos));
+                fillDsv4RegionBytes(decode_manager, decode_block_id, layer_id, gid, 0xEE);
+            }
+        }
+    }
+    runtimeSyncAndCheck();
+
+    auto group_types_tensor = groupTypesTensorForConfig(config);
+
+    auto cache_store = std::make_shared<MemoryBackedCacheStore>();
+    auto layout      = prefill_manager->getMainModelCacheLayerLayout();
+    for (int layer_id = 0; layer_id < 4; ++layer_id) {
+        for (int gid : config.groupIdsForLayer(layer_id)) {
+            auto tag        = config.tagForGroup(static_cast<size_t>(gid));
+            auto group_idx  = static_cast<size_t>(gid);
+            ASSERT_TRUE(layout.layers_to_kv_buffer_ptrs_by_group[layer_id][group_idx].defined())
+                << "layer=" << layer_id << " group=" << group_idx;
+
+            torch_ext::PyCacheStoreInputs inputs;
+            inputs.context_batch_size             = 1;
+            inputs.decoder_batch_size             = 0;
+            inputs.request_id                     = torch::tensor({request_id}, torch::kInt64);
+            inputs.request_pd_separation          = torch::tensor({true}, torch::kBool);
+            inputs.kv_cache_group_types           = group_types_tensor;
+            inputs.cache_keys                     = cache_key_strings;
+            inputs.input_lengths_host             = torch::tensor({block_num * spb}, torch::kInt32);
+            inputs.prefix_lengths_host            = torch::tensor({0}, torch::kInt32);
+            inputs.tokens_per_block               = spb;
+            inputs.kv_block_stride_bytes          = config.kv_block_stride_bytes;
+            inputs.kv_scale_stride_bytes          = 0;
+            inputs.pd_separation                  = true;
+            inputs.model_id                       = model_id;
+            inputs.decode_entrance                = false;
+            inputs.warmup                         = false;
+            inputs.use_opaque_kv_cache_store      = config.use_opaque_kv_cache_store;
+            inputs.mla_kvcache                    = false;
+            inputs.cache_store                    = cache_store;
+
+            torch_ext::LayerKVCache layer_cache;
+            layer_cache.kv_cache_base      = layout.layers_to_kv_buffer_ptrs_by_group[layer_id][group_idx];
+            layer_cache.seq_size_per_block = config.typeForGroup(static_cast<size_t>(gid)) == CacheGroupType::FULL ? kernel_spb : spb;
+            layer_cache.layer_id           = layer_id;
+            layer_cache.group_id           = gid;
+            layer_cache.tag                = tag;
+
+            WriteCacheStoreOp(inputs.input_lengths_host,
+                              inputs.prefix_lengths_host,
+                              blockIdsTensor(prefill_resource, gid),
+                              inputs,
+                              layer_cache);
+        }
+    }
+
+    const auto first_csa_key = "kv_" + makeCacheKey(model_id, cache_key_strings[0], /*layer_id=*/2, "csa_kv");
+    ASSERT_NE(cache_store->stored_blocks_.find(first_csa_key), cache_store->stored_blocks_.end());
+    EXPECT_EQ(cache_store->stored_blocks_[first_csa_key].size(),
+              config.kvBlockStrideBytesForGroup(static_cast<size_t>(config.groupIdForTag("csa_kv"))));
+
+    EngineInitParams params;
+    params.model_id                 = model_id;
+    params.model_config_.num_layers = 4;
+    params.parallelism_config       = ParallelismConfig();
+
+    DecodeRpcServer server;
+    server.engine_                   = std::make_shared<MinimalEngine>(params, decode_manager);
+    server.maga_init_params_         = params;
+    server.propose_maga_init_params_ = nullptr;
+    server.resource_.cache_store     = cache_store;
+
+    std::vector<std::string>            peer_addrs = {"127.0.0.1:12345:12346"};
+    grpc::ServerContext                 server_context;
+    DecodeRpcServer::LoadKVCacheContext load_context(request_id,
+                                                     "dsv4-decoupled-cache-store-pd",
+                                                     peer_addrs,
+                                                     cache_keys,
+                                                     decode_resource->groupBlocks(),
+                                                     /*reuse_block_size=*/0,
+                                                     /*timeout_ms=*/5000,
+                                                     /*partition_count=*/1,
+                                                     /*partition_id=*/0,
+                                                     &server_context);
+    auto                                status = server.loadCache(load_context);
+    ASSERT_TRUE(status.ok()) << status.ToString();
+
+    for (int layer_id = 0; layer_id < 4; ++layer_id) {
+        for (int gid : config.groupIdsForLayer(layer_id)) {
+            auto positions   = dsv4BlockPositionsForCacheTransfer(config, gid, block_num, /*reuse_block_size=*/0);
+            for (auto block_pos : positions) {
+                auto decode_block_id = decode_resource->blocks(0, gid)[block_pos];
+                ASSERT_FALSE(isNullBlockIdx(decode_block_id));
+                expectDsv4RegionBytes(
+                    decode_manager, decode_block_id, layer_id, gid, dsv4PdPattern(layer_id, gid, block_pos));
+            }
+        }
+    }
+}
+
+TEST_F(PdSepKVCacheReleaseTest, testDsv4CacheStorePDSepTransfersAllLayerRegionsWithPrefixReuse) {
+    const int     spb        = static_cast<int>(kDsv4TokensPerBlock);
+    const int     block_num  = 4;
+    const int     reuse_num  = 1;
+    const int64_t request_id = 9018;
+    const size_t  model_id   = 78;
+
+    auto config = makeDsv4Config(/*block_num=*/24);
+
+    auto makeResource = [&config]() {
+        auto resource = std::make_shared<BatchKVCacheResource>();
+        resource->resetBatchSize(1);
+        resource->initGroups(config.groupNums(),
+                             static_cast<int>(config.layer_all_num),
+                             config.layerGroupIdsSnapshot(),
+                             config.kernelBlocksPerKvBlock(),
+                             config.groupTypesSnapshot());
+        return resource;
+    };
+    auto makeCompleteTokens = [spb, block_num](int max_seq_len) {
+        auto input              = std::make_shared<GenerateInput>();
+        input->input_ids        = torch::arange(max_seq_len, torch::kInt32);
+        input->generate_config  = std::make_shared<GenerateConfig>();
+        auto complete_token_ids = std::make_shared<CompleteTokenIds>(1, 1, max_seq_len + spb, spb);
+        complete_token_ids->init(input);
+        complete_token_ids->setSeqLength(block_num * spb);
+        return complete_token_ids;
+    };
+
+    auto prefill_manager = std::make_shared<KVCacheManager>(config, /*warmup=*/false, nullptr);
+    auto decode_manager  = std::make_shared<KVCacheManager>(config, /*warmup=*/false, nullptr);
+    ASSERT_TRUE(prefill_manager->init());
+    ASSERT_TRUE(decode_manager->init());
+
+    auto prefill_resource = makeResource();
+    auto decode_resource  = makeResource();
+    ASSERT_TRUE(
+        prefill_manager->malloc({prefill_resource, makeCompleteTokens(block_num * spb), request_id, true, false, false})
+            .success);
+    ASSERT_TRUE(
+        decode_manager->malloc({decode_resource, makeCompleteTokens(block_num * spb), request_id, true, false, false})
+            .success);
+
+    std::vector<CacheKeyType> cache_keys;
+    std::vector<std::string>  cache_key_strings;
+    for (int i = 0; i < block_num; ++i) {
+        cache_keys.push_back(11000 + i);
+        cache_key_strings.push_back(std::to_string(cache_keys.back()));
+    }
+
+    for (int layer_id = 0; layer_id < 4; ++layer_id) {
+        for (int gid : config.groupIdsForLayer(layer_id)) {
+            auto positions   = dsv4BlockPositionsForCacheTransfer(config, gid, block_num, reuse_num);
+            for (auto block_pos : positions) {
+                auto prefill_block_id = prefill_resource->blocks(0, gid)[block_pos];
+                auto decode_block_id  = decode_resource->blocks(0, gid)[block_pos];
+                ASSERT_FALSE(isNullBlockIdx(prefill_block_id)) << "prefill gid=" << gid << " pos=" << block_pos;
+                ASSERT_FALSE(isNullBlockIdx(decode_block_id)) << "decode gid=" << gid << " pos=" << block_pos;
+                fillDsv4RegionBytes(
+                    prefill_manager, prefill_block_id, layer_id, gid, dsv4PdPattern(layer_id, gid, block_pos));
+                fillDsv4RegionBytes(decode_manager, decode_block_id, layer_id, gid, 0xEE);
+            }
+        }
+    }
+    runtimeSyncAndCheck();
+
+    auto group_types_tensor = groupTypesTensorForConfig(config);
+
+    auto cache_store = std::make_shared<MemoryBackedCacheStore>();
+    auto layout      = prefill_manager->getMainModelCacheLayerLayout();
+    for (int layer_id = 0; layer_id < 4; ++layer_id) {
+        for (int gid : config.groupIdsForLayer(layer_id)) {
+            auto tag        = config.tagForGroup(static_cast<size_t>(gid));
+            auto group_idx  = static_cast<size_t>(gid);
+            ASSERT_TRUE(layout.layers_to_kv_buffer_ptrs_by_group[layer_id][group_idx].defined())
+                << "layer=" << layer_id << " group=" << group_idx;
+
+            CacheStoreInputs inputs;
+            inputs.input_lengths_host                  = torch::tensor({(block_num - reuse_num) * spb}, torch::kInt32);
+            inputs.prefix_lengths_host                 = torch::tensor({reuse_num * spb}, torch::kInt32);
+            inputs.host_kv_cache_offset                = blockIdsTensor(prefill_resource, gid);
+            inputs.kv_cache_group_types_host           = group_types_tensor;
+            inputs.context_batch_size                  = 1;
+            inputs.decoder_batch_size                  = 0;
+            inputs.request_id                          = torch::tensor({request_id}, torch::kInt64);
+            inputs.request_pd_separation               = torch::tensor({true}, torch::kBool);
+            inputs.cache_keys                          = cache_key_strings;
+            inputs.tokens_per_block                    = spb;
+            inputs.kv_block_stride_bytes               = config.kvBlockStrideBytesForGroup(static_cast<size_t>(gid));
+            inputs.kv_scale_stride_bytes               = 0;
+            inputs.pd_separation                       = true;
+            inputs.model_id                            = model_id;
+            inputs.decode_entrance                     = false;
+            inputs.warmup                              = false;
+            inputs.use_opaque_kv_cache_store           = config.use_opaque_kv_cache_store;
+            inputs.layer_id                            = layer_id;
+            inputs.group_id                            = gid;
+            inputs.tag                                 = tag;
+
+            KvCacheInfo kv_cache_info;
+            kv_cache_info.kv_cache_buffer = layout.layers_to_kv_buffer_ptrs_by_group[layer_id][group_idx];
+            runtimeWriteCacheStore(inputs, kv_cache_info, /*mla_kvcache=*/false, cache_store);
+        }
+    }
+    ASSERT_EQ(cache_store->store_request_keys_.size(), 10u);
+    ASSERT_EQ(cache_store->stored_blocks_.size(),
+              expectedDsv4StoredBlocks(config, /*layer_num=*/4, block_num, reuse_num));
+
+    EngineInitParams params;
+    params.model_id                 = model_id;
+    params.model_config_.num_layers = 4;
+    params.parallelism_config       = ParallelismConfig();
+
+    DecodeRpcServer server;
+    server.engine_                   = std::make_shared<MinimalEngine>(params, decode_manager);
+    server.maga_init_params_         = params;
+    server.propose_maga_init_params_ = nullptr;
+    server.resource_.cache_store     = cache_store;
+
+    std::vector<std::string>            peer_addrs = {"127.0.0.1:12345:12346"};
+    grpc::ServerContext                 server_context;
+    DecodeRpcServer::LoadKVCacheContext load_context(request_id,
+                                                     "dsv4-cache-store-pd-prefix-reuse",
+                                                     peer_addrs,
+                                                     cache_keys,
+                                                     decode_resource->groupBlocks(),
+                                                     reuse_num,
+                                                     /*timeout_ms=*/5000,
+                                                     /*partition_count=*/1,
+                                                     /*partition_id=*/0,
+                                                     &server_context);
+    auto                                status = server.loadCache(load_context);
+    ASSERT_TRUE(status.ok()) << status.ToString();
+
+    EXPECT_EQ(cache_store->load_buffer_requests_.size(), 10u);
+    EXPECT_EQ(cache_store->load_request_keys_.size(), 10u);
+    for (int layer_id = 0; layer_id < 4; ++layer_id) {
+        for (int gid : config.groupIdsForLayer(layer_id)) {
+            auto positions   = dsv4BlockPositionsForCacheTransfer(config, gid, block_num, reuse_num);
+            for (auto block_pos : positions) {
+                auto decode_block_id = decode_resource->blocks(0, gid)[block_pos];
+                ASSERT_FALSE(isNullBlockIdx(decode_block_id));
+                expectDsv4RegionBytes(
+                    decode_manager, decode_block_id, layer_id, gid, dsv4PdPattern(layer_id, gid, block_pos));
+            }
+        }
+    }
+}
+
+// =============================================================================
+// Test: runtimeWriteCacheStore with pinned-host metadata + event sync
+// Verifies that when metadata tensors (input_lengths, prefix_lengths) are
+// prepared on pinned host via async D2H and a pre_created_event is attached,
+// runtimeWriteCacheStore waits for the event and reads metadata correctly —
+// the same path used by the optimized WriteCacheStoreOp that avoids
+// synchronous .cpu() calls on background threads.
+// =============================================================================
+TEST_F(PdSepKVCacheReleaseTest, testWriteCacheStoreWithPinnedHostMetadataAndEvent) {
+    auto config  = makeConfig();  // 3 layers, 16 blocks, 8 tokens/block, INT8
+    auto manager = std::make_shared<KVCacheManager>(config, /*warmup=*/false, nullptr);
+    ASSERT_TRUE(manager->init());
+
+    const int spb            = 8;
+    const int block_num      = 2;
+    const int input_length   = block_num * spb;
+    const int request_id_val = 42;
+
+    // Allocate KV blocks.
+    auto resource = std::make_shared<BatchKVCacheResource>();
+    resource->resetBatchSize(1);
+    resource->initGroups(config.groupNums(),
+                         static_cast<int>(config.layer_all_num),
+                         config.layerGroupIdsSnapshot(),
+                         config.kernelBlocksPerKvBlock(),
+                         config.groupTypesSnapshot());
+
+    auto input              = std::make_shared<GenerateInput>();
+    input->input_ids        = torch::arange(input_length, torch::kInt32);
+    input->generate_config  = std::make_shared<GenerateConfig>();
+    auto complete_token_ids = std::make_shared<CompleteTokenIds>(1, 1, input_length + spb, spb);
+    complete_token_ids->init(input);
+    complete_token_ids->setSeqLength(input_length);
+
+    auto result = manager->malloc({resource, complete_token_ids, request_id_val, true, false, false});
+    ASSERT_TRUE(result.success);
+
+    // Fill KV cache blocks with a known pattern so MemoryBackedCacheStore can
+    // verify the transfer.
+    auto layout = manager->getMainModelCacheLayerLayout();
+    for (int layer_id = 0; layer_id < 3; ++layer_id) {
+        auto buf = layout.layers_to_kv_buffer_ptrs[layer_id];
+        ASSERT_TRUE(buf.defined());
+        for (int b = 0; b < block_num; ++b) {
+            auto bid       = resource->blocks(0, 0)[b];
+            auto kv_stride = config.kv_block_stride_bytes;
+            ASSERT_FALSE(isNullBlockIdx(bid));
+            auto device_slice = torch::from_blob((uint8_t*)buf.data_ptr() + bid * kv_stride,
+                                                 {(int64_t)kv_stride},
+                                                 torch::TensorOptions(torch::kUInt8).device(torch::kCUDA));
+            device_slice.fill_(static_cast<uint8_t>(layer_id * 10 + b));
+        }
+    }
+    runtimeSyncAndCheck();
+
+    // Prepare cache key strings (one per block).
+    std::vector<std::string> cache_key_strings;
+    for (int i = 0; i < block_num; ++i) {
+        cache_key_strings.push_back(std::to_string(10000 + i));
+    }
+
+    // --- Core of the test: async D2H to pinned host, then event ---
+    // Create device tensors (mimicking what buildPyAttentionInputs produces).
+    auto input_lengths_device  = torch::tensor({input_length}, torch::kInt32).cuda();
+    auto prefix_lengths_device = torch::tensor({0}, torch::kInt32).cuda();
+
+    // Async-copy to pinned host (mimicking prepareWriteCacheParams).
+    auto pinned_i32          = torch::TensorOptions(torch::kInt32).pinned_memory(true);
+    auto input_lengths_host  = torch::empty({1}, pinned_i32);
+    auto prefix_lengths_host = torch::empty({1}, pinned_i32);
+    input_lengths_host.copy_(input_lengths_device, /*non_blocking=*/true);
+    prefix_lengths_host.copy_(prefix_lengths_device, /*non_blocking=*/true);
+
+    // Record event AFTER async D2H on the current stream.
+    auto event = runtimeCreateEvent();
+
+    // --- Call runtimeWriteCacheStore (event->synchronize() inside) ---
+    auto cache_store = std::make_shared<MemoryBackedCacheStore>();
+    auto block_ids   = torch::from_blob(const_cast<int*>(resource->blocks(0, 0).data()),
+                                        {1, (int64_t)resource->blocks(0, 0).size()},
+                                      torch::kInt32)
+                         .clone();
+
+    for (int layer_id = 0; layer_id < 3; ++layer_id) {
+        CacheStoreInputs inputs;
+        inputs.input_lengths_host        = input_lengths_host;
+        inputs.prefix_lengths_host       = prefix_lengths_host;
+        inputs.host_kv_cache_offset      = block_ids;
+        inputs.context_batch_size        = 1;
+        inputs.decoder_batch_size        = 0;
+        inputs.request_id                = torch::tensor({(int64_t)request_id_val}, torch::kInt64);
+        inputs.request_pd_separation     = torch::tensor({true}, torch::kBool);
+        inputs.cache_keys                = cache_key_strings;
+        inputs.tokens_per_block          = spb;
+        inputs.kv_block_stride_bytes     = config.kv_block_stride_bytes;
+        inputs.kv_scale_stride_bytes     = 0;
+        inputs.pd_separation             = true;
+        inputs.model_id                  = 0;
+        inputs.decode_entrance           = false;
+        inputs.warmup                    = false;
+        inputs.use_opaque_kv_cache_store = false;
+        inputs.layer_id                  = layer_id;
+        inputs.group_id                  = 0;
+        inputs.tag                       = "";
+        inputs.pre_created_event         = event;
+
+        KvCacheInfo kv_cache_info;
+        kv_cache_info.kv_cache_buffer = layout.layers_to_kv_buffer_ptrs[layer_id];
+        runtimeWriteCacheStore(inputs, kv_cache_info, /*mla_kvcache=*/false, cache_store);
+    }
+
+    // Verify: cache store received correct request key for all 3 layers.
+    EXPECT_EQ(cache_store->store_request_keys_.size(), 3u);
+    // MHA (non-opaque, non-mla) splits each block into k + v → 2 entries per block.
+    EXPECT_EQ(cache_store->stored_blocks_.size(), 3u * block_num * 2u);
+
+    // Verify stored data matches the pattern we filled.
+    for (int layer_id = 0; layer_id < 3; ++layer_id) {
+        for (int b = 0; b < block_num; ++b) {
+            auto k_key = "k_" + makeCacheKey(0, cache_key_strings[b], layer_id);
+            auto it    = cache_store->stored_blocks_.find(k_key);
+            ASSERT_NE(it, cache_store->stored_blocks_.end()) << "missing key: " << k_key;
+            uint8_t expected = static_cast<uint8_t>(layer_id * 10 + b);
+            EXPECT_EQ(it->second[0], expected) << "layer=" << layer_id << " block=" << b << " first byte mismatch";
+        }
+    }
+}
+
+TEST_F(PdSepKVCacheReleaseTest, testWriteCacheStoreUsesTensorDeviceForCpuKvBuffer) {
+    const int         spb              = 8;
+    const int         kv_stride        = 64;
+    const int         request_id_val   = 4242;
+    const std::string cache_key_string = "10000";
+
+    auto kv_options = torch::TensorOptions(torch::kUInt8).device(torch::kCPU).pinned_memory(true);
+    auto kv_buffer  = torch::empty({2, kv_stride}, kv_options);
+    kv_buffer[1].fill_(static_cast<uint8_t>(123));
+
+    auto inputs =
+        makeSingleBlockWriteInputs(cache_key_string, request_id_val, spb, kv_stride, 0, true, 0, "csa_state");
+
+    KvCacheInfo kv_cache_info;
+    kv_cache_info.kv_cache_buffer = kv_buffer;
+
+    auto cache_store = std::make_shared<MemoryBackedCacheStore>();
+    runtimeWriteCacheStore(inputs, kv_cache_info, /*mla_kvcache=*/false, cache_store);
+
+    const auto key = "kv_" + makeCacheKey(0, cache_key_string, 0, "csa_state");
+    auto       it  = cache_store->stored_blocks_.find(key);
+    ASSERT_NE(it, cache_store->stored_blocks_.end());
+    ASSERT_EQ(it->second.size(), static_cast<size_t>(kv_stride));
+    EXPECT_EQ(it->second[0], static_cast<uint8_t>(123));
+
+    ASSERT_EQ(cache_store->store_buffer_requests_.size(), 1u);
+    auto blocks   = cache_store->store_buffer_requests_.front()->getBlocks();
+    auto block_it = blocks.find(key);
+    ASSERT_NE(block_it, blocks.end());
+    EXPECT_FALSE(block_it->second->gpu_mem);
+}
+
+TEST_F(PdSepKVCacheReleaseTest, testWriteCacheStoreUsesTensorDeviceForCpuSplitKvBuffer) {
+    const int         spb              = 8;
+    const int         kv_stride        = 64;
+    const int         kv_half          = kv_stride / 2;
+    const int         request_id_val   = 4243;
+    const std::string cache_key_string = "10001";
+
+    auto kv_options = torch::TensorOptions(torch::kUInt8).device(torch::kCPU).pinned_memory(true);
+    auto kv_buffer  = torch::empty({2, kv_stride}, kv_options);
+    auto block      = kv_buffer[1];
+    block.slice(0, 0, kv_half).fill_(static_cast<uint8_t>(17));
+    block.slice(0, kv_half, kv_stride).fill_(static_cast<uint8_t>(29));
+
+    auto inputs = makeSingleBlockWriteInputs(cache_key_string, request_id_val, spb, kv_stride, 0, false, 0, "");
+
+    KvCacheInfo kv_cache_info;
+    kv_cache_info.kv_cache_buffer = kv_buffer;
+
+    auto cache_store = std::make_shared<MemoryBackedCacheStore>();
+    runtimeWriteCacheStore(inputs, kv_cache_info, /*mla_kvcache=*/false, cache_store);
+
+    const auto cache_key = makeCacheKey(0, cache_key_string, 0);
+    const auto k_key     = "k_" + cache_key;
+    const auto v_key     = "v_" + cache_key;
+    auto       k_it      = cache_store->stored_blocks_.find(k_key);
+    auto       v_it      = cache_store->stored_blocks_.find(v_key);
+    ASSERT_NE(k_it, cache_store->stored_blocks_.end());
+    ASSERT_NE(v_it, cache_store->stored_blocks_.end());
+    ASSERT_EQ(k_it->second.size(), static_cast<size_t>(kv_half));
+    ASSERT_EQ(v_it->second.size(), static_cast<size_t>(kv_half));
+    EXPECT_EQ(k_it->second[0], static_cast<uint8_t>(17));
+    EXPECT_EQ(v_it->second[0], static_cast<uint8_t>(29));
+
+    ASSERT_EQ(cache_store->store_buffer_requests_.size(), 1u);
+    auto k_block = cache_store->store_buffer_requests_.front()->getBlock(k_key);
+    auto v_block = cache_store->store_buffer_requests_.front()->getBlock(v_key);
+    ASSERT_NE(k_block, nullptr);
+    ASSERT_NE(v_block, nullptr);
+    EXPECT_FALSE(k_block->gpu_mem);
+    EXPECT_FALSE(v_block->gpu_mem);
+}
+
+TEST_F(PdSepKVCacheReleaseTest, testWriteCacheStoreUsesTensorDeviceForCpuKvScaleBuffer) {
+    const int         spb              = 8;
+    const int         kv_stride        = 64;
+    const int         scale_stride     = 16;
+    const int         request_id_val   = 4244;
+    const std::string cache_key_string = "10002";
+
+    auto cpu_options     = torch::TensorOptions(torch::kUInt8).device(torch::kCPU).pinned_memory(true);
+    auto kv_buffer       = torch::empty({2, kv_stride}, cpu_options);
+    auto kv_scale_buffer = torch::empty({2, scale_stride}, cpu_options);
+    kv_buffer[1].fill_(static_cast<uint8_t>(41));
+    kv_scale_buffer[1].fill_(static_cast<uint8_t>(73));
+
+    auto inputs =
+        makeSingleBlockWriteInputs(cache_key_string, request_id_val, spb, kv_stride, scale_stride, true, 0, "csa_state");
+
+    KvCacheInfo kv_cache_info;
+    kv_cache_info.kv_cache_buffer = kv_buffer;
+    kv_cache_info.kv_scale_buffer = kv_scale_buffer;
+
+    auto cache_store = std::make_shared<MemoryBackedCacheStore>();
+    runtimeWriteCacheStore(inputs, kv_cache_info, /*mla_kvcache=*/false, cache_store);
+
+    const auto scale_key = "kv_scale_" + makeCacheKey(0, cache_key_string, 0, "csa_state");
+    auto       scale_it  = cache_store->stored_blocks_.find(scale_key);
+    ASSERT_NE(scale_it, cache_store->stored_blocks_.end());
+    ASSERT_EQ(scale_it->second.size(), static_cast<size_t>(scale_stride));
+    EXPECT_EQ(scale_it->second[0], static_cast<uint8_t>(73));
+
+    ASSERT_EQ(cache_store->store_buffer_requests_.size(), 1u);
+    auto scale_block = cache_store->store_buffer_requests_.front()->getBlock(scale_key);
+    ASSERT_NE(scale_block, nullptr);
+    EXPECT_FALSE(scale_block->gpu_mem);
+}
+
 }  // namespace rtp_llm
diff --git a/rtp_llm/cpp/metrics/RtpLLMMetrics.cc b/rtp_llm/cpp/metrics/RtpLLMMetrics.cc
index c097471529..e0727c9939 100644
--- a/rtp_llm/cpp/metrics/RtpLLMMetrics.cc
+++ b/rtp_llm/cpp/metrics/RtpLLMMetrics.cc
@@ -14,6 +14,8 @@ AUTIL_LOG_SETUP(rtp_llm, RtpEmbeddingGlobalMetrics);
 AUTIL_LOG_SETUP(rtp_llm, RtpEmbeddingStreamMetrics);
 AUTIL_LOG_SETUP(rtp_llm, RtpLLMSchedulerMetrics);
 AUTIL_LOG_SETUP(rtp_llm, RtpLLMCacheMetrics);
+AUTIL_LOG_SETUP(rtp_llm, RtpLLMCachePoolMetrics);
+AUTIL_LOG_SETUP(rtp_llm, RtpLLMCacheEvictionMetrics);
 AUTIL_LOG_SETUP(rtp_llm, RtpLLMCacheReuseMetrics);
 AUTIL_LOG_SETUP(rtp_llm, RtpLLMDeviceCacheReuseMetrics);
 AUTIL_LOG_SETUP(rtp_llm, RtpLLMExecutorMetrics);
@@ -352,6 +354,42 @@ void RtpLLMCacheMetrics::report(const kmonitor::MetricsTags* tags, RtpLLMCacheMe
     REPORT_MUTABLE_METRIC(mr_cost_time_ms_metric, collector->mr_cost_time_ms);
 }
 
+bool RtpLLMCachePoolMetrics::init(kmonitor::MetricsGroupManager* manager) {
+    REGISTER_GAUGE_MUTABLE_METRIC(free_blocks_metric, "rtp_llm_kv_cache_pool_free_blocks");
+    REGISTER_GAUGE_MUTABLE_METRIC(available_blocks_metric, "rtp_llm_kv_cache_pool_available_blocks");
+    REGISTER_GAUGE_MUTABLE_METRIC(request_ref_blocks_metric, "rtp_llm_kv_cache_pool_request_ref_blocks");
+    REGISTER_GAUGE_MUTABLE_METRIC(connector_ref_blocks_metric, "rtp_llm_kv_cache_pool_connector_ref_blocks");
+    REGISTER_GAUGE_MUTABLE_METRIC(total_blocks_metric, "rtp_llm_kv_cache_pool_total_blocks");
+    REGISTER_GAUGE_MUTABLE_METRIC(used_ratio_metric, "rtp_llm_kv_cache_pool_used_ratio");
+    return true;
+}
+
+void RtpLLMCachePoolMetrics::report(const kmonitor::MetricsTags* tags, RtpLLMCachePoolMetricsCollector* collector) {
+    REPORT_MUTABLE_METRIC(free_blocks_metric, collector->free_blocks);
+    REPORT_MUTABLE_METRIC(available_blocks_metric, collector->available_blocks);
+    REPORT_MUTABLE_METRIC(request_ref_blocks_metric, collector->request_ref_blocks);
+    REPORT_MUTABLE_METRIC(connector_ref_blocks_metric, collector->connector_ref_blocks);
+    REPORT_MUTABLE_METRIC(total_blocks_metric, collector->total_blocks);
+    REPORT_MUTABLE_METRIC(used_ratio_metric, collector->used_ratio);
+}
+
+bool RtpLLMCacheEvictionMetrics::init(kmonitor::MetricsGroupManager* manager) {
+    REGISTER_GAUGE_MUTABLE_METRIC(evicted_block_lifetime_ms_metric,
+                                  "rtp_llm_kv_cache_evicted_block_lifetime_ms");
+    REGISTER_GAUGE_MUTABLE_METRIC(evicted_block_count_metric, "rtp_llm_kv_cache_evicted_block_count");
+    return true;
+}
+
+void RtpLLMCacheEvictionMetrics::report(const kmonitor::MetricsTags*       tags,
+                                        RtpLLMCacheEvictionMetricsCollector* collector) {
+    if (collector->lifetime_ms >= 0) {
+        REPORT_MUTABLE_METRIC(evicted_block_lifetime_ms_metric, collector->lifetime_ms);
+    }
+    if (collector->evicted_block_count >= 0) {
+        REPORT_MUTABLE_METRIC(evicted_block_count_metric, collector->evicted_block_count);
+    }
+}
+
 bool RtpLLMRemoteCacheMatchMetrics::init(kmonitor::MetricsGroupManager* manager) {
     REGISTER_QPS_MUTABLE_METRIC(remote_match_qps_metric, "rtp_llm_remote_match_qps");
     REGISTER_QPS_MUTABLE_METRIC(remote_match_fail_qps_metric, "rtp_llm_remote_match_fail_qps");
diff --git a/rtp_llm/cpp/metrics/RtpLLMMetrics.h b/rtp_llm/cpp/metrics/RtpLLMMetrics.h
index 3c25b70bd8..b82c164924 100644
--- a/rtp_llm/cpp/metrics/RtpLLMMetrics.h
+++ b/rtp_llm/cpp/metrics/RtpLLMMetrics.h
@@ -434,6 +434,53 @@ class RtpLLMCacheMetrics: public kmonitor::MetricsGroup {
     AUTIL_LOG_DECLARE();
 };
 
+class RtpLLMCachePoolMetricsCollector final {
+public:
+    int64_t free_blocks          = 0;
+    int64_t available_blocks     = 0;
+    int64_t request_ref_blocks   = 0;
+    int64_t connector_ref_blocks = 0;
+    int64_t total_blocks         = 0;
+    int64_t reserve_blocks       = 0;
+    float   used_ratio           = 0;
+};
+
+class RtpLLMCachePoolMetrics: public kmonitor::MetricsGroup {
+public:
+    bool init(kmonitor::MetricsGroupManager* manager) override;
+    void report(const kmonitor::MetricsTags* tags, RtpLLMCachePoolMetricsCollector* collector);
+
+public:
+    kmonitor::MutableMetric* free_blocks_metric          = nullptr;
+    kmonitor::MutableMetric* available_blocks_metric     = nullptr;
+    kmonitor::MutableMetric* request_ref_blocks_metric   = nullptr;
+    kmonitor::MutableMetric* connector_ref_blocks_metric = nullptr;
+    kmonitor::MutableMetric* total_blocks_metric         = nullptr;
+    kmonitor::MutableMetric* used_ratio_metric           = nullptr;
+
+private:
+    AUTIL_LOG_DECLARE();
+};
+
+class RtpLLMCacheEvictionMetricsCollector final {
+public:
+    int64_t lifetime_ms          = -1;
+    int64_t evicted_block_count  = -1;
+};
+
+class RtpLLMCacheEvictionMetrics: public kmonitor::MetricsGroup {
+public:
+    bool init(kmonitor::MetricsGroupManager* manager) override;
+    void report(const kmonitor::MetricsTags* tags, RtpLLMCacheEvictionMetricsCollector* collector);
+
+public:
+    kmonitor::MutableMetric* evicted_block_lifetime_ms_metric = nullptr;
+    kmonitor::MutableMetric* evicted_block_count_metric       = nullptr;
+
+private:
+    AUTIL_LOG_DECLARE();
+};
+
 class RtpLLMCacheReuseMetricsCollector final {
 public:
     int64_t kv_cache_reuse_length = 0;
@@ -816,8 +863,10 @@ class RtpLLMMemoryCacheCopyMetricsCollector final {
 class RtpLLMMemoryCacheStatusMetricsCollector final {
 public:
     int64_t total_block_num     = 0;
-    int64_t allocated_block_num = 0;  // 在cache中的block数量
-    int64_t available_block_num = 0;  // 可用的block数量
+    int64_t allocated_block_num = 0;
+    int64_t available_block_num = 0;
+    int64_t item_num            = 0;
+    float   used_ratio          = 0;
 };
 
 class RtpLLMMemoryCacheMetrics: public kmonitor::MetricsGroup {
diff --git a/rtp_llm/cpp/model_rpc/DecodeRpcServer.cc b/rtp_llm/cpp/model_rpc/DecodeRpcServer.cc
index 6f2596a6f5..32bd9cde29 100644
--- a/rtp_llm/cpp/model_rpc/DecodeRpcServer.cc
+++ b/rtp_llm/cpp/model_rpc/DecodeRpcServer.cc
@@ -5,7 +5,7 @@
 #include <condition_variable>
 #include <c10/core/InferenceMode.h>
 
-#include "rtp_llm/cpp/cache/CacheGroupType.h"
+#include "rtp_llm/cpp/cache/spec/CacheGroupType.h"
 #include "rtp_llm/cpp/utils/KVCacheUtils.h"
 #include "rtp_llm/cpp/model_rpc/QueryConverter.h"
 #include "rtp_llm/cpp/model_rpc/DecodeRpcServer.h"
@@ -579,7 +579,7 @@ ErrorInfo DecodeRpcServer::loadCache(const LoadKVCacheContext& load_context) {
 
     const bool   use_mla       = cache_config.use_mla;
     const bool   use_hybrid    = cache_config.groupNums() > 1;
-    const auto&  spec          = cache_config.cache_specs[0];
+    const auto&  spec          = cache_config.specForGroup(0);
     const size_t k_total_bytes = spec->k_block_size_bytes();
     const size_t v_total_bytes = spec->v_block_size_bytes();
 
@@ -596,6 +596,8 @@ ErrorInfo DecodeRpcServer::loadCache(const LoadKVCacheContext& load_context) {
 
     auto cancel_check_func  = [&load_context]() -> bool { return load_context.server_context->IsCancelled(); };
     auto start_load_time_us = currentTimeUs();
+    const auto flat_layer_to_group = cache_config.flatLayerToGroupId();
+    const auto cache_group_types   = cache_config.groupTypesSnapshot();
     std::vector<std::shared_ptr<LoadContext>> load_contexts;
     for (int i = 0; i < load_context.peer_addrs.size(); i++) {
         auto&                                            peer_addr = load_context.peer_addrs[i];
@@ -607,8 +609,8 @@ ErrorInfo DecodeRpcServer::loadCache(const LoadKVCacheContext& load_context) {
             auto load_layer_cache =
                 std::make_shared<RequestBlockBuffer>(std::to_string(load_context.request_id), request_key);
             size_t gid = 0;
-            if (use_hybrid && layer_id < cache_config.layer_to_group_id.size()) {
-                const int mapped_gid = cache_config.layer_to_group_id[layer_id];
+            if (use_hybrid && layer_id < flat_layer_to_group.size()) {
+                const int mapped_gid = flat_layer_to_group[layer_id];
                 if (mapped_gid >= 0) {
                     gid = static_cast<size_t>(mapped_gid);
                 }
@@ -627,10 +629,10 @@ ErrorInfo DecodeRpcServer::loadCache(const LoadKVCacheContext& load_context) {
             block_pos_list.reserve(block_num);
             if (use_hybrid && block_num > 0) {
                 CacheGroupType group_type = CacheGroupType::FULL;
-                if (layer_id < cache_config.layer_to_group_id.size() && !cache_config.group_types.empty()) {
-                    const int gid = cache_config.layer_to_group_id[layer_id];
-                    if (gid >= 0 && static_cast<size_t>(gid) < cache_config.group_types.size()) {
-                        group_type = cache_config.group_types[static_cast<size_t>(gid)];
+                if (layer_id < flat_layer_to_group.size() && !cache_group_types.empty()) {
+                    const int gid = flat_layer_to_group[layer_id];
+                    if (gid >= 0 && static_cast<size_t>(gid) < cache_group_types.size()) {
+                        group_type = cache_group_types[static_cast<size_t>(gid)];
                     }
                 }
                 if (group_type == CacheGroupType::LINEAR) {
@@ -707,9 +709,12 @@ ErrorInfo DecodeRpcServer::loadCache(const LoadKVCacheContext& load_context) {
                                             "mtp layer_num mismatch: engine=" + std::to_string(layer_num)
                                                 + " cache_cfg=" + std::to_string(mtp_cache_cfg.layer_num)
                                                 + " (mtp_model_id=" + std::to_string(mtp_model_id) + ")");
+                    const auto mtp_global_layer_ids   = mtp_cache_cfg.globalLayerIdsSnapshot();
+                    const auto mtp_flat_layer_to_grp  = mtp_cache_cfg.flatLayerToGroupId();
+                    const auto mtp_group_types        = mtp_cache_cfg.groupTypesSnapshot();
                     RTP_LLM_CHECK_WITH_INFO(
-                        !mtp_cache_cfg.global_layer_ids.empty(),
-                        "mtp_cache_cfg.global_layer_ids is empty (mtp_model_id=" + std::to_string(mtp_model_id) + ")");
+                        !mtp_global_layer_ids.empty(),
+                        "mtp_cache_cfg.globalLayerIdsSnapshot() is empty (mtp_model_id=" + std::to_string(mtp_model_id) + ")");
 
                     for (size_t layer_id = 0; layer_id < layer_num; layer_id++) {
                         auto request_key = std::to_string(load_context.request_id) + "-" + std::to_string(layer_id);
@@ -717,8 +722,8 @@ ErrorInfo DecodeRpcServer::loadCache(const LoadKVCacheContext& load_context) {
                             std::make_shared<RequestBlockBuffer>(std::to_string(load_context.request_id), request_key);
                         size_t     gid            = 0;
                         const bool mtp_use_hybrid = mtp_cache_cfg.groupNums() > 1;
-                        if (mtp_use_hybrid && layer_id < mtp_cache_cfg.layer_to_group_id.size()) {
-                            const int mapped_gid = mtp_cache_cfg.layer_to_group_id[layer_id];
+                        if (mtp_use_hybrid && layer_id < mtp_flat_layer_to_grp.size()) {
+                            const int mapped_gid = mtp_flat_layer_to_grp[layer_id];
                             if (mapped_gid >= 0) {
                                 gid = static_cast<size_t>(mapped_gid);
                             }
@@ -734,18 +739,18 @@ ErrorInfo DecodeRpcServer::loadCache(const LoadKVCacheContext& load_context) {
                         size_t      model_id  = mtp_base_model_id;
 
                         // Use per-module global_layer_ids for address lookup.
-                        const int global_layer_id = mtp_cache_cfg.global_layer_ids[0][layer_id];
+                        const int global_layer_id = mtp_global_layer_ids[0][layer_id];
 
                         // Hybrid cache: Linear group only needs the last block; Full group needs all blocks.
                         std::vector<size_t> block_pos_list;
                         block_pos_list.reserve(block_num);
                         if (mtp_use_hybrid && block_num > 0) {
                             CacheGroupType group_type = CacheGroupType::FULL;
-                            if (layer_id < mtp_cache_cfg.layer_to_group_id.size()
-                                && !mtp_cache_cfg.group_types.empty()) {
-                                const int gid = mtp_cache_cfg.layer_to_group_id[layer_id];
-                                if (gid >= 0 && static_cast<size_t>(gid) < mtp_cache_cfg.group_types.size()) {
-                                    group_type = mtp_cache_cfg.group_types[static_cast<size_t>(gid)];
+                            if (layer_id < mtp_flat_layer_to_grp.size()
+                                && !mtp_group_types.empty()) {
+                                const int gid = mtp_flat_layer_to_grp[layer_id];
+                                if (gid >= 0 && static_cast<size_t>(gid) < mtp_group_types.size()) {
+                                    group_type = mtp_group_types[static_cast<size_t>(gid)];
                                 }
                             }
                             if (group_type == CacheGroupType::LINEAR) {
diff --git a/rtp_llm/cpp/model_rpc/proto/model_rpc_service.proto b/rtp_llm/cpp/model_rpc/proto/model_rpc_service.proto
index b006f8dc32..5812477a1b 100644
--- a/rtp_llm/cpp/model_rpc/proto/model_rpc_service.proto
+++ b/rtp_llm/cpp/model_rpc/proto/model_rpc_service.proto
@@ -451,11 +451,17 @@ message MemoryOperationRequestPB {
     message CopyItem {
         int32 mem_block = 1;
         repeated int32 gpu_blocks = 2;
+        bool is_complete = 3;
+        BackingType backing_type = 4;
     }
     enum CopyDirection {
         H2D = 0;
         D2H = 1;
     }
+    enum BackingType {
+        MEMORY = 0;
+        DISK = 1;
+    }
     repeated CopyItem copy_items = 1;
     CopyDirection copy_direction = 2;
 }
diff --git a/rtp_llm/cpp/model_utils/AttentionConfig.h b/rtp_llm/cpp/model_utils/AttentionConfig.h
index 5eb1f2aff6..18a2b798c7 100644
--- a/rtp_llm/cpp/model_utils/AttentionConfig.h
+++ b/rtp_llm/cpp/model_utils/AttentionConfig.h
@@ -2,6 +2,7 @@
 
 #include "rtp_llm/cpp/model_utils/RopeConfig.h"
 #include <c10/core/ScalarType.h>
+#include <vector>
 
 namespace rtp_llm {
 
@@ -58,6 +59,10 @@ struct AttentionConfigs {
     int  indexer_head_dim = 0;
     int  indexer_head_num = 0;
     int  indexer_topk     = 0;
+    int  sliding_window   = 0;
+    int  o_groups         = 0;
+    int  o_lora_rank      = 0;
+    std::vector<int> layer_compress_ratios;
 
     // data type for attention computation
     c10::ScalarType dtype = c10::ScalarType::Half;
diff --git a/rtp_llm/cpp/models/PyWrappedModel.h b/rtp_llm/cpp/models/PyWrappedModel.h
index 06c62a0c7e..7d42441d3f 100644
--- a/rtp_llm/cpp/models/PyWrappedModel.h
+++ b/rtp_llm/cpp/models/PyWrappedModel.h
@@ -169,7 +169,7 @@ inline PyWrappedModel::PyWrappedModel(const GptModelInitParams& params,
             kv_cache.kv_scale_base_by_layer.push_back(t);
         }
 
-        kv_cache.layer_attn_types = layout.layer_attn_types;
+        kv_cache.layer_group_types = layout.layer_group_types;
         init_resources.kv_cache   = kv_cache;
     }
 
diff --git a/rtp_llm/cpp/normal_engine/NormalBatchStreamProcessor.cc b/rtp_llm/cpp/normal_engine/NormalBatchStreamProcessor.cc
index f628d2910d..2d0f6ab71a 100644
--- a/rtp_llm/cpp/normal_engine/NormalBatchStreamProcessor.cc
+++ b/rtp_llm/cpp/normal_engine/NormalBatchStreamProcessor.cc
@@ -24,8 +24,8 @@ NormalBatchStreamProcessor::NormalBatchStreamProcessor(
     model_input_gatherer_config_.kernel_seq_size_per_block  = cache_config.kernel_seq_size_per_block;
     model_input_gatherer_config_.kernel_blocks_per_kv_block = cache_config.kernelBlocksPerKvBlock();
     model_input_gatherer_config_.kv_cache_group_nums        = cache_config.groupNums();
-    model_input_gatherer_config_.layer_to_kv_cache_group_id = cache_config.layer_to_group_id;
-    model_input_gatherer_config_.kv_cache_group_types       = cache_config.group_types;
+    model_input_gatherer_config_.layer_to_kv_cache_group_id = cache_config.flatLayerToGroupId();
+    model_input_gatherer_config_.kv_cache_group_types       = cache_config.groupTypesSnapshot();
     model_input_gatherer_config_.warm_up                    = warm_up;
     model_input_gatherer_config_.enable_detail_log          = profiling_debug_logging_config.enable_detail_log;
 
diff --git a/rtp_llm/cpp/normal_engine/NormalEngine.cc b/rtp_llm/cpp/normal_engine/NormalEngine.cc
index 9ff9caa273..083e8decdb 100644
--- a/rtp_llm/cpp/normal_engine/NormalEngine.cc
+++ b/rtp_llm/cpp/normal_engine/NormalEngine.cc
@@ -6,7 +6,7 @@
 #include "rtp_llm/cpp/utils/StatusUtil.h"
 #include "rtp_llm/cpp/engine_base/schedulers/FIFOScheduler.h"
 #include "rtp_llm/cpp/engine_base/schedulers/BatchDecodeScheduler.h"
-#include "rtp_llm/cpp/cache/CacheConfigCreator.h"
+#include "rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h"
 #include "rtp_llm/cpp/engine_base/system_prompt/SystemPromptConstructor.h"
 #include "rtp_llm/cpp/utils/Logger.h"
 #include "rtp_llm/cpp/utils/AssertUtils.h"
@@ -250,7 +250,7 @@ WarmUpResult NormalEngine::decodeWarmUp(const EngineInitParams& params) {
     fake_input->generate_config->calculate_loss       = int(runtime_config.warm_up_with_loss);
     rtp_llm::setTraceMemory(true);
 
-    auto cache_config               = CacheConfigCreator::createBasicConfig(model_config_, parallelism_config);
+    auto cache_config               = CacheConfigCreator::createBasicConfig(model_config_, parallelism_config, KVCacheConfig{}, false, 0);
     cache_config.seq_size_per_block = model_config_.attn_config.tokens_per_block;
     cache_config.block_num          = 5;
     ParallelismConfig temp_parallelism_config;
@@ -322,7 +322,7 @@ void NormalEngine::initCacheManager(std::optional<WarmUpResult> warm_up_result)
 
         const auto& cache_cfg    = resource_context_.cache_manager->cacheConfig();
         kv_cache_group_num_      = cache_cfg.groupNums();
-        kv_cache_layer_to_group_ = cache_cfg.layer_to_group_id;
+        kv_cache_layer_to_group_ = cache_cfg.flatLayerToGroupId();
     } else {
         auto result = CacheConfigCreator::createConfig(
             model_config_, parallelism_config, runtime_config, kv_cache_config, warm_up_result);
@@ -339,7 +339,7 @@ void NormalEngine::initCacheManager(std::optional<WarmUpResult> warm_up_result)
         }
         const auto& cache_cfg    = resource_context_.cache_manager->cacheConfig();
         kv_cache_group_num_      = cache_cfg.groupNums();
-        kv_cache_layer_to_group_ = cache_cfg.layer_to_group_id;
+        kv_cache_layer_to_group_ = cache_cfg.flatLayerToGroupId();
     }
 }
 
diff --git a/rtp_llm/cpp/normal_engine/NormalModelInputGatherer.h b/rtp_llm/cpp/normal_engine/NormalModelInputGatherer.h
index ea638ff49a..1bb33c2032 100644
--- a/rtp_llm/cpp/normal_engine/NormalModelInputGatherer.h
+++ b/rtp_llm/cpp/normal_engine/NormalModelInputGatherer.h
@@ -4,7 +4,7 @@
 #include <vector>
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "rtp_llm/cpp/cache/CacheGroupType.h"
+#include "rtp_llm/cpp/cache/spec/CacheGroupType.h"
 #include "rtp_llm/cpp/cache/Types.h"
 #include "rtp_llm/cpp/config/ConfigModules.h"
 #include "rtp_llm/models_py/bindings/core/OpData.h"
diff --git a/rtp_llm/cpp/normal_engine/speculative/MtpExecutor.cc b/rtp_llm/cpp/normal_engine/speculative/MtpExecutor.cc
index 9d9a1fe79d..8ed4cd8b59 100644
--- a/rtp_llm/cpp/normal_engine/speculative/MtpExecutor.cc
+++ b/rtp_llm/cpp/normal_engine/speculative/MtpExecutor.cc
@@ -7,7 +7,7 @@
 #include "rtp_llm/cpp/utils/StatusUtil.h"
 #include "rtp_llm/cpp/engine_base/schedulers/FIFOScheduler.h"
 #include "rtp_llm/cpp/engine_base/schedulers/BatchDecodeScheduler.h"
-#include "rtp_llm/cpp/cache/CacheConfigCreator.h"
+#include "rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h"
 #include "rtp_llm/cpp/engine_base/system_prompt/SystemPromptConstructor.h"
 #include "rtp_llm/cpp/utils/Logger.h"
 #include "rtp_llm/cpp/utils/AssertUtils.h"
@@ -22,6 +22,17 @@
 
 namespace rtp_llm {
 
+namespace {
+std::vector<int> flattenLayerToGroupIds(const std::vector<std::vector<int>>& ids) {
+    std::vector<int> flat;
+    flat.reserve(ids.size());
+    for (const auto& v : ids) {
+        flat.push_back(v.empty() ? -1 : v[0]);
+    }
+    return flat;
+}
+}  // namespace
+
 bool MtpExecutor::isTpRank0() const {
     return tp_rank_ == 0;
 }
@@ -197,7 +208,7 @@ MtpExecutor::MtpExecutor(const EngineInitParams&                        params,
     if (!params.py_model.is_none()) {
         RTP_LLM_LOG_INFO("init executor with python model");
         model_.reset(new PyWrappedModel(
-            model_init_params, params.py_model, false, true, target_cache_layer_layout.layer_to_groups));
+            model_init_params, params.py_model, false, true, flattenLayerToGroupIds(target_cache_layer_layout.layer_to_group_ids)));
     }
 
     // when warmup, cache manager maybe nullptr
@@ -239,7 +250,7 @@ MtpExecutor::MtpExecutor(const EngineInitParams&                        params,
         if (!params.py_sp_model.is_none()) {
             RTP_LLM_LOG_INFO("[speculative decoding] using py model");
             draft_model_.reset(new PyWrappedModel(
-                model_params, params.py_sp_model, false, false, draft_cache_layer_layout.layer_to_groups));
+                model_params, params.py_sp_model, false, false, flattenLayerToGroupIds(draft_cache_layer_layout.layer_to_group_ids)));
             // Create separate model for speculative prefill with CUDA graph if enabled (from params)
             const bool enable_cuda_graph = params.hw_kernel_config.enable_cuda_graph;
             RTP_LLM_LOG_INFO(
@@ -249,7 +260,7 @@ MtpExecutor::MtpExecutor(const EngineInitParams&                        params,
                 RTP_LLM_LOG_INFO(
                     "[speculative decoding] creating separate prefill draft model with CUDA graph support");
                 sp_prefill_draft_model_.reset(new PyWrappedModel(
-                    model_params, params.py_sp_model, true, false, draft_cache_layer_layout.layer_to_groups));
+                    model_params, params.py_sp_model, true, false, flattenLayerToGroupIds(draft_cache_layer_layout.layer_to_group_ids)));
             }
         }
         break;  // NOTE: only support one mtp model now
@@ -260,12 +271,14 @@ MtpExecutor::MtpExecutor(const EngineInitParams&                        params,
     draft_kv_cache_layer_to_group =
         torch::empty({(int64_t)draft_cache_layer_layout.layers_to_kv_buffer_ptrs.size()}, torch::kInt32);
 
+    auto target_flat_ids = flattenLayerToGroupIds(target_cache_layer_layout.layer_to_group_ids);
+    auto draft_flat_ids  = flattenLayerToGroupIds(draft_cache_layer_layout.layer_to_group_ids);
     memcpy(target_kv_cache_layer_to_group.data_ptr<int>(),
-           target_cache_layer_layout.layer_to_groups.data(),
-           target_cache_layer_layout.layer_to_groups.size() * sizeof(int));
+           target_flat_ids.data(),
+           target_flat_ids.size() * sizeof(int));
     memcpy(draft_kv_cache_layer_to_group.data_ptr<int>(),
-           draft_cache_layer_layout.layer_to_groups.data(),
-           draft_cache_layer_layout.layer_to_groups.size() * sizeof(int));
+           draft_flat_ids.data(),
+           draft_flat_ids.size() * sizeof(int));
 }
 
 /*
diff --git a/rtp_llm/cpp/normal_engine/speculative/test/MtpBatchStreamProcessorTest.cc b/rtp_llm/cpp/normal_engine/speculative/test/MtpBatchStreamProcessorTest.cc
index 63c4a7496e..3a13a802c3 100644
--- a/rtp_llm/cpp/normal_engine/speculative/test/MtpBatchStreamProcessorTest.cc
+++ b/rtp_llm/cpp/normal_engine/speculative/test/MtpBatchStreamProcessorTest.cc
@@ -26,6 +26,19 @@ std::vector<T> toVec(const torch::Tensor& t) {
 
 class MtpBatchStreamProcessorTest: public DeviceTestBase {
 public:
+    static void initSingleGroupCacheConfig(CacheConfig& config, int layer_num = 1) {
+        config.layer_num     = static_cast<uint32_t>(layer_num);
+        config.layer_all_num = static_cast<uint32_t>(layer_num);
+        auto spec                = std::make_shared<MHAKVCacheSpec>();
+        spec->type               = KVCacheSpecType::MultiHeadAttention;
+        spec->seq_size_per_block = 8;
+        spec->local_head_num_kv  = 1;
+        spec->size_per_head      = 1;
+        std::vector<int> layer_ids(layer_num);
+        std::iota(layer_ids.begin(), layer_ids.end(), 0);
+        config.fromGroupedSpecs({spec}, {layer_ids}, {CacheGroupType::FULL}, {"default"});
+    }
+
     GenerateStreamPtr createContextStream(const ModelConfig&     model_config,
                                           const RuntimeConfig&   runtime_config,
                                           const ResourceContext& resource_context,
@@ -39,7 +52,7 @@ class MtpBatchStreamProcessorTest: public DeviceTestBase {
         BatchKVCacheResource addr;
         // New (refactored) BatchKVCacheResource: [batch_id][group_id] -> block_indices
         addr.resetBatchSize(1);
-        addr.initGroups(1, 1, {0});
+        addr.initGroups(1, 1, {{0}});
         addr.setBatchBlocks(0, 0, {block_id});
         stream->setKVCache(addr);
 
@@ -84,7 +97,7 @@ TEST_F(MtpBatchStreamProcessorTest, testPrefillDispatch) {
     PDSepConfig                 pd_sep_config;
     ProfilingDebugLoggingConfig profiling_debug_logging_config;
     CacheConfig                 cache_config;
-    cache_config.group_types = {CacheGroupType::FULL};
+    initSingleGroupCacheConfig(cache_config);
 
     model_config.max_seq_len    = 2048;
     model_config.vocab_size     = 4;
@@ -167,7 +180,7 @@ TEST_F(MtpBatchStreamProcessorTest, testDispatchDecodeStream) {
     draft_prefill_output.sampler_output.all_probs =
         torch::tensor({0.2f, 0.1f, 0.3f, 0.5f, 0.3f, 0.1f, 0.4f, 0.2f}, torch::kFloat32).reshape({2, 4});
 
-    cache_config.group_types = {CacheGroupType::FULL};
+    initSingleGroupCacheConfig(cache_config);
     MtpBatchStreamProcessor processor(
         model_config, pd_sep_config, profiling_debug_logging_config, cache_config, sp_config, false);
 
@@ -216,7 +229,7 @@ TEST_F(MtpBatchStreamProcessorTest, testGatherDecodeModelInput) {
 
     auto stream_groups = StreamGroups({stream1, stream2});
 
-    cache_config.group_types = {CacheGroupType::FULL};
+    initSingleGroupCacheConfig(cache_config);
     auto processor           = MtpBatchStreamProcessor(
         model_config, pd_sep_config, profiling_debug_logging_config, cache_config, sp_config, false);
     auto model_input = processor.gatherDecodeModelInput(stream_groups);
@@ -293,7 +306,7 @@ TEST_F(MtpBatchStreamProcessorTest, testPrepareOneStepSpecDecodeModelInput) {
 
     auto stream_groups = StreamGroups({stream1, stream2});
 
-    cache_config.group_types = {CacheGroupType::FULL};
+    initSingleGroupCacheConfig(cache_config);
     auto processor           = MtpBatchStreamProcessor(
         model_config, pd_sep_config, profiling_debug_logging_config, cache_config, sp_config, false);
     auto model_input_status = processor.gatherDecodeModelInput(stream_groups);
@@ -391,7 +404,7 @@ TEST_F(MtpBatchStreamProcessorTest, testprepareDecodeDraftModelInput) {
 
     auto stream_groups = StreamGroups({stream1, stream2});
 
-    cache_config.group_types = {CacheGroupType::FULL};
+    initSingleGroupCacheConfig(cache_config);
     auto processor           = MtpBatchStreamProcessor(
         model_config, pd_sep_config, profiling_debug_logging_config, cache_config, sp_config, false);
     auto model_input_status = processor.gatherDecodeModelInput(stream_groups);
@@ -446,7 +459,7 @@ TEST_F(MtpBatchStreamProcessorTest, testUpdatePrefillPostDraftModelInput) {
 
     auto stream_groups = StreamGroups({stream1, stream2});
 
-    cache_config.group_types = {CacheGroupType::FULL};
+    initSingleGroupCacheConfig(cache_config);
     auto processor           = MtpBatchStreamProcessor(
         model_config, pd_sep_config, profiling_debug_logging_config, cache_config, sp_config, false);
     auto model_input_status = processor.gatherModelInput(stream_groups);
@@ -504,7 +517,7 @@ TEST_F(MtpBatchStreamProcessorTest, testUpdateDecodePostDraftModelInput) {
 
     auto stream_groups = StreamGroups({stream1, stream2});
 
-    cache_config.group_types = {CacheGroupType::FULL};
+    initSingleGroupCacheConfig(cache_config);
     auto processor           = MtpBatchStreamProcessor(
         model_config, pd_sep_config, profiling_debug_logging_config, cache_config, sp_config, false);
     auto model_input_status = processor.gatherModelInput(stream_groups);
diff --git a/rtp_llm/cpp/normal_engine/speculative/test/MtpExecutorTest.cc b/rtp_llm/cpp/normal_engine/speculative/test/MtpExecutorTest.cc
index c28eea6bbd..9c993f0700 100644
--- a/rtp_llm/cpp/normal_engine/speculative/test/MtpExecutorTest.cc
+++ b/rtp_llm/cpp/normal_engine/speculative/test/MtpExecutorTest.cc
@@ -342,7 +342,9 @@ class MtpExecutorTest: public DeviceTestBase {
                                                          rtp_llm::TYPE_INT8,
                                                          /*local_head_num_kv=*/128,
                                                          /*size_per_head=*/256);
-        cache_config.mtp_sub_configs.push_back(std::make_shared<CacheConfig>(mtp_config));
+        cache_config.layer_all_num = cache_config.layer_num + mtp_config.layer_num;
+        auto sub_cfg = cache_config.mergeMTPModule(mtp_config, /*module_index=*/0, /*main_layer_num=*/cache_config.layer_num);
+        cache_config.mtp_sub_configs.push_back(sub_cfg);
 
         EngineInitParams params = createEngineInitParams(config, model_config, runtime_config, kv_cache_config);
         params.sp_config        = sp_config;
diff --git a/rtp_llm/cpp/normal_engine/test/BUILD b/rtp_llm/cpp/normal_engine/test/BUILD
index 839060f0ec..c9b43594d0 100644
--- a/rtp_llm/cpp/normal_engine/test/BUILD
+++ b/rtp_llm/cpp/normal_engine/test/BUILD
@@ -44,6 +44,7 @@ cc_test(
         "//rtp_llm/models_py/bindings/cuda/ops:cuda_impl",
         "//rtp_llm/cpp/normal_engine:normal_engine",
         "//rtp_llm/cpp/models:models",
+        "//rtp_llm/cpp/cache/test:cache_config_test_utils",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
         "@local_config_cuda//cuda:cuda_headers",
diff --git a/rtp_llm/cpp/normal_engine/test/MockEngine.h b/rtp_llm/cpp/normal_engine/test/MockEngine.h
index abf7b4959d..031a0ace3c 100644
--- a/rtp_llm/cpp/normal_engine/test/MockEngine.h
+++ b/rtp_llm/cpp/normal_engine/test/MockEngine.h
@@ -16,6 +16,7 @@
 #include "rtp_llm/cpp/testing/TestBase.h"
 #include "rtp_llm/cpp/models/models_weight/W.h"
 #include "rtp_llm/cpp/config/ConfigModules.h"
+#include "rtp_llm/cpp/cache/spec/KVCacheSpecDescTypes.h"
 
 using namespace std;
 namespace W = rtp_llm::W;
@@ -73,6 +74,19 @@ rtp_llm::EngineInitParams createEngineInitParams(const CustomConfig&     config,
     const size_t inter_size = 512;
     // inter_size is now calculated in ModelDeployWeightInfo, not in ModelConfig
     model_config.attn_config.tokens_per_block = 2;
+    kv_cache_config.seq_size_per_block        = model_config.attn_config.tokens_per_block;
+
+    DataType kv_dtype = config.kv_cache_data_type == DataType::TYPE_INT8 ? DataType::TYPE_INT8
+                      : config.kv_cache_data_type == DataType::TYPE_FP8_E4M3 ? DataType::TYPE_FP8_E4M3
+                      : DataType::TYPE_FP16;
+    KVCacheSpecDesc mha_desc;
+    mha_desc.tag                = "default";
+    mha_desc.cache_type         = CacheType::MHA;
+    mha_desc.num_kv_heads       = model_config.attn_config.kv_head_num;
+    mha_desc.seq_size_per_block = model_config.attn_config.tokens_per_block;
+    mha_desc.dtype              = kv_dtype;
+    mha_desc.size_per_head      = model_config.attn_config.size_per_head;
+    model_config.kv_cache_spec_descs.resize(model_config.num_layers, {mha_desc});
     runtime_config.reserve_runtime_mem_mb     = 1024;
     const size_t hidden_units                 = 128;
 
diff --git a/rtp_llm/cpp/normal_engine/test/NormalBatchStreamProcessorTest.cc b/rtp_llm/cpp/normal_engine/test/NormalBatchStreamProcessorTest.cc
index ed4a6107ee..8ef0c8bde6 100644
--- a/rtp_llm/cpp/normal_engine/test/NormalBatchStreamProcessorTest.cc
+++ b/rtp_llm/cpp/normal_engine/test/NormalBatchStreamProcessorTest.cc
@@ -2,6 +2,8 @@
 #include "torch/all.h"
 #include "gtest/gtest.h"
 
+#include "rtp_llm/cpp/cache/test/CacheConfigTestUtils.h"
+
 #define private public
 #include "rtp_llm/cpp/normal_engine/NormalBatchStreamProcessor.h"
 #include "rtp_llm/cpp/normal_engine/NormalGenerateStream.h"
@@ -24,7 +26,21 @@ static torch::Tensor hostIntBuffer(std::vector<int32_t> data) {
     return torch::tensor(data, torch::kInt32);
 }
 
-class NormalBatchStreamProcessorTest: public DeviceTestBase {};
+class NormalBatchStreamProcessorTest: public DeviceTestBase {
+public:
+    static void initSingleGroupCacheConfig(CacheConfig& config, int layer_num = 1) {
+        config.layer_num     = static_cast<uint32_t>(layer_num);
+        config.layer_all_num = static_cast<uint32_t>(layer_num);
+        auto spec                = std::make_shared<MHAKVCacheSpec>();
+        spec->type               = KVCacheSpecType::MultiHeadAttention;
+        spec->seq_size_per_block = 8;
+        spec->local_head_num_kv  = 1;
+        spec->size_per_head      = 1;
+        std::vector<int> layer_ids(layer_num);
+        std::iota(layer_ids.begin(), layer_ids.end(), 0);
+        config.fromGroupedSpecs({spec}, {layer_ids}, {CacheGroupType::FULL}, {"default"});
+    }
+};
 
 TEST_F(NormalBatchStreamProcessorTest, testSimpleAssemble) {
     ResourceContext resource_context;
@@ -36,7 +52,7 @@ TEST_F(NormalBatchStreamProcessorTest, testSimpleAssemble) {
     PDSepConfig                 pd_sep_config;
     ProfilingDebugLoggingConfig profiling_debug_logging_config;
     CacheConfig                 cache_config;
-    cache_config.group_types = {CacheGroupType::FULL};
+    initSingleGroupCacheConfig(cache_config, model_config.num_layers);
 
     RuntimeConfig              runtime_config;
     NormalBatchStreamProcessor processor(
@@ -50,7 +66,7 @@ TEST_F(NormalBatchStreamProcessorTest, testSimpleAssemble) {
     query1->input_ids = hostIntBuffer({1});
     BatchKVCacheResource addr1;
     addr1.resetBatchSize(1);
-    addr1.initGroups(1, 3, {0, 0, 0});
+    addr1.initGroups(1, 3, {{0}, {0}, {0}});
     addr1.setBatchBlocks(0, 0, {1, 2, 3, 4});
     stream1->setKVCache(addr1);
     stream1->setIsContextStream(false);
@@ -63,7 +79,7 @@ TEST_F(NormalBatchStreamProcessorTest, testSimpleAssemble) {
     query2->input_ids = hostIntBuffer({1, 2});
     BatchKVCacheResource addr2;
     addr2.resetBatchSize(1);
-    addr2.initGroups(1, 3, {0, 0, 0});
+    addr2.initGroups(1, 3, {{0}, {0}, {0}});
     addr2.setBatchBlocks(0, 0, {5, 6, 7, 8});
     stream2->setKVCache(addr2);
     stream2->setIsContextStream(false);
@@ -75,7 +91,7 @@ TEST_F(NormalBatchStreamProcessorTest, testSimpleAssemble) {
         make_shared<NormalGenerateStream>(query3, model_config, runtime_config, resource_context, nullptr);
     BatchKVCacheResource addr3;
     addr3.resetBatchSize(1);
-    addr3.initGroups(1, 3, {0, 0, 0});
+    addr3.initGroups(1, 3, {{0}, {0}, {0}});
     addr3.setBatchBlocks(0, 0, {9, 10});
     stream3->setKVCache(addr3);
 
@@ -86,7 +102,7 @@ TEST_F(NormalBatchStreamProcessorTest, testSimpleAssemble) {
         make_shared<NormalGenerateStream>(query4, model_config, runtime_config, resource_context, nullptr);
     BatchKVCacheResource addr4;
     addr4.resetBatchSize(1);
-    addr4.initGroups(1, 3, {0, 0, 0});
+    addr4.initGroups(1, 3, {{0}, {0}, {0}});
     addr4.setBatchBlocks(0, 0, {11, 12, 13, 14});
     stream4->setKVCache(addr4);
     stream4->setReuseLength(1);
@@ -152,7 +168,7 @@ TEST_F(NormalBatchStreamProcessorTest, testSoftmaxProbs) {
         make_shared<NormalGenerateStream>(query1, model_config, runtime_config, resource_context, nullptr);
     BatchKVCacheResource addr1;
     addr1.resetBatchSize(1);
-    addr1.initGroups(1, 3, {0, 0, 0});
+    addr1.initGroups(1, 3, {{0}, {0}, {0}});
     addr1.setBatchBlocks(0, 0, {1});
     stream1->setKVCache(addr1);
 
@@ -162,7 +178,7 @@ TEST_F(NormalBatchStreamProcessorTest, testSoftmaxProbs) {
     for (const auto& stream : streams) {
         stream->generate_status_->status = StreamState::RUNNING;
     }
-    cache_config.group_types = {CacheGroupType::FULL};
+    initSingleGroupCacheConfig(cache_config, model_config.num_layers);
     NormalBatchStreamProcessor processor(
         model_config, pd_sep_config, profiling_debug_logging_config, cache_config, false);
 
@@ -205,7 +221,7 @@ TEST_F(NormalBatchStreamProcessorTest, testLoss) {
         make_shared<NormalGenerateStream>(query1, model_config, runtime_config, resource_context, nullptr);
     BatchKVCacheResource addr1;
     addr1.resetBatchSize(1);
-    addr1.initGroups(1, 3, {0, 0, 0});
+    addr1.initGroups(1, 3, {{0}, {0}, {0}});
     addr1.setBatchBlocks(0, 0, {1});
     stream1->setKVCache(addr1);
 
@@ -217,7 +233,7 @@ TEST_F(NormalBatchStreamProcessorTest, testLoss) {
         make_shared<NormalGenerateStream>(query3, model_config, runtime_config, resource_context, nullptr);
     BatchKVCacheResource addr3;
     addr3.resetBatchSize(1);
-    addr3.initGroups(1, 3, {0, 0, 0});
+    addr3.initGroups(1, 3, {{0}, {0}, {0}});
     addr3.setBatchBlocks(0, 0, {9});
     stream3->setKVCache(addr3);
 
@@ -229,7 +245,7 @@ TEST_F(NormalBatchStreamProcessorTest, testLoss) {
         make_shared<NormalGenerateStream>(query4, model_config, runtime_config, resource_context, nullptr);
     BatchKVCacheResource addr4;
     addr4.resetBatchSize(1);
-    addr4.initGroups(1, 3, {0, 0, 0});
+    addr4.initGroups(1, 3, {{0}, {0}, {0}});
     addr4.setBatchBlocks(0, 0, {11, 12});
     stream4->setKVCache(addr4);
 
@@ -241,7 +257,7 @@ TEST_F(NormalBatchStreamProcessorTest, testLoss) {
     for (const auto& stream : streams) {
         stream->generate_status_->status = StreamState::RUNNING;
     }
-    cache_config.group_types = {CacheGroupType::FULL};
+    initSingleGroupCacheConfig(cache_config, model_config.num_layers);
     NormalBatchStreamProcessor processor(
         model_config, pd_sep_config, profiling_debug_logging_config, cache_config, false);
 
@@ -288,7 +304,7 @@ TEST_F(NormalBatchStreamProcessorTest, testMultimodalGatherBatch) {
     PDSepConfig                 pd_sep_config;
     ProfilingDebugLoggingConfig profiling_debug_logging_config;
     CacheConfig                 cache_config;
-    cache_config.group_types = {CacheGroupType::FULL};
+    initSingleGroupCacheConfig(cache_config, model_config.num_layers);
     RuntimeConfig              runtime_config;
     NormalBatchStreamProcessor processor(
         model_config, pd_sep_config, profiling_debug_logging_config, cache_config, false);
diff --git a/rtp_llm/cpp/pybind/ConfigInit.cc b/rtp_llm/cpp/pybind/ConfigInit.cc
index 66d31fb189..9540076456 100644
--- a/rtp_llm/cpp/pybind/ConfigInit.cc
+++ b/rtp_llm/cpp/pybind/ConfigInit.cc
@@ -722,11 +722,13 @@ PYBIND11_MODULE(libth_transformer_config, m) {
         .value("SLIDING_WINDOW", HybridAttentionType::SLIDING_WINDOW);
 
     pybind11::class_<HybridAttentionConfig>(m, "HybridAttentionConfig")
-        .def(pybind11::init<bool, std::vector<HybridAttentionType>>(),
-             pybind11::arg("enable_hybrid_attention") = false,
-             pybind11::arg("hybrid_attention_types")  = std::vector<HybridAttentionType>{})
+        .def(pybind11::init<bool, bool, std::vector<HybridAttentionType>>(),
+             pybind11::arg("enable_hybrid_attention")           = false,
+             pybind11::arg("enable_independent_kv_cache_pools") = false,
+             pybind11::arg("hybrid_attention_types")            = std::vector<HybridAttentionType>{})
         .def("to_string", &HybridAttentionConfig::to_string)
         .def_readwrite("enable_hybrid_attention", &HybridAttentionConfig::enable_hybrid_attention)
+        .def_readwrite("enable_independent_kv_cache_pools", &HybridAttentionConfig::enable_independent_kv_cache_pools)
         .def_readwrite("hybrid_attention_types", &HybridAttentionConfig::hybrid_attention_types);
 
     // Register SpeculativeType enum
diff --git a/rtp_llm/cpp/testing/TestBase.h b/rtp_llm/cpp/testing/TestBase.h
index 8663049f0b..24d834a8c6 100644
--- a/rtp_llm/cpp/testing/TestBase.h
+++ b/rtp_llm/cpp/testing/TestBase.h
@@ -203,7 +203,7 @@ class DeviceTestBase: public EngineBaseTest {
 
         auto batch_kv_cache = std::make_shared<rtp_llm::BatchKVCacheResource>();
         batch_kv_cache->resetBatchSize(batch_size);
-        batch_kv_cache->initGroups(1, cache_config.layer_all_num, cache_config.layer_to_group_id);
+        batch_kv_cache->initGroups(1, cache_config.layer_all_num, cache_config.layerGroupIdsSnapshot());
 
         auto complete_token_ids =
             std::make_shared<rtp_llm::CompleteTokenIds>(static_cast<int>(batch_size),
@@ -261,13 +261,13 @@ class DeviceTestBase: public EngineBaseTest {
                                              torch::indexing::Slice()})
                                      .reshape({2,
                                                static_cast<int64_t>(cache_config.seq_size_per_block),
-                                               static_cast<int64_t>(cache_config.cache_specs[0]->local_head_num_kv),
+                                               static_cast<int64_t>(cache_config.specForGroup(0)->local_head_num_kv),
                                                static_cast<int64_t>(
-                                                   static_cast<rtp_llm::MHAKVCacheSpec&>(*cache_config.cache_specs[0])
+                                                   static_cast<rtp_llm::MHAKVCacheSpec&>(*cache_config.specForGroup(0))
                                                        .size_per_head)})
                                      .transpose(2, 1)
                                      .contiguous();
-                        // vblock is not used in setKVBlockValue in this case
+                        // vblock is not used in writeKVBlockForTest in this case
                         vblock = kvCache
                                      .index({torch::indexing::Slice(),
                                              static_cast<int64_t>(i),
@@ -275,17 +275,17 @@ class DeviceTestBase: public EngineBaseTest {
                                              torch::indexing::Slice(block_start, block_end),
                                              torch::indexing::Slice()})
                                      .reshape({static_cast<int64_t>(cache_config.seq_size_per_block),
-                                               static_cast<int64_t>(cache_config.cache_specs[0]->local_head_num_kv),
+                                               static_cast<int64_t>(cache_config.specForGroup(0)->local_head_num_kv),
                                                static_cast<int64_t>(
-                                                   static_cast<rtp_llm::MHAKVCacheSpec&>(*cache_config.cache_specs[0])
+                                                   static_cast<rtp_llm::MHAKVCacheSpec&>(*cache_config.specForGroup(0))
                                                        .size_per_head)})
                                      .transpose(1, 0)
                                      .contiguous();
                     }
                     // std::cout << "index: " << k << " start: " << block_start << " end: " << block_end << std::endl;
                     // std::cout << "block index: " << k_indexs[k] << std::endl;
-                    if (!cache_manager_->setKVBlockValue(k_indexs[k], kblock, vblock)) {
-                        std::cout << "setKVBlockValue failed for block index: " << k_indexs[k] << std::endl;
+                    if (!cache_manager_->writeKVBlockForTest(k_indexs[k], kblock, vblock)) {
+                        std::cout << "writeKVBlockForTest failed for block index: " << k_indexs[k] << std::endl;
                         return torch::Tensor();
                     }
                 }
diff --git a/rtp_llm/models_py/bindings/NoBlockCopy.h b/rtp_llm/models_py/bindings/NoBlockCopy.h
index 978dc23739..ddd524b112 100644
--- a/rtp_llm/models_py/bindings/NoBlockCopy.h
+++ b/rtp_llm/models_py/bindings/NoBlockCopy.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <cstddef>
 #include <torch/torch.h>
 #include <vector>
 
@@ -16,12 +17,73 @@ struct MultiCopyParams {
     size_t split_kv_scale_stride_bytes = 0;
 };
 
+struct BatchedMemoryCopyTile {
+    void*       dst   = nullptr;
+    const void* src   = nullptr;
+    size_t      bytes = 0;
+};
+
+struct BatchedMemoryCopyParams {
+    std::vector<BatchedMemoryCopyTile> tiles;
+    int                                device_index = -1;
+};
+
+enum class StagedMemoryCopyDirection {
+    H2D = 0,
+    D2H = 1,
+};
+
+struct StagedMemoryCopyTile {
+    void*  gpu         = nullptr;
+    size_t host_offset = 0;
+    size_t bytes       = 0;
+};
+
+struct StagedMemoryCopyHostSegment {
+    void*  host        = nullptr;
+    size_t host_offset = 0;
+    size_t bytes       = 0;
+};
+
+struct StagedMemoryCopyParams {
+    void*                                    host_base  = nullptr;
+    size_t                                   host_bytes = 0;
+    std::vector<StagedMemoryCopyHostSegment> host_segments;
+    std::vector<StagedMemoryCopyTile>        tiles;
+    int                                      device_index = -1;
+    StagedMemoryCopyDirection                direction    = StagedMemoryCopyDirection::H2D;
+};
+
+struct StagedMemoryCopyScratch {
+    void*  host_staging       = nullptr;
+    size_t host_capacity      = 0;
+    void*  device_staging     = nullptr;
+    size_t device_capacity    = 0;
+    void*  device_ptrs        = nullptr;
+    void*  device_offsets     = nullptr;
+    void*  device_sizes       = nullptr;
+    size_t meta_capacity      = 0;
+    int    device_index       = -1;
+};
+
 // Multi-tensor non-blocking copy with device-specific implementation.
 // CUDA: uses a dedicated stream + optional split-KV SM scatter path.
 // ROCm: plain tensor copy_.
 // Other devices: not supported (will abort).
 void execNoBlockCopy(const MultiCopyParams& params);
 
+// One CUDA runtime call copy executor for regular host/device pointers.
+// CUDA 12.8+ uses cudaMemcpyBatchAsync to avoid per-tile cudaMemcpyAsync launches.
+bool execBatchedMemoryCopy(const BatchedMemoryCopyParams& params);
+
+// Stages compact host payload in GPU memory, then uses one SM gather/scatter kernel.
+// host_segments may describe non-contiguous host blocks; they are packed/unpacked on CPU.
+// scratch is optional; passing one lets callers reuse pinned host staging and device metadata buffers.
+// H2D: compact host payload -> GPU staging -> tile.gpu by tile.host_offset.
+// D2H: tile.gpu -> GPU staging by tile.host_offset -> compact host payload.
+bool execStagedMemoryCopy(const StagedMemoryCopyParams& params, StagedMemoryCopyScratch* scratch = nullptr);
+void releaseStagedMemoryCopyScratch(StagedMemoryCopyScratch& scratch);
+
 // Warmup split-KV copy kernels. No-op on non-CUDA / PPU devices.
 // Must be called after cudaSetDevice + setCurrentCUDAStream.
 void warmupNoBlockCopy();
diff --git a/rtp_llm/models_py/bindings/NoBlockCopyDefault.cc b/rtp_llm/models_py/bindings/NoBlockCopyDefault.cc
index dae679dab0..307fc039eb 100644
--- a/rtp_llm/models_py/bindings/NoBlockCopyDefault.cc
+++ b/rtp_llm/models_py/bindings/NoBlockCopyDefault.cc
@@ -14,6 +14,16 @@ void execNoBlockCopy(const MultiCopyParams& params) {
     }
 }
 
+bool execBatchedMemoryCopy(const BatchedMemoryCopyParams& params) {
+    return params.tiles.empty();
+}
+
+bool execStagedMemoryCopy(const StagedMemoryCopyParams& params, StagedMemoryCopyScratch*) {
+    return params.tiles.empty();
+}
+
+void releaseStagedMemoryCopyScratch(StagedMemoryCopyScratch&) {}
+
 void warmupNoBlockCopy() {}
 
 }  // namespace rtp_llm
diff --git a/rtp_llm/models_py/bindings/OpDefs.cc b/rtp_llm/models_py/bindings/OpDefs.cc
index 67bfe051d6..e104bc3ac4 100644
--- a/rtp_llm/models_py/bindings/OpDefs.cc
+++ b/rtp_llm/models_py/bindings/OpDefs.cc
@@ -28,8 +28,8 @@ void registerPyOpDefs(pybind11::module& m) {
         .def_readwrite("use_mla", &KVCache::use_mla, "Whether MLA cache layout is used")
         .def_readwrite("kv_lora_rank", &KVCache::kv_lora_rank, "MLA KV LoRA rank")
         .def_readwrite("rope_head_dim", &KVCache::rope_head_dim, "MLA RoPE head dimension")
-        .def_readwrite("layer_attn_types",
-                       &KVCache::layer_attn_types,
+        .def_readwrite("layer_group_types",
+                       &KVCache::layer_group_types,
                        "Per-layer attention type (CacheGroupType::FULL or LINEAR). "
                        "Empty = all layers treated as FULL (backward compatibility).")
         .def("get_layer_cache",
diff --git a/rtp_llm/models_py/bindings/OpDefs.h b/rtp_llm/models_py/bindings/OpDefs.h
index adf0d41656..37fa931b05 100644
--- a/rtp_llm/models_py/bindings/OpDefs.h
+++ b/rtp_llm/models_py/bindings/OpDefs.h
@@ -4,7 +4,7 @@
 #include <pybind11/stl.h>
 #include <pybind11/embed.h>
 #include <torch/extension.h>
-#include "rtp_llm/cpp/cache/CacheGroupType.h"
+#include "rtp_llm/cpp/cache/spec/CacheGroupType.h"
 #include "rtp_llm/cpp/model_utils/AttentionConfig.h"
 #include "rtp_llm/models_py/bindings/ParamsBase.h"
 #include "rtp_llm/cpp/utils/Logger.h"
@@ -44,14 +44,14 @@ struct KVCache {
     int                        rope_head_dim             = 0;
 
     // Per-layer attention type (CacheGroupType::FULL or LINEAR).
-    std::vector<rtp_llm::CacheGroupType> layer_attn_types;
+    std::vector<rtp_llm::CacheGroupType> layer_group_types;
 
     LayerKVCache getLayerCache(int idx) {
         LayerKVCache layer_cache;
         layer_cache.layer_id = idx;
 
         // Determine whether this layer is a full-attention layer.
-        if (idx < 0 || static_cast<size_t>(idx) >= layer_attn_types.size())
+        if (idx < 0 || static_cast<size_t>(idx) >= layer_group_types.size())
             throw std::runtime_error("Invalid layer index: " + std::to_string(idx));
         auto          base = kv_cache_base_by_layer[idx];
         torch::Tensor scale;
@@ -59,7 +59,7 @@ struct KVCache {
             scale = kv_scale_base_by_layer[idx];
         }
 
-        const bool is_full = layer_attn_types[static_cast<size_t>(idx)] == rtp_llm::CacheGroupType::FULL;
+        const bool is_full = layer_group_types[static_cast<size_t>(idx)] == rtp_llm::CacheGroupType::FULL;
 
         if (!is_full) {
             // Linear/SSM attention layer: return the raw cache tensor unchanged.
diff --git a/rtp_llm/models_py/bindings/core/ExecOps.cc b/rtp_llm/models_py/bindings/core/ExecOps.cc
index 4ba2a154ce..ad98ca2713 100644
--- a/rtp_llm/models_py/bindings/core/ExecOps.cc
+++ b/rtp_llm/models_py/bindings/core/ExecOps.cc
@@ -2,7 +2,7 @@
 #include "rtp_llm/models_py/bindings/core/CommonDefines.h"
 #include "rtp_llm/cpp/disaggregate/cache_store/CacheStore.h"
 #include "rtp_llm/cpp/utils/Logger.h"
-#include "rtp_llm/cpp/cache/CacheGroupType.h"
+#include "rtp_llm/cpp/cache/spec/CacheGroupType.h"
 #include "rtp_llm/cpp/utils/KVCacheUtils.h"
 #include "rtp_llm/cpp/utils/ErrorCode.h"
 #include "rtp_llm/cpp/utils/StackTrace.h"
diff --git a/rtp_llm/models_py/bindings/core/TensorHolder.h b/rtp_llm/models_py/bindings/core/TensorHolder.h
new file mode 100644
index 0000000000..c0db0e7009
--- /dev/null
+++ b/rtp_llm/models_py/bindings/core/TensorHolder.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <queue>
+#include <vector>
+
+#include <torch/extension.h>
+
+namespace rtp_llm {
+
+struct TensorHolder {
+    static constexpr size_t kReleasedHoldRounds = 2;
+
+    std::vector<torch::Tensor>              tensors;
+    std::queue<std::vector<torch::Tensor>> clear_tensors;
+
+    void hold_host(const torch::Tensor& tensor) {
+        if (tensor.defined() && tensor.device().is_cpu()) {
+            tensors.push_back(tensor);
+        }
+    }
+
+    void hold(const torch::Tensor& tensor) {
+        if (tensor.defined()) {
+            tensors.push_back(tensor);
+        }
+    }
+
+    void release() {
+        // Move the current hold set into clear_tensors. Keep two released
+        // rounds alive so tensors created for async H2D/D2H copies or CUDA
+        // kernels are not freed until the third release point.
+        clear_tensors.push(std::move(tensors));
+        tensors.clear();
+        while (clear_tensors.size() > kReleasedHoldRounds) {
+            clear_tensors.pop();
+        }
+    }
+};
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/models_py/bindings/cuda/NoBlockCopy.cc b/rtp_llm/models_py/bindings/cuda/NoBlockCopy.cc
index 7b5fdcc70f..8e72ace7ed 100644
--- a/rtp_llm/models_py/bindings/cuda/NoBlockCopy.cc
+++ b/rtp_llm/models_py/bindings/cuda/NoBlockCopy.cc
@@ -1,7 +1,10 @@
 #include "rtp_llm/models_py/bindings/NoBlockCopy.h"
+#include "rtp_llm/models_py/bindings/common/kernels/sm_copy_kernel.h"
 #include "rtp_llm/models_py/bindings/cuda/SplitKvCacheCopy.h"
 #include "rtp_llm/models_py/bindings/cuda/cuda_host_utils.h"
 
+#include <algorithm>
+#include <cstring>
 #include <cuda_runtime.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
@@ -15,8 +18,189 @@ at::cuda::CUDAStream& getNoBlockCopyStream() {
     return stream;
 }
 
+enum class HostCoverage {
+    Invalid,
+    Partial,
+    Full,
+};
+
+HostCoverage checkHostCoverage(const StagedMemoryCopyParams& params) {
+    std::vector<std::pair<size_t, size_t>> ranges;
+    ranges.reserve(params.tiles.size());
+    for (const auto& tile : params.tiles) {
+        if (tile.bytes == 0) {
+            continue;
+        }
+        if (tile.host_offset > params.host_bytes || tile.bytes > params.host_bytes - tile.host_offset) {
+            return HostCoverage::Invalid;
+        }
+        ranges.emplace_back(tile.host_offset, tile.bytes);
+    }
+    std::sort(ranges.begin(), ranges.end());
+
+    size_t covered = 0;
+    bool   has_gap = false;
+    for (const auto& [offset, bytes] : ranges) {
+        if (bytes == 0 || offset < covered) {
+            return HostCoverage::Invalid;
+        }
+        if (offset > covered) {
+            has_gap = true;
+        }
+        covered = offset + bytes;
+    }
+    if (covered > params.host_bytes) {
+        return HostCoverage::Invalid;
+    }
+    return (!has_gap && covered == params.host_bytes) ? HostCoverage::Full : HostCoverage::Partial;
+}
+
+bool checkHostSegments(const StagedMemoryCopyParams& params) {
+    if (params.host_segments.empty()) {
+        return params.host_base != nullptr && params.host_bytes > 0;
+    }
+
+    std::vector<std::pair<size_t, size_t>> ranges;
+    ranges.reserve(params.host_segments.size());
+    for (const auto& segment : params.host_segments) {
+        if (segment.host == nullptr || segment.bytes == 0) {
+            return false;
+        }
+        if (segment.host_offset > params.host_bytes || segment.bytes > params.host_bytes - segment.host_offset) {
+            return false;
+        }
+        ranges.emplace_back(segment.host_offset, segment.bytes);
+    }
+    std::sort(ranges.begin(), ranges.end());
+
+    size_t covered = 0;
+    for (const auto& [offset, bytes] : ranges) {
+        if (offset < covered) {
+            return false;
+        }
+        covered = offset + bytes;
+    }
+    return covered <= params.host_bytes;
+}
+
+void packHostSegments(const StagedMemoryCopyParams& params, void* host_staging) {
+    auto* base = static_cast<char*>(host_staging);
+    for (const auto& segment : params.host_segments) {
+        std::memcpy(base + segment.host_offset, segment.host, segment.bytes);
+    }
+}
+
+void unpackHostSegments(const StagedMemoryCopyParams& params, const void* host_staging) {
+    const auto* base = static_cast<const char*>(host_staging);
+    for (const auto& segment : params.host_segments) {
+        std::memcpy(segment.host, base + segment.host_offset, segment.bytes);
+    }
+}
+
+void copyHostToPinnedStaging(const StagedMemoryCopyParams& params, void* host_staging) {
+    if (params.host_segments.empty()) {
+        std::memcpy(host_staging, params.host_base, params.host_bytes);
+        return;
+    }
+    packHostSegments(params, host_staging);
+}
+
+void copyPinnedStagingToHost(const StagedMemoryCopyParams& params, const void* host_staging) {
+    if (params.host_segments.empty()) {
+        std::memcpy(params.host_base, host_staging, params.host_bytes);
+        return;
+    }
+    unpackHostSegments(params, host_staging);
+}
+
+void releaseDevicePointer(void*& ptr) {
+    if (ptr != nullptr) {
+        (void)cudaFree(ptr);
+        ptr = nullptr;
+    }
+}
+
+void releaseMetadataScratch(StagedMemoryCopyScratch& scratch) {
+    releaseDevicePointer(scratch.device_ptrs);
+    releaseDevicePointer(scratch.device_offsets);
+    releaseDevicePointer(scratch.device_sizes);
+    scratch.meta_capacity = 0;
+}
+
+bool ensureStagedMemoryCopyScratch(StagedMemoryCopyScratch& scratch,
+                                   int                      device_index,
+                                   size_t                   host_bytes,
+                                   size_t                   tile_num) {
+    if (scratch.device_index >= 0 && scratch.device_index != device_index) {
+        releaseStagedMemoryCopyScratch(scratch);
+    }
+    check_cuda_value(cudaSetDevice(device_index));
+    scratch.device_index = device_index;
+
+    if (scratch.host_capacity < host_bytes) {
+        if (scratch.host_staging != nullptr) {
+            (void)cudaFreeHost(scratch.host_staging);
+            scratch.host_staging = nullptr;
+            scratch.host_capacity = 0;
+        }
+        auto err = cudaHostAlloc(&scratch.host_staging, host_bytes, cudaHostAllocDefault);
+        if (err != cudaSuccess) {
+            RTP_LLM_LOG_WARNING("execStagedMemoryCopy failed to allocate pinned host staging: %s",
+                                cudaGetErrorString(err));
+            return false;
+        }
+        scratch.host_capacity = host_bytes;
+    }
+
+    if (scratch.device_capacity < host_bytes) {
+        releaseDevicePointer(scratch.device_staging);
+        auto err = cudaMalloc(&scratch.device_staging, host_bytes);
+        if (err != cudaSuccess) {
+            scratch.device_capacity = 0;
+            RTP_LLM_LOG_WARNING("execStagedMemoryCopy failed to allocate device staging: %s",
+                                cudaGetErrorString(err));
+            return false;
+        }
+        scratch.device_capacity = host_bytes;
+    }
+
+    if (scratch.meta_capacity < tile_num) {
+        releaseMetadataScratch(scratch);
+        auto err = cudaMalloc(&scratch.device_ptrs, tile_num * sizeof(void*));
+        if (err == cudaSuccess) {
+            err = cudaMalloc(&scratch.device_offsets, tile_num * sizeof(size_t));
+        }
+        if (err == cudaSuccess) {
+            err = cudaMalloc(&scratch.device_sizes, tile_num * sizeof(size_t));
+        }
+        if (err != cudaSuccess) {
+            releaseMetadataScratch(scratch);
+            RTP_LLM_LOG_WARNING("execStagedMemoryCopy failed to allocate device metadata: %s",
+                                cudaGetErrorString(err));
+            return false;
+        }
+        scratch.meta_capacity = tile_num;
+    }
+    return true;
+}
+
 }  // namespace
 
+void releaseStagedMemoryCopyScratch(StagedMemoryCopyScratch& scratch) {
+    if (scratch.device_index >= 0) {
+        (void)cudaSetDevice(scratch.device_index);
+    }
+    if (scratch.host_staging != nullptr) {
+        (void)cudaFreeHost(scratch.host_staging);
+    }
+    releaseDevicePointer(scratch.device_staging);
+    releaseMetadataScratch(scratch);
+    scratch.host_staging    = nullptr;
+    scratch.host_capacity   = 0;
+    scratch.device_capacity = 0;
+    scratch.device_index    = -1;
+}
+
 void execNoBlockCopy(const MultiCopyParams& params) {
     RTP_LLM_CHECK_WITH_INFO(params.multi_src.size() == params.multi_dst.size(),
                             "multi_src.size(%zu) != multi_dst.size(%zu)",
@@ -61,6 +245,203 @@ void execNoBlockCopy(const MultiCopyParams& params) {
     check_cuda_error();
 }
 
+bool execBatchedMemoryCopy(const BatchedMemoryCopyParams& params) {
+    if (params.tiles.empty()) {
+        return true;
+    }
+    if (params.device_index < 0) {
+        RTP_LLM_LOG_WARNING("execBatchedMemoryCopy failed: invalid device_index=%d", params.device_index);
+        return false;
+    }
+
+#if CUDART_VERSION >= 12080
+    check_cuda_value(cudaSetDevice(params.device_index));
+    auto stream = getNoBlockCopyStream().stream();
+
+    const size_t tile_num = params.tiles.size();
+    std::vector<void*>       dsts;
+    std::vector<const void*> srcs;
+    std::vector<size_t>      sizes;
+    dsts.reserve(tile_num);
+    srcs.reserve(tile_num);
+    sizes.reserve(tile_num);
+    for (const auto& tile : params.tiles) {
+        if (tile.dst == nullptr || tile.src == nullptr || tile.bytes == 0) {
+            continue;
+        }
+        dsts.push_back(tile.dst);
+        srcs.push_back(tile.src);
+        sizes.push_back(tile.bytes);
+    }
+    if (dsts.empty()) {
+        return true;
+    }
+
+    cudaMemcpyAttributes attr{};
+    attr.srcAccessOrder = cudaMemcpySrcAccessOrderStream;
+    size_t attr_idx     = 0;
+#if CUDART_VERSION >= 13000
+    auto err = cudaMemcpyBatchAsync(dsts.data(), srcs.data(), sizes.data(), dsts.size(), &attr, &attr_idx, 1, stream);
+#else
+    std::vector<void*> mutable_srcs;
+    mutable_srcs.reserve(srcs.size());
+    for (auto* src : srcs) {
+        mutable_srcs.push_back(const_cast<void*>(src));
+    }
+    size_t fail_idx = 0;
+    auto   err = cudaMemcpyBatchAsync(
+        dsts.data(), mutable_srcs.data(), sizes.data(), dsts.size(), &attr, &attr_idx, 1, &fail_idx, stream);
+#endif
+    if (err == cudaSuccess) {
+        err = cudaStreamSynchronize(stream);
+    }
+    if (err != cudaSuccess) {
+        RTP_LLM_LOG_WARNING("execBatchedMemoryCopy failed: tiles=%zu, error=%s", dsts.size(), cudaGetErrorString(err));
+        return false;
+    }
+    check_cuda_error();
+    return true;
+#else
+    RTP_LLM_LOG_DEBUG("execBatchedMemoryCopy unavailable: CUDART_VERSION=%d", CUDART_VERSION);
+    return false;
+#endif
+}
+
+bool execStagedMemoryCopy(const StagedMemoryCopyParams& params, StagedMemoryCopyScratch* scratch) {
+    if (params.tiles.empty()) {
+        return true;
+    }
+    if (params.device_index < 0 || params.host_bytes == 0 || !checkHostSegments(params)) {
+        RTP_LLM_LOG_WARNING("execStagedMemoryCopy failed: device=%d host_base=%p host_bytes=%zu host_segments=%zu",
+                            params.device_index,
+                            params.host_base,
+                            params.host_bytes,
+                            params.host_segments.size());
+        return false;
+    }
+    const auto host_coverage = checkHostCoverage(params);
+    if (host_coverage == HostCoverage::Invalid) {
+        RTP_LLM_LOG_WARNING("execStagedMemoryCopy failed: invalid/overlapping host coverage, tiles=%zu bytes=%zu",
+                            params.tiles.size(),
+                            params.host_bytes);
+        return false;
+    }
+
+    check_cuda_value(cudaSetDevice(params.device_index));
+    auto stream = getNoBlockCopyStream().stream();
+
+    std::vector<void*>  h_ptrs;
+    std::vector<size_t> h_offsets;
+    std::vector<size_t> h_sizes;
+    h_ptrs.reserve(params.tiles.size());
+    h_offsets.reserve(params.tiles.size());
+    h_sizes.reserve(params.tiles.size());
+    for (const auto& tile : params.tiles) {
+        if (tile.gpu == nullptr || tile.bytes == 0) {
+            continue;
+        }
+        if (tile.host_offset > params.host_bytes || tile.bytes > params.host_bytes - tile.host_offset) {
+            RTP_LLM_LOG_WARNING("execStagedMemoryCopy failed: tile out of host span, off=%zu bytes=%zu host=%zu",
+                                tile.host_offset,
+                                tile.bytes,
+                                params.host_bytes);
+            return false;
+        }
+        h_ptrs.push_back(tile.gpu);
+        h_offsets.push_back(tile.host_offset);
+        h_sizes.push_back(tile.bytes);
+    }
+    if (h_ptrs.empty()) {
+        return true;
+    }
+
+    StagedMemoryCopyScratch local_scratch;
+    auto*                   work_scratch = scratch != nullptr ? scratch : &local_scratch;
+    auto cleanup_local_scratch = [&]() {
+        if (scratch == nullptr) {
+            releaseStagedMemoryCopyScratch(local_scratch);
+        }
+    };
+
+    const size_t tile_num = h_ptrs.size();
+    if (!ensureStagedMemoryCopyScratch(*work_scratch, params.device_index, params.host_bytes, tile_num)) {
+        cleanup_local_scratch();
+        return false;
+    }
+
+    auto err = cudaMemcpyAsync(
+        work_scratch->device_ptrs, h_ptrs.data(), tile_num * sizeof(void*), cudaMemcpyHostToDevice, stream);
+    if (err == cudaSuccess) {
+        err = cudaMemcpyAsync(work_scratch->device_offsets,
+                              h_offsets.data(),
+                              tile_num * sizeof(size_t),
+                              cudaMemcpyHostToDevice,
+                              stream);
+    }
+    if (err == cudaSuccess) {
+        err = cudaMemcpyAsync(
+            work_scratch->device_sizes, h_sizes.data(), tile_num * sizeof(size_t), cudaMemcpyHostToDevice, stream);
+    }
+
+    if (err == cudaSuccess && params.direction == StagedMemoryCopyDirection::H2D) {
+        copyHostToPinnedStaging(params, work_scratch->host_staging);
+        err = cudaMemcpyAsync(work_scratch->device_staging,
+                              work_scratch->host_staging,
+                              params.host_bytes,
+                              cudaMemcpyHostToDevice,
+                              stream);
+        if (err == cudaSuccess) {
+            sDevMPS::launch_scatter_copy_var_nooffset(
+                work_scratch->device_staging,
+                reinterpret_cast<const size_t*>(work_scratch->device_offsets),
+                reinterpret_cast<const size_t*>(work_scratch->device_sizes),
+                reinterpret_cast<void**>(work_scratch->device_ptrs),
+                static_cast<int>(tile_num),
+                0,
+                stream);
+            err = cudaGetLastError();
+        }
+    } else if (err == cudaSuccess) {
+        sDevMPS::launch_gather_copy_var_nooffset(
+            reinterpret_cast<const void**>(work_scratch->device_ptrs),
+            reinterpret_cast<const size_t*>(work_scratch->device_sizes),
+            reinterpret_cast<const size_t*>(work_scratch->device_offsets),
+            work_scratch->device_staging,
+            static_cast<int>(tile_num),
+            0,
+            stream);
+        err = cudaGetLastError();
+        if (err == cudaSuccess) {
+            err = cudaMemcpyAsync(work_scratch->host_staging,
+                                  work_scratch->device_staging,
+                                  params.host_bytes,
+                                  cudaMemcpyDeviceToHost,
+                                  stream);
+        }
+    }
+
+    if (err == cudaSuccess) {
+        err = cudaStreamSynchronize(stream);
+    } else {
+        (void)cudaStreamSynchronize(stream);
+    }
+    if (err == cudaSuccess && params.direction == StagedMemoryCopyDirection::D2H) {
+        copyPinnedStagingToHost(params, work_scratch->host_staging);
+    }
+    if (err != cudaSuccess) {
+        RTP_LLM_LOG_WARNING("execStagedMemoryCopy failed: tiles=%zu bytes=%zu direction=%s error=%s",
+                            tile_num,
+                            params.host_bytes,
+                            params.direction == StagedMemoryCopyDirection::H2D ? "H2D" : "D2H",
+                            cudaGetErrorString(err));
+        cleanup_local_scratch();
+        return false;
+    }
+    cleanup_local_scratch();
+    check_cuda_error();
+    return true;
+}
+
 void warmupNoBlockCopy() {
     if (!warmupSplitKvCopyKernels(at::cuda::getCurrentCUDAStream().stream())) {
         RTP_LLM_LOG_WARNING("warmupSplitKvCopyKernels failed; split-KV copy may JIT on first use");
diff --git a/rtp_llm/models_py/bindings/cuda/kernels/attention_input_metadata.cu b/rtp_llm/models_py/bindings/cuda/kernels/attention_input_metadata.cu
new file mode 100644
index 0000000000..599eda7709
--- /dev/null
+++ b/rtp_llm/models_py/bindings/cuda/kernels/attention_input_metadata.cu
@@ -0,0 +1,107 @@
+#include "rtp_llm/models_py/bindings/cuda/kernels/attention_input_metadata.h"
+
+#include <c10/util/Exception.h>
+#include <cuda_runtime.h>
+#include <cstdint>
+
+namespace rtp_llm {
+
+namespace {
+
+__global__ void buildAttentionInputMetadataKernel(const int32_t* __restrict__ input_lengths,
+                                                  const int32_t* __restrict__ prefix_lengths,
+                                                  int32_t* __restrict__ cu_seqlens,
+                                                  int32_t* __restrict__ cu_kv_seqlens,
+                                                  int32_t* __restrict__ padding_offset,
+                                                  int32_t batch_size,
+                                                  int32_t total_tokens) {
+    if (blockIdx.x != 0 || threadIdx.x != 0) {
+        return;
+    }
+
+    int32_t max_input_len = 0;
+    int32_t q_acc         = 0;
+    int32_t kv_acc        = 0;
+    cu_seqlens[0]         = 0;
+    cu_kv_seqlens[0]      = 0;
+
+    for (int32_t b = 0; b < batch_size; ++b) {
+        const int32_t input_len  = input_lengths[b];
+        const int32_t prefix_len = prefix_lengths ? prefix_lengths[b] : 0;
+        max_input_len            = max_input_len > input_len ? max_input_len : input_len;
+        q_acc += input_len;
+        kv_acc += input_len + prefix_len;
+        cu_seqlens[b + 1]    = q_acc;
+        cu_kv_seqlens[b + 1] = kv_acc;
+    }
+
+    if (!padding_offset || total_tokens <= 0) {
+        return;
+    }
+
+    int32_t out_idx    = 0;
+    int32_t cum_offset = 0;
+    for (int32_t b = 0; b < batch_size; ++b) {
+        const int32_t input_len = input_lengths[b];
+        for (int32_t j = 0; j < input_len && out_idx < total_tokens; ++j) {
+            padding_offset[out_idx++] = cum_offset;
+        }
+        cum_offset += max_input_len - input_len;
+    }
+}
+
+}  // namespace
+
+void invokeBuildAttentionInputMetadata(const at::Tensor& input_lengths,
+                                       const at::Tensor& prefix_lengths,
+                                       at::Tensor&       cu_seqlens,
+                                       at::Tensor&       cu_kv_seqlens,
+                                       at::Tensor&       padding_offset,
+                                       cudaStream_t      stream) {
+    TORCH_CHECK(input_lengths.defined(), "input_lengths must be defined");
+    TORCH_CHECK(input_lengths.is_cuda(), "input_lengths must be a CUDA tensor");
+    TORCH_CHECK(input_lengths.scalar_type() == at::kInt, "input_lengths must be int32");
+    TORCH_CHECK(input_lengths.is_contiguous(), "input_lengths must be contiguous");
+    TORCH_CHECK(!prefix_lengths.defined() || prefix_lengths.numel() == 0 || prefix_lengths.is_cuda(),
+                "prefix_lengths must be CUDA or empty");
+    TORCH_CHECK(!prefix_lengths.defined() || prefix_lengths.numel() == 0 || prefix_lengths.scalar_type() == at::kInt,
+                "prefix_lengths must be int32");
+    TORCH_CHECK(cu_seqlens.is_cuda() && cu_seqlens.scalar_type() == at::kInt, "cu_seqlens must be CUDA int32");
+    TORCH_CHECK(cu_kv_seqlens.is_cuda() && cu_kv_seqlens.scalar_type() == at::kInt, "cu_kv_seqlens must be CUDA int32");
+    TORCH_CHECK(!padding_offset.defined() || padding_offset.is_cuda(), "padding_offset must be CUDA");
+
+    const auto batch_size   = static_cast<int32_t>(input_lengths.size(0));
+    const auto total_tokens = padding_offset.defined() ? static_cast<int32_t>(padding_offset.numel()) : 0;
+    if (batch_size == 0) {
+        if (cu_seqlens.numel() > 0) {
+            cu_seqlens.zero_();
+        }
+        if (cu_kv_seqlens.numel() > 0) {
+            cu_kv_seqlens.zero_();
+        }
+        if (padding_offset.defined() && padding_offset.numel() > 0) {
+            padding_offset.zero_();
+        }
+        return;
+    }
+
+    const int32_t* prefix_ptr = nullptr;
+    if (prefix_lengths.defined() && prefix_lengths.numel() > 0) {
+        TORCH_CHECK(prefix_lengths.is_contiguous(), "prefix_lengths must be contiguous");
+        TORCH_CHECK(prefix_lengths.size(0) >= batch_size, "prefix_lengths size must cover input_lengths");
+        prefix_ptr = prefix_lengths.data_ptr<int32_t>();
+    }
+
+    buildAttentionInputMetadataKernel<<<1, 1, 0, stream>>>(
+        input_lengths.data_ptr<int32_t>(),
+        prefix_ptr,
+        cu_seqlens.data_ptr<int32_t>(),
+        cu_kv_seqlens.data_ptr<int32_t>(),
+        padding_offset.defined() && padding_offset.numel() > 0 ? padding_offset.data_ptr<int32_t>() : nullptr,
+        batch_size,
+        total_tokens);
+    const auto result = cudaGetLastError();
+    TORCH_CHECK(result == cudaSuccess, "build attention input metadata kernel failed: ", cudaGetErrorString(result));
+}
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/models_py/bindings/cuda/kernels/attention_input_metadata.h b/rtp_llm/models_py/bindings/cuda/kernels/attention_input_metadata.h
new file mode 100644
index 0000000000..866223e499
--- /dev/null
+++ b/rtp_llm/models_py/bindings/cuda/kernels/attention_input_metadata.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <cuda_runtime_api.h>
+
+namespace rtp_llm {
+
+void invokeBuildAttentionInputMetadata(const at::Tensor& input_lengths,
+                                       const at::Tensor& prefix_lengths,
+                                       at::Tensor&       cu_seqlens,
+                                       at::Tensor&       cu_kv_seqlens,
+                                       at::Tensor&       padding_offset,
+                                       cudaStream_t      stream);
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/models_py/bindings/cuda/kernels/cuda_graph_prepare.cu b/rtp_llm/models_py/bindings/cuda/kernels/cuda_graph_prepare.cu
new file mode 100644
index 0000000000..db07d768a0
--- /dev/null
+++ b/rtp_llm/models_py/bindings/cuda/kernels/cuda_graph_prepare.cu
@@ -0,0 +1,373 @@
+#include "rtp_llm/models_py/bindings/cuda/kernels/cuda_graph_prepare.h"
+
+#include <algorithm>
+#include <c10/util/Exception.h>
+#include <cuda_runtime.h>
+
+namespace rtp_llm {
+
+namespace {
+
+__global__ void cudaGraphPrepareFillKernel(CudaGraphPrepareFillParams params) {
+    const int64_t tid    = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    const int64_t stride = static_cast<int64_t>(blockDim.x) * gridDim.x;
+
+    for (int32_t region_idx = 0; region_idx < params.region_count; ++region_idx) {
+        const auto region = params.regions[region_idx];
+        if (region.ptr == nullptr || region.count <= 0) {
+            continue;
+        }
+        for (int64_t i = tid; i < region.count; i += stride) {
+            region.ptr[i] = region.value;
+        }
+    }
+}
+
+__global__ void prepareFlashInferDecodeParamsKernel(const int32_t* sequence_lengths_plus_1,
+                                                    const int32_t* block_ids,
+                                                    int32_t*       batch_indice,
+                                                    int32_t*       page_indice,
+                                                    int32_t*       decode_page_indptr,
+                                                    int32_t*       paged_kv_last_page_len,
+                                                    int32_t*       qo_indptr,
+                                                    int32_t*       kvlen,
+                                                    int32_t*       positions,
+                                                    int64_t*       slot_mapping,
+                                                    int32_t        batch_size,
+                                                    int32_t        max_blocks_per_batch,
+                                                    int32_t        seq_size_per_block,
+                                                    int32_t        captured_batch_capacity) {
+    // Replay path is small-batch metadata; one CUDA block avoids any host prefix-sum.
+    if (threadIdx.x != 0 || blockIdx.x != 0) {
+        return;
+    }
+
+    int32_t page_offset        = 0;
+    decode_page_indptr[0]      = 0;
+    qo_indptr[0]               = 0;
+    const int32_t safe_page_sz = seq_size_per_block > 0 ? seq_size_per_block : 1;
+
+    for (int32_t batch = 0; batch < batch_size; ++batch) {
+        const int32_t seq_len = sequence_lengths_plus_1[batch] > 1 ? sequence_lengths_plus_1[batch] : 1;
+        const int32_t pages   = (seq_len + safe_page_sz - 1) / safe_page_sz;
+
+        batch_indice[batch]           = batch;
+        positions[batch]              = seq_len - 1;
+        kvlen[batch]                  = seq_len;
+        paged_kv_last_page_len[batch] = (seq_len - 1) % safe_page_sz + 1;
+        const int32_t block_index     = (seq_len - 1) / safe_page_sz;
+        const int32_t block_offset    = (seq_len - 1) % safe_page_sz;
+        const int32_t block_number =
+            block_index < max_blocks_per_batch ? block_ids[batch * max_blocks_per_batch + block_index] : 0;
+        slot_mapping[batch] = static_cast<int64_t>(block_number) * safe_page_sz + static_cast<int64_t>(block_offset);
+
+        const int32_t pages_to_copy = pages < max_blocks_per_batch ? pages : max_blocks_per_batch;
+        for (int32_t page = 0; page < pages_to_copy; ++page) {
+            page_indice[page_offset + page] = block_ids[batch * max_blocks_per_batch + page];
+        }
+        page_offset += pages_to_copy;
+        decode_page_indptr[batch + 1] = page_offset;
+        qo_indptr[batch + 1]          = batch + 1;
+    }
+
+    // Decode CUDA graph replay can use a graph captured for a larger batch
+    // than the current live batch. Clear stale entries so the captured kernels
+    // do not process phantom rows with old kvlen/page metadata and block_id=0.
+    for (int32_t batch = batch_size; batch < captured_batch_capacity; ++batch) {
+        batch_indice[batch]           = 0;
+        positions[batch]              = 0;
+        kvlen[batch]                  = 0;
+        paged_kv_last_page_len[batch] = 0;
+        slot_mapping[batch]           = -1;
+        decode_page_indptr[batch + 1] = page_offset;
+        qo_indptr[batch + 1]          = batch_size;
+    }
+}
+
+// Generic prefill cuda graph metadata kernel. Used by both:
+//   - target verify (SparseMla, with sparse-specific outputs)
+//   - draft prefill (FlashInfer, sparse-specific outputs as nullptr)
+// Pass nullptr for ks/ke/expanded_seq_lens/topk_indices_offset to skip those.
+__global__ void prepareSparseMlaTargetVerifyParamsKernel(const int32_t* input_lengths,
+                                                         const int32_t* prefix_lengths,
+                                                         const int32_t* block_ids,
+                                                         int32_t*       batch_indice,
+                                                         int32_t*       page_indice,
+                                                         int32_t*       decode_page_indptr,
+                                                         int32_t*       paged_kv_last_page_len,
+                                                         int32_t*       qo_indptr,
+                                                         int32_t*       prefill_ragged_kv_len_indptr,
+                                                         int32_t*       kvlen,
+                                                         int32_t*       positions,
+                                                         int64_t*       slot_mapping,
+                                                         int32_t*       expanded_seq_lens,
+                                                         int32_t*       topk_indices_offset,
+                                                         int32_t*       ks,
+                                                         int32_t*       ke,
+                                                         int32_t        batch_size,
+                                                         int32_t        max_blocks_per_batch,
+                                                         int32_t        seq_size_per_block,
+                                                         int32_t        captured_batch_capacity,
+                                                         int32_t        captured_total_tokens) {
+    if (threadIdx.x != 0 || blockIdx.x != 0) {
+        return;
+    }
+
+    const int32_t safe_page_sz = seq_size_per_block > 0 ? seq_size_per_block : 1;
+    int32_t       token_offset = 0;
+    int32_t       page_offset  = 0;
+    int32_t       accu_kv_len  = 0;
+    int32_t       k_offset     = 0;
+
+    decode_page_indptr[0]           = 0;
+    qo_indptr[0]                    = 0;
+    prefill_ragged_kv_len_indptr[0] = 0;
+
+    for (int32_t i = 0; i < batch_size; ++i) {
+        const int32_t input_len  = input_lengths[i];
+        const int32_t prefix_len = prefix_lengths[i];
+        const int32_t kv_len     = input_len + prefix_len;
+
+        for (int32_t j = 0; j < input_len; ++j) {
+            const int32_t position      = j + prefix_len;
+            batch_indice[token_offset]  = i;
+            positions[token_offset]     = position;
+            const int32_t seq_len_value = kv_len - input_len + 1 + j;
+            if (expanded_seq_lens != nullptr) {
+                expanded_seq_lens[token_offset] = seq_len_value;
+            }
+            if (topk_indices_offset != nullptr) {
+                topk_indices_offset[token_offset] = 0;
+            }
+            if (ks != nullptr) {
+                ks[token_offset] = k_offset;
+            }
+            if (ke != nullptr) {
+                ke[token_offset] = k_offset + seq_len_value;
+            }
+
+            // slot_mapping: physical KV cache slot for this token
+            const int32_t block_index  = position / safe_page_sz;
+            const int32_t block_offset = position % safe_page_sz;
+            const int32_t block_number =
+                block_index < max_blocks_per_batch ? block_ids[i * max_blocks_per_batch + block_index] : 0;
+            slot_mapping[token_offset] =
+                static_cast<int64_t>(block_number) * safe_page_sz + static_cast<int64_t>(block_offset);
+
+            token_offset++;
+        }
+        k_offset += kv_len;
+        accu_kv_len += kv_len;
+
+        kvlen[i]                    = kv_len;
+        paged_kv_last_page_len[i]   = (kv_len - 1) % safe_page_sz + 1;
+        const int32_t pages         = (kv_len + safe_page_sz - 1) / safe_page_sz;
+        const int32_t pages_to_copy = pages < max_blocks_per_batch ? pages : max_blocks_per_batch;
+        for (int32_t p = 0; p < pages_to_copy; ++p) {
+            page_indice[page_offset + p] = block_ids[i * max_blocks_per_batch + p];
+        }
+        page_offset += pages_to_copy;
+
+        decode_page_indptr[i + 1]           = page_offset;
+        qo_indptr[i + 1]                    = token_offset;
+        prefill_ragged_kv_len_indptr[i + 1] = accu_kv_len;
+    }
+
+    // Zero-fill stale entries beyond the active batch to prevent CUDA graph
+    // replay from processing phantom batch elements with stale metadata.
+    for (int32_t i = batch_size; i < captured_batch_capacity; ++i) {
+        kvlen[i]                            = 0;
+        paged_kv_last_page_len[i]           = 0;
+        decode_page_indptr[i + 1]           = page_offset;
+        qo_indptr[i + 1]                    = token_offset;
+        prefill_ragged_kv_len_indptr[i + 1] = accu_kv_len;
+    }
+    for (int32_t t = token_offset; t < captured_total_tokens; ++t) {
+        batch_indice[t] = 0;
+        positions[t]    = 0;
+        if (slot_mapping != nullptr)
+            slot_mapping[t] = -1;
+        if (expanded_seq_lens != nullptr)
+            expanded_seq_lens[t] = 0;
+        if (topk_indices_offset != nullptr)
+            topk_indices_offset[t] = 0;
+        if (ks != nullptr)
+            ks[t] = 0;
+        if (ke != nullptr)
+            ke[t] = 0;
+    }
+}
+
+}  // namespace
+
+void invokeCudaGraphPrepareFill(CudaGraphPrepareFillParams params, cudaStream_t stream) {
+    TORCH_CHECK(params.region_count >= 0 && params.region_count <= kMaxCudaGraphPrepareFillRegions,
+                "invalid cuda graph prepare fill region count: ",
+                params.region_count);
+
+    int64_t total_count = 0;
+    for (int32_t i = 0; i < params.region_count; ++i) {
+        total_count += params.regions[i].count > 0 ? params.regions[i].count : 0;
+    }
+    if (total_count <= 0) {
+        return;
+    }
+
+    constexpr int block_size = 256;
+    const int     blocks     = static_cast<int>(std::min<int64_t>((total_count + block_size - 1) / block_size, 1024));
+    cudaGraphPrepareFillKernel<<<blocks, block_size, 0, stream>>>(params);
+    const auto result = cudaGetLastError();
+    TORCH_CHECK(result == cudaSuccess, "cuda graph prepare fill kernel failed: ", cudaGetErrorString(result));
+}
+
+void invokePrepareFlashInferDecodeParams(const int32_t* sequence_lengths_plus_1,
+                                         const int32_t* block_ids,
+                                         int32_t*       batch_indice,
+                                         int32_t*       page_indice,
+                                         int32_t*       decode_page_indptr,
+                                         int32_t*       paged_kv_last_page_len,
+                                         int32_t*       qo_indptr,
+                                         int32_t*       kvlen,
+                                         int32_t*       positions,
+                                         int64_t*       slot_mapping,
+                                         int32_t        batch_size,
+                                         int32_t        max_blocks_per_batch,
+                                         int32_t        seq_size_per_block,
+                                         int32_t        captured_batch_capacity,
+                                         cudaStream_t   stream) {
+    TORCH_CHECK(sequence_lengths_plus_1 != nullptr, "sequence_lengths_plus_1 is null");
+    TORCH_CHECK(block_ids != nullptr, "block_ids is null");
+    TORCH_CHECK(batch_indice != nullptr && page_indice != nullptr && decode_page_indptr != nullptr
+                    && paged_kv_last_page_len != nullptr && qo_indptr != nullptr && kvlen != nullptr
+                    && positions != nullptr && slot_mapping != nullptr,
+                "FlashInfer decode metadata output buffer is null");
+    if (batch_size <= 0 || max_blocks_per_batch <= 0) {
+        return;
+    }
+    prepareFlashInferDecodeParamsKernel<<<1, 1, 0, stream>>>(sequence_lengths_plus_1,
+                                                             block_ids,
+                                                             batch_indice,
+                                                             page_indice,
+                                                             decode_page_indptr,
+                                                             paged_kv_last_page_len,
+                                                             qo_indptr,
+                                                             kvlen,
+                                                             positions,
+                                                             slot_mapping,
+                                                             batch_size,
+                                                             max_blocks_per_batch,
+                                                             seq_size_per_block,
+                                                             captured_batch_capacity);
+    const auto result = cudaGetLastError();
+    TORCH_CHECK(
+        result == cudaSuccess, "FlashInfer decode CUDA graph prepare kernel failed: ", cudaGetErrorString(result));
+}
+
+// Non-sparse prefill cuda graph kernel — sparse-specific outputs nullptr.
+void invokePrepareFlashInferPrefillParams(const int32_t* input_lengths,
+                                          const int32_t* prefix_lengths,
+                                          const int32_t* block_ids,
+                                          int32_t*       batch_indice,
+                                          int32_t*       page_indice,
+                                          int32_t*       decode_page_indptr,
+                                          int32_t*       paged_kv_last_page_len,
+                                          int32_t*       qo_indptr,
+                                          int32_t*       prefill_ragged_kv_len_indptr,
+                                          int32_t*       kvlen,
+                                          int32_t*       positions,
+                                          int64_t*       slot_mapping,
+                                          int32_t        batch_size,
+                                          int32_t        max_blocks_per_batch,
+                                          int32_t        seq_size_per_block,
+                                          int32_t        captured_total_tokens,
+                                          cudaStream_t   stream) {
+    TORCH_CHECK(input_lengths != nullptr, "input_lengths is null");
+    TORCH_CHECK(prefix_lengths != nullptr, "prefix_lengths is null");
+    TORCH_CHECK(block_ids != nullptr, "block_ids is null");
+    TORCH_CHECK(slot_mapping != nullptr, "slot_mapping is null");
+    if (batch_size <= 0 || max_blocks_per_batch <= 0) {
+        return;
+    }
+    prepareSparseMlaTargetVerifyParamsKernel<<<1, 1, 0, stream>>>(input_lengths,
+                                                                  prefix_lengths,
+                                                                  block_ids,
+                                                                  batch_indice,
+                                                                  page_indice,
+                                                                  decode_page_indptr,
+                                                                  paged_kv_last_page_len,
+                                                                  qo_indptr,
+                                                                  prefill_ragged_kv_len_indptr,
+                                                                  kvlen,
+                                                                  positions,
+                                                                  slot_mapping,
+                                                                  /*expanded_seq_lens=*/nullptr,
+                                                                  /*topk_indices_offset=*/nullptr,
+                                                                  /*ks=*/nullptr,
+                                                                  /*ke=*/nullptr,
+                                                                  batch_size,
+                                                                  max_blocks_per_batch,
+                                                                  seq_size_per_block,
+                                                                  batch_size,
+                                                                  captured_total_tokens);
+    const auto result = cudaGetLastError();
+    TORCH_CHECK(
+        result == cudaSuccess, "FlashInfer prefill CUDA graph prepare kernel failed: ", cudaGetErrorString(result));
+}
+
+void invokePrepareSparseMlaTargetVerifyParams(const int32_t* input_lengths,
+                                              const int32_t* prefix_lengths,
+                                              const int32_t* block_ids,
+                                              int32_t*       batch_indice,
+                                              int32_t*       page_indice,
+                                              int32_t*       decode_page_indptr,
+                                              int32_t*       paged_kv_last_page_len,
+                                              int32_t*       qo_indptr,
+                                              int32_t*       prefill_ragged_kv_len_indptr,
+                                              int32_t*       kvlen,
+                                              int32_t*       positions,
+                                              int64_t*       slot_mapping,
+                                              int32_t*       expanded_seq_lens,
+                                              int32_t*       topk_indices_offset,
+                                              int32_t*       ks,
+                                              int32_t*       ke,
+                                              int32_t        batch_size,
+                                              int32_t        max_blocks_per_batch,
+                                              int32_t        seq_size_per_block,
+                                              int32_t        captured_batch_capacity,
+                                              int32_t        captured_total_tokens,
+                                              cudaStream_t   stream) {
+    TORCH_CHECK(input_lengths != nullptr, "input_lengths is null");
+    TORCH_CHECK(prefix_lengths != nullptr, "prefix_lengths is null");
+    TORCH_CHECK(block_ids != nullptr, "block_ids is null");
+    TORCH_CHECK(slot_mapping != nullptr, "slot_mapping is null");
+    if (batch_size <= 0 || max_blocks_per_batch <= 0) {
+        return;
+    }
+    prepareSparseMlaTargetVerifyParamsKernel<<<1, 1, 0, stream>>>(input_lengths,
+                                                                  prefix_lengths,
+                                                                  block_ids,
+                                                                  batch_indice,
+                                                                  page_indice,
+                                                                  decode_page_indptr,
+                                                                  paged_kv_last_page_len,
+                                                                  qo_indptr,
+                                                                  prefill_ragged_kv_len_indptr,
+                                                                  kvlen,
+                                                                  positions,
+                                                                  slot_mapping,
+                                                                  expanded_seq_lens,
+                                                                  topk_indices_offset,
+                                                                  ks,
+                                                                  ke,
+                                                                  batch_size,
+                                                                  max_blocks_per_batch,
+                                                                  seq_size_per_block,
+                                                                  captured_batch_capacity,
+                                                                  captured_total_tokens);
+    const auto result = cudaGetLastError();
+    TORCH_CHECK(result == cudaSuccess,
+                "SparseMLA target verify CUDA graph prepare kernel failed: ",
+                cudaGetErrorString(result));
+}
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/models_py/bindings/cuda/kernels/cuda_graph_prepare.h b/rtp_llm/models_py/bindings/cuda/kernels/cuda_graph_prepare.h
new file mode 100644
index 0000000000..50de2495d9
--- /dev/null
+++ b/rtp_llm/models_py/bindings/cuda/kernels/cuda_graph_prepare.h
@@ -0,0 +1,80 @@
+#pragma once
+
+#include <cstdint>
+#include <cuda_runtime_api.h>
+
+namespace rtp_llm {
+
+constexpr int kMaxCudaGraphPrepareFillRegions = 32;
+
+struct CudaGraphPrepareFillRegion {
+    int32_t* ptr   = nullptr;
+    int64_t  count = 0;
+    int32_t  value = 0;
+};
+
+struct CudaGraphPrepareFillParams {
+    int32_t                    region_count = 0;
+    CudaGraphPrepareFillRegion regions[kMaxCudaGraphPrepareFillRegions];
+};
+
+void invokeCudaGraphPrepareFill(CudaGraphPrepareFillParams params, cudaStream_t stream);
+
+void invokePrepareFlashInferDecodeParams(const int32_t* sequence_lengths_plus_1,
+                                         const int32_t* block_ids,
+                                         int32_t*       batch_indice,
+                                         int32_t*       page_indice,
+                                         int32_t*       decode_page_indptr,
+                                         int32_t*       paged_kv_last_page_len,
+                                         int32_t*       qo_indptr,
+                                         int32_t*       kvlen,
+                                         int32_t*       positions,
+                                         int64_t*       slot_mapping,
+                                         int32_t        batch_size,
+                                         int32_t        max_blocks_per_batch,
+                                         int32_t        seq_size_per_block,
+                                         int32_t        captured_batch_capacity,
+                                         cudaStream_t   stream);
+
+void invokePrepareFlashInferPrefillParams(const int32_t* input_lengths,
+                                          const int32_t* prefix_lengths,
+                                          const int32_t* block_ids,
+                                          int32_t*       batch_indice,
+                                          int32_t*       page_indice,
+                                          int32_t*       decode_page_indptr,
+                                          int32_t*       paged_kv_last_page_len,
+                                          int32_t*       qo_indptr,
+                                          int32_t*       prefill_ragged_kv_len_indptr,
+                                          int32_t*       kvlen,
+                                          int32_t*       positions,
+                                          int64_t*       slot_mapping,
+                                          int32_t        batch_size,
+                                          int32_t        max_blocks_per_batch,
+                                          int32_t        seq_size_per_block,
+                                          int32_t        captured_total_tokens,
+                                          cudaStream_t   stream);
+
+void invokePrepareSparseMlaTargetVerifyParams(const int32_t* input_lengths,
+                                              const int32_t* prefix_lengths,
+                                              const int32_t* block_ids,
+                                              int32_t*       batch_indice,
+                                              int32_t*       page_indice,
+                                              int32_t*       decode_page_indptr,
+                                              int32_t*       paged_kv_last_page_len,
+                                              int32_t*       qo_indptr,
+                                              int32_t*       prefill_ragged_kv_len_indptr,
+                                              int32_t*       kvlen,
+                                              int32_t*       positions,
+                                              int64_t*       slot_mapping,
+                                              int32_t*       expanded_seq_lens,
+                                              int32_t*       topk_indices_offset,
+                                              int32_t*       ks,
+                                              int32_t*       ke,
+                                              int32_t        batch_size,
+                                              int32_t        max_blocks_per_batch,
+                                              int32_t        seq_size_per_block,
+                                              int32_t        captured_batch_capacity,
+                                              int32_t        captured_total_tokens,
+                                              cudaStream_t   stream);
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/models_py/bindings/cuda/kernels/mtp_target_verify_prepare.cu b/rtp_llm/models_py/bindings/cuda/kernels/mtp_target_verify_prepare.cu
new file mode 100644
index 0000000000..4e36819bc3
--- /dev/null
+++ b/rtp_llm/models_py/bindings/cuda/kernels/mtp_target_verify_prepare.cu
@@ -0,0 +1,333 @@
+#include "rtp_llm/models_py/bindings/cuda/kernels/mtp_target_verify_prepare.h"
+
+#include "rtp_llm/cpp/utils/AssertUtils.h"
+
+#include <algorithm>
+
+namespace rtp_llm {
+
+namespace {
+
+__global__ void mtpTargetVerifyPrepareKernel(const int32_t* __restrict__ sequence_lengths,
+                                             int32_t* __restrict__ input_lengths,
+                                             int32_t* __restrict__ prefix_lengths,
+                                             int32_t* __restrict__ sequence_lengths_plus_1,
+                                             int32_t* __restrict__ lm_output_indexes,
+                                             int32_t tokens_per_batch,
+                                             int32_t batch_size) {
+    const int32_t idx = static_cast<int32_t>(blockIdx.x * blockDim.x + threadIdx.x);
+    if (idx >= batch_size) {
+        return;
+    }
+    input_lengths[idx]           = tokens_per_batch;
+    prefix_lengths[idx]          = sequence_lengths[idx];
+    sequence_lengths_plus_1[idx] = sequence_lengths[idx] + 1;
+    lm_output_indexes[idx]       = idx * tokens_per_batch;
+}
+
+__global__ void mtpSpecDecodeMetadataPrepareKernel(int32_t* __restrict__ input_lengths,
+                                                   int32_t* __restrict__ lm_output_indexes,
+                                                   int32_t tokens_per_batch,
+                                                   int32_t batch_size) {
+    const int32_t total_tokens = batch_size * tokens_per_batch;
+    const int32_t idx          = static_cast<int32_t>(blockIdx.x * blockDim.x + threadIdx.x);
+    if (idx < batch_size) {
+        input_lengths[idx] = tokens_per_batch;
+    }
+    if (idx < total_tokens) {
+        lm_output_indexes[idx] = idx;
+    }
+}
+
+__global__ void mtpSpecDecodeTokensMetadataPrepareKernel(const int32_t* __restrict__ token0,
+                                                         const int32_t* __restrict__ token1,
+                                                         const int32_t* __restrict__ token2,
+                                                         const int32_t* __restrict__ token3,
+                                                         const int32_t* __restrict__ token4,
+                                                         const int32_t* __restrict__ token5,
+                                                         const int32_t* __restrict__ token6,
+                                                         const int32_t* __restrict__ token7,
+                                                         int32_t* __restrict__ spec_tokens,
+                                                         int32_t* __restrict__ input_lengths,
+                                                         int32_t* __restrict__ lm_output_indexes,
+                                                         int32_t tokens_per_batch,
+                                                         int32_t batch_size) {
+    const int32_t total_tokens = batch_size * tokens_per_batch;
+    const int32_t idx          = static_cast<int32_t>(blockIdx.x * blockDim.x + threadIdx.x);
+    if (idx >= total_tokens) {
+        return;
+    }
+
+    const int32_t  batch_idx = idx / tokens_per_batch;
+    const int32_t  token_idx = idx - batch_idx * tokens_per_batch;
+    const int32_t* src       = nullptr;
+    switch (token_idx) {
+        case 0:
+            src = token0;
+            break;
+        case 1:
+            src = token1;
+            break;
+        case 2:
+            src = token2;
+            break;
+        case 3:
+            src = token3;
+            break;
+        case 4:
+            src = token4;
+            break;
+        case 5:
+            src = token5;
+            break;
+        case 6:
+            src = token6;
+            break;
+        case 7:
+            src = token7;
+            break;
+    }
+
+    spec_tokens[idx]       = src[batch_idx];
+    lm_output_indexes[idx] = idx;
+    if (token_idx == 0) {
+        input_lengths[batch_idx] = tokens_per_batch;
+    }
+}
+
+__global__ void mtpPrefillShiftAppendKernel(const int32_t* __restrict__ combo_tokens_in,
+                                            const int32_t* __restrict__ input_lengths,
+                                            const int32_t* __restrict__ batch_offsets,
+                                            const int32_t* __restrict__ new_all_token_ids,
+                                            int32_t* __restrict__ combo_tokens_out,
+                                            int32_t token_stride,
+                                            int32_t batch_size,
+                                            int32_t total_tokens) {
+    const int32_t global_idx = static_cast<int32_t>(blockIdx.x * blockDim.x + threadIdx.x);
+    if (global_idx >= total_tokens) {
+        return;
+    }
+    // Binary search for the batch this token belongs to. batch_offsets[b] holds
+    // the exclusive end offset for batch b (i.e. cumulative input_lengths up to b+1).
+    int32_t lo = 0;
+    int32_t hi = batch_size - 1;
+    while (lo < hi) {
+        const int32_t mid = lo + ((hi - lo) >> 1);
+        if (batch_offsets[mid] <= global_idx) {
+            lo = mid + 1;
+        } else {
+            hi = mid;
+        }
+    }
+    const int32_t batch_idx         = lo;
+    const int32_t batch_start       = (batch_idx == 0) ? 0 : batch_offsets[batch_idx - 1];
+    const int32_t position_in_batch = global_idx - batch_start;
+    const int32_t input_length      = input_lengths[batch_idx];
+
+    if (position_in_batch == input_length - 1) {
+        // Last position: write the new accepted token (last column of new_all_token_ids).
+        combo_tokens_out[global_idx] = new_all_token_ids[batch_idx * token_stride + token_stride - 1];
+    } else if (position_in_batch < input_length - 1) {
+        // Shift left by 1: out[i] = in[i+1] within the batch.
+        combo_tokens_out[global_idx] = combo_tokens_in[global_idx + 1];
+    }
+}
+
+void checkCudaI32Vector(const torch::Tensor& tensor, const char* name, int64_t batch_size) {
+    RTP_LLM_CHECK_WITH_INFO(tensor.defined(), "%s must be defined", name);
+    RTP_LLM_CHECK_WITH_INFO(tensor.is_cuda(), "%s must be CUDA", name);
+    RTP_LLM_CHECK_WITH_INFO(tensor.scalar_type() == torch::kInt32, "%s must be int32", name);
+    RTP_LLM_CHECK_WITH_INFO(tensor.is_contiguous(), "%s must be contiguous", name);
+    RTP_LLM_CHECK_WITH_INFO(
+        tensor.numel() >= batch_size, "%s numel %ld is smaller than batch_size %ld", name, tensor.numel(), batch_size);
+}
+
+}  // namespace
+
+void invokeMtpTargetVerifyPrepare(const torch::Tensor& sequence_lengths,
+                                  torch::Tensor&       input_lengths,
+                                  torch::Tensor&       prefix_lengths,
+                                  torch::Tensor&       sequence_lengths_plus_1,
+                                  torch::Tensor&       lm_output_indexes,
+                                  int32_t              tokens_per_batch,
+                                  cudaStream_t         stream) {
+    const int64_t batch_size = input_lengths.numel();
+    if (batch_size <= 0) {
+        return;
+    }
+    checkCudaI32Vector(sequence_lengths, "sequence_lengths", batch_size);
+    checkCudaI32Vector(input_lengths, "input_lengths", batch_size);
+    checkCudaI32Vector(prefix_lengths, "prefix_lengths", batch_size);
+    checkCudaI32Vector(sequence_lengths_plus_1, "sequence_lengths_plus_1", batch_size);
+    checkCudaI32Vector(lm_output_indexes, "lm_output_indexes", batch_size);
+
+    constexpr int block_size = 256;
+    const int     grid_size  = static_cast<int>((batch_size + block_size - 1) / block_size);
+    mtpTargetVerifyPrepareKernel<<<grid_size, block_size, 0, stream>>>(sequence_lengths.data_ptr<int32_t>(),
+                                                                       input_lengths.data_ptr<int32_t>(),
+                                                                       prefix_lengths.data_ptr<int32_t>(),
+                                                                       sequence_lengths_plus_1.data_ptr<int32_t>(),
+                                                                       lm_output_indexes.data_ptr<int32_t>(),
+                                                                       tokens_per_batch,
+                                                                       static_cast<int32_t>(batch_size));
+}
+
+void invokeMtpSpecDecodeMetadataPrepare(torch::Tensor& input_lengths,
+                                        torch::Tensor& lm_output_indexes,
+                                        int32_t        tokens_per_batch,
+                                        cudaStream_t   stream) {
+    const int64_t batch_size = input_lengths.numel();
+    if (batch_size <= 0) {
+        return;
+    }
+    checkCudaI32Vector(input_lengths, "input_lengths", batch_size);
+    const int64_t total_tokens = batch_size * tokens_per_batch;
+    checkCudaI32Vector(lm_output_indexes, "lm_output_indexes", total_tokens);
+
+    constexpr int block_size = 256;
+    const int64_t work_items = std::max<int64_t>(batch_size, total_tokens);
+    const int     grid_size  = static_cast<int>((work_items + block_size - 1) / block_size);
+    mtpSpecDecodeMetadataPrepareKernel<<<grid_size, block_size, 0, stream>>>(input_lengths.data_ptr<int32_t>(),
+                                                                             lm_output_indexes.data_ptr<int32_t>(),
+                                                                             tokens_per_batch,
+                                                                             static_cast<int32_t>(batch_size));
+}
+
+void invokeMtpSpecDecodeTokensMetadataPrepare(const std::vector<torch::Tensor>& token_columns,
+                                              torch::Tensor&                    spec_tokens,
+                                              torch::Tensor&                    input_lengths,
+                                              torch::Tensor&                    lm_output_indexes,
+                                              int32_t                           tokens_per_batch,
+                                              cudaStream_t                      stream) {
+    RTP_LLM_CHECK_WITH_INFO(tokens_per_batch > 0, "tokens_per_batch must be positive");
+    RTP_LLM_CHECK_WITH_INFO(tokens_per_batch <= 8, "tokens_per_batch %d exceeds fused kernel max 8", tokens_per_batch);
+    RTP_LLM_CHECK_WITH_INFO(static_cast<int32_t>(token_columns.size()) == tokens_per_batch,
+                            "token_columns size %ld must equal tokens_per_batch %d",
+                            token_columns.size(),
+                            tokens_per_batch);
+
+    const int64_t batch_size = input_lengths.numel();
+    if (batch_size <= 0) {
+        return;
+    }
+    const int64_t total_tokens = batch_size * tokens_per_batch;
+    checkCudaI32Vector(spec_tokens, "spec_tokens", total_tokens);
+    checkCudaI32Vector(input_lengths, "input_lengths", batch_size);
+    checkCudaI32Vector(lm_output_indexes, "lm_output_indexes", total_tokens);
+    for (size_t i = 0; i < token_columns.size(); ++i) {
+        checkCudaI32Vector(token_columns[i], "token_columns", batch_size);
+    }
+
+    const int32_t* ptrs[8] = {};
+    for (size_t i = 0; i < token_columns.size(); ++i) {
+        ptrs[i] = token_columns[i].data_ptr<int32_t>();
+    }
+
+    constexpr int block_size = 256;
+    const int     grid_size  = static_cast<int>((total_tokens + block_size - 1) / block_size);
+    mtpSpecDecodeTokensMetadataPrepareKernel<<<grid_size, block_size, 0, stream>>>(
+        ptrs[0],
+        ptrs[1],
+        ptrs[2],
+        ptrs[3],
+        ptrs[4],
+        ptrs[5],
+        ptrs[6],
+        ptrs[7],
+        spec_tokens.data_ptr<int32_t>(),
+        input_lengths.data_ptr<int32_t>(),
+        lm_output_indexes.data_ptr<int32_t>(),
+        tokens_per_batch,
+        static_cast<int32_t>(batch_size));
+}
+
+// Fused kernel: next_seq_len[i] = prev_seq_len[i] + accept_len[i]
+//               hidden_idx[i]  = (int64_t)(accept_len[i] - 1)
+__global__ void mtpDispatchStatePrepareKernel(const int32_t* __restrict__ accept_len,
+                                              const int32_t* __restrict__ prev_seq_len,
+                                              int32_t* __restrict__ next_seq_len,
+                                              int64_t* __restrict__ hidden_idx,
+                                              int32_t batch_size) {
+    const int32_t idx = static_cast<int32_t>(blockIdx.x * blockDim.x + threadIdx.x);
+    if (idx >= batch_size) {
+        return;
+    }
+    const int32_t al  = accept_len[idx];
+    next_seq_len[idx] = prev_seq_len[idx] + al;
+    hidden_idx[idx]   = static_cast<int64_t>(al - 1);
+}
+
+void invokeMtpDispatchStatePrepare(const torch::Tensor& accept_len,
+                                   const torch::Tensor& prev_seq_len,
+                                   torch::Tensor&       next_seq_len,
+                                   torch::Tensor&       hidden_idx,
+                                   int64_t              batch_size,
+                                   cudaStream_t         stream) {
+    if (batch_size <= 0) {
+        return;
+    }
+    checkCudaI32Vector(accept_len, "accept_len", batch_size);
+    checkCudaI32Vector(prev_seq_len, "prev_seq_len", batch_size);
+    checkCudaI32Vector(next_seq_len, "next_seq_len", batch_size);
+    RTP_LLM_CHECK_WITH_INFO(hidden_idx.defined() && hidden_idx.is_cuda(), "hidden_idx must be CUDA");
+    RTP_LLM_CHECK_WITH_INFO(hidden_idx.scalar_type() == torch::kInt64, "hidden_idx must be int64");
+    RTP_LLM_CHECK_WITH_INFO(hidden_idx.is_contiguous(), "hidden_idx must be contiguous");
+    RTP_LLM_CHECK_WITH_INFO(
+        hidden_idx.numel() >= batch_size, "hidden_idx numel %ld < batch_size %ld", hidden_idx.numel(), batch_size);
+
+    constexpr int block_size = 256;
+    const int     grid_size  = static_cast<int>((batch_size + block_size - 1) / block_size);
+    mtpDispatchStatePrepareKernel<<<grid_size, block_size, 0, stream>>>(accept_len.data_ptr<int32_t>(),
+                                                                        prev_seq_len.data_ptr<int32_t>(),
+                                                                        next_seq_len.data_ptr<int32_t>(),
+                                                                        hidden_idx.data_ptr<int64_t>(),
+                                                                        static_cast<int32_t>(batch_size));
+}
+
+// REBASE CONFLICT CONTEXT(518707c73): source branch added this fused
+// shift/append launcher to eliminate sync-heavy CPU token manipulation. Keep it
+// with the new base dispatch-state prepare launcher above.
+void invokeMtpPrefillShiftAppend(const torch::Tensor& combo_tokens_in,
+                                 const torch::Tensor& input_lengths,
+                                 const torch::Tensor& batch_offsets,
+                                 const torch::Tensor& new_all_token_ids,
+                                 torch::Tensor&       combo_tokens_out,
+                                 int32_t              token_stride,
+                                 cudaStream_t         stream) {
+    const int64_t batch_size = input_lengths.numel();
+    if (batch_size <= 0) {
+        return;
+    }
+    const int64_t total_tokens = combo_tokens_in.numel();
+    if (total_tokens <= 0) {
+        return;
+    }
+    checkCudaI32Vector(combo_tokens_in, "combo_tokens_in", total_tokens);
+    checkCudaI32Vector(combo_tokens_out, "combo_tokens_out", total_tokens);
+    checkCudaI32Vector(input_lengths, "input_lengths", batch_size);
+    checkCudaI32Vector(batch_offsets, "batch_offsets", batch_size);
+    RTP_LLM_CHECK_WITH_INFO(new_all_token_ids.defined() && new_all_token_ids.is_cuda(),
+                            "new_all_token_ids must be CUDA");
+    RTP_LLM_CHECK_WITH_INFO(new_all_token_ids.scalar_type() == torch::kInt32,
+                            "new_all_token_ids must be int32 (got %s)",
+                            c10::toString(new_all_token_ids.scalar_type()));
+    RTP_LLM_CHECK_WITH_INFO(new_all_token_ids.is_contiguous(), "new_all_token_ids must be contiguous");
+    RTP_LLM_CHECK_WITH_INFO(new_all_token_ids.numel() >= batch_size * token_stride,
+                            "new_all_token_ids numel %ld < batch_size %ld * token_stride %d",
+                            new_all_token_ids.numel(),
+                            batch_size,
+                            token_stride);
+
+    constexpr int block_size = 256;
+    const int     grid_size  = static_cast<int>((total_tokens + block_size - 1) / block_size);
+    mtpPrefillShiftAppendKernel<<<grid_size, block_size, 0, stream>>>(combo_tokens_in.data_ptr<int32_t>(),
+                                                                      input_lengths.data_ptr<int32_t>(),
+                                                                      batch_offsets.data_ptr<int32_t>(),
+                                                                      new_all_token_ids.data_ptr<int32_t>(),
+                                                                      combo_tokens_out.data_ptr<int32_t>(),
+                                                                      token_stride,
+                                                                      static_cast<int32_t>(batch_size),
+                                                                      static_cast<int32_t>(total_tokens));
+}
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/models_py/bindings/cuda/kernels/mtp_target_verify_prepare.h b/rtp_llm/models_py/bindings/cuda/kernels/mtp_target_verify_prepare.h
new file mode 100644
index 0000000000..0b66be8efd
--- /dev/null
+++ b/rtp_llm/models_py/bindings/cuda/kernels/mtp_target_verify_prepare.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <torch/extension.h>
+#include <cuda_runtime.h>
+#include <vector>
+
+namespace rtp_llm {
+
+void invokeMtpTargetVerifyPrepare(const torch::Tensor& sequence_lengths,
+                                  torch::Tensor&       input_lengths,
+                                  torch::Tensor&       prefix_lengths,
+                                  torch::Tensor&       sequence_lengths_plus_1,
+                                  torch::Tensor&       lm_output_indexes,
+                                  int32_t              tokens_per_batch,
+                                  cudaStream_t         stream);
+
+void invokeMtpSpecDecodeMetadataPrepare(torch::Tensor& input_lengths,
+                                        torch::Tensor& lm_output_indexes,
+                                        int32_t        tokens_per_batch,
+                                        cudaStream_t   stream);
+
+void invokeMtpSpecDecodeTokensMetadataPrepare(const std::vector<torch::Tensor>& token_columns,
+                                              torch::Tensor&                    spec_tokens,
+                                              torch::Tensor&                    input_lengths,
+                                              torch::Tensor&                    lm_output_indexes,
+                                              int32_t                           tokens_per_batch,
+                                              cudaStream_t                      stream);
+
+// Fused kernel for dispatchDecodeAsync per-stream state publishing.
+// Computes: next_seq_len[i] = prev_seq_len[i] + accept_len[i]  (int32)
+//           hidden_idx[i]   = accept_len[i] - 1                 (int64)
+// All inputs/outputs must be contiguous CUDA tensors with numel >= batch_size.
+void invokeMtpDispatchStatePrepare(const torch::Tensor& accept_len,
+                                   const torch::Tensor& prev_seq_len,
+                                   torch::Tensor&       next_seq_len,
+                                   torch::Tensor&       hidden_idx,
+                                   int64_t              batch_size,
+                                   cudaStream_t         stream);
+
+// REBASE CONFLICT CONTEXT(518707c73): keep new base dispatch-state publishing
+// kernel and add source branch prefill shift/append kernel to avoid sync-heavy
+// CPU token manipulation.
+// For each batch b with input_lengths_d[b] tokens packed at offset cumsum(input_lengths_d)[b-1]
+// in combo_tokens_in:
+//   * shift combo_tokens_in[offset .. offset+input_length-1] left by 1 (drop first token)
+//   * write new_all_token_ids[b, token_stride-1] at combo_tokens_out[offset+input_length-1]
+// All inputs/outputs are int32 CUDA tensors. combo_tokens_out may alias combo_tokens_in;
+// the kernel writes each position from a single thread per (batch, position) pair so
+// in-place shift is safe.
+void invokeMtpPrefillShiftAppend(const torch::Tensor& combo_tokens_in,
+                                 const torch::Tensor& input_lengths,
+                                 const torch::Tensor& batch_offsets,
+                                 const torch::Tensor& new_all_token_ids,
+                                 torch::Tensor&       combo_tokens_out,
+                                 int32_t              token_stride,
+                                 cudaStream_t         stream);
+
+}  // namespace rtp_llm