diff --git a/BUILD b/BUILD index ab77eddc83..3ac2c23fba 100755 --- a/BUILD +++ b/BUILD @@ -45,6 +45,11 @@ config_setting( values = {"define": "using_cuda12_x86=true"}, ) +config_setting( + name = "using_cuda13_x86", + values = {"define": "using_cuda13_x86=true"}, +) + config_setting( name = "using_rocm", values = {"define": "using_rocm=true"}, diff --git a/rtp_llm/cpp/cache/BUILD b/rtp_llm/cpp/cache/BUILD index 3294d0b918..4b8a946ed9 100644 --- a/rtp_llm/cpp/cache/BUILD +++ b/rtp_llm/cpp/cache/BUILD @@ -1,13 +1,34 @@ load("//:def.bzl", "copts") load("@arch_config//:arch_select.bzl", "torch_deps") +cc_library( + name = "cp_slot_mapper", + srcs = ["CPSlotMapper.cc"], + hdrs = ["CPSlotMapper.h"], + copts = copts(), + visibility = ["//visibility:public"], +) + cc_library( name = "cache_group_type", hdrs = [ - "CacheGroupType.h", + "spec/CacheGroupType.h", + ], + copts = copts(), + visibility = ["//visibility:public"], +) + +cc_library( + name = "kv_cache_spec_desc_types", + hdrs = [ + "spec/KVCacheSpecDescTypes.h", ], copts = copts(), visibility = ["//visibility:public"], + deps = [ + ":cache_group_type", + "//rtp_llm/models_py/bindings/core:types", + ], ) cc_library( @@ -19,17 +40,49 @@ cc_library( visibility = ["//visibility:public"], ) +cc_library( + name = "kv_cache_specs", + hdrs = [ + "spec/KVCacheSpec.h", + "spec/KVCacheSpecBase.h", + "spec/KVCacheSpecDesc.h", + "spec/KVCacheSpecDescTypes.h", + "spec/MHAKVCacheSpec.h", + "spec/LinearKVCacheSpec.h", + "spec/MLAKVCacheSpec.h", + "spec/OpaqueKVCacheSpec.h", + "Types.h", + ], + copts = copts(), + visibility = ["//visibility:public"], + deps = [ + ":block_info", + ":batch_kv_cache_resource", + ":cache_group_type", + ":cp_slot_mapper", + "//rtp_llm/cpp/config:config_modules", + "//rtp_llm/cpp/model_utils:model_utils", + "//rtp_llm/models_py/bindings/core:types", + ], +) + cc_library( name = "cache_types", + srcs = [ + "Types.cc", + ], hdrs = [ "BufferTypes.h", "CacheConfig.h", - "CacheGroupType.h", - "KVCacheSpec.h", - "KVCacheSpecBase.h", - "MHAKVCacheSpec.h", - "LinearKVCacheSpec.h", - "MLAKVCacheSpec.h", + "spec/CacheGroupType.h", + "spec/KVCacheSpec.h", + "spec/KVCacheSpecBase.h", + "spec/KVCacheSpecDesc.h", + "spec/KVCacheSpecDescTypes.h", + "spec/MHAKVCacheSpec.h", + "spec/LinearKVCacheSpec.h", + "spec/MLAKVCacheSpec.h", + "spec/OpaqueKVCacheSpec.h", "Types.h", "WarmUpResult.h", ], @@ -39,8 +92,11 @@ cc_library( ":block_info", ":batch_kv_cache_resource", ":cache_group_type", + ":cp_slot_mapper", "//:rtp_compute_ops", "//rtp_llm/cpp/config:config_modules", + "//rtp_llm/cpp/engine_base/stream:complete_token_ids", + "//rtp_llm/cpp/model_utils:model_utils", "//rtp_llm/models_py/bindings/core:types", ] + torch_deps(), ) @@ -51,6 +107,7 @@ cc_library( "BlockCache.cc", "BlockPool.cc", "MemoryLayoutStrategy.cc", + "SharedBlockCache.cc", ], hdrs = [ "BlockCache.h", @@ -60,55 +117,100 @@ cc_library( "BlockRefCounter.h", "MemoryLayoutStrategy.h", "MemoryLayoutConfig.h", + "SharedBlockCache.h", ], copts = copts(), visibility = ["//visibility:public"], deps = [ ":cache_types", - "//rtp_llm/models_py/bindings/core:exec_ops_hdr", - "//rtp_llm/cpp/disaggregate/cache_store", - "//rtp_llm/cpp/engine_base/stream:complete_token_ids", + "//rtp_llm/cpp/disaggregate/cache_store:cache_store_interface", "//rtp_llm/cpp/utils:kv_cache_utils", "//rtp_llm/cpp/utils:lru_cache", "//rtp_llm/cpp/utils:profiling_scope", + "//rtp_llm/models_py/bindings/core:type_convert", + ] + select({ + "@//:using_cuda": [ + "//rtp_llm/models_py/bindings/cuda:cuda_host_utils", + "@local_config_cuda//cuda:cuda_headers", + "@local_config_cuda//cuda:cudart", + ], + "//conditions:default": [], + }), +) + +cc_library( + name = "kv_cache_transfer_planner", + srcs = [ + "KVCacheTransferPlanner.cc", + ], + hdrs = [ + "KVCacheTransferPlanner.h", + ], + copts = copts(), + visibility = ["//visibility:public"], + deps = [ + ":cache_group_type", ], ) cc_library( name = "kv_cache_group", srcs = [ - "FullKVCacheGroup.cc", - "KVCacheGroup.cc", - "LinearKVCacheGroup.cc", + "group/FullKVCacheGroup.cc", + "group/KVCacheGroup.cc", + "group/LinearKVCacheGroup.cc", + "group/SWAKVCacheGroup.cc", ], hdrs = [ - "FullKVCacheGroup.h", - "KVCacheGroup.h", - "LinearKVCacheGroup.h", + "group/FullKVCacheGroup.h", + "group/KVCacheGroup.h", + "group/LinearKVCacheGroup.h", + "group/SWAKVCacheGroup.h", ], copts = copts(), visibility = ["//visibility:public"], deps = [ ":block_pool", + ":cache_types", ], ) +cc_library( + name = "kv_cache_allocator_hdr", + hdrs = [ + "allocator/KVCacheAllocator.h", + ], + copts = copts(), + visibility = ["//visibility:public"], + deps = [ + ":block_pool", + ":cache_types", + "//rtp_llm/cpp/metrics:metrics", + ] + torch_deps(), +) + cc_library( name = "kv_cache_allocator", srcs = [ - "HybridTypeKVCacheAllocator.cc", - "KVCacheAllocator.cc", - "SingleTypeKVCacheAllocator.cc", + "allocator/HybridKVCacheAllocator.cc", + "allocator/HybridPoolKVCacheAllocator.cc", + "allocator/HybridTypeKVCacheAllocator.cc", + "allocator/KVCacheAllocator.cc", + "allocator/SingleTypeKVCacheAllocator.cc", ], hdrs = [ - "HybridTypeKVCacheAllocator.h", - "KVCacheAllocator.h", - "SingleTypeKVCacheAllocator.h", + "allocator/HybridKVCacheAllocator.h", + "allocator/HybridPoolKVCacheAllocator.h", + "allocator/HybridTypeKVCacheAllocator.h", + "allocator/KVCacheAllocator.h", + "allocator/SingleTypeKVCacheAllocator.h", ], copts = copts(), visibility = ["//visibility:public"], deps = [ + ":kv_cache_allocator_hdr", ":kv_cache_group", + "//rtp_llm/cpp/engine_base/stream:complete_token_ids", "//rtp_llm/models_py/bindings/core:exec_ops_hdr", ], ) @@ -124,25 +226,27 @@ cc_library( visibility = ["//visibility:public"], deps = [ "//rtp_llm/cpp/utils:core_utils", - "//rtp_llm/cpp/cache:cache_group_type", + ":cache_group_type", ], ) cc_library( name = "cache_core", srcs = [ - "CacheConfigCreator.cc", - "HybridConfigCreator.cc", + "config_creator/CacheConfigCreator.cc", + "config_creator/HybridConfigCreator.cc", + "config_creator/HybridPoolConfigCreator.cc", "KVCacheHashUtil.cc", - "MemoryEvaluationHelper.cc", - "SingleConfigCreator.cc", + "config_creator/MemoryEvaluationHelper.cc", + "config_creator/SingleConfigCreator.cc", ], hdrs = [ - "CacheConfigCreator.h", - "HybridConfigCreator.h", + "config_creator/CacheConfigCreator.h", + "config_creator/HybridConfigCreator.h", + "config_creator/HybridPoolConfigCreator.h", "KVCacheHashUtil.h", - "MemoryEvaluationHelper.h", - "SingleConfigCreator.h", + "config_creator/MemoryEvaluationHelper.h", + "config_creator/SingleConfigCreator.h", ], copts = copts(), visibility = ["//visibility:public"], @@ -151,6 +255,7 @@ cc_library( ":cache_types", ":kv_cache_allocator", "//rtp_llm/cpp/config:model_config", + "//rtp_llm/cpp/engine_base/stream:complete_token_ids", "//rtp_llm/models_py/bindings/core:exec_ops_hdr", "//rtp_llm/models_py/bindings/core:type_convert", "//rtp_llm/cpp/disaggregate/cache_store", diff --git a/rtp_llm/cpp/cache/BatchKVCacheResource.h b/rtp_llm/cpp/cache/BatchKVCacheResource.h index 2f51f8f377..0435185d7e 100644 --- a/rtp_llm/cpp/cache/BatchKVCacheResource.h +++ b/rtp_llm/cpp/cache/BatchKVCacheResource.h @@ -21,13 +21,13 @@ class BatchKVCacheResource { batch_resource.resize(batch_size); } - void initGroups(int group_nums, - int layer_num, - const std::vector& layer_to_group_id = {}, - size_t kernel_blocks_per_kv_block = 1, - const std::vector& group_types = {}) { + void initGroups(int group_nums, + int layer_num, + const std::vector>& layer_group_ids = {}, + size_t kernel_blocks_per_kv_block = 1, + const std::vector& group_types = {}) { for (auto& batch : batch_resource) { - batch.initGroups(group_nums, layer_num, layer_to_group_id, kernel_blocks_per_kv_block, group_types); + batch.initGroups(group_nums, layer_num, layer_group_ids, kernel_blocks_per_kv_block, group_types); } } @@ -67,16 +67,36 @@ class BatchKVCacheResource { return batch_resource[batch_id].blocks(group_id); } + const BlockIndicesType& blocks(int batch_id, int layer_id, int group_id) const { + RTP_LLM_CHECK(batch_id >= 0 && static_cast(batch_id) < batch_resource.size()); + return batch_resource[batch_id].blocks(layer_id, group_id); + } + const BlockIndicesType& kernelBlocks(int batch_id, int group_id = 0) const { RTP_LLM_CHECK(batch_id >= 0 && static_cast(batch_id) < batch_resource.size()); return batch_resource[batch_id].kernelBlocks(group_id); } + const BlockIndicesType& kernelBlocks(int batch_id, int layer_id, int group_id) const { + RTP_LLM_CHECK(batch_id >= 0 && static_cast(batch_id) < batch_resource.size()); + return batch_resource[batch_id].kernelBlocks(layer_id, group_id); + } + + int groupId(int batch_id, int layer_id, int group_id) const { + RTP_LLM_CHECK(batch_id >= 0 && static_cast(batch_id) < batch_resource.size()); + return batch_resource[batch_id].groupId(layer_id, group_id); + } + BlockIds& mutableBlockIds(int batch_id, int group_id = 0) { RTP_LLM_CHECK(batch_id >= 0 && static_cast(batch_id) < batch_resource.size()); return batch_resource[batch_id].mutableBlockIds(group_id); } + int groupId(int layer_id, int group_id) const { + RTP_LLM_CHECK(!batch_resource.empty()); + return batch_resource[0].groupId(layer_id, group_id); + } + const GroupBlockIds& groupBlocks(int batch_id = 0) const { RTP_LLM_CHECK(batch_id >= 0 && static_cast(batch_id) < batch_resource.size()); return batch_resource[batch_id].groupBlocks(); @@ -106,6 +126,7 @@ class BatchKVCacheResource { auto& keys = batch_resource[batch_id].cacheKeys(); if (!keys.empty()) { keys.pop_back(); + batch_resource[batch_id].rebuildLinearBlockDependencies(); } } @@ -114,6 +135,7 @@ class BatchKVCacheResource { auto& keys = resource.cacheKeys(); if (!keys.empty()) { keys.pop_back(); + resource.rebuildLinearBlockDependencies(); } } } @@ -121,22 +143,36 @@ class BatchKVCacheResource { void clearCacheKeys(int batch_id = 0) { RTP_LLM_CHECK(batch_id >= 0 && static_cast(batch_id) < batch_resource.size()); batch_resource[batch_id].cacheKeys().clear(); + batch_resource[batch_id].blockDependencies().clear(); } void pushBackCacheKey(int batch_id, CacheKeyType key) { RTP_LLM_CHECK(batch_id >= 0 && static_cast(batch_id) < batch_resource.size()); - batch_resource[batch_id].cacheKeys().push_back(key); + auto& resource = batch_resource[batch_id]; + auto& keys = resource.cacheKeys(); + auto& deps = resource.blockDependencies(); + BlockDependency dependency; + dependency.ordinal = static_cast(keys.size()); + if (!keys.empty()) { + dependency.has_parent = true; + dependency.parent_key = keys.back(); + } + keys.push_back(key); + deps.push_back(dependency); } - void initBatchGroups(int batch_id, - int group_nums, - int layer_num, - const std::vector& layer_to_group_id = {}, - size_t kernel_blocks_per_kv_block = 1, - const std::vector& group_types = {}) { + void initBatchGroups(int batch_id, + int group_nums, + int layer_num, + const std::vector>& layer_group_ids = {}, + size_t kernel_blocks_per_kv_block = 1, + const std::vector& group_types = {}) { RTP_LLM_CHECK(batch_id >= 0 && static_cast(batch_id) < batch_resource.size()); - batch_resource[batch_id].initGroups( - group_nums, layer_num, layer_to_group_id, kernel_blocks_per_kv_block, group_types); + batch_resource[batch_id].initGroups(group_nums, + layer_num, + layer_group_ids, + kernel_blocks_per_kv_block, + group_types); } void setBatchBlocks(int batch_id, int group_id, const BlockIndicesType& blocks) { @@ -146,7 +182,7 @@ class BatchKVCacheResource { void setBatchCacheKeys(int batch_id, const CacheKeysType& keys) { RTP_LLM_CHECK(batch_id >= 0 && static_cast(batch_id) < batch_resource.size()); - batch_resource[batch_id].cacheKeys() = keys; + batch_resource[batch_id].setCacheKeys(keys); } void check() const { diff --git a/rtp_llm/cpp/cache/BlockPool.cc b/rtp_llm/cpp/cache/BlockPool.cc index c7a94322ea..267e6d1441 100644 --- a/rtp_llm/cpp/cache/BlockPool.cc +++ b/rtp_llm/cpp/cache/BlockPool.cc @@ -1,16 +1,127 @@ #include "rtp_llm/cpp/cache/BlockPool.h" -#include "rtp_llm/models_py/bindings/core/ExecOps.h" #include "rtp_llm/cpp/cache/MemoryLayoutStrategy.h" +#include "rtp_llm/cpp/utils/Logger.h" #include "rtp_llm/cpp/utils/TimeUtil.h" #include "rtp_llm/cpp/utils/KVCacheUtils.h" +#include "rtp_llm/cpp/disaggregate/cache_store/CacheStore.h" #include "rtp_llm/cpp/disaggregate/cache_store/MemoryUtil.h" -#include "rtp_llm/cpp/disaggregate/cache_store/NormalCacheStore.h" #include "rtp_llm/cpp/utils/ProfilingScope.h" +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#if USING_CUDA +#include +#endif + namespace rtp_llm { -BlockPool::BlockPool(const BlockPoolConfig& config, AllocationType allocation_type): - config_(config), allocation_type_(allocation_type) {} +namespace { + +bool shouldPinHostBlockPool(); + +const char* allocationTypeName(AllocationType allocation_type) { + switch (allocation_type) { + case AllocationType::HOST: + return "HOST"; + case AllocationType::DEVICE: + return "DEVICE"; + } + return "UNKNOWN"; +} + +const char* memoryTypeName(MemoryType memory_type) { + switch (memory_type) { + case MemoryType::MEMORY_CPU: + return "CPU"; + case MemoryType::MEMORY_CPU_PINNED: + return "CPU_PINNED"; + case MemoryType::MEMORY_GPU: + return "GPU"; + } + return "UNKNOWN"; +} + +const char* +requestedBackingName(AllocationType allocation_type, bool use_pinned_cpu_backing, bool use_cuda_malloc_backing) { + if (allocation_type == AllocationType::HOST) { + return shouldPinHostBlockPool() ? "CPU_PINNED_OR_CPU_FALLBACK" : "CPU"; + } + if (use_cuda_malloc_backing) { + return "GPU_CUDA_MALLOC"; + } + return use_pinned_cpu_backing ? "CPU_PINNED" : "GPU"; +} + +bool shouldPinHostBlockPool() { + const char* value = std::getenv("RTP_LLM_PIN_HOST_BLOCK_POOL"); + if (value == nullptr) { + return true; + } + const std::string flag(value); + return flag != "0" && flag != "false" && flag != "FALSE" && flag != "off" && flag != "OFF"; +} + +void markHostBlockPoolDontDump(const char* pool_name, void* ptr, size_t size) { +#ifdef MADV_DONTDUMP + if (ptr == nullptr || size == 0) { + return; + } + + long page_size = sysconf(_SC_PAGESIZE); + if (page_size <= 0) { + page_size = 4096; + } + + const auto begin = reinterpret_cast(ptr); + const auto page_mask = static_cast(page_size - 1); + const auto aligned_begin = begin & ~page_mask; + const auto aligned_end = (begin + size + page_mask) & ~page_mask; + const auto aligned_size = static_cast(aligned_end - aligned_begin); + + if (madvise(reinterpret_cast(aligned_begin), aligned_size, MADV_DONTDUMP) != 0) { + RTP_LLM_LOG_WARNING("madvise MADV_DONTDUMP failed for host block pool, pool_name=%s ptr=%p, size=%zu, " + "error=%s", + pool_name, + ptr, + size, + std::strerror(errno)); + } else { + RTP_LLM_LOG_INFO("madvise MADV_DONTDUMP success for host block pool, pool_name=%s ptr=%p, size=%zu, " + "aligned_ptr=%p, aligned_size=%zu", + pool_name, + ptr, + size, + reinterpret_cast(aligned_begin), + aligned_size); + } +#else + RTP_LLM_LOG_WARNING( + "MADV_DONTDUMP is not defined, host block pool may be included in coredump, pool_name=%s ptr=%p, size=%zu", + pool_name, + ptr, + size); +#endif +} + +} // namespace + +BlockPool::BlockPool(const BlockPoolConfig& config, + AllocationType allocation_type, + bool use_pinned_cpu_backing, + bool use_cuda_malloc_backing): + config_(config), + allocation_type_(allocation_type), + use_pinned_cpu_backing_(use_pinned_cpu_backing), + use_cuda_malloc_backing_(use_cuda_malloc_backing) {} BlockPool::~BlockPool() { cache_aligned_buffer_ = torch::Tensor(); @@ -38,15 +149,123 @@ void BlockPool::validateConfig() const { void BlockPool::initializeCacheBuffer() { if (allocation_type_ == AllocationType::HOST) { - cache_aligned_buffer_ = torch::empty({static_cast(config_.total_size_bytes)}, - torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU)) - .pin_memory(); + auto cpu_buffer = torch::empty({static_cast(config_.total_size_bytes)}, + torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU)); + if (shouldPinHostBlockPool()) { + try { + cache_aligned_buffer_ = cpu_buffer.pin_memory(); + } catch (const std::exception& e) { + RTP_LLM_LOG_WARNING("pin host block pool failed, fallback to pageable CPU memory, pool_name=%s " + "total_size=%zu bytes, error=%s", + config_.pool_name.c_str(), + config_.total_size_bytes, + e.what()); + cache_aligned_buffer_ = std::move(cpu_buffer); + } + } else { + RTP_LLM_LOG_INFO("host block pool uses pageable CPU memory, pool_name=%s total_size=%zu bytes", + config_.pool_name.c_str(), + config_.total_size_bytes); + cache_aligned_buffer_ = std::move(cpu_buffer); + } + RTP_LLM_LOG_INFO("mark host block pool dont dump, pool_name=%s ptr=%p, size=%zu", + config_.pool_name.c_str(), + cache_aligned_buffer_.data_ptr(), + config_.total_size_bytes); + markHostBlockPoolDontDump( + config_.pool_name.c_str(), cache_aligned_buffer_.data_ptr(), config_.total_size_bytes); + } else if (use_pinned_cpu_backing_) { + initializePinnedCpuBuffer("device block pool pinned CPU backing"); + } else if (use_cuda_malloc_backing_) { + initializeCudaMallocBuffer(); } else { cache_aligned_buffer_ = torch::empty({static_cast(config_.total_size_bytes)}, torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCUDA)); } cache_base_ptr_ = cache_aligned_buffer_.data_ptr(); RTP_LLM_CHECK_WITH_INFO(cache_base_ptr_ != nullptr, "block pool allocate cache aligned buffer is null"); + const bool is_cuda = cache_aligned_buffer_.is_cuda(); + const bool is_pinned = !is_cuda && cache_aligned_buffer_.is_pinned(); + static constexpr double kBytesPerMB = 1024.0 * 1024.0; + RTP_LLM_LOG_INFO("BlockPool backing selected: pool_name=%s allocation_type=%s requested_backing=%s " + "actual_backing=%s is_cuda=%d is_pinned=%d ptr=%p total_size=%zu bytes total_size_mb=%.2f " + "block_num=%u memory_layouts=%zu", + config_.pool_name.c_str(), + allocationTypeName(allocation_type_), + requestedBackingName(allocation_type_, use_pinned_cpu_backing_, use_cuda_malloc_backing_), + memoryTypeName(where()), + is_cuda, + is_pinned, + cache_base_ptr_, + config_.total_size_bytes, + static_cast(config_.total_size_bytes) / kBytesPerMB, + config_.block_num, + config_.memory_layouts.size()); +} + +void BlockPool::initializePinnedCpuBuffer(const char* log_context) { + RTP_LLM_LOG_WARNING( + "%s, pool_name=%s, total_size=%zu bytes", log_context, config_.pool_name.c_str(), config_.total_size_bytes); + auto cpu_buffer = torch::empty({static_cast(config_.total_size_bytes)}, + torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU)); + try { + cache_aligned_buffer_ = cpu_buffer.pin_memory(); + } catch (const std::exception& e) { + RTP_LLM_FAIL("%s pin failed, pool_name=%s total_size=%zu bytes, error=%s", + log_context, + config_.pool_name.c_str(), + config_.total_size_bytes, + e.what()); + } +} + +void BlockPool::initializeCudaMallocBuffer() { +#if USING_CUDA + RTP_LLM_CHECK_WITH_INFO(allocation_type_ == AllocationType::DEVICE, + "cudaMalloc block pool backing requires DEVICE allocation"); + RTP_LLM_CHECK_WITH_INFO(config_.total_size_bytes > 0, "cudaMalloc block pool total_size_bytes must be > 0"); + + int device_id = -1; + auto device_err = cudaGetDevice(&device_id); + RTP_LLM_CHECK_WITH_INFO(device_err == cudaSuccess, + "cudaGetDevice failed before cudaMalloc block pool allocation, error=%s", + cudaGetErrorString(device_err)); + + void* ptr = nullptr; + const auto err = cudaMalloc(&ptr, config_.total_size_bytes); + RTP_LLM_CHECK_WITH_INFO(err == cudaSuccess, + "cudaMalloc block pool failed, pool_name=%s, total_size=%zu bytes, error=%s", + config_.pool_name.c_str(), + config_.total_size_bytes, + cudaGetErrorString(err)); + + auto deleter = [device_id](void* p) { + if (p == nullptr) { + return; + } + int current_device = -1; + if (cudaGetDevice(¤t_device) == cudaSuccess && current_device != device_id) { + (void)cudaSetDevice(device_id); + (void)cudaFree(p); + (void)cudaSetDevice(current_device); + return; + } + (void)cudaFree(p); + }; + cache_aligned_buffer_ = + torch::from_blob(ptr, + {static_cast(config_.total_size_bytes)}, + std::move(deleter), + torch::TensorOptions().dtype(torch::kUInt8).device(torch::Device(torch::kCUDA, device_id))); + RTP_LLM_LOG_INFO("cudaMalloc block pool backing allocated, pool_name=%s, ptr=%p, total_size=%zu bytes, device=%d", + config_.pool_name.c_str(), + ptr, + config_.total_size_bytes, + device_id); +#else + RTP_LLM_FAIL("cudaMalloc block pool backing requested but this binary was not built with CUDA, pool_name=%s", + config_.pool_name.c_str()); +#endif } void BlockPool::initializeLayerMappings() { @@ -98,15 +317,16 @@ void BlockPool::processMemoryLayout(size_t layout_idx, const torch::Tensor& full processLayerTensors(layout_idx, layout_cfg, global_layer_begin); // 记录初始化信息 - RTP_LLM_LOG_INFO( - "MemoryLayout[%zu] initialized: layer_num=%u block_num=%u kv_off=%zu kv_bytes=%zu scale_off=%zu scale_bytes=%zu", - layout_idx, - layout_cfg.layer_num, - layout_cfg.block_num, - layout_cfg.kv_cache_offset_bytes, - layout_cfg.kv_block_pool_size_bytes, - layout_cfg.kv_scale_offset_bytes, - layout_cfg.kv_scale_pool_size_bytes); + RTP_LLM_LOG_INFO("MemoryLayout[%zu] initialized: pool_name=%s layer_num=%u block_num=%u kv_off=%zu kv_bytes=%zu " + "scale_off=%zu scale_bytes=%zu", + layout_idx, + config_.pool_name.c_str(), + layout_cfg.layer_num, + layout_cfg.block_num, + layout_cfg.kv_cache_offset_bytes, + layout_cfg.kv_block_pool_size_bytes, + layout_cfg.kv_scale_offset_bytes, + layout_cfg.kv_scale_pool_size_bytes); } torch::Tensor BlockPool::createTensor( @@ -180,17 +400,14 @@ bool BlockPool::init() { initializeLayoutStrategies(); initFreeBlocks(); - RTP_LLM_LOG_INFO("BlockPool init success: memory_layouts=%zu, total_layers=%zu, total_size=%zu bytes", + RTP_LLM_LOG_INFO("BlockPool init success: pool_name=%s memory_layouts=%zu, total_layers=%zu, total_size=%zu bytes", + config_.pool_name.c_str(), config_.memory_layouts.size(), global_layer_to_local_.size(), config_.total_size_bytes); return true; } -BlockCachePtr BlockPool::blockCache() { - return block_cache_; -} - void BlockPool::initFreeBlocks() { // block 0 is reserved for (BlockIdxType i = 1; i < static_cast(config_.block_num); ++i) { @@ -201,7 +418,6 @@ void BlockPool::initFreeBlocks() { req_con_ref_counter_.init(config_.block_num); block_cache_ref_counter_.init(config_.block_num); req_cache_ref_counter_.init(config_.block_num); - block_cache_ = std::make_shared(); } std::vector BlockPool::allLayerCacheBase() const { @@ -223,8 +439,10 @@ BlockIndicesType BlockPool::malloc(int num_blocks) { { std::scoped_lock lock(ref_mu_, free_mu_); if (free_block_ids_.size() < static_cast(num_blocks)) { - RTP_LLM_LOG_WARNING( - "Block pool only has %zu free blocks, cannot allocate %d blocks", free_block_ids_.size(), num_blocks); + RTP_LLM_LOG_WARNING("Block pool only has %zu free blocks, cannot allocate %d blocks, pool_name=%s", + free_block_ids_.size(), + num_blocks, + config_.pool_name.c_str()); return {}; } auto first = free_block_ids_.begin(); @@ -341,8 +559,9 @@ void BlockPool::regUserMr(size_t model_id, std::shared_ptr cache_sto cache_store_ = std::move(cache_store); } if (cache_store_ && !kvcache_reg_mr_) { - RTP_LLM_LOG_INFO("start to register user mr"); - auto memory_util = std::static_pointer_cast(cache_store_)->getMemoryUtil(); + RTP_LLM_LOG_INFO("start to register user mr, pool_name=%s", config_.pool_name.c_str()); + auto memory_util = cache_store_->getMemoryUtil(); + const bool gpu = where() == MemoryType::MEMORY_GPU; for (size_t layout_idx = 0; layout_idx < config_.memory_layouts.size(); ++layout_idx) { const auto& layout_cfg = config_.memory_layouts[layout_idx]; @@ -353,6 +572,7 @@ void BlockPool::regUserMr(size_t model_id, std::shared_ptr cache_sto layout_cfg.kv_cache_offset_bytes, layout_cfg.kv_block_pool_size_bytes, layout_cfg.kv_block_stride_bytes, + gpu, "kv"); // Register scale buffer if present @@ -362,6 +582,7 @@ void BlockPool::regUserMr(size_t model_id, std::shared_ptr cache_sto layout_cfg.kv_scale_offset_bytes, layout_cfg.kv_scale_pool_size_bytes, layout_cfg.kv_scale_stride_bytes, + gpu, "scale"); } } @@ -372,22 +593,23 @@ void BlockPool::regUserMr(size_t model_id, std::shared_ptr cache_sto void BlockPool::deregUserMr() { if (kvcache_reg_mr_ && cache_store_) { - RTP_LLM_LOG_INFO("start to deregister user mr"); - auto memory_util = std::static_pointer_cast(cache_store_)->getMemoryUtil(); + RTP_LLM_LOG_INFO("start to deregister user mr, pool_name=%s", config_.pool_name.c_str()); + auto memory_util = cache_store_->getMemoryUtil(); + const bool gpu = where() == MemoryType::MEMORY_GPU; for (size_t layout_idx = 0; layout_idx < config_.memory_layouts.size(); ++layout_idx) { const auto& layout_cfg = config_.memory_layouts[layout_idx]; // Deregister KV buffer - deregisterUserMrForBuffer(memory_util, layout_idx, layout_cfg.kv_cache_offset_bytes, "kv"); + deregisterUserMrForBuffer(memory_util, layout_idx, layout_cfg.kv_cache_offset_bytes, gpu, "kv"); // Deregister scale buffer if present if (layout_cfg.hasScale()) { - deregisterUserMrForBuffer(memory_util, layout_idx, layout_cfg.kv_scale_offset_bytes, "scale"); + deregisterUserMrForBuffer(memory_util, layout_idx, layout_cfg.kv_scale_offset_bytes, gpu, "scale"); } } - RTP_LLM_LOG_INFO("deregister user mr for block pool success"); + RTP_LLM_LOG_INFO("deregister user mr for block pool success, pool_name=%s", config_.pool_name.c_str()); kvcache_reg_mr_ = false; } } @@ -397,18 +619,23 @@ void BlockPool::registerUserMrForBuffer(std::shared_ptr mem size_t offset_bytes, size_t bytes, size_t stride_bytes, + bool gpu, const std::string& buffer_type) { void* base_ptr = static_cast(static_cast(cache_base_ptr_) + static_cast(offset_bytes)); auto start_us = currentTimeUs(); - if (!memory_util->regUserMr(base_ptr, bytes, true, stride_bytes)) { - RTP_LLM_FAIL("register user mr for block pool layout[%zu] %s buffer failed", layout_idx, buffer_type.c_str()); + if (!memory_util->regUserMr(base_ptr, bytes, gpu, stride_bytes)) { + RTP_LLM_FAIL("register user mr for block pool layout[%zu] %s buffer failed, pool_name=%s", + layout_idx, + buffer_type.c_str(), + config_.pool_name.c_str()); } auto cost_ms = (currentTimeUs() - start_us) / 1000; mr_cost_time_ms_ += cost_ms; - RTP_LLM_LOG_INFO("register user mr success: layout[%zu] %s base=%p len=%zu aligned=%zu cost=%ld ms", + RTP_LLM_LOG_INFO("register user mr success: pool_name=%s layout[%zu] %s base=%p len=%zu aligned=%zu cost=%ld ms", + config_.pool_name.c_str(), layout_idx, buffer_type.c_str(), base_ptr, @@ -420,11 +647,15 @@ void BlockPool::registerUserMrForBuffer(std::shared_ptr mem void BlockPool::deregisterUserMrForBuffer(std::shared_ptr memory_util, size_t layout_idx, size_t offset_bytes, + bool gpu, const std::string& buffer_type) { void* base_ptr = static_cast(static_cast(cache_base_ptr_) + static_cast(offset_bytes)); - if (!memory_util->deregUserMr(base_ptr, true)) { - RTP_LLM_FAIL("deregister user mr for block pool layout[%zu] %s buffer failed", layout_idx, buffer_type.c_str()); + if (!memory_util->deregUserMr(base_ptr, gpu)) { + RTP_LLM_FAIL("deregister user mr for block pool layout[%zu] %s buffer failed, pool_name=%s", + layout_idx, + buffer_type.c_str(), + config_.pool_name.c_str()); } } @@ -470,8 +701,10 @@ size_t BlockPool::notInUseBlocksNum() const { // Returns {layout_index, local_layer_id}. layout_index is the index in BlockPoolConfig.memory_layouts. std::pair BlockPool::mapGlobalLayerIdToLocal(int global_layer_id) const { if (global_layer_id < 0 || static_cast(global_layer_id) >= global_layer_to_local_.size()) { - RTP_LLM_LOG_ERROR( - "Global layer_id %d out of range (total layers: %zu)", global_layer_id, global_layer_to_local_.size()); + RTP_LLM_LOG_ERROR("Global layer_id %d out of range (total layers: %zu), pool_name=%s", + global_layer_id, + global_layer_to_local_.size(), + config_.pool_name.c_str()); return {-1, -1}; } @@ -500,7 +733,10 @@ BlockPool::convertIndexToBuffer(int layer_id, int block_id, int partition_count, } MemoryType BlockPool::where() const { - return cache_aligned_buffer_.is_cuda() ? MemoryType::MEMORY_GPU : MemoryType::MEMORY_CPU; + if (cache_aligned_buffer_.is_cuda()) { + return MemoryType::MEMORY_GPU; + } + return cache_aligned_buffer_.is_pinned() ? MemoryType::MEMORY_CPU_PINNED : MemoryType::MEMORY_CPU; } void BlockPool::checkLayoutValidity(int layout_id) const { diff --git a/rtp_llm/cpp/cache/BlockPool.h b/rtp_llm/cpp/cache/BlockPool.h index 30a4ce1e85..3e8b1291a4 100644 --- a/rtp_llm/cpp/cache/BlockPool.h +++ b/rtp_llm/cpp/cache/BlockPool.h @@ -12,7 +12,6 @@ #include "rtp_llm/cpp/cache/BlockRefCounter.h" #include "rtp_llm/cpp/cache/Types.h" #include "rtp_llm/cpp/cache/BufferTypes.h" -#include "rtp_llm/cpp/cache/BlockCache.h" #include "rtp_llm/cpp/cache/MemoryLayoutStrategy.h" #include "rtp_llm/cpp/cache/BlockPoolConfig.h" #include "rtp_llm/cpp/disaggregate/cache_store/MemoryUtil.h" @@ -23,13 +22,14 @@ class CacheStore; class BlockPool { public: - BlockPool(const BlockPoolConfig& config, AllocationType allocation_type = AllocationType::DEVICE); + BlockPool(const BlockPoolConfig& config, + AllocationType allocation_type = AllocationType::DEVICE, + bool use_pinned_cpu_backing = false, + bool use_cuda_malloc_backing = false); ~BlockPool(); bool init(); - BlockCachePtr blockCache(); - MemoryType where() const; std::vector allLayerCacheBase() const; std::vector allLayerScaleCacheBase() const; @@ -74,6 +74,9 @@ class BlockPool { size_t getTotalSizeBytes() const { return config_.total_size_bytes; } + const std::string& poolName() const { + return config_.pool_name; + } private: void initFreeBlocks(); @@ -85,6 +88,8 @@ class BlockPool { // Helper functions for init() void validateConfig() const; void initializeCacheBuffer(); + void initializePinnedCpuBuffer(const char* log_context); + void initializeCudaMallocBuffer(); void initializeLayerMappings(); void initializeLayoutStrategies(); @@ -107,10 +112,12 @@ class BlockPool { size_t offset_bytes, size_t bytes, size_t stride_bytes, + bool gpu, const std::string& buffer_type); void deregisterUserMrForBuffer(std::shared_ptr memory_util, size_t layout_idx, size_t offset_bytes, + bool gpu, const std::string& buffer_type); private: @@ -126,8 +133,8 @@ class BlockPool { BlockRefCounter req_cache_ref_counter_; AllocationType allocation_type_; - - BlockCachePtr block_cache_; + bool use_pinned_cpu_backing_; + bool use_cuda_malloc_backing_; torch::Tensor cache_aligned_buffer_; void* cache_base_ptr_ = nullptr; diff --git a/rtp_llm/cpp/cache/BlockPoolConfig.h b/rtp_llm/cpp/cache/BlockPoolConfig.h index 09ff401cb8..67bc62fc87 100644 --- a/rtp_llm/cpp/cache/BlockPoolConfig.h +++ b/rtp_llm/cpp/cache/BlockPoolConfig.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include "rtp_llm/cpp/cache/MemoryLayoutConfig.h" @@ -7,6 +8,8 @@ namespace rtp_llm { struct BlockPoolConfig { + std::string pool_name = "unnamed"; + // all memory layouts share the same block id space uint32_t block_num = 0; @@ -15,4 +18,4 @@ struct BlockPoolConfig { std::vector memory_layouts; }; -} // namespace rtp_llm \ No newline at end of file +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/BlockPoolConfigHelper.h b/rtp_llm/cpp/cache/BlockPoolConfigHelper.h index deac75c410..bab8f95ee7 100644 --- a/rtp_llm/cpp/cache/BlockPoolConfigHelper.h +++ b/rtp_llm/cpp/cache/BlockPoolConfigHelper.h @@ -3,6 +3,8 @@ #include "rtp_llm/cpp/cache/CacheConfig.h" #include "rtp_llm/cpp/cache/BlockPoolConfig.h" +#include + namespace rtp_llm { class BlockPoolConfigHelper { @@ -16,12 +18,13 @@ class BlockPoolConfigHelper { * @param cache_config The CacheConfig containing main model and optional MTP modules */ static BlockPoolConfig createConfig(const CacheConfig& cache_config) { - RTP_LLM_CHECK_WITH_INFO(!cache_config.cache_specs.empty(), "cache_specs must not be empty"); + RTP_LLM_CHECK_WITH_INFO(cache_config.groupNums() > 0, "cache groups must not be empty"); BlockPoolConfig config; + config.pool_name = "default"; config.block_num = cache_config.block_num; const bool is_hybrid = cache_config.groupNums() > 1; auto layer_num = is_hybrid ? cache_config.group_layer_num : cache_config.layer_num; - const auto& main_spec = cache_config.cache_specs[0]; + const auto& main_spec = cache_config.specForGroup(0); // linear block size is same with full block block size MemoryLayoutConfig main_layout = createMemoryLayoutConfig(is_hybrid, layer_num, @@ -42,13 +45,23 @@ class BlockPoolConfigHelper { for (size_t i = 0; i < cache_config.mtp_sub_configs.size(); ++i) { const auto& mtp_sub_config = cache_config.mtp_sub_configs[i]; RTP_LLM_CHECK_WITH_INFO(mtp_sub_config != nullptr, "mtp_sub_configs[%zu] is null", i); - RTP_LLM_CHECK_WITH_INFO( - !mtp_sub_config->cache_specs.empty(), "MTP module %zu cache_specs must not be empty", i); + RTP_LLM_CHECK_WITH_INFO(mtp_sub_config->groupNums() > 0, + "MTP module %zu cache groups must not be empty", + i); const auto mtp_layer_num = mtp_sub_config->layer_num; - const auto& mtp_spec = mtp_sub_config->cache_specs[0]; - // mtp block size is not same with main model block size + size_t real_mtp_gid = 0; + for (size_t gid = 0; gid < static_cast(mtp_sub_config->groupNums()); ++gid) { + if (!mtp_sub_config->layerIdsForGroup(gid).empty()) { + real_mtp_gid = gid; + break; + } + } + const auto& mtp_spec = mtp_sub_config->specForGroup(real_mtp_gid); + // mtp block size is not same with main model block size. MTP + // sub-configs may keep target-aligned placeholder groups, so use + // the first group that owns a real MTP layer instead of gid 0. MemoryLayoutConfig mtp_layout = createMemoryLayoutConfig(false, mtp_layer_num, mtp_spec->block_size_bytes(), @@ -79,10 +92,61 @@ class BlockPoolConfigHelper { return config; } + static BlockPoolConfig createConfigForGroup(const CacheConfig& cache_config, size_t group_id) { + RTP_LLM_CHECK_WITH_INFO(group_id < static_cast(cache_config.groupNums()), + "group_id %zu out of range, groupNums=%d", + group_id, + cache_config.groupNums()); + const auto& spec = cache_config.specForGroup(group_id); + RTP_LLM_CHECK_WITH_INFO(spec != nullptr, "cache_specs[%zu] is null", group_id); + + BlockPoolConfig config; + config.pool_name = "group_" + std::to_string(group_id); + const auto& tag = cache_config.tagForGroup(group_id); + if (!tag.empty()) { + config.pool_name = tag; + } + config.block_num = cache_config.blockNumForGroup(group_id); + const bool has_group_blocks = config.block_num != cache_config.block_num; + RTP_LLM_LOG_INFO("createConfigForGroup: pool_name=%s gid=%zu block_num=%d (has_group_blocks=%d, " + "groupNums=%d, global_block_num=%d)", + config.pool_name.c_str(), + group_id, + config.block_num, + has_group_blocks, + cache_config.groupNums(), + cache_config.block_num); + + const uint32_t layer_num = static_cast(cache_config.layerIdsForGroup(group_id).size()); + RTP_LLM_CHECK_WITH_INFO(layer_num > 0, "group %zu has no layers", group_id); + + const size_t kv_stride = cache_config.kvBlockStrideBytesForGroup(group_id); + const size_t scale_stride = cache_config.kvScaleStrideBytesForGroup(group_id); + + CacheConfig group_cache_config = cache_config; + group_cache_config.block_num = config.block_num; + if (group_id < cache_config.group_seq_size_per_block.size() + && cache_config.group_seq_size_per_block[group_id] > 0) { + group_cache_config.seq_size_per_block = cache_config.group_seq_size_per_block[group_id]; + } + + MemoryLayoutConfig layout = + createMemoryLayoutConfig(false, layer_num, kv_stride, scale_stride, spec, group_cache_config); + const bool is_full_group = cache_config.typeForGroup(group_id) == CacheGroupType::FULL; + layout.kernel_blocks_per_kv_block = is_full_group ? cache_config.kernelBlocksPerKvBlock() : 1; + layout.kv_cache_offset_bytes = 0; + layout.kv_scale_offset_bytes = layout.kv_cache_offset_bytes + layout.kv_block_pool_size_bytes; + + config.memory_layouts.push_back(layout); + config.total_size_bytes = layout.kv_block_pool_size_bytes + layout.kv_scale_pool_size_bytes; + return config; + } + // for memory connector static BlockPoolConfig createConfig(uint32_t layer_num, uint32_t block_num, size_t block_stride_bytes, rtp_llm::DataType dtype) { BlockPoolConfig config; + config.pool_name = "memory_connector"; config.block_num = block_num; MemoryLayoutConfig layout_cfg; @@ -122,7 +186,7 @@ class BlockPoolConfigHelper { cfg.v_scale_stride_bytes = spec->v_scale_block_size_bytes(); cfg.enable_kv_scale = cfg.kv_scale_stride_bytes > 0; - cfg.dtype = cache_config.dtype; + cfg.dtype = spec->dtype; cfg.local_head_num_kv = spec->local_head_num_kv; cfg.enable_hybrid_attention = enable_hybrid_attention; // Scale 3D layout for MLA and indexer; KV 3D only for MLA (concat_and_cache_mla) diff --git a/rtp_llm/cpp/cache/BufferTypes.h b/rtp_llm/cpp/cache/BufferTypes.h index 743f6182f8..3e0da5279f 100644 --- a/rtp_llm/cpp/cache/BufferTypes.h +++ b/rtp_llm/cpp/cache/BufferTypes.h @@ -1,9 +1,11 @@ #pragma once +#include +#include #include #include -#include "rtp_llm/cpp/cache/CacheGroupType.h" +#include "rtp_llm/cpp/cache/spec/CacheGroupType.h" namespace rtp_llm { @@ -13,11 +15,16 @@ struct BlockBufferPtrInfo { }; struct CacheLayerLayout { - std::vector layer_to_groups; - std::vector group_types; - std::vector layer_attn_types; - std::vector layers_to_kv_buffer_ptrs; - std::vector layers_to_scale_buffer_ptrs; + std::vector> layer_to_group_ids; + std::vector group_types; + std::vector group_tags; + std::vector> layer_tag_to_group_id; + std::vector group_seq_size_per_block; + std::vector layer_group_types; + std::vector layers_to_kv_buffer_ptrs; + std::vector layers_to_scale_buffer_ptrs; + std::vector> layers_to_kv_buffer_ptrs_by_group; + std::vector> layers_to_scale_buffer_ptrs_by_group; }; struct KVCacheBuffer { diff --git a/rtp_llm/cpp/cache/CPSlotMapper.cc b/rtp_llm/cpp/cache/CPSlotMapper.cc new file mode 100644 index 0000000000..382f34ced9 --- /dev/null +++ b/rtp_llm/cpp/cache/CPSlotMapper.cc @@ -0,0 +1,40 @@ +#include "rtp_llm/cpp/cache/CPSlotMapper.h" + +#include + +namespace rtp_llm { + +CPSlotMapper::CPSlotMapper(): cp_rank_(0), cp_size_(1), block_size_(1), virtual_block_size_(1) {} + +CPSlotMapper::CPSlotMapper(int cp_rank, int cp_size, int block_size): + cp_rank_(cp_rank), cp_size_(cp_size), block_size_(block_size), virtual_block_size_(block_size * cp_size) { + if (cp_size <= 0) { + throw std::invalid_argument("CPSlotMapper cp_size must be positive"); + } + if (block_size <= 0) { + throw std::invalid_argument("CPSlotMapper block_size must be positive"); + } + if (cp_rank < 0 || cp_rank >= cp_size) { + throw std::invalid_argument("CPSlotMapper cp_rank out of range"); + } +} + +int CPSlotMapper::localBlockCount(int seq_len) const { + if (seq_len <= 0) { + return 0; + } + // All CP ranks keep the same block count = ceil(total_blocks / cp_size). + // rank0 is the controller: it allocates blocks and broadcasts block_ids + // to all ranks. Using a uniform count simplifies KV cache management — + // ranks with fewer "real" data blocks simply have unused trailing blocks. + int total_blocks = (seq_len + block_size_ - 1) / block_size_; + return (total_blocks + cp_size_ - 1) / cp_size_; +} + +int CPSlotMapper::effectiveSeqLenForAlloc(int actual_seq_len) const { + // Translate to a seq_len that, when the allocator divides by block_size, + // yields localBlockCount(actual_seq_len). + return localBlockCount(actual_seq_len) * block_size_; +} + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/CPSlotMapper.h b/rtp_llm/cpp/cache/CPSlotMapper.h new file mode 100644 index 0000000000..804bb31a4a --- /dev/null +++ b/rtp_llm/cpp/cache/CPSlotMapper.h @@ -0,0 +1,48 @@ +#pragma once + +namespace rtp_llm { + +/// Page-level virtual block sharding for Context Parallelism. +/// +/// Entire blocks are assigned to ranks round-robin: block_idx % cp_size == cp_rank. +/// Virtual block size is block_size * cp_size (used for cache key grouping). +/// +/// Sharded when cp_size > 1. The default constructor (cp_size=1) gives +/// passthrough behaviour identical to "no CP". +class CPSlotMapper { +public: + CPSlotMapper(); + CPSlotMapper(int cp_rank, int cp_size, int block_size); + + bool isSharded() const { + return cp_size_ > 1; + } + + int cpRank() const { + return cp_rank_; + } + int cpSize() const { + return cp_size_; + } + int blockSize() const { + return block_size_; + } + int virtualBlockSize() const { + return virtual_block_size_; + } + + int localBlockCount(int seq_len) const; + + // Translate actual seq_len to an effective value that, when divided by + // block_size, yields localBlockCount(actual_seq_len). Use this when + // feeding seq_len into an allocator that divides by block_size internally. + int effectiveSeqLenForAlloc(int actual_seq_len) const; + +private: + int cp_rank_ = 0; + int cp_size_ = 1; + int block_size_ = 1; + int virtual_block_size_ = 1; +}; + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/CacheConfig.h b/rtp_llm/cpp/cache/CacheConfig.h index 75f80b6d32..0875ce0277 100644 --- a/rtp_llm/cpp/cache/CacheConfig.h +++ b/rtp_llm/cpp/cache/CacheConfig.h @@ -1,28 +1,50 @@ #pragma once +#include +#include +#include #include #include #include +#include #include -#include "rtp_llm/cpp/cache/CacheGroupType.h" -#include "rtp_llm/cpp/cache/KVCacheSpec.h" +#include "rtp_llm/cpp/cache/spec/CacheGroupType.h" +#include "rtp_llm/cpp/cache/spec/KVCacheSpec.h" +#include "rtp_llm/cpp/config/ConfigModules.h" +#include "rtp_llm/cpp/utils/AssertUtils.h" #include "rtp_llm/models_py/bindings/core/Types.h" #include "rtp_llm/cpp/utils/StringUtil.h" namespace rtp_llm { +struct GroupBase { + KVCacheSpecPtr spec; + CacheGroupPolicy policy; + std::vector layer_ids; + uint32_t block_num = 0; + size_t kv_block_stride_bytes = 0; + size_t kv_scale_stride_bytes = 0; +}; + +struct LayerBase { + std::vector group_ids; + std::map tag_to_gid; +}; + struct CacheConfig { - // Cache specification and layer mapping - std::vector cache_specs; - std::vector> global_layer_ids; // including mtp module layers - std::vector> layer_ids; - std::vector> linear_groups; // for hybrid attention - std::vector> full_groups; // for hybrid attention - std::vector group_types; // for hybrid attention - std::vector layer_attn_types; - std::vector layer_to_group_id; - std::vector layer_to_block_stride_bytes; + std::vector groups; + std::vector layers; + std::unordered_map tag_to_gid; + + // Cache specification and layer mapping are owned by groups/layers above. + std::vector layer_to_block_stride_bytes; + std::vector group_seq_size_per_block; + bool group_block_layout_initialized = false; + bool use_independent_block_pools = false; + bool use_typed_cache_regions = false; + bool use_opaque_kv_cache_store = false; + bool disable_decode_first_malloc_device_reuse = false; // Model configuration rtp_llm::DataType dtype; @@ -46,19 +68,22 @@ struct CacheConfig { // Block sizing information // ---- Per-block sizes (all layers) ---- - size_t kv_block_size_bytes = 0; - size_t kv_scale_size_bytes = 0; - size_t block_size_bytes = 0; // (kv + scales together) + size_t kv_block_size_bytes = 0; + size_t kv_scale_size_bytes = 0; + size_t block_size_bytes = 0; // (kv + scales together) // ---- Per-block strides (one layer) ---- size_t kv_block_stride_bytes = 0; size_t kv_scale_stride_bytes = 0; + // Bytes pre-reserved for explicitly-sized pools. + // CacheConfigCreator deducts this from kv_cache_mem_size before computing the + // paged block_num, so paged pools don't overcommit HBM. 0 means no reservation. + size_t explicitly_sized_pool_reserve_bytes = 0; + // Attention-specific configuration - int linear_step = 1; // For Linear attention: keep one cache block every `linear_step` blocks + int linear_step = 1; // For Linear attention: keep one cache block every `linear_step` blocks int group_layer_num = 1; // Number of layers per group for hybrid attention - int linear_group_num = 0; // Number of linear attention groups - int full_group_num = 0; // Number of full attention groups // mtp-model configurations std::vector> mtp_sub_configs; @@ -66,7 +91,601 @@ struct CacheConfig { CacheConfig() {} int groupNums() const { - return std::max(1, static_cast(cache_specs.size())); + return static_cast(groups.size()); + } + + const KVCacheSpecPtr& specForGroup(size_t gid) const { + RTP_LLM_CHECK_WITH_INFO(gid < groups.size(), "CacheConfig::specForGroup invalid gid=%zu size=%zu", gid, groups.size()); + RTP_LLM_CHECK_WITH_INFO(groups[gid].spec != nullptr, "CacheConfig::specForGroup null spec gid=%zu", gid); + return groups[gid].spec; + } + + CacheGroupType typeForGroup(size_t gid) const { + RTP_LLM_CHECK_WITH_INFO(gid < groups.size(), "CacheConfig::typeForGroup invalid gid=%zu size=%zu", gid, groups.size()); + return groups[gid].policy.group_type; + } + + const std::string& tagForGroup(size_t gid) const { + return specForGroup(gid)->tag; + } + + int groupIdForTag(const std::string& tag) const { + const auto it = tag_to_gid.find(tag); + RTP_LLM_CHECK_WITH_INFO(it != tag_to_gid.end(), "CacheConfig::groupIdForTag missing tag=%s", tag.c_str()); + return it->second; + } + + const std::vector& layerIdsForGroup(size_t gid) const { + RTP_LLM_CHECK_WITH_INFO(gid < groups.size(), "CacheConfig::layerIdsForGroup invalid gid=%zu size=%zu", gid, groups.size()); + return groups[gid].layer_ids; + } + + std::vector groupTypesSnapshot() const { + std::vector types; + types.reserve(groups.size()); + for (const auto& group : groups) { + types.push_back(group.policy.group_type); + } + return types; + } + + std::vector groupTagsSnapshot() const { + std::vector tags; + tags.reserve(groups.size()); + for (const auto& group : groups) { + RTP_LLM_CHECK_WITH_INFO(group.spec != nullptr, "CacheConfig::groupTagsSnapshot null spec"); + tags.push_back(group.spec->tag); + } + return tags; + } + + std::vector groupPoliciesSnapshot() const { + std::vector policies; + policies.reserve(groups.size()); + for (const auto& group : groups) { + policies.push_back(group.policy); + } + return policies; + } + + std::vector groupBlockNumsSnapshot() const { + if (!group_block_layout_initialized) { + return {}; + } + std::vector block_nums; + block_nums.reserve(groups.size()); + for (const auto& group : groups) { + block_nums.push_back(group.block_num); + } + return block_nums; + } + + std::vector groupBlockSizeBytesSnapshot() const { + std::vector result; + result.reserve(static_cast(groupNums())); + for (size_t gid = 0; gid < static_cast(groupNums()); ++gid) { + result.push_back(blockSizeBytesForGroup(gid)); + } + return result; + } + + std::vector groupKvBlockStrideBytesSnapshot() const { + if (!group_block_layout_initialized) { + return {}; + } + std::vector strides; + strides.reserve(groups.size()); + for (const auto& group : groups) { + strides.push_back(group.kv_block_stride_bytes); + } + return strides; + } + + std::vector groupKvScaleStrideBytesSnapshot() const { + if (!group_block_layout_initialized) { + return {}; + } + std::vector strides; + strides.reserve(groups.size()); + for (const auto& group : groups) { + strides.push_back(group.kv_scale_stride_bytes); + } + return strides; + } + + std::vector> layerGroupIdsSnapshot() const { + std::vector> result; + result.reserve(layers.size()); + for (const auto& layer : layers) { + result.push_back(layer.group_ids); + } + return result; + } + + // Compatibility: flat layer→first-group-id mapping (old: layer_to_group_id). + std::vector flatLayerToGroupId() const { + std::vector result; + result.reserve(layers.size()); + for (const auto& layer : layers) { + result.push_back(layer.group_ids.empty() ? -1 : layer.group_ids[0]); + } + return result; + } + + // Compatibility: per-group global layer IDs (old: global_layer_ids). + std::vector> globalLayerIdsSnapshot() const { + std::vector> result; + result.reserve(groups.size()); + for (const auto& group : groups) { + result.push_back(group.layer_ids); + } + return result; + } + + std::vector> layerTagToGroupIdSnapshot() const { + std::vector> result; + result.reserve(layers.size()); + for (const auto& layer : layers) { + result.push_back(layer.tag_to_gid); + } + return result; + } + + uint32_t blockNumForGroup(size_t gid) const { + RTP_LLM_CHECK_WITH_INFO(gid < groups.size(), "CacheConfig::blockNumForGroup invalid gid=%zu size=%zu", gid, groups.size()); + if (group_block_layout_initialized && groups[gid].block_num > 0) { + return groups[gid].block_num; + } + return block_num; + } + + size_t kvBlockStrideBytesForGroup(size_t gid) const { + RTP_LLM_CHECK_WITH_INFO(gid < groups.size(), "CacheConfig::kvBlockStrideBytesForGroup invalid gid=%zu size=%zu", gid, groups.size()); + if (group_block_layout_initialized) { + return groups[gid].kv_block_stride_bytes; + } + return specForGroup(gid)->block_size_bytes(); + } + + size_t kvScaleStrideBytesForGroup(size_t gid) const { + RTP_LLM_CHECK_WITH_INFO(gid < groups.size(), "CacheConfig::kvScaleStrideBytesForGroup invalid gid=%zu size=%zu", gid, groups.size()); + if (group_block_layout_initialized) { + return groups[gid].kv_scale_stride_bytes; + } + return specForGroup(gid)->scale_block_size_bytes(); + } + + size_t blockSizeBytesForGroup(size_t gid) const { + return layerIdsForGroup(gid).size() * (kvBlockStrideBytesForGroup(gid) + kvScaleStrideBytesForGroup(gid)); + } + + void setGroupPolicies(const std::vector& policies) { + RTP_LLM_CHECK_WITH_INFO(policies.size() == groups.size(), + "CacheConfig::setGroupPolicies size %zu != group size %zu", + policies.size(), + groups.size()); + for (size_t gid = 0; gid < policies.size(); ++gid) { + groups[gid].policy = policies[gid]; + } + } + + void setGroupBlockLayout(const std::vector& block_nums, + const std::vector& kv_block_stride_bytes, + const std::vector& kv_scale_stride_bytes) { + const size_t group_num = static_cast(groupNums()); + RTP_LLM_CHECK_WITH_INFO(block_nums.size() == group_num, + "CacheConfig::setGroupBlockLayout block_nums size %zu != group size %zu", + block_nums.size(), + group_num); + RTP_LLM_CHECK_WITH_INFO(kv_block_stride_bytes.size() == group_num, + "CacheConfig::setGroupBlockLayout kv stride size %zu != group size %zu", + kv_block_stride_bytes.size(), + group_num); + RTP_LLM_CHECK_WITH_INFO(kv_scale_stride_bytes.size() == group_num, + "CacheConfig::setGroupBlockLayout scale stride size %zu != group size %zu", + kv_scale_stride_bytes.size(), + group_num); + for (size_t gid = 0; gid < group_num; ++gid) { + groups[gid].block_num = block_nums[gid]; + groups[gid].kv_block_stride_bytes = kv_block_stride_bytes[gid]; + groups[gid].kv_scale_stride_bytes = kv_scale_stride_bytes[gid]; + } + group_block_layout_initialized = true; + } + + void resizeLayerRoutes(size_t layer_count) { + layers.resize(layer_count); + } + + void setLayerIdsForGroup(size_t gid, const std::vector& layer_ids) { + RTP_LLM_CHECK_WITH_INFO(gid < groups.size(), + "CacheConfig::setLayerIdsForGroup invalid gid=%zu size=%zu", + gid, + groups.size()); + groups[gid].layer_ids = layer_ids; + if (groups[gid].spec != nullptr) { + groups[gid].spec->layers = layer_ids; + } + } + + void appendLayerToGroup(size_t gid, int layer_id, const std::string& tag) { + RTP_LLM_CHECK_WITH_INFO(gid < groups.size(), + "CacheConfig::appendLayerToGroup invalid gid=%zu size=%zu", + gid, + groups.size()); + RTP_LLM_CHECK_WITH_INFO(layer_id >= 0, "CacheConfig::appendLayerToGroup invalid layer_id=%d", layer_id); + const auto layer = static_cast(layer_id); + if (layer >= layers.size()) { + layers.resize(layer + 1); + } + groups[gid].layer_ids.push_back(layer_id); + if (groups[gid].spec != nullptr) { + groups[gid].spec->layers = groups[gid].layer_ids; + } + layers[layer].group_ids.push_back(static_cast(gid)); + if (!tag.empty()) { + layers[layer].tag_to_gid[tag] = static_cast(gid); + } + } + + size_t fullGroupId() const { + for (size_t gid = 0; gid < static_cast(groupNums()); ++gid) { + if (typeForGroup(gid) == CacheGroupType::FULL) { + return gid; + } + } + return 0; + } + + std::shared_ptr mergeMTPModule(const CacheConfig& propose_config, + int module_index, + uint32_t main_layer_num) { + RTP_LLM_CHECK_WITH_INFO(!groups.empty(), "CacheConfig::mergeMTPModule requires destination topology views"); + RTP_LLM_CHECK_WITH_INFO(!propose_config.groups.empty(), + "CacheConfig::mergeMTPModule requires propose topology views"); + RTP_LLM_CHECK_WITH_INFO(module_index >= 0, "CacheConfig::mergeMTPModule invalid module_index=%d", module_index); + + auto sub_cfg = std::make_shared(propose_config); + sub_cfg->block_num = block_num; + sub_cfg->layer_all_num = sub_cfg->layer_num; + + const auto mtp_layer_num = propose_config.layer_num; + const auto total_layers = + static_cast(main_layer_num) + static_cast(module_index + 1) * mtp_layer_num; + resizeLayerRoutes(total_layers); + if (layer_to_block_stride_bytes.size() < total_layers) { + layer_to_block_stride_bytes.resize(total_layers, 0); + } + + // MTP currently relies on target and draft models sharing the same group-index + // namespace: model inputs, CUDA graph metadata, and Python attention inputs pass + // block tables by gid without a draft-local remap. Therefore the sub-config + // keeps every target group in first-seen order. Groups not used by the propose + // model are placeholders with an empty layer list. + const auto target_group_num = static_cast(groupNums()); + std::unordered_map propose_gid_by_tag; + for (size_t gid = 0; gid < static_cast(propose_config.groupNums()); ++gid) { + propose_gid_by_tag.emplace(propose_config.tagForGroup(gid), gid); + } + + std::vector sub_groups; + std::vector sub_layers(static_cast(mtp_layer_num)); + std::vector sub_group_seq_size_per_block; + sub_groups.reserve(target_group_num); + sub_group_seq_size_per_block.reserve(target_group_num); + + for (size_t target_gid = 0; target_gid < target_group_num; ++target_gid) { + const auto& tag = tagForGroup(target_gid); + const auto propose_it = propose_gid_by_tag.find(tag); + const bool has_propose_group = propose_it != propose_gid_by_tag.end(); + const size_t source_gid = has_propose_group ? propose_it->second : target_gid; + const auto& source_config = has_propose_group ? propose_config : *this; + const auto& source_group = source_config.groups[source_gid]; + + GroupBase sub_group; + sub_group.spec = source_group.spec->clone(); + sub_group.policy = source_group.policy; + sub_group.block_num = source_group.block_num; + sub_group.kv_block_stride_bytes = source_group.kv_block_stride_bytes; + sub_group.kv_scale_stride_bytes = source_group.kv_scale_stride_bytes; + + if (has_propose_group) { + for (int local_layer_id : propose_config.layerIdsForGroup(source_gid)) { + if (local_layer_id < 0 || local_layer_id >= static_cast(mtp_layer_num)) { + continue; + } + const auto global_layer_id = static_cast(main_layer_num) + + module_index * static_cast(mtp_layer_num) + local_layer_id; + const auto global_layer = static_cast(global_layer_id); + + sub_group.layer_ids.push_back(global_layer_id); + auto& sub_layer = sub_layers[static_cast(local_layer_id)]; + sub_layer.group_ids.push_back(static_cast(target_gid)); + sub_layer.tag_to_gid[tag] = static_cast(target_gid); + + appendLayerToGroup(target_gid, global_layer_id, tag); + + RTP_LLM_CHECK_WITH_INFO(static_cast(local_layer_id) + < sub_cfg->layer_to_block_stride_bytes.size(), + "CacheConfig::mergeMTPModule local layer stride missing layer=%d size=%zu", + local_layer_id, + sub_cfg->layer_to_block_stride_bytes.size()); + layer_to_block_stride_bytes[global_layer] = + sub_cfg->layer_to_block_stride_bytes[static_cast(local_layer_id)]; + } + } + sub_group.spec->layers = sub_group.layer_ids; + sub_groups.push_back(std::move(sub_group)); + + const auto& source_seq = source_config.group_seq_size_per_block; + sub_group_seq_size_per_block.push_back(source_gid < source_seq.size() ? source_seq[source_gid] : 0); + } + + for (size_t layer_id = 0; layer_id < sub_layers.size(); ++layer_id) { + RTP_LLM_CHECK_WITH_INFO(!sub_layers[layer_id].group_ids.empty(), + "CacheConfig::mergeMTPModule missing group mapping for sub layer %zu", + layer_id); + } + + sub_cfg->groups = std::move(sub_groups); + sub_cfg->layers = std::move(sub_layers); + sub_cfg->tag_to_gid.clear(); + for (size_t gid = 0; gid < sub_cfg->groups.size(); ++gid) { + sub_cfg->tag_to_gid.emplace(sub_cfg->groups[gid].spec->tag, static_cast(gid)); + } + sub_cfg->group_seq_size_per_block = std::move(sub_group_seq_size_per_block); + sub_cfg->group_block_layout_initialized = group_block_layout_initialized; + return sub_cfg; + } + + uint32_t explicitIndependentBlocks(size_t gid) const { + return policyForGroup(gid).explicit_block_num; + } + + bool usesExplicitIndependentBlocks(size_t gid) const { + return explicitIndependentBlocks(gid) > 0; + } + + CacheGroupPolicy policyForGroup(size_t gid) const { + RTP_LLM_CHECK_WITH_INFO(gid < groups.size(), "CacheConfig::policyForGroup invalid gid=%zu size=%zu", gid, groups.size()); + return groups[gid].policy; + } + + int groupIdForLayerTag(int layer_id, const std::string& tag) const { + RTP_LLM_CHECK_WITH_INFO(layer_id >= 0 && static_cast(layer_id) < layers.size(), + "CacheConfig::groupIdForLayerTag invalid layer_id=%d size=%zu", + layer_id, + layers.size()); + const auto& tag_to_group = layers[static_cast(layer_id)].tag_to_gid; + const auto it = tag_to_group.find(tag); + RTP_LLM_CHECK_WITH_INFO(it != tag_to_group.end(), + "CacheConfig::groupIdForLayerTag missing tag=%s for layer_id=%d", + tag.c_str(), + layer_id); + return it->second; + } + + int groupIdFor(int layer_id) const { + RTP_LLM_CHECK_WITH_INFO(layer_id >= 0 && static_cast(layer_id) < layers.size(), + "CacheConfig::groupIdFor invalid layer_id=%d size=%zu", + layer_id, + layers.size()); + const auto& gids = layers[static_cast(layer_id)].group_ids; + RTP_LLM_CHECK_WITH_INFO(gids.size() == 1, + "CacheConfig::groupIdFor requires exactly one cache tag for layer_id=%d, got %zu", + layer_id, + gids.size()); + return gids.front(); + } + + const std::vector& groupIdsForLayer(int layer_id) const { + RTP_LLM_CHECK_WITH_INFO(layer_id >= 0 && static_cast(layer_id) < layers.size(), + "CacheConfig::groupIdsForLayer invalid layer_id=%d size=%zu", + layer_id, + layers.size()); + const auto& gids = layers[static_cast(layer_id)].group_ids; + RTP_LLM_CHECK_WITH_INFO(!gids.empty(), "CacheConfig::groupIdsForLayer missing layer_id=%d", layer_id); + return gids; + } + + static CacheGroupPolicy cacheGroupPolicyForSpec(const KVCacheSpecPtr& spec, CacheGroupType group_type) { + CacheGroupPolicy policy = defaultCacheGroupPolicy(group_type); + if (spec && spec->is_state_cache) { + policy.evict_policy = CacheEvictPolicy::INDEPENDENT; + } + if (spec && spec->skip_prefix_reuse) { + policy.reuse_policy = CacheReusePolicy::NON_REUSABLE; + policy.active_tail_blocks = 1; + policy.validate_tail_blocks = false; + } + return policy; + } + + static bool samePolicy(const CacheGroupPolicy& lhs, const CacheGroupPolicy& rhs) { + return lhs.reuse_policy == rhs.reuse_policy && lhs.evict_policy == rhs.evict_policy + && lhs.active_tail_blocks == rhs.active_tail_blocks + && lhs.validate_tail_blocks == rhs.validate_tail_blocks + && lhs.explicit_block_num == rhs.explicit_block_num + && lhs.reserve_from_paged_budget == rhs.reserve_from_paged_budget + && lhs.prefix_reusable == rhs.prefix_reusable + && lhs.uses_pinned_cpu_backing == rhs.uses_pinned_cpu_backing + && lhs.is_cp_shardable == rhs.is_cp_shardable + && lhs.has_sparse_slots == rhs.has_sparse_slots + && lhs.has_kernel_block_subdiv == rhs.has_kernel_block_subdiv + && lhs.cp_compact_tail_blocks == rhs.cp_compact_tail_blocks + && lhs.is_reservable == rhs.is_reservable + && lhs.group_type == rhs.group_type; + } + + void setTopology(std::vector new_groups, std::vector new_layers) { + RTP_LLM_CHECK_WITH_INFO(!new_groups.empty(), "CacheConfig::setTopology requires at least one cache group"); + RTP_LLM_CHECK_WITH_INFO(layer_num > 0, "CacheConfig::setTopology requires positive layer_num"); + RTP_LLM_CHECK_WITH_INFO(new_layers.size() == static_cast(layer_num), + "CacheConfig::setTopology layer count %zu != layer_num %u", + new_layers.size(), + layer_num); + + std::unordered_map new_tag_to_gid; + for (size_t gid = 0; gid < new_groups.size(); ++gid) { + auto& group = new_groups[gid]; + RTP_LLM_CHECK_WITH_INFO(group.spec != nullptr, "CacheConfig::setTopology got null spec at group %zu", gid); + RTP_LLM_CHECK_WITH_INFO(!group.spec->tag.empty(), + "CacheConfig::setTopology requires non-empty tag for group %zu", + gid); + new_tag_to_gid.emplace(group.spec->tag, static_cast(gid)); + group.spec = group.spec->clone(); + group.spec->layers = group.layer_ids; + } + + std::vector> group_has_layer( + new_groups.size(), std::vector(static_cast(layer_num), false)); + for (size_t gid = 0; gid < new_groups.size(); ++gid) { + for (int layer_id : new_groups[gid].layer_ids) { + RTP_LLM_CHECK_WITH_INFO(layer_id >= 0 && static_cast(layer_id) < new_layers.size(), + "CacheConfig::setTopology tag=%s has invalid layer id %d for layer_num=%u", + new_groups[gid].spec->tag.c_str(), + layer_id, + layer_num); + const auto layer_index = static_cast(layer_id); + RTP_LLM_CHECK_WITH_INFO(!group_has_layer[gid][layer_index], + "CacheConfig::setTopology tag=%s has duplicate layer id %d", + new_groups[gid].spec->tag.c_str(), + layer_id); + group_has_layer[gid][layer_index] = true; + } + } + + for (size_t layer_id = 0; layer_id < new_layers.size(); ++layer_id) { + auto& layer = new_layers[layer_id]; + RTP_LLM_CHECK_WITH_INFO(!layer.group_ids.empty(), + "CacheConfig::setTopology missing group mapping for layer %zu", + layer_id); + std::map seen_gids; + for (int gid : layer.group_ids) { + RTP_LLM_CHECK_WITH_INFO(gid >= 0 && static_cast(gid) < new_groups.size(), + "CacheConfig::setTopology layer %zu has invalid gid %d", + layer_id, + gid); + RTP_LLM_CHECK_WITH_INFO(seen_gids.emplace(gid, true).second, + "CacheConfig::setTopology layer %zu has duplicate gid %d", + layer_id, + gid); + RTP_LLM_CHECK_WITH_INFO(group_has_layer[static_cast(gid)][layer_id], + "CacheConfig::setTopology layer %zu gid %d is missing reverse group layer id", + layer_id, + gid); + } + + for (const auto& [tag, gid] : layer.tag_to_gid) { + RTP_LLM_CHECK_WITH_INFO(gid >= 0 && static_cast(gid) < new_groups.size(), + "CacheConfig::setTopology layer %zu tag=%s has invalid gid %d", + layer_id, + tag.c_str(), + gid); + RTP_LLM_CHECK_WITH_INFO(tag == new_groups[static_cast(gid)].spec->tag, + "CacheConfig::setTopology layer %zu tag=%s does not match gid %d tag=%s", + layer_id, + tag.c_str(), + gid, + new_groups[static_cast(gid)].spec->tag.c_str()); + RTP_LLM_CHECK_WITH_INFO(std::find(layer.group_ids.begin(), layer.group_ids.end(), gid) + != layer.group_ids.end(), + "CacheConfig::setTopology layer %zu tag=%s gid %d is not in layer groups", + layer_id, + tag.c_str(), + gid); + } + } + + groups = std::move(new_groups); + layers = std::move(new_layers); + tag_to_gid = std::move(new_tag_to_gid); + group_block_layout_initialized = false; + } + + void fromGroupedSpecs(const std::vector& specs, + const std::vector>& layers_by_group, + const std::vector& types, + const std::vector& tags = {}) { + const size_t group_num = specs.size(); + RTP_LLM_CHECK_WITH_INFO(group_num > 0, "CacheConfig::fromGroupedSpecs requires at least one cache spec"); + RTP_LLM_CHECK_WITH_INFO(layers_by_group.size() == group_num, + "CacheConfig::fromGroupedSpecs layer group count %zu != spec count %zu", + layers_by_group.size(), + group_num); + RTP_LLM_CHECK_WITH_INFO(types.size() == group_num, + "CacheConfig::fromGroupedSpecs group type count %zu != spec count %zu", + types.size(), + group_num); + RTP_LLM_CHECK_WITH_INFO(tags.empty() || tags.size() == group_num, + "CacheConfig::fromGroupedSpecs tag count %zu != spec count %zu", + tags.size(), + group_num); + RTP_LLM_CHECK_WITH_INFO(layer_num > 0, "CacheConfig::fromGroupedSpecs requires positive layer_num"); + + std::vector new_groups; + std::vector new_layers(static_cast(layer_num)); + new_groups.reserve(group_num); + + for (size_t gid = 0; gid < group_num; ++gid) { + const auto& spec = specs[gid]; + RTP_LLM_CHECK_WITH_INFO(spec != nullptr, "CacheConfig::fromGroupedSpecs got null spec at group %zu", gid); + std::string tag = tags.empty() ? spec->tag : tags[gid]; + if (tag.empty() && group_num == 1) { + tag = "default"; + } + RTP_LLM_CHECK_WITH_INFO(!tag.empty(), + "CacheConfig::fromGroupedSpecs requires non-empty tag for cache spec %zu", + gid); + auto stored_spec = spec->clone(); + stored_spec->tag = tag; + + GroupBase group; + group.spec = stored_spec; + group.policy = cacheGroupPolicyForSpec(stored_spec, types[gid]); + group.layer_ids = layers_by_group[gid]; + new_groups.push_back(group); + + for (int layer_id : layers_by_group[gid]) { + RTP_LLM_CHECK_WITH_INFO(layer_id >= 0 && static_cast(layer_id) < new_layers.size(), + "CacheConfig::fromGroupedSpecs tag=%s has invalid layer id %d for layer_num=%u", + tag.c_str(), + layer_id, + layer_num); + auto& layer = new_layers[static_cast(layer_id)]; + layer.group_ids.push_back(static_cast(gid)); + const auto [it, inserted] = layer.tag_to_gid.emplace(tag, static_cast(gid)); + RTP_LLM_CHECK_WITH_INFO(inserted || it->second == static_cast(gid), + "CacheConfig::fromGroupedSpecs layer %d tag %s maps to both group %d and %zu", + layer_id, + tag.c_str(), + inserted ? static_cast(gid) : it->second, + gid); + } + } + + setTopology(std::move(new_groups), std::move(new_layers)); + } + + void finalizeBlockNums(uint32_t global_block_num, const RuntimeConfig& runtime_config) { + (void)runtime_config; + if (!use_independent_block_pools || !group_block_layout_initialized || groups.empty()) { + explicitly_sized_pool_reserve_bytes = 0; + return; + } + + size_t reserve = 0; + for (size_t gid = 0; gid < groups.size(); ++gid) { + const auto explicit_independent_blocks = explicitIndependentBlocks(gid); + const auto rule_blocks = explicit_independent_blocks > 0 ? explicit_independent_blocks : global_block_num; + groups[gid].block_num = rule_blocks; + + // Explicit independent pools are allocated outside the paged pool budget. + if (explicit_independent_blocks > 0) { + reserve += static_cast(rule_blocks) * blockSizeBytesForGroup(gid); + } + } + explicitly_sized_pool_reserve_bytes = reserve; } std::string debugString(size_t indent = 0) const { @@ -107,19 +726,41 @@ struct CacheConfig { OUTPUT_FIELD(kv_scale_stride_bytes); os << "\n"; + const auto group_policies = groupPoliciesSnapshot(); + const auto group_block_nums = groupBlockNumsSnapshot(); + const auto group_layer_ids = layerGroupIdsSnapshot(); + const auto group_tags = groupTagsSnapshot(); + std::vector> layers_by_group; + layers_by_group.reserve(groups.size()); + for (const auto& group : groups) { + layers_by_group.push_back(group.layer_ids); + } + // Attention-specific configuration section os << indent1 << "# Attention Configuration:\n"; OUTPUT_FIELD(linear_step); OUTPUT_FIELD(group_layer_num); - OUTPUT_FIELD(linear_group_num); - OUTPUT_FIELD(full_group_num); + OUTPUT_FIELD_EXPR("full_group_num", + std::count_if(group_policies.begin(), group_policies.end(), + [](const CacheGroupPolicy& p) { return p.group_type == CacheGroupType::FULL; })); + OUTPUT_FIELD_EXPR("linear_group_num", + std::count_if(group_policies.begin(), group_policies.end(), + [](const CacheGroupPolicy& p) { return p.group_type == CacheGroupType::LINEAR; })); + OUTPUT_FIELD_EXPR("swa_group_num", + std::count_if(group_policies.begin(), group_policies.end(), + [](const CacheGroupPolicy& p) { return p.group_type == CacheGroupType::SWA; })); + OUTPUT_FIELD(use_independent_block_pools); + OUTPUT_FIELD(use_typed_cache_regions); + OUTPUT_FIELD(use_opaque_kv_cache_store); + OUTPUT_FIELD(disable_decode_first_malloc_device_reuse); + os << indent1 << "group_block_nums=" << rtp_llm::vectorToString(group_block_nums) << "\n"; os << "\n"; // Cache specification section os << indent1 << "# Cache Specifications:\n"; - OUTPUT_FIELD_EXPR("cache_specs.size()", cache_specs.size()); - for (size_t i = 0; i < cache_specs.size(); ++i) { - const auto& spec = cache_specs[i]; + OUTPUT_FIELD_EXPR("cache_specs.size()", groups.size()); + for (size_t i = 0; i < groups.size(); ++i) { + const auto& spec = groups[i].spec; if (!spec) { os << indent1 << "cache_specs[" << i << "]=null\n"; continue; @@ -133,28 +774,28 @@ struct CacheConfig { // Layer mapping section os << indent1 << "# Layer Mapping:\n"; - OUTPUT_FIELD_EXPR("global_layer_ids.size()", global_layer_ids.size()); - os << indent1 << "global_layer_ids=" << rtp_llm::vectorsToString(global_layer_ids) << "\n"; - OUTPUT_FIELD_EXPR("layer_ids.size()", layer_ids.size()); - os << indent1 << "layer_ids=" << rtp_llm::vectorsToString(layer_ids) << "\n"; - OUTPUT_FIELD_EXPR("group_types.size()", group_types.size()); + OUTPUT_FIELD_EXPR("layer_ids.size()", layers_by_group.size()); + os << indent1 << "layer_ids=" << rtp_llm::vectorsToString(layers_by_group) << "\n"; + OUTPUT_FIELD_EXPR("group_types.size()", group_policies.size()); os << indent1 << "group_types=["; - for (size_t i = 0; i < group_types.size(); ++i) { - os << static_cast(group_types[i]); - if (i + 1 < group_types.size()) { + for (size_t i = 0; i < group_policies.size(); ++i) { + os << static_cast(group_policies[i].group_type); + if (i + 1 < group_policies.size()) { os << ","; } } os << "]\n"; - OUTPUT_FIELD_EXPR("layer_attn_types.size()", layer_attn_types.size()); - os << indent1 << "layer_attn_types=["; - for (size_t i = 0; i < layer_attn_types.size(); ++i) { - os << static_cast(layer_attn_types[i]); - if (i + 1 < layer_attn_types.size()) { + OUTPUT_FIELD_EXPR("group_tags.size()", group_tags.size()); + os << indent1 << "group_tags=["; + for (size_t i = 0; i < group_tags.size(); ++i) { + os << group_tags[i]; + if (i + 1 < group_tags.size()) { os << ","; } } os << "]\n"; + OUTPUT_FIELD_EXPR("layer_to_group_ids.size()", group_layer_ids.size()); + os << indent1 << "layer_to_group_ids=" << rtp_llm::vectorsToString(group_layer_ids) << "\n"; os << "\n"; // mtp configurations section diff --git a/rtp_llm/cpp/cache/CacheConfigCreator.cc b/rtp_llm/cpp/cache/CacheConfigCreator.cc deleted file mode 100644 index bd3985dbfb..0000000000 --- a/rtp_llm/cpp/cache/CacheConfigCreator.cc +++ /dev/null @@ -1,225 +0,0 @@ -#include "rtp_llm/cpp/cache/CacheConfigCreator.h" - -#include - -#include "rtp_llm/cpp/cache/HybridConfigCreator.h" -#include "rtp_llm/cpp/cache/MemoryEvaluationHelper.h" -#include "rtp_llm/cpp/cache/SingleConfigCreator.h" -#include "rtp_llm/cpp/utils/Logger.h" -#include "rtp_llm/cpp/utils/AssertUtils.h" - -namespace rtp_llm { - -CacheConfig CacheConfigCreator::createBasicConfig(const ModelConfig& model_config, - const ParallelismConfig& parallelism_config, - bool is_mtp) { - if (model_config.hybrid_attention_config.enable_hybrid_attention) { - return HybridConfigCreator::createHybridConfig(model_config, parallelism_config, is_mtp); - } else { - return SingleConfigCreator::createSingleConfig(model_config, parallelism_config, is_mtp); - } -} - -CacheConfig CacheConfigCreator::createConfig(const ModelConfig& model_config, - const ParallelismConfig& parallelism_config, - const RuntimeConfig& runtime_config, - const KVCacheConfig& kv_cache_config, - const std::optional& warm_up_result, - const std::optional& sp_config) { - CacheConfig config = CacheConfigCreator::createBasicConfig(model_config, parallelism_config); - uint32_t block_num = 0; - - config.linear_step = kv_cache_config.linear_step; - if (kv_cache_config.kernel_seq_size_per_block > 0) { - RTP_LLM_CHECK_WITH_INFO(kv_cache_config.seq_size_per_block % kv_cache_config.kernel_seq_size_per_block == 0, - "seq_size_per_block(%d) must be divisible by kernel_seq_size_per_block(%d)", - kv_cache_config.seq_size_per_block, - kv_cache_config.kernel_seq_size_per_block); - config.kernel_seq_size_per_block = static_cast(kv_cache_config.kernel_seq_size_per_block); - } else { - // Default: kernel block size == physical block size (no split). - config.kernel_seq_size_per_block = config.seq_size_per_block; - } - - if (kv_cache_config.test_block_num > 0) { - RTP_LLM_LOG_INFO("KVCacheConfig explicitly specified kv cache block num %d", kv_cache_config.test_block_num); - block_num = kv_cache_config.test_block_num; - } else { - const auto kv_cache_mem_size = MemoryEvaluationHelper::getKVCacheMemorySize( - runtime_config, kv_cache_config, model_config, parallelism_config, warm_up_result, sp_config); - block_num = kv_cache_mem_size / config.block_size_bytes; - } - RTP_LLM_CHECK_WITH_INFO(block_num > 0, - "kv cache needs at least 1 block but %ld, each block needs %ld MiB memory", - block_num, - static_cast(config.block_size_bytes / 1024 / 1024)); - - const auto kv_cache_seq_len = static_cast(block_num) * config.seq_size_per_block; - config.block_num = static_cast(block_num); - RTP_LLM_LOG_INFO("kv cache block nums is %u, allows storing %ld tokens", block_num, kv_cache_seq_len); - if (kv_cache_seq_len < model_config.max_seq_len) { - RTP_LLM_LOG_WARNING("kv cache block nums %u can only store %ld tokens, less than max_seq_len %ld, " - "this is dangerous, consider decrease max_seq_len", - block_num, - kv_cache_seq_len, - model_config.max_seq_len); - } - return config; -} - -CacheConfig CacheConfigCreator::createSpConfig(const ModelConfig& score_model_config, - const ModelConfig& propose_model_config, - const ParallelismConfig& parallelism_config, - const RuntimeConfig& runtime_config, - const KVCacheConfig& kv_cache_config, - const SpeculativeExecutionConfig& sp_config, - const std::optional& warm_up_result, - bool is_mtp, - bool is_eagle) { - CacheConfig score_config = CacheConfigCreator::createBasicConfig(score_model_config, parallelism_config, false); - CacheConfig propose_config = - CacheConfigCreator::createBasicConfig(propose_model_config, parallelism_config, is_mtp); - - if (kv_cache_config.kernel_seq_size_per_block > 0) { - const size_t kernel_seq_size_per_block = static_cast(kv_cache_config.kernel_seq_size_per_block); - RTP_LLM_CHECK_WITH_INFO(score_config.seq_size_per_block % kernel_seq_size_per_block == 0, - "score seq_size_per_block(%zu) must be divisible by kernel_seq_size_per_block(%zu)", - score_config.seq_size_per_block, - kernel_seq_size_per_block); - RTP_LLM_CHECK_WITH_INFO(propose_config.seq_size_per_block % kernel_seq_size_per_block == 0, - "propose seq_size_per_block(%zu) must be divisible by kernel_seq_size_per_block(%zu)", - propose_config.seq_size_per_block, - kernel_seq_size_per_block); - score_config.kernel_seq_size_per_block = kernel_seq_size_per_block; - propose_config.kernel_seq_size_per_block = kernel_seq_size_per_block; - } else { - // Default: kernel block size == physical block size (no split). - score_config.kernel_seq_size_per_block = score_config.seq_size_per_block; - propose_config.kernel_seq_size_per_block = propose_config.seq_size_per_block; - } - - int num_mtp_modules = 1; - if (is_mtp) { - num_mtp_modules = sp_config.gen_num_per_cycle; - if (is_eagle) { - num_mtp_modules = 1; - } - } - - uint32_t total_layer_num = score_config.layer_num; - for (int i = 0; i < num_mtp_modules; ++i) { - total_layer_num += propose_config.layer_num; - } - - size_t total_block_size_bytes = score_config.block_size_bytes; - for (int i = 0; i < num_mtp_modules; ++i) { - total_block_size_bytes += propose_config.block_size_bytes; - } - - size_t block_num = 0; - if (kv_cache_config.test_block_num > 0) { - block_num = kv_cache_config.test_block_num; - } else { - const auto kv_cache_mem_size = MemoryEvaluationHelper::getKVCacheMemorySize( - runtime_config, kv_cache_config, score_model_config, parallelism_config, warm_up_result, sp_config); - - block_num = kv_cache_mem_size - / (static_cast(score_config.block_size_bytes) - + static_cast(propose_config.block_size_bytes) * static_cast(num_mtp_modules)); - } - - RTP_LLM_CHECK_WITH_INFO(block_num > 0, "kv cache needs at least 1 block but %zu", block_num); - - CacheConfig config = score_config; - config.linear_step = std::max(1, kv_cache_config.linear_step); - config.layer_all_num = total_layer_num; - config.block_size_bytes = total_block_size_bytes; - // config.block_size = config.block_size_bytes / rtp_llm::getTypeSize(config.dtype); - config.block_num = block_num; - - const uint32_t main_layer_num = score_config.layer_num; - const uint32_t mtp_layer_num = propose_config.layer_num; - - size_t full_gid = 0; - if (config.group_types.size() > 1) { - for (size_t gid = 0; gid < config.group_types.size(); ++gid) { - if (config.group_types[gid] == CacheGroupType::FULL) { - full_gid = gid; - break; - } - } - } - - // Each sub-model needs an independent CacheConfig because global_layer_ids differs per module. - config.mtp_sub_configs.clear(); - config.mtp_sub_configs.reserve(num_mtp_modules); - config.layer_to_group_id.resize(total_layer_num, 0); - config.layer_attn_types.resize(total_layer_num, CacheGroupType::FULL); - config.layer_to_block_stride_bytes.assign(static_cast(total_layer_num), 0); - - // Main(score) model per-layer stride (kv + scale). - // This is expected to be fully populated by createBasicConfig() (Single/Hybrid creators). - const size_t score_layers = static_cast(main_layer_num); - RTP_LLM_CHECK_WITH_INFO(score_config.layer_to_block_stride_bytes.size() == score_layers, - "score_config.layer_to_block_stride_bytes size mismatch, got=%zu need=%zu", - score_config.layer_to_block_stride_bytes.size(), - score_layers); - for (size_t l = 0; l < score_layers; ++l) { - config.layer_to_block_stride_bytes[l] = score_config.layer_to_block_stride_bytes[l]; - if (l < score_config.layer_attn_types.size()) { - config.layer_attn_types[l] = score_config.layer_attn_types[l]; - } - } - - for (int m = 0; m < num_mtp_modules; ++m) { - auto sub_cfg = std::make_shared(propose_config); - sub_cfg->block_num = block_num; - sub_cfg->layer_all_num = sub_cfg->layer_num; - - sub_cfg->global_layer_ids.clear(); - sub_cfg->global_layer_ids.resize(1); - sub_cfg->global_layer_ids[0].resize(mtp_layer_num); - RTP_LLM_CHECK_WITH_INFO(sub_cfg->layer_to_block_stride_bytes.size() == static_cast(mtp_layer_num), - "sub_cfg.layer_to_block_stride_bytes size mismatch, got=%zu need=%u", - sub_cfg->layer_to_block_stride_bytes.size(), - mtp_layer_num); - for (size_t l = 0; l < mtp_layer_num; ++l) { - int global_layer_id = main_layer_num + m * mtp_layer_num + l; - sub_cfg->global_layer_ids[0][l] = global_layer_id; - config.layer_to_group_id[global_layer_id] = static_cast(full_gid); - config.global_layer_ids[full_gid].push_back(global_layer_id); - - const int stride_bytes = sub_cfg->layer_to_block_stride_bytes[static_cast(l)]; - config.layer_to_block_stride_bytes[static_cast(global_layer_id)] = stride_bytes; - if (l < sub_cfg->layer_attn_types.size()) { - config.layer_attn_types[static_cast(global_layer_id)] = sub_cfg->layer_attn_types[l]; - } - } - - sub_cfg->layer_to_group_id.assign(static_cast(sub_cfg->layer_num), static_cast(full_gid)); - config.mtp_sub_configs.push_back(sub_cfg); - } - - const auto kv_cache_seq_len = static_cast(block_num) * config.seq_size_per_block; - RTP_LLM_LOG_INFO("CacheConfig created: is_mtp=%d, total_layers=%u, num_mtp_modules=%d, block_num=%zu, " - "allows storing %zu tokens, total_block_size=%zu bytes (main=%zu + %d*propose=%zu)", - is_mtp, - total_layer_num, - num_mtp_modules, - block_num, - kv_cache_seq_len, - total_block_size_bytes, - score_config.block_size_bytes, - num_mtp_modules, - propose_config.block_size_bytes); - - RTP_LLM_LOG_INFO("CacheConfig debugString(main_score_model):\n%s", score_config.debugString().c_str()); - for (size_t i = 0; i < config.mtp_sub_configs.size(); ++i) { - const auto& sub = config.mtp_sub_configs[i]; - RTP_LLM_LOG_INFO("CacheConfig debugString(sub_propose_model[%zu]):\n%s", i, sub->debugString().c_str()); - } - - return config; -} - -} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/CacheGroupType.h b/rtp_llm/cpp/cache/CacheGroupType.h deleted file mode 100644 index aae75d2b5d..0000000000 --- a/rtp_llm/cpp/cache/CacheGroupType.h +++ /dev/null @@ -1,15 +0,0 @@ -#pragma once - -#include - -namespace rtp_llm { - -// Cache group type for hybrid KV-cache: -// - LINEAR: linear attention group (only last block is needed for cache-store transfer) -// - FULL: full attention group (all blocks are needed for cache-store transfer) -enum class CacheGroupType : int8_t { - LINEAR = 0, - FULL = 1, -}; - -} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/HybridConfigCreator.cc b/rtp_llm/cpp/cache/HybridConfigCreator.cc deleted file mode 100644 index c9a3306fc5..0000000000 --- a/rtp_llm/cpp/cache/HybridConfigCreator.cc +++ /dev/null @@ -1,238 +0,0 @@ -#include "rtp_llm/cpp/cache/HybridConfigCreator.h" - -#include - -#include "rtp_llm/cpp/cache/KVCacheSpec.h" -#include "rtp_llm/cpp/cache/MemoryEvaluationHelper.h" - -namespace rtp_llm { - -std::vector> HybridConfigCreator::splitIntoGroups(const std::vector& ids, int group_layer_num) { - std::vector> groups; - if (ids.empty()) { - return groups; - } - const int n = static_cast(ids.size()); - const int s = std::max(group_layer_num, 1); - groups.reserve((n + s - 1) / s); - for (int i = 0; i < n; i += s) { - const int end = std::min(i + s, n); - groups.emplace_back(ids.begin() + i, ids.begin() + end); - } - return groups; -} - -int HybridConfigCreator::calculateGroupLayerNum(int linear_layer_count, int full_layer_count) { - // All full attention layers must reside in one cache group (full_group_num <= 1). - // prepare_fmha_impl binds the block table of group 0 once; it is not re-bound per layer. - // group_layer_num must be >= full_layer_count to satisfy this. - // When gcd is already sufficient it works directly; the fallback handles all other cases - // (coprime gcd==1, or gcd>1 but still smaller than full_layer_count). - int group_layer_num = 0; - if (linear_layer_count > 0 && full_layer_count > 0) { - group_layer_num = std::gcd(linear_layer_count, full_layer_count); - // Fallback: when gcd < full_layer_count, force group_layer_num = full_layer_count - // to guarantee all full layers fit in one group. - // e.g. Kimi Linear 20:7 -> gcd=1 < 7 -> group_layer_num=7, linear groups=[7,7,6], - // last group wastes 1 layer slot per block, negligible. - if (group_layer_num < full_layer_count) { - group_layer_num = full_layer_count; - } - } else { - group_layer_num = std::max(linear_layer_count, full_layer_count); - } - group_layer_num = std::max(group_layer_num, 1); - return group_layer_num; -} - -std::pair, std::vector> -HybridConfigCreator::splitLayersByAttentionType(const ModelConfig& model_config) { - int64_t layer_num = model_config.num_layers; - RTP_LLM_CHECK_WITH_INFO(layer_num > 0, "invalid model_config.num_layers=%ld", layer_num); - - std::vector linear_layers; - std::vector full_layers; - linear_layers.reserve(layer_num); - full_layers.reserve(layer_num); - - const auto& types = model_config.hybrid_attention_config.hybrid_attention_types; - for (int i = 0; i < static_cast(layer_num); ++i) { - if (types[static_cast(i)] == HybridAttentionType::LINEAR) { - linear_layers.push_back(i); - } else { - full_layers.push_back(i); - } - } - - return std::make_pair(std::move(linear_layers), std::move(full_layers)); -} - -CacheConfig HybridConfigCreator::initializeConfig(const ModelConfig& model_config, - const std::vector& linear_layers, - const std::vector& full_layers, - rtp_llm::DataType dtype) { - int64_t layer_num = model_config.num_layers; - - CacheConfig config; - config.layer_num = static_cast(layer_num); - config.layer_all_num = static_cast(layer_num); - config.block_num = 0; - config.seq_size_per_block = static_cast(model_config.attn_config.tokens_per_block); - config.use_mla = model_config.attn_config.use_mla; - config.dtype = dtype; - config.linear_step = 1; - - config.global_layer_ids.push_back(linear_layers); - config.global_layer_ids.push_back(full_layers); - config.layer_ids.push_back(linear_layers); - config.layer_ids.push_back(full_layers); - - return config; -} - -KVCacheSpecPtr HybridConfigCreator::createFullAttentionSpec(const ModelConfig& model_config, - const ParallelismConfig& parallelism_config, - rtp_llm::DataType dtype) { - KVCacheSpecPtr full_spec; - if (model_config.attn_config.use_mla && model_config.mla_ops_type != rtp_llm::MlaOpsType::MHA) { - full_spec = std::make_shared(model_config.attn_config, parallelism_config); - } else { - full_spec = std::make_shared(model_config.attn_config, parallelism_config); - } - full_spec->dtype = dtype; - return full_spec; -} - -KVCacheSpecPtr HybridConfigCreator::createLinearAttentionSpec(const ModelConfig& model_config, - const ParallelismConfig& parallelism_config, - rtp_llm::DataType dtype) { - auto linear_spec = std::make_shared( - model_config.attn_config, parallelism_config, model_config.linear_attention_config); - linear_spec->dtype = dtype; - return linear_spec; -} - -std::pair>, std::vector>> HybridConfigCreator::createLayerGroups( - const std::vector& linear_layers, const std::vector& full_layers, int& group_layer_num) { - const int linear_cnt = static_cast(linear_layers.size()); - const int full_cnt = static_cast(full_layers.size()); - group_layer_num = HybridConfigCreator::calculateGroupLayerNum(linear_cnt, full_cnt); - - const auto linear_groups = HybridConfigCreator::splitIntoGroups(linear_layers, group_layer_num); - const auto full_groups = HybridConfigCreator::splitIntoGroups(full_layers, group_layer_num); - - return std::make_pair(std::move(linear_groups), std::move(full_groups)); -} - -void HybridConfigCreator::setupCacheConfigSpecs(CacheConfig& config, - const std::vector>& linear_groups, - const std::vector>& full_groups, - const KVCacheSpecPtr& linear_spec, - const KVCacheSpecPtr& full_spec) { - config.global_layer_ids.clear(); - config.layer_ids.clear(); - config.cache_specs.clear(); - config.group_types.clear(); - - // Keep order: all full groups first, then linear groups. - for (const auto& g : full_groups) { - config.global_layer_ids.push_back(g); - config.layer_ids.push_back(g); - config.cache_specs.push_back(full_spec); - config.group_types.push_back(CacheGroupType::FULL); - } - for (const auto& g : linear_groups) { - config.global_layer_ids.push_back(g); - config.layer_ids.push_back(g); - config.cache_specs.push_back(linear_spec); - config.group_types.push_back(CacheGroupType::LINEAR); - } - config.linear_group_num = static_cast(linear_groups.size()); - config.full_group_num = static_cast(full_groups.size()); -} - -void HybridConfigCreator::setupPhysicalSizes(CacheConfig& config, - const KVCacheSpecPtr& full_spec, - const KVCacheSpecPtr& linear_spec) { - // Decide the physical KV block/scale sizes by taking max between full and linear specs. - const size_t full_kv_block_stride_bytes = full_spec->block_size_bytes(); - const size_t linear_kv_block_stride_bytes = linear_spec->block_size_bytes(); - - // now we only support that linear attention block have padding - RTP_LLM_CHECK_WITH_INFO(full_kv_block_stride_bytes >= linear_kv_block_stride_bytes, - "not support full attention with padding now"); - - config.kv_block_stride_bytes = full_kv_block_stride_bytes; - config.kv_block_size_bytes = static_cast(config.group_layer_num) * config.kv_block_stride_bytes; - config.kv_scale_stride_bytes = full_spec->scale_block_size_bytes(); - config.kv_scale_size_bytes = static_cast(config.group_layer_num) * config.kv_scale_stride_bytes; - config.block_size_bytes = config.kv_block_size_bytes + config.kv_scale_size_bytes; -} - -void HybridConfigCreator::setupLayerToGroupMapping(CacheConfig& config) { - config.layer_to_group_id.assign(config.layer_num, 0); - for (size_t gid = 0; gid < config.layer_ids.size(); ++gid) { - for (int layer_id : config.layer_ids[gid]) { - if (layer_id >= 0 && static_cast(layer_id) < config.layer_num) { - config.layer_to_group_id[static_cast(layer_id)] = static_cast(gid); - } - } - } -} - -CacheConfig HybridConfigCreator::createHybridConfig(const ModelConfig& model_config, - const ParallelismConfig& parallelism_config, - bool is_mtp) { - auto dtype = MemoryEvaluationHelper::getDataTypeForCache(model_config); - - // Split layers by attention type - auto [linear_layers, full_layers] = HybridConfigCreator::splitLayersByAttentionType(model_config); - - // Initialize config - CacheConfig config = HybridConfigCreator::initializeConfig(model_config, linear_layers, full_layers, dtype); - - // Create attention specs - auto full_spec = HybridConfigCreator::createFullAttentionSpec(model_config, parallelism_config, dtype); - auto linear_spec = HybridConfigCreator::createLinearAttentionSpec(model_config, parallelism_config, dtype); - - // Create layer groups and calculate group layer number - int group_layer_num = 0; - auto [linear_groups, full_groups] = - HybridConfigCreator::createLayerGroups(linear_layers, full_layers, group_layer_num); - config.group_layer_num = group_layer_num; - - // Setup cache config specs - HybridConfigCreator::setupCacheConfigSpecs(config, linear_groups, full_groups, linear_spec, full_spec); - - // Hard check: current only supports a single full attention group. - RTP_LLM_CHECK_WITH_INFO( - config.full_group_num <= 1, - "Multiple full attention groups (%d) are not supported in hybrid mode. " - "prepare_fmha_impl is called once before the layer loop, binding the block table from group 0. " - "To support multiple full groups, implement per-group fmha preparation.", - config.full_group_num); - - // Setup physical sizes - HybridConfigCreator::setupPhysicalSizes(config, full_spec, linear_spec); - - // Setup layer to group mapping - HybridConfigCreator::setupLayerToGroupMapping(config); - - config.layer_attn_types.assign(config.layer_num, CacheGroupType::FULL); - for (size_t layer_id = 0; layer_id < config.layer_to_group_id.size(); ++layer_id) { - const int gid = config.layer_to_group_id[layer_id]; - if (gid >= 0 && static_cast(gid) < config.group_types.size()) { - config.layer_attn_types[layer_id] = config.group_types[static_cast(gid)]; - } - } - - // Per-layer block stride (kv + scale). - // For hybrid attention, the physical per-layer stride follows the selected physical layout stride. - const size_t per_layer_stride_bytes = config.kv_block_stride_bytes + config.kv_scale_stride_bytes; - config.layer_to_block_stride_bytes.assign(static_cast(config.layer_all_num), - static_cast(per_layer_stride_bytes)); - - return config; -} - -} // namespace rtp_llm \ No newline at end of file diff --git a/rtp_llm/cpp/cache/HybridTypeKVCacheAllocator.cc b/rtp_llm/cpp/cache/HybridTypeKVCacheAllocator.cc deleted file mode 100644 index 3423a3ca64..0000000000 --- a/rtp_llm/cpp/cache/HybridTypeKVCacheAllocator.cc +++ /dev/null @@ -1,536 +0,0 @@ -#include "rtp_llm/cpp/cache/HybridTypeKVCacheAllocator.h" - -#include -#include -#include -#include -#include - -#include "rtp_llm/cpp/cache/BlockPoolConfigHelper.h" -#include "rtp_llm/cpp/utils/Logger.h" -#include "rtp_llm/cpp/utils/TimeUtil.h" -#include "rtp_llm/cpp/engine_base/stream/CompleteTokenIds.h" - -namespace rtp_llm { -HybridTypeKVCacheAllocator::HybridTypeKVCacheAllocator(const CacheConfig& config, - AllocationType allocation_type, - const kmonitor::MetricsReporterPtr metrics_reporter, - int64_t reserve_block_ratio): - KVCacheAllocator(config, allocation_type, metrics_reporter, reserve_block_ratio) {} - -bool HybridTypeKVCacheAllocator::doInit() { - RTP_LLM_CHECK_WITH_INFO(!config_.cache_specs.empty(), "no cache_specs found in CacheConfig"); - - auto pool_config = BlockPoolConfigHelper::createConfig(config_); - block_pool_ = std::make_shared(pool_config, allocation_type_); - RTP_LLM_CHECK_WITH_INFO(block_pool_->init(), "Failed to initialize block pool for HybridTypeKVCacheAllocator"); - - const auto& layer_groups = config_.global_layer_ids; - const int group_nums = static_cast(layer_groups.size()); - kv_cache_groups_.reserve(group_nums); - - // global layer id -> group id mapping (for address lookup APIs) - layer_to_group_id_ = config_.layer_to_group_id; - - for (int gid = 0; gid < group_nums; ++gid) { - KVCacheSpecPtr spec = config_.cache_specs[static_cast(gid)]; - const auto& ids = layer_groups[static_cast(gid)]; - - KVCacheGroupPtr group; - if (spec && spec->type == KVCacheSpecType::LinearAttention) { - group = std::make_shared(ids, spec, block_pool_, gid, config_.linear_step); - linear_group_ids_.push_back(gid); - } else { - group = std::make_shared(ids, spec, block_pool_, gid); - full_group_ids_.push_back(gid); - } - - RTP_LLM_CHECK_WITH_INFO(group->init(), "Failed to initialize KVCacheGroup gid %d", gid); - kv_cache_groups_.push_back(group); - } - - global_layer_to_local_id_.assign(static_cast(config_.layer_all_num), -1); - for (const auto& cur_group_layers : layer_groups) { - for (size_t local_layer_idx = 0; local_layer_idx < cur_group_layers.size(); ++local_layer_idx) { - const int global_layer_idx = cur_group_layers[local_layer_idx]; - if (global_layer_idx >= 0 && static_cast(global_layer_idx) < global_layer_to_local_id_.size()) { - global_layer_to_local_id_[static_cast(global_layer_idx)] = static_cast(local_layer_idx); - } - } - } - - RTP_LLM_LOG_INFO("HybridTypeKVCacheAllocator init success"); - return true; -} - -void HybridTypeKVCacheAllocator::referenceValidBlocks(const BlockIndicesType& blocks) const { - BlockIndicesType valid; - valid.reserve(blocks.size()); - for (auto b : blocks) { - if (!isNullBlockIdx(b)) { - valid.push_back(b); - } - } - if (!valid.empty()) { - block_pool_->requestReference(valid); - } -} - -int HybridTypeKVCacheAllocator::reuseCache(const CacheKeysType& cache_keys, BatchKVCacheResource& kv_resource) { - // 1) Prefix match on all full-attn groups, take the shortest prefix. - int min_full_reuse_blocks = static_cast(cache_keys.size()); - std::vector full_matched_blocks(kv_cache_groups_.size()); - - for (int gid : full_group_ids_) { - auto match_result = kv_cache_groups_[static_cast(gid)]->match(cache_keys); - min_full_reuse_blocks = std::min(min_full_reuse_blocks, static_cast(match_result.reuse_blocks)); - full_matched_blocks[static_cast(gid)] = std::move(match_result.block_indices); - } - - // 2) Right-to-left joint check for all linear groups (single-key match). - int pos = min_full_reuse_blocks - 1; - std::vector linear_tail_blocks; // per linear group - linear_tail_blocks.resize(linear_group_ids_.size(), NULL_BLOCK_IDX); - - for (; pos >= 0; --pos) { - bool all_linear_matched = true; - for (size_t i = 0; i < linear_group_ids_.size(); ++i) { - const int gid = linear_group_ids_[i]; - auto* linear_group = dynamic_cast(kv_cache_groups_[static_cast(gid)].get()); - auto result = linear_group->matchSingleKey(cache_keys[static_cast(pos)]); - if (result.block_indices.empty()) { - all_linear_matched = false; - break; - } - linear_tail_blocks[i] = result.block_indices[0]; - } - if (all_linear_matched) { - break; - } - } - - const int reuse_blocks_len = std::max(pos + 1, 0); - if (reuse_blocks_len <= 0) { - return 0; - } - - // Write matched blocks into batch 0 blocks, per group. - // NOTE: for linear groups we only reuse the tail block; other slots are set to NULL_BLOCK_IDX. - for (int gid = 0; gid < static_cast(kv_cache_groups_.size()); ++gid) { - kv_resource.mutableBlockIds(0, gid).assign( - BlockIndicesType(static_cast(reuse_blocks_len), NULL_BLOCK_IDX)); - } - - for (int gid : full_group_ids_) { - BlockIndicesType full_blocks = full_matched_blocks[static_cast(gid)]; - if (static_cast(full_blocks.size()) > reuse_blocks_len) { - full_blocks.resize(static_cast(reuse_blocks_len)); - } - kv_resource.mutableBlockIds(0, gid).assign(std::move(full_blocks)); - } - - for (size_t i = 0; i < linear_group_ids_.size(); ++i) { - const int gid = linear_group_ids_[i]; - kv_resource.mutableBlockIds(0, gid).setAt(static_cast(reuse_blocks_len - 1), linear_tail_blocks[i]); - } - - return reuse_blocks_len; -} - -MallocResult HybridTypeKVCacheAllocator::incrMalloc(const MallocInfo& malloc_info) { - auto& kv_resource = malloc_info.batch_kv_cache_resource; - const int batch_size = kv_resource->batchSize(); - const int seq_len = malloc_info.complete_token_ids->seqLength(); - const int reserve_step = malloc_info.complete_token_ids->getReserveStep(); - - // Record original sizes for rollback in case any subsequent allocation fails - std::vector> original_sizes(batch_size); - for (int b = 0; b < batch_size; ++b) { - original_sizes[b].resize(static_cast(kv_resource->groupNums())); - for (int gid = 0; gid < kv_resource->groupNums(); ++gid) { - original_sizes[b][static_cast(gid)] = kv_resource->blocksNum(b, gid); - } - } - - bool all_success = true; - int failed_batch = -1; - int failed_group = -1; - - for (int b = 0; b < batch_size; ++b) { - for (int gid = 0; gid < kv_resource->groupNums(); ++gid) { - auto& block_ids = kv_resource->mutableBlockIds(b, gid); - - if (!kv_cache_groups_[static_cast(gid)]->malloc( - block_ids, seq_len, malloc_info.reuse_cache, reserve_step)) { - all_success = false; - failed_batch = b; - failed_group = gid; - break; - } - } - if (!all_success) { - break; - } - } - - if (all_success) { - // Sparse cleanup is only safe for incremental allocation. Prefill init keeps - // reused prefix slots intact because causal_conv1d still reads them by prefix_length. - if (!malloc_info.enable_remove_skipped_blocks) { - return {true, 0}; - } - // Decode-time memory saving for linear groups (apply after we know allocations succeeded). - for (int b = 0; b < batch_size; ++b) { - for (int gid = 0; gid < kv_resource->groupNums(); ++gid) { - kv_cache_groups_[static_cast(gid)]->removeSkippedBlocks( - kv_resource->mutableBlockIds(b, gid), malloc_info.reuse_cache, reserve_step); - } - } - return {true, 0}; - } - - // rollback kvcache blocks - BlockIndicesType blocks_to_free; - - for (int b = 0; b < batch_size; ++b) { - for (int gid = 0; gid < kv_resource->groupNums(); ++gid) { - auto& block_ids = kv_resource->mutableBlockIds(b, gid); - size_t original_num = original_sizes[b][static_cast(gid)]; - if (block_ids.blocksNum() > original_num) { - const auto& blk = block_ids.blocks(); - for (size_t i = original_num; i < blk.size(); ++i) { - if (!isNullBlockIdx(blk[i])) { - blocks_to_free.push_back(blk[i]); - } - } - block_ids.resize(original_num); - } - } - if (b > failed_batch) { - break; - } - } - if (!blocks_to_free.empty()) { - // All groups share the same block pool; free directly. - block_pool_->requestFree(blocks_to_free); - } - RTP_LLM_LOG_WARNING("Hybrid incrMalloc failed at batch=%d group=%d", failed_batch, failed_group); - return {false, 0}; -} - -MallocResult HybridTypeKVCacheAllocator::initMallocForCommonLen(const MallocInfo& malloc_info) { - auto& kv_resource = malloc_info.batch_kv_cache_resource; - const int batch_size = kv_resource->batchSize(); - - const int seq_len = malloc_info.complete_token_ids->seqLength(); - const int common_seq_len = std::min(malloc_info.complete_token_ids->commonSeqLength(), seq_len); - - const auto& cache_keys = kv_resource->cacheKeys(0); - int64_t match_cost_time_us = 0; - const size_t reserve_blocks = reserveBlockNum(); - int reuse_blocks = 0; - - if (malloc_info.enable_device_cache) { - // Drop last key of partial block (same rationale as SingleType). - CacheKeysType match_keys(cache_keys.begin(), cache_keys.empty() ? cache_keys.end() : cache_keys.end() - 1); - auto begin_us = currentTimeUs(); - reuse_blocks = reuseCache(match_keys, *kv_resource); - match_cost_time_us = currentTimeUs() - begin_us; - - // Reference reused blocks in batch 0 (filter NULL_BLOCK_IDX). - for (int gid = 0; gid < kv_resource->groupNums(); ++gid) { - const auto& blocks = kv_resource->blocks(0, gid); - referenceValidBlocks(blocks); - } - kv_resource->cacheResource(0).setDeviceReuseBlockNum(reuse_blocks); - } - - const int need_blocks = (reserve_blocks > 0) ? getNeedBlocks(malloc_info) : 0; - // Reserve blocks check (best-effort, similar to SingleType). - if (reserve_blocks > 0 && need_blocks > 0) { - const size_t available_blocks = availableBlocksNum(); - if (available_blocks < static_cast(need_blocks) + reserve_blocks) { - if (malloc_info.verbose) { - RTP_LLM_LOG_INFO("Hybrid initMalloc rejected by reserve blocks: request_id=%ld " - "need_blocks=%d available_blocks=%zu " - "reserve_blocks=%zu", - malloc_info.request_id, - need_blocks, - available_blocks, - reserve_blocks); - } - return {false, 0}; - } - } - - // Allocate common blocks on batch 0. - for (int gid = 0; gid < kv_resource->groupNums(); ++gid) { - auto& block_ids_0 = kv_resource->mutableBlockIds(0, gid); - - // Common blocks are shared across batches; reserve_step is per-batch extra and will be handled in incrMalloc. - if (!kv_cache_groups_[static_cast(gid)]->malloc( - block_ids_0, common_seq_len, malloc_info.reuse_cache, 0)) { - return {false, 0}; - } - } - - // Other batches reference batch 0's common blocks. - for (int b = 1; b < batch_size; ++b) { - for (int gid = 0; gid < kv_resource->groupNums(); ++gid) { - kv_cache_groups_[static_cast(gid)]->reference(kv_resource->mutableBlockIds(b, gid), - kv_resource->blocks(0, gid)); - } - } - - return {true, reuse_blocks * seqSizePerBlock(), match_cost_time_us}; -} - -void HybridTypeKVCacheAllocator::free(const FreeInfo& free_info) { - auto& kv_cache_resource = free_info.batch_kv_cache_resource; - - if (kv_cache_resource->curBlocksNum() == 0) { - return; - } - - for (int batch_id = 0; batch_id < kv_cache_resource->batchSize(); ++batch_id) { - for (int gid = 0; gid < kv_cache_resource->groupNums(); ++gid) { - kv_cache_groups_[static_cast(gid)]->free(kv_cache_resource->blocks(batch_id, gid)); - } - } - kv_cache_resource->clearBlocks(); -} - -void HybridTypeKVCacheAllocator::insertIntoCache(const InsertInfo& insert_info) { - auto& kv_cache_resource = insert_info.batch_kv_cache_resource; - RTP_LLM_CHECK(kv_cache_resource != nullptr); - - int batch_size = kv_cache_resource->batchSize(); - int seq_size_per_block = seqSizePerBlock(); - - for (int batch_id = 0; batch_id < batch_size; ++batch_id) { - const auto& cache_keys = kv_cache_resource->cacheKeys(batch_id); - - auto token_ids = insert_info.complete_token_ids->completeTokenIdsVec(batch_id); - if (token_ids.size() <= 1 || cache_keys.empty()) { - continue; - } - - // Only insert full blocks. - const size_t token_len = token_ids.size() - 1; - const size_t full_blocks_num = token_len / static_cast(seq_size_per_block); - const size_t n = std::min(cache_keys.size(), full_blocks_num); - if (n == 0) { - continue; - } - - CacheKeysType put_cache_keys(cache_keys.begin(), cache_keys.begin() + n); - for (int gid = 0; gid < kv_cache_resource->groupNums(); ++gid) { - const auto& blocks = kv_cache_resource->blocks(batch_id, gid); - BlockIndicesType put_blocks; - put_blocks.reserve(n); - for (size_t i = 0; i < n && i < blocks.size(); ++i) { - put_blocks.push_back(blocks[i]); - } - kv_cache_groups_[static_cast(gid)]->insertIntoCache( - put_cache_keys, put_blocks, insert_info.is_resident); - } - } -} - -CacheLayerLayout HybridTypeKVCacheAllocator::allLayerCacheBase() const { - CacheLayerLayout layout; - const auto layer_tensors = block_pool_->allLayerCacheBase(); - const auto scale_tensors = block_pool_->allLayerScaleCacheBase(); - - layout.layer_to_groups = layer_to_group_id_; - layout.layers_to_kv_buffer_ptrs.resize(config_.layer_all_num); - layout.layers_to_scale_buffer_ptrs.resize(config_.layer_all_num); - - for (size_t layer_id = 0; layer_id < static_cast(config_.layer_all_num); ++layer_id) { - int32_t local = global_layer_to_local_id_[layer_id]; - const size_t local_idx = static_cast(local); - - if (local_idx < layer_tensors.size() && layer_tensors[local_idx].defined() - && layer_tensors[local_idx].numel() > 0) { - layout.layers_to_kv_buffer_ptrs[layer_id] = layer_tensors[local_idx]; - } - - if (!scale_tensors.empty() && local_idx < scale_tensors.size() && scale_tensors[local_idx].defined() - && scale_tensors[local_idx].numel() > 0) { - layout.layers_to_scale_buffer_ptrs[layer_id] = scale_tensors[local_idx]; - } - } - return layout; -} - -BlockAddrInfo HybridTypeKVCacheAllocator::convertIndexToAddr(int layer_id, int block_id) const { - if (layer_id < 0 || layer_id >= static_cast(layer_to_group_id_.size())) { - RTP_LLM_FAIL("convertIndexToAddr invalid layer_id=%d", layer_id); - } - const int gid = layer_to_group_id_[static_cast(layer_id)]; - RTP_LLM_CHECK_WITH_INFO(gid >= 0 && gid < static_cast(kv_cache_groups_.size()), "invalid group id mapping"); - return kv_cache_groups_[static_cast(gid)]->convertIndexToAddr(layer_id, block_id); -} - -std::vector HybridTypeKVCacheAllocator::convertIndexToBuffer(int layer_id, int block_id) const { - if (layer_id < 0 || layer_id >= static_cast(layer_to_group_id_.size())) { - RTP_LLM_FAIL("convertIndexToBuffer invalid layer_id=%d", layer_id); - } - const int gid = layer_to_group_id_[static_cast(layer_id)]; - RTP_LLM_CHECK_WITH_INFO(gid >= 0 && gid < static_cast(kv_cache_groups_.size()), "invalid group id mapping"); - return kv_cache_groups_[static_cast(gid)]->convertIndexToBuffer(layer_id, block_id); -} - -std::vector HybridTypeKVCacheAllocator::convertIndexToBuffer(int layer_id, - int block_id, - int partition_count, - int partition_id) const { - if (layer_id < 0 || layer_id >= static_cast(layer_to_group_id_.size())) { - RTP_LLM_FAIL("convertIndexToBuffer(partition) invalid layer_id=%d", layer_id); - } - const int gid = layer_to_group_id_[static_cast(layer_id)]; - RTP_LLM_CHECK_WITH_INFO(gid >= 0 && gid < static_cast(kv_cache_groups_.size()), "invalid group id mapping"); - return kv_cache_groups_[static_cast(gid)]->convertIndexToBuffer( - layer_id, block_id, partition_count, partition_id); -} - -std::shared_ptr HybridTypeKVCacheAllocator::incrKVCacheRef(const KVCacheResource& kvcache_resource, - const CacheKeysType& cache_keys, - bool is_connector) { - if (cache_keys.empty()) { - return nullptr; - } - - const int group_nums = kvcache_resource.groupNums(); - if (group_nums <= 0) { - return nullptr; - } - - std::unordered_map key_to_pos; - const auto& resource_keys = kvcache_resource.cacheKeys(); - key_to_pos.reserve(resource_keys.size()); - for (size_t i = 0; i < resource_keys.size(); ++i) { - key_to_pos.emplace(resource_keys[i], i); - } - - auto selected_resource_ptr = new KVCacheResource(kvcache_resource); - auto deleter = [self = shared_from_this(), is_connector](KVCacheResource* resource) { - self->decrKVCacheRef(*resource, is_connector); - delete resource; - }; - std::shared_ptr selected_resource(selected_resource_ptr, deleter); - selected_resource->initGroups(group_nums, - static_cast(config_.layer_all_num), - config_.layer_to_group_id, - config_.kernelBlocksPerKvBlock(), - config_.group_types); - - CacheKeysType& selected_keys = selected_resource->cacheKeys(); - std::vector selected_blocks(static_cast(group_nums)); - - BlockIndicesType blocks_to_reference; - blocks_to_reference.reserve(cache_keys.size()); - - for (auto key : cache_keys) { - auto it = key_to_pos.find(key); - if (it == key_to_pos.end()) { - continue; - } - const size_t pos = it->second; - for (int gid = 0; gid < group_nums; ++gid) { - auto& src_blocks = kvcache_resource.blocks(gid); - if (pos >= src_blocks.size()) { - continue; - } - const auto block = src_blocks[pos]; - selected_blocks[static_cast(gid)].push_back(block); - if (!isNullBlockIdx(block) && block > 0) { - blocks_to_reference.push_back(block); - } - } - } - - selected_keys.assign(cache_keys.begin(), cache_keys.end()); - if (is_connector) { - block_pool_->connectorReference(blocks_to_reference); - } else { - block_pool_->requestReference(blocks_to_reference); - } - - for (int gid = 0; gid < group_nums; ++gid) { - selected_resource->mutableBlockIds(gid).assign(std::move(selected_blocks[static_cast(gid)])); - } - - return selected_resource; -} - -void HybridTypeKVCacheAllocator::decrKVCacheRef(const KVCacheResource& kvcache_resource, bool is_connector) { - const int group_nums = kvcache_resource.groupNums(); - std::vector blocks_to_free; - for (int gid = 0; gid < group_nums; ++gid) { - const auto& blocks = kvcache_resource.blocks(gid); - for (auto b : blocks) { - if (!isNullBlockIdx(b) && b > 0) { - blocks_to_free.push_back(b); - } - } - } - if (is_connector) { - block_pool_->connectorFree(blocks_to_free); - } else { - block_pool_->requestFree(blocks_to_free); - } -} - -int HybridTypeKVCacheAllocator::seqSizePerBlock() const { - return static_cast(config_.seq_size_per_block); -} - -bool HybridTypeKVCacheAllocator::updateKVBlock(const BatchKVCacheResourcePtr& batch_kv_cache_resource, - const std::vector& block_src_batch, - bool copy_last_block, - std::vector& block_update_mapping) { - // TODO(chanyin): may be implemented in Base class in future - return true; -} - -int HybridTypeKVCacheAllocator::getNeedBlocks(const MallocInfo& malloc_info) const { - if (!malloc_info.batch_kv_cache_resource || !malloc_info.complete_token_ids) { - return 0; - } - const int batch_size = malloc_info.batch_kv_cache_resource->batchSize(); - const int total_seq_len = malloc_info.complete_token_ids->totalSeqLength(); - const int common_seq_len = std::min(malloc_info.complete_token_ids->commonSeqLength(), total_seq_len); - - const int seq_len = malloc_info.complete_token_ids->seqLength(); - const int reserve_step = malloc_info.complete_token_ids->getReserveStep(); - - const bool reuse_enabled = malloc_info.reuse_cache; - const int reuse_blocks_len = reuse_enabled ? malloc_info.batch_kv_cache_resource->curBlocksNum() : 0; - - int common_blocks_total = 0; - int extra_blocks_total = 0; - - for (int gid = 0; gid < static_cast(kv_cache_groups_.size()); ++gid) { - const auto need = kv_cache_groups_[static_cast(gid)]->getNeedBlocks( - common_seq_len, seq_len, reserve_step, reuse_blocks_len, reuse_enabled); - common_blocks_total += need.common_blocks; - extra_blocks_total += need.extra_blocks; - } - - return common_blocks_total + batch_size * extra_blocks_total; -} - -int HybridTypeKVCacheAllocator::singleBatchNeedBlocks(const BatchKVCacheResourcePtr& batch_kv_cache_resource, - int seq_len, - int reserve_step) const { - int need_blocks = 0; - for (int gid = 0; gid < batch_kv_cache_resource->groupNums(); ++gid) { - const int cur_blocks = batch_kv_cache_resource->blocksNum(0, gid); - need_blocks += kv_cache_groups_[static_cast(gid)]->needBlocksNum(seq_len, cur_blocks, reserve_step); - } - - return need_blocks; -} - -} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/HybridTypeKVCacheAllocator.h b/rtp_llm/cpp/cache/HybridTypeKVCacheAllocator.h deleted file mode 100644 index 1db81d9e81..0000000000 --- a/rtp_llm/cpp/cache/HybridTypeKVCacheAllocator.h +++ /dev/null @@ -1,68 +0,0 @@ -#pragma once - -#include -#include -#include - -#include "rtp_llm/cpp/cache/KVCacheAllocator.h" -#include "rtp_llm/cpp/cache/FullKVCacheGroup.h" -#include "rtp_llm/cpp/cache/LinearKVCacheGroup.h" - -namespace rtp_llm { - -class HybridTypeKVCacheAllocator: - public KVCacheAllocator, - public std::enable_shared_from_this { -public: - HybridTypeKVCacheAllocator(const CacheConfig& config, - AllocationType allocation_type = AllocationType::DEVICE, - const kmonitor::MetricsReporterPtr metrics_reporter = nullptr, - int64_t reserve_block_ratio = 0); - - void free(const FreeInfo& free_info) override; - void insertIntoCache(const InsertInfo& insert_info) override; - BlockAddrInfo convertIndexToAddr(int layer_id, int block_id) const override; - std::vector convertIndexToBuffer(int layer_id, int block_id) const override; - std::vector - convertIndexToBuffer(int layer_id, int block_id, int partition_count, int partition_id) const override; - std::shared_ptr incrKVCacheRef(const KVCacheResource& kvcache_resource, - const CacheKeysType& cache_keys, - bool is_connector = false) override; - CacheLayerLayout allLayerCacheBase() const override; - - bool updateKVBlock(const BatchKVCacheResourcePtr& batch_kv_cache_resource, - const std::vector& block_src_batch, - bool copy_last_block, - std::vector& block_update_mapping) override; - - int seqSizePerBlock() const override; - int singleBatchNeedBlocks(const BatchKVCacheResourcePtr& batch_kv_cache_resource, - int seq_len, - int reserve_step) const override; - -private: - bool doInit() override; - MallocResult incrMalloc(const MallocInfo& malloc_info) override; - MallocResult initMallocForCommonLen(const MallocInfo& malloc_info) override; - int getNeedBlocks(const MallocInfo& malloc_info) const override; - void decrKVCacheRef(const KVCacheResource& kvcache_resource, bool is_connector = false) override; - - // Joint match across groups. Returns reuse_blocks decided by full groups + linear groups. - int reuseCache(const CacheKeysType& cache_keys, BatchKVCacheResource& kv_resource); - void referenceValidBlocks(const BlockIndicesType& blocks) const; - -private: - std::vector kv_cache_groups_; - - std::vector full_group_ids_; - std::vector linear_group_ids_; - - // global layer id -> group id - std::vector layer_to_group_id_; - // global layer id -> local layer id - std::vector global_layer_to_local_id_; -}; - -using HybridTypeKVCacheAllocatorPtr = std::shared_ptr; - -} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/KVCacheHashUtil.cc b/rtp_llm/cpp/cache/KVCacheHashUtil.cc index fdc30f85ab..924adcfae3 100644 --- a/rtp_llm/cpp/cache/KVCacheHashUtil.cc +++ b/rtp_llm/cpp/cache/KVCacheHashUtil.cc @@ -29,6 +29,9 @@ void initCacheKeys(BatchKVCacheResourcePtr batch_kv_cache_resource, } batch_kv_cache_resource->setLastBlockAligned(seq_len % seq_size_per_block == 0); + for (int i = 0; i < batch_size; ++i) { + batch_kv_cache_resource->cacheResource(i).ensureLinearBlockDependencies(); + } } void updateCacheKeys(BatchKVCacheResourcePtr batch_kv_cache_resource, @@ -60,6 +63,9 @@ void updateCacheKeys(BatchKVCacheResourcePtr batch_kv_cache_resource, // After incremental update we guarantee all existing keys are for full blocks. batch_kv_cache_resource->setLastBlockAligned(true); + for (int i = 0; i < batch_size; ++i) { + batch_kv_cache_resource->cacheResource(i).ensureLinearBlockDependencies(); + } } void dropLastPartialBlock(BatchKVCacheResourcePtr batch_kv_cache_resource) { diff --git a/rtp_llm/cpp/cache/KVCacheManager.cc b/rtp_llm/cpp/cache/KVCacheManager.cc index ce9b4840c6..04383fa612 100644 --- a/rtp_llm/cpp/cache/KVCacheManager.cc +++ b/rtp_llm/cpp/cache/KVCacheManager.cc @@ -4,9 +4,12 @@ #include #include -#include "rtp_llm/cpp/cache/SingleTypeKVCacheAllocator.h" -#include "rtp_llm/cpp/cache/HybridTypeKVCacheAllocator.h" #include "rtp_llm/cpp/cache/BatchKVCacheResource.h" +#include "rtp_llm/cpp/cache/spec/CacheGroupType.h" +#include "rtp_llm/cpp/cache/allocator/HybridPoolKVCacheAllocator.h" +#include "rtp_llm/cpp/cache/allocator/HybridTypeKVCacheAllocator.h" +#include "rtp_llm/cpp/cache/allocator/SingleTypeKVCacheAllocator.h" +#include "rtp_llm/cpp/cache/SharedBlockCache.h" #include "rtp_llm/cpp/cache/connector/KVCacheConnectorCoordinator.h" #include "rtp_llm/cpp/cache/KVCacheHashUtil.h" #include "rtp_llm/cpp/metrics/RtpLLMMetrics.h" @@ -17,6 +20,84 @@ namespace rtp_llm { +namespace { + +struct GlobalCacheMetricsSnapshot { + RtpLLMCacheMetricsCollector collector; + size_t total_blocks = 0; + size_t available_blocks = 0; + size_t request_ref_blocks = 0; + size_t connector_ref_blocks = 0; +}; + +GlobalCacheMetricsSnapshot collectGlobalCacheMetrics(const KVCacheAllocatorPtr& allocator) { + GlobalCacheMetricsSnapshot snapshot; + auto shared_cache = allocator->sharedBlockCache(); + + snapshot.total_blocks = allocator->totalBlocksNum(); + snapshot.available_blocks = allocator->availableBlocksNum(); + snapshot.request_ref_blocks = allocator->requestRefBlocksNum(); + snapshot.connector_ref_blocks = allocator->connectorRefBlocksNum(); + + auto& collector = snapshot.collector; + collector.kv_cache_item_num = shared_cache ? static_cast(shared_cache->size()) : 0; + collector.kv_cache_left_seq = static_cast(allocator->availableTokensNum()); + collector.kv_cache_available_blocks = static_cast(snapshot.available_blocks); + collector.kv_cache_request_ref_blocks = static_cast(snapshot.request_ref_blocks); + collector.kv_cache_connector_ref_blocks = static_cast(snapshot.connector_ref_blocks); + collector.kv_cache_free_blocks = static_cast(allocator->freeBlocksNum()); + collector.kv_cache_used_ratio = + (snapshot.total_blocks == 0) ? + 0.0f : + static_cast(100.0 * (snapshot.total_blocks - snapshot.available_blocks) + / static_cast(snapshot.total_blocks)); + collector.mr_cost_time_ms = allocator->getMrCostTimeMs(); + + return snapshot; +} + +void logGlobalCacheMetrics(const GlobalCacheMetricsSnapshot& snapshot) { + RTP_LLM_LOG_INFO("kvc raw global: total=%zu avail=%zu req_ref=%zu con_ref=%zu free=%zu items=%ld ratio=%.4f%%", + snapshot.total_blocks, + snapshot.available_blocks, + snapshot.request_ref_blocks, + snapshot.connector_ref_blocks, + static_cast(snapshot.collector.kv_cache_free_blocks), + static_cast(snapshot.collector.kv_cache_item_num), + snapshot.collector.kv_cache_used_ratio); +} + +void reportPoolCacheMetrics(const kmonitor::MetricsReporterPtr& metrics_reporter, + const KVCachePoolMetricsSnapshot& pool_snapshot, + bool should_log) { + if (should_log) { + RTP_LLM_LOG_INFO("kvc raw pool[%s]: total=%zu avail=%zu req_ref=%zu con_ref=%zu free=%zu reserve=%zu " + "ratio=%.4f%%", + pool_snapshot.pool_name.c_str(), + pool_snapshot.total_blocks, + pool_snapshot.available_blocks, + pool_snapshot.request_ref_blocks, + pool_snapshot.connector_ref_blocks, + pool_snapshot.free_blocks, + pool_snapshot.reserve_blocks, + pool_snapshot.used_ratio); + } + + RtpLLMCachePoolMetricsCollector pool_collector; + pool_collector.free_blocks = static_cast(pool_snapshot.free_blocks); + pool_collector.available_blocks = static_cast(pool_snapshot.available_blocks); + pool_collector.request_ref_blocks = static_cast(pool_snapshot.request_ref_blocks); + pool_collector.connector_ref_blocks = static_cast(pool_snapshot.connector_ref_blocks); + pool_collector.total_blocks = static_cast(pool_snapshot.total_blocks); + pool_collector.reserve_blocks = static_cast(pool_snapshot.reserve_blocks); + pool_collector.used_ratio = pool_snapshot.used_ratio; + + kmonitor::MetricsTags pool_tags("pool_name", pool_snapshot.pool_name); + metrics_reporter->report(&pool_tags, &pool_collector); +} + +} // namespace + KVCacheManager::KVCacheManager(const CacheConfig& config, bool warmup, const kmonitor::MetricsReporterPtr metrics_reporter, @@ -25,7 +106,8 @@ KVCacheManager::KVCacheManager(const CacheConfig& config, const RuntimeConfig& runtime_config, const SpeculativeExecutionConfig& sp_config, const PDSepConfig& pd_sep_config, - const CacheStoreConfig& cache_store_config): + const CacheStoreConfig& cache_store_config, + bool use_cuda_malloc_block_pool): config_(config), metrics_reporter_(metrics_reporter), kv_cache_config_(kv_cache_config), @@ -33,14 +115,32 @@ KVCacheManager::KVCacheManager(const CacheConfig& config, runtime_config_(runtime_config), sp_config_(sp_config), pd_sep_config_(pd_sep_config), - cache_store_config_(cache_store_config) { + cache_store_config_(cache_store_config), + use_cuda_malloc_block_pool_(use_cuda_malloc_block_pool) { if (warmup) { config_.block_num = 1; } else { allocateAndSync(); } - RTP_LLM_LOG_INFO("cache config: layer_num=%d, block_num=%d, block_size=%dB, seq_size_per_block=%zu", + // Page-level RR sharding context: one CPSlotMapper for the lifetime of the + // manager and allocator. When kv_cache_sharded=false (or tp_size==1), + // cp_slot_mapper_ stays nullptr and every call site stays bit-equal to the + // pre-RR behaviour. + const auto& cp_cfg = parallelism_config_.prefill_cp_config; + if (cp_cfg.kv_cache_sharded && parallelism_config_.tp_size > 1) { + cp_slot_mapper_ = std::make_shared(static_cast(parallelism_config_.tp_rank), + static_cast(parallelism_config_.tp_size), + static_cast(config_.seq_size_per_block)); + RTP_LLM_LOG_INFO("CP sharded KV cache enabled: tp_rank=%d, tp_size=%d, block_size=%zu, " + "virtual_block_size=%d", + (int)parallelism_config_.tp_rank, + (int)parallelism_config_.tp_size, + config_.seq_size_per_block, + cp_slot_mapper_->virtualBlockSize()); + } + + RTP_LLM_LOG_INFO("cache config: layer_num=%d, block_num=%d, block_size=%zuB, seq_size_per_block=%zu", config_.layer_num, config_.block_num, config_.block_size_bytes, @@ -59,19 +159,42 @@ KVCacheManager::~KVCacheManager() { // 初始化和配置相关 bool KVCacheManager::init() { - RTP_LLM_CHECK_WITH_INFO(!config_.cache_specs.empty(), "cache specs must not be empty"); + RTP_LLM_CHECK_WITH_INFO(!allocator_ && !coordinator_ && !metrics_reporter_thread_.joinable(), + "KVCacheManager::init called more than once"); + RTP_LLM_CHECK_WITH_INFO(config_.groupNums() > 0, "cache specs must not be empty"); + + auto shared_cache = std::make_shared(); + shared_cache->setPrefixTreeEnabled(kv_cache_config_.enable_gpu_prefix_tree); + const bool enable_independent_group_eviction = kv_cache_config_.enable_memory_cache + && kv_cache_config_.enable_prefix_tree_memory_cache + && kv_cache_config_.enable_independent_group_eviction; const bool is_hybrid = config_.groupNums() > 1; - if (is_hybrid) { + if (config_.use_independent_block_pools) { + allocator_ = std::make_shared(config_, + AllocationType::DEVICE, + metrics_reporter_, + kv_cache_config_.reserve_block_ratio, + pd_sep_config_.role_type); + } else if (is_hybrid) { allocator_ = std::make_shared( config_, AllocationType::DEVICE, metrics_reporter_, kv_cache_config_.reserve_block_ratio); - RTP_LLM_CHECK_WITH_INFO(allocator_->init(), "HybridTypeKVCacheAllocator init failed"); } else { allocator_ = std::make_shared( config_, AllocationType::DEVICE, metrics_reporter_, kv_cache_config_.reserve_block_ratio); - RTP_LLM_CHECK_WITH_INFO(allocator_->init(), "SingleTypeKVCacheAllocator init failed"); } + if (use_cuda_malloc_block_pool_) { + RTP_LLM_LOG_INFO("RDMA cache store enabled for PD role, use cudaMalloc KV cache block-pool backing"); + allocator_->setUseCudaMallocBlockPool(true); + } + + allocator_->setCPSlotMapper(cp_slot_mapper_); + allocator_->setSharedBlockCache(shared_cache); + RTP_LLM_CHECK_WITH_INFO(allocator_->init(), "KVCacheAllocator init failed"); + shared_cache->setIndependentGroupEviction(enable_independent_group_eviction, + allocator_->independentEvictionGroupIds()); + if (metrics_reporter_) { stop_.store(false, std::memory_order_relaxed); metrics_reporter_thread_ = std::thread(&KVCacheManager::reportMetricsLoop, this); @@ -86,6 +209,13 @@ const CacheConfig& KVCacheManager::cacheConfig() const { } const CacheConfig& KVCacheManager::getMTPModuleCacheConfig(int mtp_module_id) const { + RTP_LLM_CHECK_WITH_INFO(mtp_module_id >= 0 && static_cast(mtp_module_id) < config_.mtp_sub_configs.size(), + "Invalid mtp_module_id: %d, must be in range [0, %zu)", + mtp_module_id, + config_.mtp_sub_configs.size()); + RTP_LLM_CHECK_WITH_INFO(config_.mtp_sub_configs[mtp_module_id] != nullptr, + "mtp_sub_configs[%d] is null", + mtp_module_id); return *config_.mtp_sub_configs[mtp_module_id]; } @@ -95,6 +225,8 @@ MallocResult KVCacheManager::malloc(const MallocInfo& malloc_info) { RTP_LLM_PROFILE_FUNCTION(); RTP_LLM_CHECK(malloc_info.batch_kv_cache_resource && malloc_info.complete_token_ids); + // Cache-key computation is identical for CP and non-CP — we always have + // the full sequence's token ids; rolling hash is at block_size granularity. const int seq_size_per_block = config_.seq_size_per_block; if (!malloc_info.batch_kv_cache_resource->curBlocksNum()) { initCacheKeys(malloc_info.batch_kv_cache_resource, malloc_info.complete_token_ids, seq_size_per_block); @@ -120,6 +252,7 @@ void KVCacheManager::insertIntoCache(const InsertInfo& insert_info) { int KVCacheManager::singleBatchNeedBlocks(const BatchKVCacheResourcePtr& batch_kv_cache_resource, int seq_len, int reserve_step) const { + RTP_LLM_CHECK_WITH_INFO(allocator_ != nullptr, "singleBatchNeedBlocks called before KVCacheManager initialized"); return allocator_->singleBatchNeedBlocks(batch_kv_cache_resource, seq_len, reserve_step); } @@ -149,83 +282,6 @@ bool KVCacheManager::updateKVBlock(const BatchKVCacheResourcePtr& batch_kv_cache return allocator_->updateKVBlock(batch_kv_cache_resource, block_src_batch, copy_last_block, block_update_mapping); } -// Write one KV block (optionally per-layer) from host/device tensors for test -bool KVCacheManager::setKVBlockValue(int block_index, - int layer_id, - const torch::Tensor& k_buffer, - const torch::Tensor& v_buffer) { - // Basic size/type validation to prevent out-of-bounds copy - auto& spec = config_.cache_specs[0]; - size_t expected_k_bytes = spec->k_block_size_bytes(); - size_t expected_v_bytes = spec->v_block_size_bytes(); - size_t src_k_bytes = k_buffer.nbytes(); - size_t src_v_bytes = v_buffer.nbytes(); - if (src_k_bytes < expected_k_bytes || src_v_bytes < expected_v_bytes) { - RTP_LLM_LOG_ERROR("setKVBlockValue src bytes too small: k[%zu]<[%zu] or v[%zu]<[%zu]", - src_k_bytes, - expected_k_bytes, - src_v_bytes, - expected_v_bytes); - return false; - } - - auto dst = allocator_->convertIndexToBuffer(layer_id, block_index); - RTP_LLM_CHECK_WITH_INFO( - !dst.empty(), "convertIndexToBuffer returned empty for layer %d, block %d", layer_id, block_index); - if (!dst[0].addr) { - RTP_LLM_LOG_ERROR("convertIndexToBuffer returned null for layer %d, block %d", layer_id, block_index); - return false; - } - - auto copyFunc = [&](const torch::Tensor& src_tensor, const BlockInfo& dst_block, size_t dst_byte_offset) -> bool { - const size_t dst_bytes = dst_block.size_bytes; - const size_t src_bytes = src_tensor.nbytes(); - if (dst_bytes < dst_byte_offset + src_bytes) { - RTP_LLM_LOG_ERROR("dst block bytes[%zu] < dst_offset[%zu] + src bytes[%zu] in setKVBlockValue(layer=%d)", - dst_bytes, - dst_byte_offset, - src_bytes, - layer_id); - return false; - } - - auto* dst_ptr = static_cast(dst_block.addr) + dst_byte_offset; - auto dst_device = dst_block.is_cuda ? torch::kCUDA : torch::kCPU; - auto src_device = src_tensor.is_cuda() ? torch::kCUDA : torch::kCPU; - auto dst_t = torch::from_blob( - dst_ptr, {(int64_t)src_bytes}, torch::TensorOptions().dtype(torch::kUInt8).device(dst_device)); - auto src_t = torch::from_blob(src_tensor.data_ptr(), - {(int64_t)src_bytes}, - torch::TensorOptions().dtype(torch::kUInt8).device(src_device)); - dst_t.copy_(src_t); - return true; - }; - - if (!copyFunc(k_buffer, dst[0], 0)) { - return false; - } - - if (!copyFunc(v_buffer, dst[0], expected_k_bytes)) { - return false; - } - - cudaSyncAndCheck(); - return true; -} - -bool KVCacheManager::setKVBlockValue(int block_index, const torch::Tensor& k_buffer, const torch::Tensor& v_buffer) { - if (block_index < 0 || block_index >= config_.block_num) { - RTP_LLM_LOG_WARNING("Invalid block_index: %d, valid range: [0, %d)", block_index, config_.block_num); - return false; - } - - bool all_success = true; - for (int layer_id = 0; layer_id < config_.layer_num; ++layer_id) { - all_success = setKVBlockValue(block_index, layer_id, k_buffer, v_buffer) && all_success; - } - return all_success; -} - // 地址转换和缓冲区访问 BlockAddrInfo KVCacheManager::convertIndexToAddr(int block_index, int layer_id) const { @@ -241,6 +297,34 @@ KVCacheManager::convertIndexToBuffer(int block_index, int layer_id, int partitio return allocator_->convertIndexToBuffer(layer_id, block_index, partition_count, partition_id); } +BlockAddrInfo KVCacheManager::convertIndexToAddr(int block_index, int layer_id, int group_id) const { + return allocator_->convertIndexToAddr(layer_id, group_id, block_index); +} + +std::vector +KVCacheManager::convertIndexToBuffer(int block_index, int layer_id, int group_id) const { + return allocator_->convertIndexToBuffer(layer_id, group_id, block_index); +} + +std::vector KVCacheManager::convertIndexToBuffer( + int block_index, int layer_id, int group_id, int partition_count, int partition_id) const { + return allocator_->convertIndexToBuffer(layer_id, group_id, block_index, partition_count, partition_id); +} + +BlockAddrInfo KVCacheManager::convertIndexToAddrByTag(int block_index, int layer_id, const std::string& tag) const { + return allocator_->convertIndexToAddrByTag(layer_id, tag, block_index); +} + +std::vector +KVCacheManager::convertIndexToBufferByTag(int block_index, int layer_id, const std::string& tag) const { + return allocator_->convertIndexToBufferByTag(layer_id, tag, block_index); +} + +std::vector KVCacheManager::convertIndexToBufferByTag( + int block_index, int layer_id, const std::string& tag, int partition_count, int partition_id) const { + return allocator_->convertIndexToBufferByTag(layer_id, tag, block_index, partition_count, partition_id); +} + CacheLayerLayout KVCacheManager::allLayerCacheBase() const { return allocator_->allLayerCacheBase(); } @@ -252,15 +336,23 @@ CacheLayerLayout KVCacheManager::getMainModelCacheLayerLayout() const { auto& all_layer_tensors = all_layout.layers_to_kv_buffer_ptrs; auto& all_scale_tensors = all_layout.layers_to_scale_buffer_ptrs; + layout.layer_to_group_ids.resize(config_.layer_num); layout.layers_to_kv_buffer_ptrs.resize(config_.layer_num); if (!all_scale_tensors.empty()) { layout.layers_to_scale_buffer_ptrs.resize(config_.layer_num); } - layout.layer_to_groups = config_.layer_to_group_id; - layout.group_types = config_.group_types; - layout.layer_to_groups.resize(config_.layer_num); - layout.layer_attn_types.resize(config_.layer_num, CacheGroupType::FULL); + const auto layer_group_ids = config_.layerGroupIdsSnapshot(); + const auto layer_tag_to_gid = config_.layerTagToGroupIdSnapshot(); + layout.group_types = config_.groupTypesSnapshot(); + layout.group_tags = config_.groupTagsSnapshot(); + layout.layer_tag_to_group_id.resize(config_.layer_num); + layout.group_seq_size_per_block = config_.group_seq_size_per_block; + layout.layer_group_types.resize(config_.layer_num, CacheGroupType::FULL); + layout.layers_to_kv_buffer_ptrs_by_group.resize(config_.layer_num); + if (!all_layout.layers_to_scale_buffer_ptrs_by_group.empty()) { + layout.layers_to_scale_buffer_ptrs_by_group.resize(config_.layer_num); + } RTP_LLM_CHECK_WITH_INFO(config_.layer_num <= all_layer_tensors.size(), "config_.layer_num[%d] > all_layer_tensors.size()[%ld]", @@ -269,7 +361,6 @@ CacheLayerLayout KVCacheManager::getMainModelCacheLayerLayout() const { for (int layer_id = 0; layer_id < static_cast(config_.layer_num); ++layer_id) { if (static_cast(layer_id) < all_layer_tensors.size()) { - layout.layer_to_groups[layer_id] = all_layout.layer_to_groups[layer_id]; layout.layers_to_kv_buffer_ptrs[layer_id] = all_layer_tensors[layer_id]; } else { RTP_LLM_CHECK(false); @@ -282,8 +373,23 @@ CacheLayerLayout KVCacheManager::getMainModelCacheLayerLayout() const { RTP_LLM_CHECK(false); } } - if (static_cast(layer_id) < config_.layer_attn_types.size()) { - layout.layer_attn_types[layer_id] = config_.layer_attn_types[static_cast(layer_id)]; + if (static_cast(layer_id) < layer_group_ids.size()) { + layout.layer_to_group_ids[layer_id] = layer_group_ids[static_cast(layer_id)]; + if (!layout.layer_to_group_ids[layer_id].empty()) { + layout.layer_group_types[layer_id] = + config_.typeForGroup(static_cast(layout.layer_to_group_ids[layer_id].front())); + } + } + if (static_cast(layer_id) < layer_tag_to_gid.size()) { + layout.layer_tag_to_group_id[layer_id] = layer_tag_to_gid[static_cast(layer_id)]; + } + if (static_cast(layer_id) < all_layout.layers_to_kv_buffer_ptrs_by_group.size()) { + layout.layers_to_kv_buffer_ptrs_by_group[layer_id] = + all_layout.layers_to_kv_buffer_ptrs_by_group[static_cast(layer_id)]; + } + if (static_cast(layer_id) < all_layout.layers_to_scale_buffer_ptrs_by_group.size()) { + layout.layers_to_scale_buffer_ptrs_by_group[layer_id] = + all_layout.layers_to_scale_buffer_ptrs_by_group[static_cast(layer_id)]; } } @@ -300,49 +406,92 @@ CacheLayerLayout KVCacheManager::getMTPModuleCacheLayerLayout(int mtp_module_id) const auto& mtp_sub_config = config_.mtp_sub_configs[mtp_module_id]; RTP_LLM_CHECK_WITH_INFO(mtp_sub_config != nullptr, "mtp_sub_configs[%d] is null", mtp_module_id); - RTP_LLM_CHECK_WITH_INFO( - !mtp_sub_config->global_layer_ids.empty(), "mtp_sub_configs[%d]->global_layer_ids is empty", mtp_module_id); - RTP_LLM_CHECK_WITH_INFO(!mtp_sub_config->global_layer_ids[0].empty(), - "mtp_sub_configs[%d]->global_layer_ids[0] is empty", - mtp_module_id); - - const auto& mtp_global_layer_ids = mtp_sub_config->global_layer_ids[0]; - const uint32_t mtp_layer_num = mtp_sub_config->layer_num; + const uint32_t mtp_layer_num = mtp_sub_config->layer_num; + const int mtp_global_layer_base = static_cast(config_.layer_num) + + mtp_module_id * static_cast(mtp_layer_num); + std::vector global_layer_for_local(mtp_layer_num, -1); + for (size_t local_gid = 0; local_gid < static_cast(mtp_sub_config->groupNums()); ++local_gid) { + for (int global_layer_id : mtp_sub_config->layerIdsForGroup(local_gid)) { + const int local_layer_id = global_layer_id - mtp_global_layer_base; + RTP_LLM_CHECK_WITH_INFO(local_layer_id >= 0 && local_layer_id < static_cast(mtp_layer_num), + "mtp_sub_configs[%d] global layer %d is outside local range [%d, %d)", + mtp_module_id, + global_layer_id, + mtp_global_layer_base, + mtp_global_layer_base + static_cast(mtp_layer_num)); + global_layer_for_local[static_cast(local_layer_id)] = global_layer_id; + } + } + for (uint32_t local_layer_id = 0; local_layer_id < mtp_layer_num; ++local_layer_id) { + RTP_LLM_CHECK_WITH_INFO(global_layer_for_local[local_layer_id] >= 0, + "mtp_sub_configs[%d] has no global layer for local layer %u", + mtp_module_id, + local_layer_id); + } auto all_layout = allocator_->allLayerCacheBase(); auto& all_layer_tensors = all_layout.layers_to_kv_buffer_ptrs; auto& all_scale_tensors = all_layout.layers_to_scale_buffer_ptrs; - layout.layer_to_groups.resize(mtp_layer_num); layout.layers_to_kv_buffer_ptrs.resize(mtp_layer_num); if (!all_scale_tensors.empty()) { layout.layers_to_scale_buffer_ptrs.resize(mtp_layer_num); } - layout.layer_attn_types.resize(mtp_layer_num, CacheGroupType::FULL); + layout.layer_group_types.resize(mtp_layer_num, CacheGroupType::FULL); + layout.group_tags = mtp_sub_config->groupTagsSnapshot(); + layout.group_types = mtp_sub_config->groupTypesSnapshot(); + layout.group_seq_size_per_block = mtp_sub_config->group_seq_size_per_block; + + const size_t group_count = layout.group_tags.size(); + layout.layers_to_kv_buffer_ptrs_by_group.assign(mtp_layer_num, std::vector(group_count)); + layout.layers_to_scale_buffer_ptrs_by_group.assign(mtp_layer_num, std::vector(group_count)); + layout.layer_to_group_ids.resize(mtp_layer_num); + layout.layer_tag_to_group_id.resize(mtp_layer_num); for (uint32_t local_layer_id = 0; local_layer_id < mtp_layer_num; ++local_layer_id) { - if (local_layer_id < mtp_global_layer_ids.size()) { - const int global_layer_id = mtp_global_layer_ids[local_layer_id]; + const int global_layer_id = global_layer_for_local[local_layer_id]; + + if (global_layer_id >= 0 && static_cast(global_layer_id) < all_layer_tensors.size()) { + layout.layers_to_kv_buffer_ptrs[local_layer_id] = all_layer_tensors[global_layer_id]; + } else { + RTP_LLM_CHECK(false); + } - if (global_layer_id >= 0 && static_cast(global_layer_id) < all_layer_tensors.size()) { - layout.layer_to_groups[local_layer_id] = all_layout.layer_to_groups[global_layer_id]; - layout.layers_to_kv_buffer_ptrs[local_layer_id] = all_layer_tensors[global_layer_id]; + if (!all_scale_tensors.empty()) { + if (global_layer_id >= 0 && static_cast(global_layer_id) < all_scale_tensors.size()) { + layout.layers_to_scale_buffer_ptrs[local_layer_id] = all_scale_tensors[global_layer_id]; } else { RTP_LLM_CHECK(false); } + } + + for (size_t local_gid = 0; local_gid < group_count; ++local_gid) { + const auto& tag = mtp_sub_config->tagForGroup(local_gid); + const int global_gid = config_.groupIdForTag(tag); + const auto& group_layers = mtp_sub_config->layerIdsForGroup(local_gid); + if (std::find(group_layers.begin(), group_layers.end(), global_layer_id) == group_layers.end()) { + continue; + } - if (!all_scale_tensors.empty()) { - if (global_layer_id >= 0 && static_cast(global_layer_id) < all_scale_tensors.size()) { - layout.layers_to_scale_buffer_ptrs[local_layer_id] = all_scale_tensors[global_layer_id]; - } else { - RTP_LLM_CHECK(false); + layout.layer_to_group_ids[local_layer_id].push_back(static_cast(local_gid)); + layout.layer_tag_to_group_id[local_layer_id][tag] = static_cast(local_gid); + layout.layer_group_types[local_layer_id] = mtp_sub_config->typeForGroup(local_gid); + + if (static_cast(global_layer_id) < all_layout.layers_to_kv_buffer_ptrs_by_group.size()) { + const auto& src_kv = all_layout.layers_to_kv_buffer_ptrs_by_group[static_cast(global_layer_id)]; + if (global_gid >= 0 && static_cast(global_gid) < src_kv.size()) { + layout.layers_to_kv_buffer_ptrs_by_group[local_layer_id][local_gid] = + src_kv[static_cast(global_gid)]; } } - if (local_layer_id < mtp_sub_config->layer_attn_types.size()) { - layout.layer_attn_types[local_layer_id] = mtp_sub_config->layer_attn_types[local_layer_id]; + if (static_cast(global_layer_id) < all_layout.layers_to_scale_buffer_ptrs_by_group.size()) { + const auto& src_scale = + all_layout.layers_to_scale_buffer_ptrs_by_group[static_cast(global_layer_id)]; + if (global_gid >= 0 && static_cast(global_gid) < src_scale.size()) { + layout.layers_to_scale_buffer_ptrs_by_group[local_layer_id][local_gid] = + src_scale[static_cast(global_gid)]; + } } - } else { - RTP_LLM_CHECK(false); } } @@ -385,37 +534,40 @@ size_t KVCacheManager::maxAvailableTokensNum() const { KVCacheInfo KVCacheManager::getKVCacheInfo(int64_t latest_version, bool need_cache_keys) const { KVCacheInfo info; + info.version = latest_version; if (!allocator_) { RTP_LLM_LOG_ERROR("getKVCacheInfo called before KVCacheManager initialized"); - info.version = latest_version; return info; } if (need_cache_keys) { std::unordered_set all_keys; // device cache keys - auto block_cache = allocator_->getBlockPool()->blockCache(); - auto snapshot = block_cache->cacheSnapshot(latest_version); - for (const auto& cacheItem : snapshot.values) { - all_keys.insert(cacheItem.cache_key); + std::vector device_cache_keys; + auto shared_cache = allocator_->sharedBlockCache(); + if (shared_cache) { + device_cache_keys = shared_cache->allCacheKeys(); + all_keys.insert(device_cache_keys.begin(), device_cache_keys.end()); + info.version = shared_cache->version(); } // memory cache keys + RTP_LLM_CHECK_WITH_INFO(coordinator_ != nullptr, + "getKVCacheInfo called before KVCacheManager coordinator initialized"); const auto mem_cache_keys = coordinator_->memoryCacheKeys(); all_keys.insert(mem_cache_keys.begin(), mem_cache_keys.end()); info.cached_keys.assign(all_keys.begin(), all_keys.end()); - info.version = snapshot.version; } - const size_t block_size_tokens = config_.seq_size_per_block; - const size_t total_blocks = allocator_->totalBlocksNum(); - const size_t available_blocks = allocator_->availableBlocksNum(); + const size_t block_size_tokens = cp_slot_mapper_ && cp_slot_mapper_->isSharded() ? + cp_slot_mapper_->virtualBlockSize() : + config_.seq_size_per_block; + const auto capacity = allocator_->tokenCapacity(block_size_tokens); info.block_size = block_size_tokens; - info.total_kv_cache = total_blocks * block_size_tokens; - info.available_kv_cache = available_blocks * block_size_tokens; - // cached_keys left empty for now; can be populated when distributed cache is wired up. + info.total_kv_cache = capacity.total_tokens; + info.available_kv_cache = capacity.available_tokens; return info; } @@ -455,16 +607,19 @@ bool KVCacheManager::hasP2PConnector() const { std::shared_ptr KVCacheManager::asyncLoadCache(const std::shared_ptr& connector_context) { RTP_LLM_PROFILE_FUNCTION(); + RTP_LLM_CHECK_WITH_INFO(coordinator_ != nullptr, "asyncLoadCache called before KVCacheManager initialized"); return coordinator_->asyncRead(connector_context); } std::shared_ptr KVCacheManager::asyncStoreCache(const std::shared_ptr& connector_context) { RTP_LLM_PROFILE_FUNCTION(); + RTP_LLM_CHECK_WITH_INFO(coordinator_ != nullptr, "asyncStoreCache called before KVCacheManager initialized"); return coordinator_->asyncWrite(connector_context); } bool KVCacheManager::executeFunction(const FunctionRequestPB& request, FunctionResponsePB& response) { + RTP_LLM_CHECK_WITH_INFO(coordinator_ != nullptr, "executeFunction called before KVCacheManager initialized"); return coordinator_->executeFunction(request, response); } @@ -489,6 +644,7 @@ void KVCacheManager::initConnectorCoordinator() { } void KVCacheManager::allocateAndSync() { + RTP_LLM_LOG_INFO("allocateAndSync start, block_num=%d", config_.block_num); size_t world_size = parallelism_config_.tp_size * parallelism_config_.dp_size; if (world_size > 1) { size_t local_rank = parallelism_config_.tp_size * parallelism_config_.dp_rank + parallelism_config_.tp_rank; @@ -505,41 +661,37 @@ void KVCacheManager::allocateAndSync() { config_.block_num = *std::min_element(block_num_ptr, block_num_ptr + world_size); } } + if (config_.use_independent_block_pools) { + config_.finalizeBlockNums(static_cast(config_.block_num), runtime_config_); + } RTP_LLM_LOG_INFO("block_num is %d after tp sync", config_.block_num); } void KVCacheManager::reportMetricsLoop() { RTP_LLM_PROFILE_FUNCTION(); kmonitor::MetricsTags tags; + constexpr auto kLogInterval = std::chrono::minutes(1); + auto last_log_time = std::chrono::steady_clock::now() - kLogInterval; while (!stop_.load(std::memory_order_relaxed)) { if (!metrics_reporter_ || !allocator_) { std::this_thread::sleep_for(std::chrono::seconds(1)); continue; } - RtpLLMCacheMetricsCollector collector; - - auto block_pool = allocator_->getBlockPool(); - auto block_cache = block_pool ? block_pool->blockCache() : nullptr; - - const auto total_blocks = allocator_->totalBlocksNum(); - const auto available_blocks = allocator_->availableBlocksNum(); - const auto request_ref_blocks = allocator_->requestRefBlocksNum(); - const auto connector_ref_blocks = allocator_->connectorRefBlocksNum(); - - collector.kv_cache_item_num = block_cache ? static_cast(block_cache->size()) : 0; - collector.kv_cache_left_seq = static_cast(available_blocks * config_.seq_size_per_block); - collector.kv_cache_available_blocks = static_cast(available_blocks); - collector.kv_cache_request_ref_blocks = static_cast(request_ref_blocks); - collector.kv_cache_connector_ref_blocks = static_cast(connector_ref_blocks); - collector.kv_cache_free_blocks = static_cast(allocator_->freeBlocksNum()); - collector.kv_cache_used_ratio = - (total_blocks == 0) ? - 0.0f : - static_cast(100.0 * (total_blocks - available_blocks) / static_cast(total_blocks)); - collector.mr_cost_time_ms = allocator_->getMrCostTimeMs(); - - metrics_reporter_->report(&tags, &collector); + auto global_metrics = collectGlobalCacheMetrics(allocator_); + metrics_reporter_->report(&tags, &global_metrics.collector); + + const auto now = std::chrono::steady_clock::now(); + const bool should_log = (now - last_log_time) >= kLogInterval; + if (should_log) { + last_log_time = now; + logGlobalCacheMetrics(global_metrics); + } + + for (const auto& pool_snapshot : allocator_->poolMetricsSnapshots()) { + reportPoolCacheMetrics(metrics_reporter_, pool_snapshot, should_log); + } + std::this_thread::sleep_for(std::chrono::seconds(1)); // 1s } } @@ -547,9 +699,87 @@ void KVCacheManager::reportMetricsLoop() { void KVCacheManager::handleRead(const P2PConnectorStartLoadRequestPB& request, P2PConnectorStartLoadResponsePB& response, std::function is_cancelled) { - if (coordinator_) { - coordinator_->handleRead(request, response, is_cancelled); + RTP_LLM_CHECK_WITH_INFO(coordinator_ != nullptr, "handleRead called before KVCacheManager initialized"); + coordinator_->handleRead(request, response, is_cancelled); +} + +// Write one KV block (optionally per-layer) from host/device tensors for test +bool KVCacheManager::writeKVBlockForTest(int block_index, + int layer_id, + const torch::Tensor& k_buffer, + const torch::Tensor& v_buffer) { + // Basic size/type validation to prevent out-of-bounds copy + auto& spec = config_.specForGroup(0); + size_t expected_k_bytes = spec->k_block_size_bytes(); + size_t expected_v_bytes = spec->v_block_size_bytes(); + size_t src_k_bytes = k_buffer.nbytes(); + size_t src_v_bytes = v_buffer.nbytes(); + if (src_k_bytes < expected_k_bytes || src_v_bytes < expected_v_bytes) { + RTP_LLM_LOG_ERROR("writeKVBlockForTest src bytes too small: k[%zu]<[%zu] or v[%zu]<[%zu]", + src_k_bytes, + expected_k_bytes, + src_v_bytes, + expected_v_bytes); + return false; } + + auto dst = allocator_->convertIndexToBuffer(layer_id, block_index); + RTP_LLM_CHECK_WITH_INFO( + !dst.empty(), "convertIndexToBuffer returned empty for layer %d, block %d", layer_id, block_index); + if (!dst[0].addr) { + RTP_LLM_LOG_ERROR("convertIndexToBuffer returned null for layer %d, block %d", layer_id, block_index); + return false; + } + + auto copyFunc = [&](const torch::Tensor& src_tensor, + const BlockInfo& dst_block, + size_t dst_byte_offset, + size_t copy_bytes) -> bool { + const size_t dst_bytes = dst_block.size_bytes; + if (dst_bytes < dst_byte_offset + copy_bytes) { + RTP_LLM_LOG_ERROR("dst block bytes[%zu] < dst_offset[%zu] + copy bytes[%zu] in writeKVBlockForTest(layer=%d)", + dst_bytes, + dst_byte_offset, + copy_bytes, + layer_id); + return false; + } + + auto* dst_ptr = static_cast(dst_block.addr) + dst_byte_offset; + auto dst_device = dst_block.is_cuda ? torch::kCUDA : torch::kCPU; + auto src_device = src_tensor.is_cuda() ? torch::kCUDA : torch::kCPU; + auto dst_t = torch::from_blob( + dst_ptr, {(int64_t)copy_bytes}, torch::TensorOptions().dtype(torch::kUInt8).device(dst_device)); + auto src_t = torch::from_blob(src_tensor.data_ptr(), + {(int64_t)copy_bytes}, + torch::TensorOptions().dtype(torch::kUInt8).device(src_device)); + dst_t.copy_(src_t); + return true; + }; + + if (!copyFunc(k_buffer, dst[0], 0, expected_k_bytes)) { + return false; + } + + if (!copyFunc(v_buffer, dst[0], expected_k_bytes, expected_v_bytes)) { + return false; + } + + cudaSyncAndCheck(); + return true; +} + +bool KVCacheManager::writeKVBlockForTest(int block_index, const torch::Tensor& k_buffer, const torch::Tensor& v_buffer) { + if (block_index < 0 || block_index >= config_.block_num) { + RTP_LLM_LOG_WARNING("Invalid block_index: %d, valid range: [0, %d)", block_index, config_.block_num); + return false; + } + + bool all_success = true; + for (int layer_id = 0; layer_id < config_.layer_num; ++layer_id) { + all_success = writeKVBlockForTest(block_index, layer_id, k_buffer, v_buffer) && all_success; + } + return all_success; } } // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/KVCacheManager.h b/rtp_llm/cpp/cache/KVCacheManager.h index 7d84ed9f7f..69bc543ae2 100644 --- a/rtp_llm/cpp/cache/KVCacheManager.h +++ b/rtp_llm/cpp/cache/KVCacheManager.h @@ -3,6 +3,8 @@ #include #include #include +#include +#include #include #include @@ -10,7 +12,7 @@ #include "rtp_llm/cpp/cache/BufferTypes.h" #include "rtp_llm/cpp/cache/CacheConfig.h" #include "rtp_llm/cpp/cache/connector/AsyncContext.h" -#include "rtp_llm/cpp/cache/KVCacheAllocator.h" +#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h" #include "rtp_llm/cpp/config/ConfigModules.h" #include "rtp_llm/cpp/cache/connector/KVCacheConnector.h" #include "rtp_llm/cpp/model_rpc/proto/model_rpc_service.grpc.pb.h" @@ -25,14 +27,15 @@ class KVCacheConnectorReadWriteContext; class KVCacheManager { public: KVCacheManager(const CacheConfig& config, - bool warmup = false, - const kmonitor::MetricsReporterPtr metrics_reporter = nullptr, - const KVCacheConfig& kv_cache_config = KVCacheConfig{}, - const ParallelismConfig& parallelism_config = ParallelismConfig{}, - const RuntimeConfig& runtime_config = RuntimeConfig{}, - const SpeculativeExecutionConfig& sp_config = SpeculativeExecutionConfig{}, - const PDSepConfig& pd_sep_config = PDSepConfig{}, - const CacheStoreConfig& cache_store_config = CacheStoreConfig{}); + bool warmup = false, + const kmonitor::MetricsReporterPtr metrics_reporter = nullptr, + const KVCacheConfig& kv_cache_config = KVCacheConfig{}, + const ParallelismConfig& parallelism_config = ParallelismConfig{}, + const RuntimeConfig& runtime_config = RuntimeConfig{}, + const SpeculativeExecutionConfig& sp_config = SpeculativeExecutionConfig{}, + const PDSepConfig& pd_sep_config = PDSepConfig{}, + const CacheStoreConfig& cache_store_config = CacheStoreConfig{}, + bool use_cuda_malloc_block_pool = false); ~KVCacheManager(); // 初始化和配置相关 @@ -60,16 +63,19 @@ class KVCacheManager { bool copy_last_block, std::vector& block_update_mapping); - // Write one KV block (optionally per-layer) from host/device tensors for test - virtual bool - setKVBlockValue(int block_index, int layer_id, const torch::Tensor& k_buffer, const torch::Tensor& v_buffer); - virtual bool setKVBlockValue(int block_index, const torch::Tensor& k_buffer, const torch::Tensor& v_buffer); - // 地址转换和缓冲区访问 BlockAddrInfo convertIndexToAddr(int block_index, int layer_id) const; std::vector convertIndexToBuffer(int block_index, int layer_id) const; std::vector - convertIndexToBuffer(int block_index, int layer_id, int partition_count, int partition_id) const; + convertIndexToBuffer(int block_index, int layer_id, int partition_count, int partition_id) const; + BlockAddrInfo convertIndexToAddr(int block_index, int layer_id, int group_id) const; + std::vector convertIndexToBuffer(int block_index, int layer_id, int group_id) const; + std::vector + convertIndexToBuffer(int block_index, int layer_id, int group_id, int partition_count, int partition_id) const; + BlockAddrInfo convertIndexToAddrByTag(int block_index, int layer_id, const std::string& tag) const; + std::vector convertIndexToBufferByTag(int block_index, int layer_id, const std::string& tag) const; + std::vector convertIndexToBufferByTag( + int block_index, int layer_id, const std::string& tag, int partition_count, int partition_id) const; CacheLayerLayout allLayerCacheBase() const; @@ -124,6 +130,18 @@ class KVCacheManager { std::shared_ptr incrKVCacheRef(const KVCacheResource& resource, const CacheKeysType& cache_keys, bool is_connector = true); + // CP page-level RR sharding context. Returns nullptr when sharding is not active + // (single-rank or kv_cache_sharded=false). Used by connector / cache_store to + // remap cacheKeys -> last-rank-key namespace. + std::shared_ptr cpSlotMapper() const { + return cp_slot_mapper_; + } + + // Write one KV block (optionally per-layer) from host/device tensors for test + virtual bool + writeKVBlockForTest(int block_index, int layer_id, const torch::Tensor& k_buffer, const torch::Tensor& v_buffer); + virtual bool writeKVBlockForTest(int block_index, const torch::Tensor& k_buffer, const torch::Tensor& v_buffer); + private: void initConnectorCoordinator(); void allocateAndSync(); @@ -140,6 +158,9 @@ class KVCacheManager { const SpeculativeExecutionConfig sp_config_; const PDSepConfig pd_sep_config_; const CacheStoreConfig cache_store_config_; + const bool use_cuda_malloc_block_pool_; + + std::shared_ptr cp_slot_mapper_; std::atomic stop_{false}; std::thread metrics_reporter_thread_; @@ -150,4 +171,4 @@ class KVCacheManager { std::shared_ptr cache_store_; }; -} // namespace rtp_llm \ No newline at end of file +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/KVCacheResource.cc b/rtp_llm/cpp/cache/KVCacheResource.cc index fda4bbbcb3..e7b72b36b6 100644 --- a/rtp_llm/cpp/cache/KVCacheResource.cc +++ b/rtp_llm/cpp/cache/KVCacheResource.cc @@ -1,5 +1,8 @@ #include "rtp_llm/cpp/cache/KVCacheResource.h" +#include + + namespace rtp_llm { size_t BlockIds::blocksNum() const { @@ -44,8 +47,18 @@ void BlockIds::remove(const std::vector& indices) { } void BlockIds::swap(size_t pos_a, size_t pos_b) { - RTP_LLM_CHECK(pos_a < block_indices.size()); - RTP_LLM_CHECK(pos_b < block_indices.size()); + if (pos_a >= block_indices.size() || pos_b >= block_indices.size()) { + RTP_LLM_LOG_ERROR("BlockIds::swap: pos_a=%d or pos_b=%d is out of range, block_indices.size()=%d", + pos_a, + pos_b, + block_indices.size()); + RTP_LLM_CHECK_WITH_INFO(false, + "BlockIds::swap: pos_a=%d or pos_b=%d is out of range, block_indices.size()=%d", + pos_a, + pos_b, + block_indices.size()); + } + if (pos_a == pos_b) { return; } @@ -109,36 +122,54 @@ void BlockIds::syncKernelBlocks() { } } -void KVCacheResource::initGroups(int group_num, - int layer_num, - const std::vector& layer_to_group_id, - size_t kernel_blocks_per_kv_block, - const std::vector& group_types) { +void KVCacheResource::initGroups(int group_num, + int layer_num, + const std::vector>& layer_group_ids, + size_t kernel_blocks_per_kv_block, + const std::vector& group_types) { group_block_ids.clear(); layer_block_ids.clear(); + layer_group_block_ids.clear(); + + if (!group_types.empty()) { + RTP_LLM_CHECK_WITH_INFO(group_types.size() >= static_cast(group_num), + "KVCacheResource::initGroups: group_types size %zu < group_num %d", + group_types.size(), + group_num); + } group_block_ids.reserve(static_cast(group_num)); for (int i = 0; i < group_num; i++) { - const bool is_full = group_types.empty() || group_types[static_cast(i)] == CacheGroupType::FULL; - const size_t group_kernel_blocks_per_kv_block = is_full ? kernel_blocks_per_kv_block : 1; - auto bid = std::make_shared(group_kernel_blocks_per_kv_block); + const bool is_full_group = group_types.empty() || group_types[static_cast(i)] == CacheGroupType::FULL; + const size_t bpk = is_full_group ? std::max(1, kernel_blocks_per_kv_block) : 1; + auto bid = std::make_shared(bpk); group_block_ids.push_back(std::move(bid)); } if (!group_block_ids.empty()) { - RTP_LLM_CHECK_WITH_INFO(layer_to_group_id.empty() || layer_to_group_id.size() >= static_cast(layer_num), - "KVCacheResource::initGroups: layer_to_group_id size %zu < layer_num %d", - layer_to_group_id.size(), + RTP_LLM_CHECK_WITH_INFO(layer_group_ids.size() >= static_cast(layer_num), + "KVCacheResource::initGroups: layer_group_ids size %zu < layer_num %d", + layer_group_ids.size(), layer_num); - layer_block_ids.resize(layer_num); - for (int i = 0; i < layer_num; ++i) { - int gid = layer_to_group_id.empty() ? 0 : layer_to_group_id[i]; - RTP_LLM_CHECK_WITH_INFO(gid >= 0 && gid < group_num, - "KVCacheResource::initGroups: invalid group id %d for layer %d (group_num=%d)", - gid, - i, - group_num); - layer_block_ids[i] = group_block_ids[gid]; + layer_block_ids.resize(static_cast(layer_num)); + layer_group_block_ids.resize(static_cast(layer_num)); + for (int layer = 0; layer < layer_num; ++layer) { + auto& group_blocks = layer_group_block_ids[static_cast(layer)]; + group_blocks.assign(static_cast(group_num), nullptr); + + const auto& gids = layer_group_ids[static_cast(layer)]; + for (int gid : gids) { + RTP_LLM_CHECK_WITH_INFO( + gid >= 0 && gid < group_num, + "KVCacheResource::initGroups: invalid group id %d for layer %d (group_num=%d)", + gid, + layer, + group_num); + group_blocks[static_cast(gid)] = group_block_ids[static_cast(gid)]; + } + if (gids.size() == 1) { + layer_block_ids[static_cast(layer)] = group_block_ids[static_cast(gids.front())]; + } } } } @@ -159,16 +190,33 @@ const BlockIndicesType& KVCacheResource::blocks(int group_id) const { return group_block_ids[group_id]->blocks(); } +const BlockIndicesType& KVCacheResource::blocks(int layer_id, int group_id) const { + return mutableBlockIds(layer_id, group_id).blocks(); +} + const BlockIndicesType& KVCacheResource::kernelBlocks(int group_id) const { RTP_LLM_CHECK(group_block_ids.size() > static_cast(group_id)); return group_block_ids[group_id]->kernelBlocks(); } +const BlockIndicesType& KVCacheResource::kernelBlocks(int layer_id, int group_id) const { + return mutableBlockIds(layer_id, group_id).kernelBlocks(); +} + BlockIds& KVCacheResource::mutableBlockIds(int group_id) const { RTP_LLM_CHECK(group_block_ids.size() > static_cast(group_id)); return *group_block_ids[group_id]; } +BlockIds& KVCacheResource::mutableBlockIds(int layer_id, int group_id) const { + RTP_LLM_CHECK(static_cast(layer_id) < layer_group_block_ids.size()); + RTP_LLM_CHECK(static_cast(group_id) < layer_group_block_ids[static_cast(layer_id)].size()); + auto block_ids = layer_group_block_ids[static_cast(layer_id)][static_cast(group_id)]; + RTP_LLM_CHECK_WITH_INFO( + block_ids != nullptr, "KVCacheResource: missing block ids for layer %d group_id %d", layer_id, group_id); + return *block_ids; +} + int KVCacheResource::groupNums() const { return static_cast(group_block_ids.size()); } @@ -185,6 +233,19 @@ const LayerBlockIds& KVCacheResource::layerBlocks() const { return layer_block_ids; } +const LayerAttnBlockIds& KVCacheResource::layerGroupBlocks() const { + return layer_group_block_ids; +} + +int KVCacheResource::groupId(int layer_id, int group_id) const { + RTP_LLM_CHECK(static_cast(layer_id) < layer_group_block_ids.size()); + if (group_id < 0 || static_cast(group_id) >= layer_group_block_ids[static_cast(layer_id)].size() + || !layer_group_block_ids[static_cast(layer_id)][static_cast(group_id)]) { + return -1; + } + return group_id; +} + CacheKeysType& KVCacheResource::cacheKeys() { return cache_keys; } @@ -193,6 +254,62 @@ const CacheKeysType& KVCacheResource::cacheKeys() const { return cache_keys; } +void KVCacheResource::setCacheKeys(const CacheKeysType& keys) { + cache_keys = keys; + cache_keys_are_cp_canonical_ = false; + rebuildLinearBlockDependencies(); +} + +void KVCacheResource::setCacheKeys(CacheKeysType&& keys) { + cache_keys = std::move(keys); + cache_keys_are_cp_canonical_ = false; + rebuildLinearBlockDependencies(); +} + +bool KVCacheResource::cacheKeysAreCpCanonical() const { + return cache_keys_are_cp_canonical_; +} + +void KVCacheResource::setCacheKeysAreCpCanonical(bool cache_keys_are_cp_canonical) { + cache_keys_are_cp_canonical_ = cache_keys_are_cp_canonical; +} + +BlockDependenciesType& KVCacheResource::blockDependencies() { + return block_dependencies; +} + +const BlockDependenciesType& KVCacheResource::blockDependencies() const { + return block_dependencies; +} + +void KVCacheResource::setBlockDependencies(const BlockDependenciesType& dependencies) { + block_dependencies = dependencies; +} + +void KVCacheResource::setBlockDependencies(BlockDependenciesType&& dependencies) { + block_dependencies = std::move(dependencies); +} + +void KVCacheResource::rebuildLinearBlockDependencies() { + block_dependencies.clear(); + block_dependencies.reserve(cache_keys.size()); + for (size_t i = 0; i < cache_keys.size(); ++i) { + BlockDependency dependency; + dependency.ordinal = static_cast(i); + if (i > 0) { + dependency.has_parent = true; + dependency.parent_key = cache_keys[i - 1]; + } + block_dependencies.push_back(dependency); + } +} + +void KVCacheResource::ensureLinearBlockDependencies() { + if (block_dependencies.size() != cache_keys.size()) { + rebuildLinearBlockDependencies(); + } +} + size_t KVCacheResource::reuseBlockNum() const { return device_reuse_block_num_ + memory_reuse_block_num_ + remote_reuse_block_num_; } diff --git a/rtp_llm/cpp/cache/KVCacheResource.h b/rtp_llm/cpp/cache/KVCacheResource.h index a1ebe5219a..7de9c6c244 100644 --- a/rtp_llm/cpp/cache/KVCacheResource.h +++ b/rtp_llm/cpp/cache/KVCacheResource.h @@ -6,7 +6,7 @@ #include #include -#include "rtp_llm/cpp/cache/CacheGroupType.h" +#include "rtp_llm/cpp/cache/spec/CacheGroupType.h" #include "rtp_llm/cpp/utils/AssertUtils.h" namespace rtp_llm { @@ -23,6 +23,14 @@ inline bool isNullBlockIdx(BlockIdxType block_idx) { using CacheKeysType = std::vector; using BlockIndicesType = std::vector; +struct BlockDependency { + bool has_parent{false}; + CacheKeyType parent_key{0}; + uint32_t ordinal{0}; +}; + +using BlockDependenciesType = std::vector; + class BlockIds { public: explicit BlockIds(size_t kernel_blocks_per_kv_block = 1): @@ -67,32 +75,64 @@ class BlockIds { size_t kernel_blocks_per_kv_block_ = 1; }; -using GroupBlockIds = std::vector>; -using LayerBlockIds = std::vector>; +using GroupBlockIds = std::vector>; +using LayerBlockIds = std::vector>; +using LayerAttnBlockIds = std::vector>>; class KVCacheResource { public: - void initGroups(int group_num, - int layer_num, - const std::vector& layer_to_group_id = {}, - size_t kernel_blocks_per_kv_block = 1, - const std::vector& group_types = {}); + void initGroups(int group_num, + int layer_num, + const std::vector>& layer_group_ids = {}, + size_t kernel_blocks_per_kv_block = 1, + const std::vector& group_types = {}); void resizeBlocks(int reserver_blocks, int value = 0); int blocksNum(int group_id = 0) const; const BlockIndicesType& blocks(int group_id = 0) const; + const BlockIndicesType& blocks(int layer_id, int group_id) const; const BlockIndicesType& kernelBlocks(int group_id = 0) const; + const BlockIndicesType& kernelBlocks(int layer_id, int group_id) const; BlockIds& mutableBlockIds(int group_id = 0) const; + BlockIds& mutableBlockIds(int layer_id, int group_id) const; int groupNums() const; GroupBlockIds& groupBlocks(); const GroupBlockIds& groupBlocks() const; - const LayerBlockIds& layerBlocks() const; + const LayerBlockIds& layerBlocks() const; + const LayerAttnBlockIds& layerGroupBlocks() const; + int groupId(int layer_id, int group_id) const; CacheKeysType& cacheKeys(); const CacheKeysType& cacheKeys() const; + void setCacheKeys(const CacheKeysType& keys); + void setCacheKeys(CacheKeysType&& keys); + bool cacheKeysAreCpCanonical() const; + void setCacheKeysAreCpCanonical(bool cache_keys_are_cp_canonical); + + BlockDependenciesType& blockDependencies(); + const BlockDependenciesType& blockDependencies() const; + void setBlockDependencies(const BlockDependenciesType& dependencies); + void setBlockDependencies(BlockDependenciesType&& dependencies); + void rebuildLinearBlockDependencies(); + void ensureLinearBlockDependencies(); + + // Return rank-local cache keys: every cp_size-th key starting from cp_rank. + // localCacheKeys(r, s)[i] == cacheKeys()[i * s + r] + // Note: when cacheKeys().size() % cp_size != 0 (e.g. 1 real block, cp_size=2), + // localCacheKeys may return fewer entries than blocks().size(). This is + // intentional — padding blocks carry no real data and must NOT participate in + // device cache insert, PD transfer, or connector operations. Downstream code + // (e.g. insertIntoCache) already uses min(keys, blocks) to handle this. + CacheKeysType localCacheKeys(int cp_rank, int cp_size) const { + CacheKeysType local; + for (int i = cp_rank; i < static_cast(cache_keys.size()); i += cp_size) { + local.push_back(cache_keys[i]); + } + return local; + } size_t reuseBlockNum() const; @@ -108,9 +148,6 @@ class KVCacheResource { bool lastBlockAligned() const; void setLastBlockAligned(bool last_block_aligned); - size_t remoteReuseBlocksNum() const; - void setRemoteReuseBlocksNum(size_t remote_reuse_blocks_num); - void swapBlocks(size_t group_id, size_t rhs, size_t lhs); std::string debugString() const; @@ -118,9 +155,13 @@ class KVCacheResource { private: // layer_id -> block_indices LayerBlockIds layer_block_ids; + // layer_id -> group_id -> block_indices + LayerAttnBlockIds layer_group_block_ids; // group_id -> block_indices GroupBlockIds group_block_ids; CacheKeysType cache_keys; + BlockDependenciesType block_dependencies; + bool cache_keys_are_cp_canonical_{false}; size_t device_reuse_block_num_{0}; size_t memory_reuse_block_num_{0}; diff --git a/rtp_llm/cpp/cache/KVCacheSpecBase.h b/rtp_llm/cpp/cache/KVCacheSpecBase.h deleted file mode 100644 index d5d192ce1f..0000000000 --- a/rtp_llm/cpp/cache/KVCacheSpecBase.h +++ /dev/null @@ -1,86 +0,0 @@ -#pragma once - -#include -#include -#include - -#include "rtp_llm/cpp/cache/Types.h" -#include "rtp_llm/cpp/config/ConfigModules.h" -#include "rtp_llm/cpp/utils/AssertUtils.h" -#include "rtp_llm/models_py/bindings/core/Types.h" -#include "rtp_llm/cpp/model_utils/AttentionConfig.h" - -namespace rtp_llm { - -enum KVCacheSpecType { - MultiHeadAttention, - MultiHeadLatentAttention, - LinearAttention, -}; - -inline const char* KVCacheSpecTypeToString(KVCacheSpecType t) { - switch (t) { - case KVCacheSpecType::MultiHeadAttention: - return "MultiHeadAttention"; - case KVCacheSpecType::MultiHeadLatentAttention: - return "MultiHeadLatentAttention"; - case KVCacheSpecType::LinearAttention: - return "LinearAttention"; - default: - return "Unknown"; - } -} - -struct KVCacheSpec { - uint32_t layer_num; - uint32_t local_head_num_kv; - uint32_t seq_size_per_block = 1; - - KVCacheSpecType type; - rtp_llm::DataType dtype; - - virtual size_t block_size() const = 0; - virtual size_t k_block_size() const = 0; - virtual size_t v_block_size() const = 0; - - virtual size_t block_size_bytes() const = 0; - virtual size_t k_block_size_bytes() const = 0; - virtual size_t v_block_size_bytes() const = 0; - - virtual size_t scale_block_size_bytes() const { - return 0; - } - virtual size_t k_scale_block_size_bytes() const { - return 0; - } - virtual size_t v_scale_block_size_bytes() const { - return 0; - } - - virtual std::string debugString(size_t indent = 0) const = 0; - -protected: - // Helper method to generate common parts of debug string - std::string commonDebugString(size_t indent = 0) const { - const std::string indent_str = std::string(indent, ' '); - const std::string indent1 = indent_str + " "; - - std::ostringstream os; - os << indent1 << "type=" << KVCacheSpecTypeToString(type) << "(" << static_cast(type) << ")\n"; - os << indent1 << "dtype=" << static_cast(dtype) << "\n"; - os << indent1 << "layer_num=" << layer_num << "\n"; - os << indent1 << "local_head_num_kv=" << local_head_num_kv << "\n"; - os << indent1 << "seq_size_per_block=" << seq_size_per_block << "\n"; - os << indent1 << "block_size=" << block_size() << "\n"; - os << indent1 << "k_block_size=" << k_block_size() << "\n"; - os << indent1 << "v_block_size=" << v_block_size() << "\n"; - os << indent1 << "block_size_bytes=" << block_size_bytes() << "\n"; - os << indent1 << "k_block_size_bytes=" << k_block_size_bytes() << "\n"; - os << indent1 << "v_block_size_bytes=" << v_block_size_bytes() << "\n"; - return os.str(); - } -}; - -typedef std::shared_ptr KVCacheSpecPtr; - -} // namespace rtp_llm \ No newline at end of file diff --git a/rtp_llm/cpp/cache/KVCacheTransferPlanner.cc b/rtp_llm/cpp/cache/KVCacheTransferPlanner.cc new file mode 100644 index 0000000000..b6ce2d895a --- /dev/null +++ b/rtp_llm/cpp/cache/KVCacheTransferPlanner.cc @@ -0,0 +1,121 @@ +#include "rtp_llm/cpp/cache/KVCacheTransferPlanner.h" + +#include + +namespace rtp_llm { + +std::vector blockPositionsForCacheTransfer(size_t block_num, + size_t reuse_block_size, + bool use_hybrid, + CacheGroupType group_type, + bool hybrid_full_from_begin) { + return blockPositionsForCacheTransfer( + block_num, + reuse_block_size, + use_hybrid, + /*transfer_tail_blocks=*/group_type != CacheGroupType::FULL, + static_cast(defaultCacheGroupPolicy(group_type).active_tail_blocks), + hybrid_full_from_begin); +} + +std::vector blockPositionsForCacheTransfer(size_t block_num, + size_t reuse_block_size, + bool use_hybrid, + bool transfer_tail_blocks, + size_t tail_block_count, + bool hybrid_full_from_begin) { + std::vector block_pos_list; + block_pos_list.reserve(block_num); + if (use_hybrid && block_num > 0 && transfer_tail_blocks) { + const size_t tail_count = std::max(1, tail_block_count); + const size_t start = block_num > tail_count ? block_num - tail_count : 0; + for (size_t block_pos = start; block_pos < block_num; ++block_pos) { + block_pos_list.push_back(block_pos); + } + return block_pos_list; + } + const size_t start = use_hybrid && hybrid_full_from_begin ? 0 : reuse_block_size; + for (size_t block_pos = start; block_pos < block_num; ++block_pos) { + block_pos_list.push_back(block_pos); + } + return block_pos_list; +} + +std::vector buildCacheStoreBlockPlan(size_t total_logical_blocks, + size_t reuse_block_size, + bool use_hybrid, + CacheGroupType group_type, + int cp_rank, + int cp_size) { + const auto policy = defaultCacheGroupPolicy(group_type); + return buildCacheStoreBlockPlan(total_logical_blocks, + reuse_block_size, + use_hybrid, + /*cp_shardable=*/group_type == CacheGroupType::FULL, + /*cp_compact_tail_blocks=*/group_type == CacheGroupType::SWA, + static_cast(policy.active_tail_blocks), + cp_rank, + cp_size); +} + +std::vector buildCacheStoreBlockPlan(size_t total_logical_blocks, + size_t reuse_block_size, + bool use_hybrid, + bool cp_shardable, + bool cp_compact_tail_blocks, + size_t tail_block_count, + int cp_rank, + int cp_size) { + std::vector plan; + + const bool sharded_full = (cp_size > 1) && cp_shardable; + const bool compact_swa_by_cp = (cp_size > 1) && cp_compact_tail_blocks; + if (compact_swa_by_cp) { + const size_t cp_size_t = static_cast(cp_size); + const size_t canonical_blocks = (total_logical_blocks + cp_size_t - 1) / cp_size_t; + const size_t tail_count = std::max(1, tail_block_count); + const size_t start = use_hybrid ? (canonical_blocks > tail_count ? canonical_blocks - tail_count : 0) : + std::min(reuse_block_size, canonical_blocks); + plan.reserve(canonical_blocks - start); + for (size_t compact_idx = start; compact_idx < canonical_blocks; ++compact_idx) { + const size_t key_index = std::min((compact_idx + 1) * cp_size_t - 1, total_logical_blocks - 1); + plan.push_back({static_cast(key_index), static_cast(compact_idx)}); + } + return plan; + } + + auto positions = blockPositionsForCacheTransfer(total_logical_blocks, + reuse_block_size, + use_hybrid, + /*transfer_tail_blocks=*/tail_block_count > 0, + tail_block_count, + /*hybrid_full_from_begin=*/true); + + plan.reserve(positions.size()); + + if (!sharded_full && !compact_swa_by_cp) { + for (auto pos : positions) { + const int p = static_cast(pos); + plan.push_back({p, p}); + } + return plan; + } + for (auto pos : positions) { + const int p = static_cast(pos); + if (p % cp_size != cp_rank) { + continue; + } + plan.push_back({p, p / cp_size}); + } + return plan; +} + +std::string layerTagCacheTransferKey(size_t request_id, size_t layer_id, const std::string& tag) { + auto key = std::to_string(request_id) + "-" + std::to_string(layer_id); + if (!tag.empty() && tag != "default") { + key += "-tag-" + tag; + } + return key; +} + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/KVCacheTransferPlanner.h b/rtp_llm/cpp/cache/KVCacheTransferPlanner.h new file mode 100644 index 0000000000..06afad8687 --- /dev/null +++ b/rtp_llm/cpp/cache/KVCacheTransferPlanner.h @@ -0,0 +1,66 @@ +#pragma once + +#include +#include +#include + +#include "rtp_llm/cpp/cache/spec/CacheGroupType.h" + +namespace rtp_llm { + +std::vector blockPositionsForCacheTransfer( + size_t block_num, size_t reuse_block_size, bool use_hybrid, CacheGroupType group_type, bool hybrid_full_from_begin); +std::vector blockPositionsForCacheTransfer(size_t block_num, + size_t reuse_block_size, + bool use_hybrid, + bool transfer_tail_blocks, + size_t tail_block_count, + bool hybrid_full_from_begin); + +std::string layerTagCacheTransferKey(size_t request_id, size_t layer_id, const std::string& tag); + +// One iteration step of cache_store registration: pair the cache_key at +// ``key_index`` (FULL-length namespace) with the kv_cache_offset slot at +// ``offset_index`` (rank-local namespace). Outside CP-page-RR sharding the +// two are equal; under sharding they diverge for FULL groups (see below). +struct CacheStoreBlockPair { + int key_index; + int offset_index; +}; + +// Build the per-prefill-write iteration plan for cache_store registration. +// +// Background: ``cache_keys`` is always the FULL logical-block hash sequence +// (length = total_logical_blocks). ``kv_cache_offset`` is per-group and +// per-rank: for non-FULL groups every rank holds the full block list (length +// = total_logical_blocks), for FULL groups under CP-page-RR sharding each +// rank holds only the 1/cp_size logical blocks it owns, **compactly**, in +// the order they appear within the rank — i.e. local index ``i`` ↔ logical +// position ``cp_rank + i*cp_size``. +// +// To register the right key with the right buffer the planner emits: +// * (pos, pos) — non-CP / non-FULL groups +// * (cp_rank + i*cp_size, i) for owned i — CP-sharded FULL groups +// * ((i+1)*cp_size-1, i) — CP-compact SWA/fixed groups +// +// Without this re-pairing the prefill side advertises ``cache_keys[i]`` +// (== key for logical position i) attached to data from logical position +// ``cp_rank + i*cp_size`` — decode then receives content shifted by +// ``cp_rank`` slots and produces coherent-but-wrong output (DSV4 PD reuse +// regression seen 2026-05-12). +std::vector buildCacheStoreBlockPlan(size_t total_logical_blocks, + size_t reuse_block_size, + bool use_hybrid, + CacheGroupType group_type, + int cp_rank, + int cp_size); +std::vector buildCacheStoreBlockPlan(size_t total_logical_blocks, + size_t reuse_block_size, + bool use_hybrid, + bool cp_shardable, + bool cp_compact_tail_blocks, + size_t tail_block_count, + int cp_rank, + int cp_size); + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/LinearKVCacheGroup.cc b/rtp_llm/cpp/cache/LinearKVCacheGroup.cc deleted file mode 100644 index 45e7989279..0000000000 --- a/rtp_llm/cpp/cache/LinearKVCacheGroup.cc +++ /dev/null @@ -1,210 +0,0 @@ -#include "rtp_llm/cpp/cache/LinearKVCacheGroup.h" - -#include -#include - -#include "rtp_llm/cpp/utils/Logger.h" - -namespace rtp_llm { - -void LinearKVCacheGroup::filterValidBlocks(const BlockIndicesType& in, BlockIndicesType& out) const { - out.clear(); - out.reserve(in.size()); - for (auto b : in) { - if (!isNullBlockIdx(b)) { - out.push_back(b); - } - } -} - -int LinearKVCacheGroup::needBlocksNum(int seq_len, int current_blocks, int reserve_step) const { - int extra_blocks = reserve_step ? reserve_step - 1 : 0; - return std::max((seq_len + seq_size_per_block_ - 1) / seq_size_per_block_ + extra_blocks - current_blocks, 0); -} - -NeedBlocksInfo LinearKVCacheGroup::getNeedBlocks( - int common_seq_len, int seq_len, int reserve_step, int reuse_blocks_len, bool reuse_enabled) const { - const int reuse_begin = reuse_blocks_len; - const int step = std::max(1, linear_step_); - - // calculate the number of blocks in the range (begin, end] - auto count_linear_sparse_range = [&](int begin, int end) -> int { - if (end <= begin) { - return 0; - } - if (!reuse_enabled) { - // keeps only the tail block - return 1; - } - const int eligible = (end + 1) / step - (begin + 1) / step; - const int tail = ((end + 1) % step == 0) ? 0 : 1; - return eligible + tail; - }; - - NeedBlocksInfo info; - - // common_slots: blocks for common_seq_len (no reserve) - const int common_slots = needBlocksNum(common_seq_len, 0); - // seq_slots: blocks for seq_len (no reserve) - const int seq_slots = needBlocksNum(seq_len, 0); - // total_slots = seq_slots + reserve_step - const int total_slots = needBlocksNum(seq_len, 0, reserve_step); - - info.common_blocks = count_linear_sparse_range(reuse_begin, common_slots); - info.extra_blocks = count_linear_sparse_range(common_slots, seq_slots); - info.extra_blocks += std::max(total_slots - seq_slots, 0); // for reserve_step - - info.common_blocks = std::max(info.common_blocks, 0); - info.extra_blocks = std::max(info.extra_blocks, 0); - return info; -} - -MatchResult LinearKVCacheGroup::matchSingleKey(CacheKeyType cache_key) const { - MatchResult result; - auto matched = block_cache_->match(cache_key, group_id_); - if (!isNullBlockIdx(matched.matched_index)) { - result.block_indices = {matched.matched_index}; - } - return result; -} - -MatchResult LinearKVCacheGroup::match(const CacheKeysType& cache_keys) { - return {}; -} - -bool LinearKVCacheGroup::malloc(BlockIds& block_ids, int seq_len, bool enable_reuse_cache, int reserve_step) { - const int step = std::max(1, linear_step_); - const int current_blocks_len = static_cast(block_ids.blocksNum()); - const int seq_slots = needBlocksNum(seq_len, 0, 0); - const int new_blocks_len = needBlocksNum(seq_len, current_blocks_len, reserve_step); - - if (new_blocks_len == 0) { - return true; - } - - // LinearKVCacheGroup::malloc is responsible for: - // 1. allocating blocks for the current sequence length; - // 2. free unused blocks to reduce kvcache block usage; - - // Two policies to follow: - // 1. Linear Steps: keep N * linear_step blocks if cache reuse enabled; - // 2. Allocate Tail Blocks: allocate the last partial block when initialization and keep last 2 block during - // decoding; - - int need_alloc_blocks = 0; - - for (int i = current_blocks_len; i < current_blocks_len + new_blocks_len; i++) { - const bool is_seq_tail = (seq_slots > 0) && (i == seq_slots - 1); - const bool is_reserve = (reserve_step > 0) && (i >= seq_slots); - const bool step_hit = (((i + 1) % step) == 0); - const bool should_alloc = is_reserve || (enable_reuse_cache ? (step_hit || is_seq_tail) : is_seq_tail); - if (should_alloc) { - need_alloc_blocks++; - } - } - - if (need_alloc_blocks > 0) { - const auto free_blocks_num = freeBlocksNum(); - if (free_blocks_num < static_cast(need_alloc_blocks)) { - if (!ensureFreeBlocks(need_alloc_blocks)) { - RTP_LLM_LOG_WARNING("Insufficient free blocks for LinearKVCacheGroup: need %d, have %zu", - need_alloc_blocks, - free_blocks_num); - return false; - } - } - } - - BlockIndicesType new_ids; - new_ids.reserve(static_cast(new_blocks_len)); - for (int i = current_blocks_len; i < current_blocks_len + new_blocks_len; i++) { - const bool is_seq_tail = (seq_slots > 0) && (i == seq_slots - 1); - const bool is_reserve = (reserve_step > 0) && (i >= seq_slots); - const bool step_hit = (((i + 1) % step) == 0); - const bool should_alloc = is_reserve || (enable_reuse_cache ? (step_hit || is_seq_tail) : is_seq_tail); - if (should_alloc) { - auto result = block_pool_->malloc(1); - if (result.empty()) { - return false; - } - new_ids.push_back(result[0]); - } else { - new_ids.push_back(NULL_BLOCK_IDX); - } - } - block_ids.add(new_ids); - return true; -} - -void LinearKVCacheGroup::insertIntoCache(const CacheKeysType& cache_keys, - const BlockIndicesType& block_indices, - bool is_resident) { - if (cache_keys.empty() || block_indices.empty()) { - return; - } - const size_t n = std::min(cache_keys.size(), block_indices.size()); - for (size_t i = 0; i < n; ++i) { - const auto b = block_indices[i]; - if (isNullBlockIdx(b)) { - continue; - } - BlockCache::CacheItem item; - item.cache_key = cache_keys[i]; - item.group_id = group_id_; - item.block_index = b; - item.is_resident = is_resident; - if (block_cache_->put(item)) { - block_pool_->blockCacheReference(b); - } - } -} - -void LinearKVCacheGroup::removeSkippedBlocks(BlockIds& block_ids, bool enable_reuse_cache, int reserve_step) { - const auto& block_indices = block_ids.blocks(); // const view for reading current state - if (block_indices.empty()) { - return; - } - const int step = std::max(1, linear_step_); - const int block_size = static_cast(block_indices.size()); - - BlockIndicesType blocks_to_free; - std::vector pos_to_remove; - // keep last 2 and every reserve_step - for (int i = block_size - 3 - reserve_step; i >= 0; i--) { - if (isNullBlockIdx(block_indices[i])) { - break; - } - if (enable_reuse_cache && ((i + 1) % step) == 0) { - continue; - } - blocks_to_free.push_back(block_indices[i]); - pos_to_remove.push_back(static_cast(i)); - } - if (!blocks_to_free.empty()) { - block_pool_->requestFree(blocks_to_free); - block_ids.remove(pos_to_remove); // null-out by position, updates kernel slots incrementally - } -} - -void LinearKVCacheGroup::free(const BlockIndicesType& block_indices) { - if (block_indices.empty()) { - return; - } - BlockIndicesType valid; - filterValidBlocks(block_indices, valid); - if (valid.empty()) { - return; - } - block_pool_->requestFree(valid); -} - -void LinearKVCacheGroup::reference(BlockIds& block_ids, const BlockIndicesType& new_block_indices) { - block_ids.add(new_block_indices); - BlockIndicesType valid; - filterValidBlocks(new_block_indices, valid); - if (!valid.empty()) { - block_pool_->requestReference(valid); - } -} - -} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/MemoryLayoutConfig.h b/rtp_llm/cpp/cache/MemoryLayoutConfig.h index 088493556f..028f4ea09c 100644 --- a/rtp_llm/cpp/cache/MemoryLayoutConfig.h +++ b/rtp_llm/cpp/cache/MemoryLayoutConfig.h @@ -37,6 +37,13 @@ struct MemoryLayoutConfig { size_t local_head_num_kv = 0; size_t seq_size_per_block = 0; + // Number of kernel blocks packed inside one BlockPool block. When > 1, + // BlockPool allocates physical blocks (each = bpk × kernel block bytes), but + // kernels still address by kernel-block id; MemoryLayoutStrategy reshapes the + // KV tensor as (layer, block_num × bpk, kv_block_stride_bytes / bpk) so the + // kernel view sees per-kernel-block strides. + size_t kernel_blocks_per_kv_block = 1; + bool enable_kv_scale = false; bool enable_hybrid_attention = false; diff --git a/rtp_llm/cpp/cache/MemoryLayoutStrategy.cc b/rtp_llm/cpp/cache/MemoryLayoutStrategy.cc index 91e555a0e7..8e86cb5c3b 100644 --- a/rtp_llm/cpp/cache/MemoryLayoutStrategy.cc +++ b/rtp_llm/cpp/cache/MemoryLayoutStrategy.cc @@ -1,7 +1,7 @@ #include "rtp_llm/cpp/cache/MemoryLayoutStrategy.h" #include "rtp_llm/models_py/bindings/core/torch_utils/TypeConvert.h" #include "rtp_llm/cpp/utils/Logger.h" -#include "rtp_llm/cpp/cache/KVCacheSpec.h" +#include "rtp_llm/cpp/cache/spec/KVCacheSpec.h" namespace rtp_llm { @@ -74,10 +74,24 @@ void MemoryLayoutStrategy::processKVTensor(torch::Tensor& kv_cache_tensor) { torch::str(layer_kv_tensors_[layer_id].sizes()).c_str()); } } else { - // MHA: [layer_num, block_num, kv_block_stride_elems], per layer 2D - torch::Tensor reshaped_tensor = kv_cache_typed.reshape({static_cast(config_.layer_num), - static_cast(config_.block_num), - static_cast(kv_block_stride_elems)}); + // MHA: [layer_num, block_num, kv_block_stride_elems], per layer 2D. + // When kernel_blocks_per_kv_block > 1 (e.g. DSV4 paged FULL pools with + // physical block > 256 tokens), reshape into the kernel-block view — + // (layer_num, block_num × bpk, kv_block_stride_elems / bpk) — so that + // kernels addressing by kernel-block id see per-kernel-block strides. + // The underlying memory is identical, only the shape interpretation + // changes; entries_per_block derived from tensor stride stays at the + // kernel-block size (e.g. 64 for compress_ratio=4) and FlashMLA's + // template instantiation constraint (block_kv == 64) holds. + const size_t bpk = std::max(1, config_.kernel_blocks_per_kv_block); + RTP_LLM_CHECK_WITH_INFO(kv_block_stride_elems % bpk == 0, + "kv_block_stride_elems(%zu) must be divisible by kernel_blocks_per_kv_block(%zu)", + kv_block_stride_elems, + bpk); + const int64_t kernel_block_count = static_cast(config_.block_num) * static_cast(bpk); + const int64_t kernel_block_stride_elems = static_cast(kv_block_stride_elems / bpk); + torch::Tensor reshaped_tensor = kv_cache_typed.reshape( + {static_cast(config_.layer_num), kernel_block_count, kernel_block_stride_elems}); clearKVTensor(reshaped_tensor); for (uint32_t layer_id = 0; layer_id < config_.layer_num; ++layer_id) { layer_kv_tensors_.push_back(reshaped_tensor[layer_id]); @@ -215,13 +229,29 @@ std::vector MemoryLayoutStrategy::createBasicBlockInfo(int layer_id, checkLayerIdValidity(layer_id); auto& layer_tensor = layer_kv_tensors_[layer_id]; - void* kv_addr = getBlockPtr(layer_tensor, block_id); - auto kv_info = makeBlockInfo(layer_tensor, kv_addr, static_cast(config_.kv_block_stride_bytes)); + void* kv_addr = nullptr; + if (config_.kernel_blocks_per_kv_block > 1) { + RTP_LLM_CHECK_WITH_INFO(block_id >= 0 && static_cast(block_id) < config_.block_num, + "Physical block ID %d out of range (max: %zu)", + block_id, + config_.block_num); + kv_addr = + static_cast(layer_tensor.data_ptr()) + static_cast(block_id) * config_.kv_block_stride_bytes; + } else { + kv_addr = getBlockPtr(layer_tensor, block_id); + } + auto kv_info = makeBlockInfo(layer_tensor, kv_addr, static_cast(config_.kv_block_stride_bytes)); if (config_.hasScale()) { auto& layer_scale_tensor = layer_kv_scale_tensors_[layer_id]; - void* kv_scale_addr = getBlockPtr(layer_scale_tensor, block_id); - auto scale_info = + void* kv_scale_addr = nullptr; + if (config_.kernel_blocks_per_kv_block > 1) { + kv_scale_addr = static_cast(layer_scale_tensor.data_ptr()) + + static_cast(block_id) * config_.kv_scale_stride_bytes; + } else { + kv_scale_addr = getBlockPtr(layer_scale_tensor, block_id); + } + auto scale_info = makeBlockInfo(layer_scale_tensor, kv_scale_addr, static_cast(config_.kv_scale_stride_bytes)); return {kv_info, scale_info}; } @@ -307,4 +337,4 @@ void MemoryLayoutStrategy::checkLayerIdValidity(int layer_id) const { layer_kv_tensors_.size()); } -} // namespace rtp_llm \ No newline at end of file +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/SharedBlockCache.cc b/rtp_llm/cpp/cache/SharedBlockCache.cc new file mode 100644 index 0000000000..537af2f902 --- /dev/null +++ b/rtp_llm/cpp/cache/SharedBlockCache.cc @@ -0,0 +1,904 @@ +#include "rtp_llm/cpp/cache/SharedBlockCache.h" + +#include + +#include "rtp_llm/cpp/utils/Logger.h" +#include "rtp_llm/cpp/utils/ProfilingScope.h" +#include "rtp_llm/cpp/utils/TimeUtil.h" + +namespace rtp_llm { + +void SharedBlockCache::init(int group_num, const std::vector& group_pools) { + std::lock_guard lock(mu_); + RTP_LLM_CHECK_WITH_INFO(static_cast(group_pools.size()) == group_num, + "group_pools size %zu != group_num %d", + group_pools.size(), + group_num); + group_num_ = group_num; + group_pools_ = group_pools; +} + +void SharedBlockCache::put(CacheKeyType cache_key, const std::vector& group_slots, bool is_resident) { + BlockDependency dependency; + put(cache_key, group_slots, is_resident, kDefaultNamespace, dependency); +} + +void SharedBlockCache::put(CacheKeyType cache_key, + const std::vector& group_slots, + bool is_resident, + NamespaceId namespace_id, + const BlockDependency& dependency, + const std::vector& matchable_slots) { + RTP_LLM_PROFILE_FUNCTION(); + std::lock_guard lock(mu_); + + if (lru_cache_.contains(cache_key)) { + auto [success, existing_item] = lru_cache_.get(cache_key); + if (success) { + const auto now_us = currentTimeUs(); + const bool resident = existing_item.is_resident || is_resident; + if (resident != existing_item.is_resident) { + existing_item.is_resident = resident; + } + const bool dependency_updated = updateItemDependencyLocked(existing_item, namespace_id, dependency); + bool updated = false; + for (size_t gid = 0; gid < group_slots.size(); ++gid) { + if (isNullBlockIdx(group_slots[gid])) { + continue; + } + if (gid >= existing_item.slots.size()) { + existing_item.slots.resize(gid + 1, NULL_BLOCK_IDX); + } + if (gid >= existing_item.matchable_slots.size()) { + existing_item.matchable_slots.resize(gid + 1, true); + } + if (gid >= existing_item.slot_created_time_us.size()) { + existing_item.slot_created_time_us.resize(gid + 1, 0); + } + if (isNullBlockIdx(existing_item.slots[gid])) { + existing_item.slots[gid] = group_slots[gid]; + existing_item.slot_created_time_us[gid] = now_us; + existing_item.matchable_slots[gid] = + matchable_slots.empty() || gid >= matchable_slots.size() ? true : matchable_slots[gid]; + updated = true; + if (static_cast(gid) < group_num_) { + group_pools_[gid]->blockCacheReference(group_slots[gid]); + } + } else if (!matchable_slots.empty() && gid < matchable_slots.size() && matchable_slots[gid] + && !existing_item.matchable_slots[gid]) { + existing_item.matchable_slots[gid] = true; + updated = true; + } + } + if (updated || existing_item.is_resident || dependency_updated) { + lru_cache_.put(cache_key, existing_item); + ++version_; + } + if (existing_item.is_resident) { + markAllTreeAliasesResidentLocked(cache_key); + } + upsertTreeNodeLocked(cache_key, namespace_id, dependency, existing_item.is_resident); + refreshAllTreeAliasesLocked(cache_key); + } + return; + } + + UnifiedCacheItem item; + const auto now_us = currentTimeUs(); + item.cache_key = cache_key; + item.is_resident = is_resident; + item.slots = group_slots; + item.created_time_us = now_us; + item.matchable_slots.resize(group_slots.size(), true); + item.slot_created_time_us.resize(group_slots.size(), 0); + for (size_t gid = 0; gid < group_slots.size() && gid < matchable_slots.size(); ++gid) { + item.matchable_slots[gid] = matchable_slots[gid]; + } + for (size_t gid = 0; gid < group_slots.size(); ++gid) { + if (!isNullBlockIdx(group_slots[gid])) { + item.slot_created_time_us[gid] = now_us; + } + } + updateItemDependencyLocked(item, namespace_id, dependency); + + lru_cache_.put(cache_key, item); + ++version_; + upsertTreeNodeLocked(cache_key, namespace_id, dependency, item.is_resident); + refreshAllTreeAliasesLocked(cache_key); + + for (int gid = 0; gid < static_cast(group_slots.size()) && gid < group_num_; ++gid) { + if (!isNullBlockIdx(group_slots[gid])) { + group_pools_[gid]->blockCacheReference(group_slots[gid]); + } + } +} + +SharedBlockCache::MatchResult SharedBlockCache::match(CacheKeyType cache_key) { + RTP_LLM_PROFILE_FUNCTION(); + std::lock_guard lock(mu_); + + auto [success, item] = lru_cache_.get(cache_key); + if (!success) { + return {false, {}}; + } + touchTreeAliasesLocked(cache_key); + return {true, item.slots}; +} + +BlockIdxType SharedBlockCache::matchGroup(CacheKeyType cache_key, int group_id) { + RTP_LLM_PROFILE_FUNCTION(); + std::lock_guard lock(mu_); + + auto [success, item] = lru_cache_.get(cache_key); + if (!success) { + return NULL_BLOCK_IDX; + } + touchTreeAliasesLocked(cache_key); + if (group_id < 0 || static_cast(group_id) >= item.slots.size()) { + return NULL_BLOCK_IDX; + } + if (!slotMatchable(item, static_cast(group_id))) { + return NULL_BLOCK_IDX; + } + const auto block = item.slots[group_id]; + return block; +} + +SharedBlockCache::EvictResult SharedBlockCache::selectAndEvict(size_t min_blocks) { + RTP_LLM_PROFILE_FUNCTION(); + std::lock_guard lock(mu_); + + EvictResult result; + if (lru_cache_.empty() || min_blocks == 0) { + return result; + } + + if (prefix_tree_enabled_ && !leaf_lru_.empty()) { + size_t selected_blocks = 0; + while (selected_blocks < min_blocks && !leaf_lru_.empty()) { + const auto leaf = *leaf_lru_.begin(); + const auto leaf_key = NamespacedKey{leaf.namespace_id, leaf.cache_key}; + auto chain = collectEvictChainLocked(leaf_key); + if (chain.empty()) { + removeTreeAliasLocked(leaf_key); + continue; + } + std::vector ordered_chain(chain.rbegin(), chain.rend()); + for (const auto& tree_key : ordered_chain) { + UnifiedCacheItem removed_item; + if (!lru_cache_.remove(tree_key.cache_key, &removed_item)) { + removeAllTreeAliasesForCacheKeyLocked(tree_key.cache_key); + continue; + } + if (result.evicted_slots.find(tree_key.cache_key) == result.evicted_slots.end()) { + result.evicted_keys.push_back(tree_key.cache_key); + result.evicted_slots[tree_key.cache_key] = removed_item.slots; + result.evicted_lifetime_ms[tree_key.cache_key] = + std::max(0, (currentTimeUs() - removed_item.created_time_us) / 1000); + result.evicted_namespaces[tree_key.cache_key] = + removed_item.has_dependency ? removed_item.dependency_namespace : tree_key.namespace_id; + if (removed_item.has_dependency) { + result.evicted_dependencies[tree_key.cache_key] = removed_item.dependency; + } + for (const auto& slot : removed_item.slots) { + if (!isNullBlockIdx(slot)) { + selected_blocks++; + } + } + } + removeAllTreeAliasesForCacheKeyLocked(tree_key.cache_key); + } + } + return result; + } + + std::unordered_set resident_keys; + for (const auto& [key, item] : lru_cache_.items()) { + if (item.is_resident) { + resident_keys.insert(item.cache_key); + } + } + + std::vector lru_keys; + for (auto it = lru_cache_.items().rbegin(); it != lru_cache_.items().rend(); ++it) { + const auto& item = it->second; + if (item.is_resident || resident_keys.count(item.cache_key)) { + continue; + } + lru_keys.push_back(item.cache_key); + } + + size_t selected_blocks = 0; + for (const auto cache_key : lru_keys) { + UnifiedCacheItem removed_item; + if (!lru_cache_.remove(cache_key, &removed_item)) { + continue; + } + removeAllTreeAliasesForCacheKeyLocked(cache_key); + + result.evicted_keys.push_back(cache_key); + result.evicted_slots[cache_key] = removed_item.slots; + result.evicted_lifetime_ms[cache_key] = + std::max(0, (currentTimeUs() - removed_item.created_time_us) / 1000); + result.evicted_namespaces[cache_key] = + removed_item.has_dependency ? removed_item.dependency_namespace : kDefaultNamespace; + if (removed_item.has_dependency) { + result.evicted_dependencies[cache_key] = removed_item.dependency; + } + + for (const auto& slot : removed_item.slots) { + if (!isNullBlockIdx(slot)) { + selected_blocks++; + } + } + if (selected_blocks >= min_blocks) { + break; + } + } + + return result; +} + +SharedBlockCache::EvictResult SharedBlockCache::selectAndEvictForGroup(int group_id, size_t min_blocks) { + RTP_LLM_PROFILE_FUNCTION(); + if (min_blocks == 0) { + return {}; + } + + std::lock_guard lock(mu_); + EvictResult result; + if (independent_group_eviction_enabled_ && prefix_tree_enabled_ && isIndependentEvictionGroupLocked(group_id)) { + if (selectIndependentGroupEvictionsLocked(group_id, min_blocks, result)) { + return result; + } + } + if (!result.evicted_keys.empty()) { + return result; + } + + // Re-enter the normal selection path without taking the mutex twice. + if (lru_cache_.empty()) { + return result; + } + if (prefix_tree_enabled_ && !leaf_lru_.empty()) { + size_t selected_blocks = 0; + bool made_progress = true; + while (selected_blocks < min_blocks && made_progress && !leaf_lru_.empty()) { + made_progress = false; + std::vector leaves(leaf_lru_.begin(), leaf_lru_.end()); + for (const auto& leaf : leaves) { + if (selected_blocks >= min_blocks) { + break; + } + const auto leaf_key = NamespacedKey{leaf.namespace_id, leaf.cache_key}; + auto chain = collectEvictChainLocked(leaf_key); + if (chain.empty()) { + removeTreeAliasLocked(leaf_key); + made_progress = true; + continue; + } + const bool chain_has_target = chainHasUsableSlotLocked(chain, group_id); + if (!chain_has_target && !chainHasReachableAncestorSlotLocked(chain, group_id)) { + continue; + } + std::vector ordered_chain(chain.rbegin(), chain.rend()); + for (const auto& tree_key : ordered_chain) { + UnifiedCacheItem removed_item; + if (!lru_cache_.remove(tree_key.cache_key, &removed_item)) { + removeAllTreeAliasesForCacheKeyLocked(tree_key.cache_key); + continue; + } + made_progress = true; + if (result.evicted_slots.find(tree_key.cache_key) == result.evicted_slots.end()) { + result.evicted_keys.push_back(tree_key.cache_key); + result.evicted_slots[tree_key.cache_key] = removed_item.slots; + result.evicted_lifetime_ms[tree_key.cache_key] = + std::max(0, (currentTimeUs() - removed_item.created_time_us) / 1000); + result.evicted_namespaces[tree_key.cache_key] = + removed_item.has_dependency ? removed_item.dependency_namespace : tree_key.namespace_id; + if (removed_item.has_dependency) { + result.evicted_dependencies[tree_key.cache_key] = removed_item.dependency; + } + if (hasUsableSlot(removed_item, group_id)) { + selected_blocks++; + } + } + removeAllTreeAliasesForCacheKeyLocked(tree_key.cache_key); + } + } + } + return result; + } + + std::unordered_set resident_keys; + for (const auto& [key, item] : lru_cache_.items()) { + if (item.is_resident) { + resident_keys.insert(item.cache_key); + } + } + + std::vector lru_keys; + for (auto it = lru_cache_.items().rbegin(); it != lru_cache_.items().rend(); ++it) { + const auto& item = it->second; + if (item.is_resident || resident_keys.count(item.cache_key)) { + continue; + } + lru_keys.push_back(item.cache_key); + } + + size_t selected_blocks = 0; + for (const auto cache_key : lru_keys) { + UnifiedCacheItem removed_item; + bool has_target_slot = false; + for (const auto& [key, item] : lru_cache_.items()) { + if (key == cache_key) { + has_target_slot = hasUsableSlot(item, group_id); + break; + } + } + if (!has_target_slot) { + continue; + } + if (!lru_cache_.remove(cache_key, &removed_item)) { + continue; + } + removeAllTreeAliasesForCacheKeyLocked(cache_key); + + result.evicted_keys.push_back(cache_key); + result.evicted_slots[cache_key] = removed_item.slots; + result.evicted_lifetime_ms[cache_key] = + std::max(0, (currentTimeUs() - removed_item.created_time_us) / 1000); + result.evicted_namespaces[cache_key] = + removed_item.has_dependency ? removed_item.dependency_namespace : kDefaultNamespace; + if (removed_item.has_dependency) { + result.evicted_dependencies[cache_key] = removed_item.dependency; + } + + if (hasUsableSlot(removed_item, group_id)) { + selected_blocks++; + } + if (selected_blocks >= min_blocks) { + break; + } + } + + return result; +} + +size_t SharedBlockCache::evictAndFree(size_t min_blocks) { + RTP_LLM_PROFILE_FUNCTION(); + + auto evict_result = selectAndEvict(min_blocks); + if (evict_result.evicted_keys.empty()) { + return 0; + } + + size_t freed = 0; + for (size_t i = 0; i < evict_result.evicted_keys.size(); ++i) { + const auto cache_key = evict_result.evicted_keys[i]; + const auto& slots = evict_result.evicted_slots.at(cache_key); + + for (int gid = 0; gid < static_cast(slots.size()) && gid < group_num_; ++gid) { + if (!isNullBlockIdx(slots[gid])) { + group_pools_[gid]->blockCacheFree(slots[gid]); + freed++; + } + } + } + return freed; +} + +size_t SharedBlockCache::evictAndFreeForGroup(int group_id, size_t min_blocks, EvictResult* evict_result_out) { + RTP_LLM_PROFILE_FUNCTION(); + + auto evict_result = selectAndEvictForGroup(group_id, min_blocks); + if (evict_result.evicted_keys.empty()) { + if (evict_result_out) { + *evict_result_out = std::move(evict_result); + } + return 0; + } + + size_t freed = 0; + for (size_t i = 0; i < evict_result.evicted_keys.size(); ++i) { + const auto cache_key = evict_result.evicted_keys[i]; + const auto& slots = evict_result.evicted_slots.at(cache_key); + + for (int gid = 0; gid < static_cast(slots.size()) && gid < group_num_; ++gid) { + if (!isNullBlockIdx(slots[gid])) { + group_pools_[gid]->blockCacheFree(slots[gid]); + if (gid == group_id) { + freed++; + } + } + } + } + if (evict_result_out) { + *evict_result_out = std::move(evict_result); + } + return freed; +} + +std::optional SharedBlockCache::remove(CacheKeyType cache_key) { + std::lock_guard lock(mu_); + + UnifiedCacheItem removed_item; + if (!lru_cache_.remove(cache_key, &removed_item)) { + return std::nullopt; + } + removeAllTreeAliasesForCacheKeyLocked(cache_key); + return removed_item; +} + +bool SharedBlockCache::contains(CacheKeyType cache_key) const { + std::lock_guard lock(mu_); + return lru_cache_.contains(cache_key); +} + +bool SharedBlockCache::empty() const { + std::lock_guard lock(mu_); + return lru_cache_.empty(); +} + +size_t SharedBlockCache::size() const { + std::lock_guard lock(mu_); + return lru_cache_.size(); +} + +std::vector SharedBlockCache::allCacheKeys() const { + std::lock_guard lock(mu_); + std::vector keys; + keys.reserve(lru_cache_.size()); + for (const auto& [key, item] : lru_cache_.items()) { + keys.push_back(key); + } + return keys; +} + +int64_t SharedBlockCache::version() const { + std::lock_guard lock(mu_); + return version_; +} + +void SharedBlockCache::setPrefixTreeEnabled(bool enabled) { + std::lock_guard lock(mu_); + prefix_tree_enabled_ = enabled; +} + +bool SharedBlockCache::prefixTreeEnabled() const { + std::lock_guard lock(mu_); + return prefix_tree_enabled_; +} + +void SharedBlockCache::setIndependentGroupEviction(bool enabled, const std::vector& group_ids) { + std::lock_guard lock(mu_); + independent_group_eviction_enabled_ = enabled; + independent_eviction_group_ids_.clear(); + for (const auto gid : group_ids) { + if (gid >= 0) { + independent_eviction_group_ids_.insert(gid); + } + } +} + +void SharedBlockCache::upsertTreeNodeLocked(CacheKeyType cache_key, + NamespaceId namespace_id, + const BlockDependency& dependency, + bool is_resident) { + if (!prefix_tree_enabled_) { + return; + } + const NamespacedKey key{namespace_id, cache_key}; + const bool has_parent = dependency.has_parent && dependency.parent_key != cache_key; + const NamespacedKey parent{namespace_id, dependency.parent_key}; + auto it = tree_nodes_.find(key); + if (it == tree_nodes_.end()) { + PrefixTreeNode node; + node.key = key; + node.parent = parent; + node.has_parent = has_parent; + node.ordinal = dependency.ordinal; + node.resident = is_resident; + node.last_access_seq = ++tree_access_seq_; + auto [inserted_it, _] = tree_nodes_.emplace(key, std::move(node)); + it = inserted_it; + aliases_by_cache_key_[cache_key].insert(key); + } else { + eraseLeafLocked(it->second); + if (it->second.has_parent && (it->second.parent == parent) == false) { + if (auto parent_it = tree_nodes_.find(it->second.parent); parent_it != tree_nodes_.end()) { + parent_it->second.children.erase(key); + refreshLeafLocked(parent_it->first); + } else { + detachPendingChildLocked(it->second.parent, key); + } + } + it->second.parent = parent; + it->second.has_parent = has_parent; + it->second.ordinal = dependency.ordinal; + it->second.resident = it->second.resident || is_resident; + it->second.last_access_seq = ++tree_access_seq_; + } + + if (has_parent) { + auto parent_it = tree_nodes_.find(parent); + if (parent_it != tree_nodes_.end()) { + eraseLeafLocked(parent_it->second); + parent_it->second.children.insert(key); + } else { + pending_children_by_parent_[parent].insert(key); + } + } + attachPendingChildrenLocked(it->second); + insertLeafIfEligibleLocked(it->second); +} + +void SharedBlockCache::detachPendingChildLocked(const NamespacedKey& parent, const NamespacedKey& child) { + auto pending_it = pending_children_by_parent_.find(parent); + if (pending_it == pending_children_by_parent_.end()) { + return; + } + pending_it->second.erase(child); + if (pending_it->second.empty()) { + pending_children_by_parent_.erase(pending_it); + } +} + +void SharedBlockCache::attachPendingChildrenLocked(PrefixTreeNode& node) { + auto pending_it = pending_children_by_parent_.find(node.key); + if (pending_it == pending_children_by_parent_.end()) { + return; + } + for (const auto& child_key : pending_it->second) { + auto child_it = tree_nodes_.find(child_key); + if (child_it != tree_nodes_.end() && child_it->second.has_parent && child_it->second.parent == node.key) { + eraseLeafLocked(node); + node.children.insert(child_key); + } + } + pending_children_by_parent_.erase(pending_it); +} + +void SharedBlockCache::touchTreeAliasesLocked(CacheKeyType cache_key) { + if (!prefix_tree_enabled_) { + return; + } + auto aliases_it = aliases_by_cache_key_.find(cache_key); + if (aliases_it == aliases_by_cache_key_.end()) { + return; + } + std::vector aliases(aliases_it->second.begin(), aliases_it->second.end()); + for (const auto& key : aliases) { + auto node_it = tree_nodes_.find(key); + if (node_it != tree_nodes_.end()) { + touchTreeNodeLocked(node_it->second); + } + } +} + +void SharedBlockCache::touchTreeNodeLocked(PrefixTreeNode& node) { + eraseLeafLocked(node); + node.last_access_seq = ++tree_access_seq_; + insertLeafIfEligibleLocked(node); +} + +void SharedBlockCache::eraseLeafLocked(const PrefixTreeNode& node) { + leaf_lru_.erase(LeafKey{node.last_access_seq, node.key.namespace_id, node.key.cache_key}); +} + +void SharedBlockCache::insertLeafIfEligibleLocked(const PrefixTreeNode& node) { + if (node.resident || !node.children.empty() || !hasFlatItemLocked(node.key.cache_key) + || isFlatItemResidentLocked(node.key.cache_key)) { + return; + } + if (node.key.namespace_id != kGpuCpCanonicalNamespace && flatItemHasCanonicalDependencyLocked(node.key.cache_key)) { + return; + } + leaf_lru_.insert(LeafKey{node.last_access_seq, node.key.namespace_id, node.key.cache_key}); +} + +void SharedBlockCache::refreshLeafLocked(const NamespacedKey& key) { + auto it = tree_nodes_.find(key); + if (it == tree_nodes_.end()) { + return; + } + eraseLeafLocked(it->second); + insertLeafIfEligibleLocked(it->second); +} + +void SharedBlockCache::removeTreeAliasLocked(const NamespacedKey& key) { + auto it = tree_nodes_.find(key); + if (it == tree_nodes_.end()) { + return; + } + PrefixTreeNode node = it->second; + eraseLeafLocked(node); + if (node.has_parent) { + auto parent_it = tree_nodes_.find(node.parent); + if (parent_it != tree_nodes_.end()) { + parent_it->second.children.erase(key); + refreshLeafLocked(parent_it->first); + } else { + detachPendingChildLocked(node.parent, key); + } + } + for (const auto& child : node.children) { + auto child_it = tree_nodes_.find(child); + if (child_it != tree_nodes_.end() && child_it->second.parent == key) { + child_it->second.has_parent = false; + } + } + auto aliases_it = aliases_by_cache_key_.find(key.cache_key); + if (aliases_it != aliases_by_cache_key_.end()) { + aliases_it->second.erase(key); + if (aliases_it->second.empty()) { + aliases_by_cache_key_.erase(aliases_it); + } + } + tree_nodes_.erase(it); +} + +void SharedBlockCache::removeAllTreeAliasesForCacheKeyLocked(CacheKeyType cache_key) { + auto aliases_it = aliases_by_cache_key_.find(cache_key); + if (aliases_it == aliases_by_cache_key_.end()) { + return; + } + std::vector aliases(aliases_it->second.begin(), aliases_it->second.end()); + for (const auto& key : aliases) { + removeTreeAliasLocked(key); + } +} + +void SharedBlockCache::markAllTreeAliasesResidentLocked(CacheKeyType cache_key) { + auto aliases_it = aliases_by_cache_key_.find(cache_key); + if (aliases_it == aliases_by_cache_key_.end()) { + return; + } + for (const auto& key : aliases_it->second) { + auto node_it = tree_nodes_.find(key); + if (node_it == tree_nodes_.end() || node_it->second.resident) { + continue; + } + eraseLeafLocked(node_it->second); + node_it->second.resident = true; + } +} + +void SharedBlockCache::refreshAllTreeAliasesLocked(CacheKeyType cache_key) { + auto aliases_it = aliases_by_cache_key_.find(cache_key); + if (aliases_it == aliases_by_cache_key_.end()) { + return; + } + std::vector aliases(aliases_it->second.begin(), aliases_it->second.end()); + for (const auto& key : aliases) { + refreshLeafLocked(key); + } +} + +bool SharedBlockCache::flatItemHasCanonicalDependencyLocked(CacheKeyType cache_key) const { + for (const auto& [key, item] : lru_cache_.items()) { + if (key == cache_key) { + return item.has_dependency && item.dependency_namespace == kGpuCpCanonicalNamespace; + } + } + return false; +} + +bool SharedBlockCache::updateItemDependencyLocked(UnifiedCacheItem& item, + NamespaceId namespace_id, + const BlockDependency& dependency) const { + if (item.has_dependency && item.dependency_namespace == kGpuCpCanonicalNamespace + && namespace_id != kGpuCpCanonicalNamespace) { + return false; + } + if (item.has_dependency && item.dependency_namespace == namespace_id + && item.dependency.has_parent == dependency.has_parent && item.dependency.parent_key == dependency.parent_key + && item.dependency.ordinal == dependency.ordinal) { + return false; + } + item.dependency = dependency; + item.dependency_namespace = namespace_id; + item.has_dependency = true; + return true; +} + +bool SharedBlockCache::slotMatchable(const UnifiedCacheItem& item, size_t group_id) { + return group_id >= item.matchable_slots.size() || item.matchable_slots[group_id]; +} + +bool SharedBlockCache::hasUsableSlot(const UnifiedCacheItem& item, int group_id) { + return group_id >= 0 && static_cast(group_id) < item.slots.size() + && !isNullBlockIdx(item.slots[static_cast(group_id)]); +} + +std::vector +SharedBlockCache::collectEvictChainLocked(const NamespacedKey& leaf_key) const { + std::vector chain; + auto it = tree_nodes_.find(leaf_key); + if (it == tree_nodes_.end() || it->second.resident || !it->second.children.empty() + || !hasFlatItemLocked(it->second.key.cache_key) || isFlatItemResidentLocked(it->second.key.cache_key)) { + return chain; + } + + NamespacedKey cur = leaf_key; + while (true) { + auto node_it = tree_nodes_.find(cur); + if (node_it == tree_nodes_.end() || node_it->second.resident || !hasFlatItemLocked(cur.cache_key) + || isFlatItemResidentLocked(cur.cache_key)) { + break; + } + chain.push_back(cur); + if (!node_it->second.has_parent) { + break; + } + auto parent_it = tree_nodes_.find(node_it->second.parent); + if (parent_it == tree_nodes_.end() || parent_it->second.resident + || isFlatItemResidentLocked(parent_it->first.cache_key)) { + break; + } + if (parent_it->second.children.size() != 1) { + break; + } + cur = parent_it->first; + } + return chain; +} + +bool SharedBlockCache::chainHasUsableSlotLocked(const std::vector& chain, int group_id) const { + for (const auto& key : chain) { + for (const auto& [cache_key, item] : lru_cache_.items()) { + if (cache_key == key.cache_key && hasUsableSlot(item, group_id)) { + return true; + } + } + } + return false; +} + +bool SharedBlockCache::chainHasReachableAncestorSlotLocked(const std::vector& chain, + int group_id) const { + if (chain.empty()) { + return false; + } + auto node_it = tree_nodes_.find(chain.back()); + while (node_it != tree_nodes_.end() && node_it->second.has_parent) { + auto parent_it = tree_nodes_.find(node_it->second.parent); + if (parent_it == tree_nodes_.end() || parent_it->second.resident + || !hasFlatItemLocked(parent_it->first.cache_key) || isFlatItemResidentLocked(parent_it->first.cache_key)) { + return false; + } + bool parent_has_target_slot = false; + for (const auto& [cache_key, item] : lru_cache_.items()) { + if (cache_key == parent_it->first.cache_key && hasUsableSlot(item, group_id)) { + parent_has_target_slot = true; + break; + } + } + if (parent_has_target_slot) { + bool all_children_evictable = true; + for (const auto& child : parent_it->second.children) { + if (!subtreeEvictableForAncestorSlotLocked(child)) { + all_children_evictable = false; + break; + } + } + if (all_children_evictable) { + return true; + } + } + node_it = parent_it; + } + return false; +} + +bool SharedBlockCache::subtreeEvictableForAncestorSlotLocked(const NamespacedKey& key) const { + auto node_it = tree_nodes_.find(key); + if (node_it == tree_nodes_.end() || node_it->second.resident || !hasFlatItemLocked(key.cache_key) + || isFlatItemResidentLocked(key.cache_key)) { + return false; + } + for (const auto& child : node_it->second.children) { + if (!subtreeEvictableForAncestorSlotLocked(child)) { + return false; + } + } + return true; +} + +bool SharedBlockCache::selectIndependentGroupEvictionsLocked(int group_id, size_t min_blocks, EvictResult& result) { + if (group_id < 0 || (group_num_ > 0 && group_id >= group_num_) || min_blocks == 0) { + return false; + } + size_t selected_blocks = 0; + std::vector leaves(leaf_lru_.begin(), leaf_lru_.end()); + for (const auto& leaf : leaves) { + if (selected_blocks >= min_blocks) { + break; + } + const auto leaf_key = NamespacedKey{leaf.namespace_id, leaf.cache_key}; + auto chain = collectEvictChainLocked(leaf_key); + if (chain.size() <= 1) { + continue; + } + // Keep the leaf group tail block when possible. Scan from leaf-parent + // upward and drop the deepest non-tail slot first. + for (size_t chain_idx = 1; chain_idx < chain.size(); ++chain_idx) { + const auto& key = chain[chain_idx]; + auto [success, item] = lru_cache_.get(key.cache_key); + if (!success || item.is_resident || static_cast(group_id) >= item.slots.size() + || isNullBlockIdx(item.slots[static_cast(group_id)])) { + continue; + } + removeSlotFromItemLocked(key.cache_key, group_id, result); + ++selected_blocks; + break; + } + } + return selected_blocks >= min_blocks; +} + +void SharedBlockCache::removeSlotFromItemLocked(CacheKeyType cache_key, int group_id, EvictResult& result) { + UnifiedCacheItem item; + if (!lru_cache_.remove(cache_key, &item)) { + return; + } + if (group_id < 0 || static_cast(group_id) >= item.slots.size() + || isNullBlockIdx(item.slots[static_cast(group_id)])) { + lru_cache_.put(cache_key, item); + return; + } + + std::vector evicted_slots(item.slots.size(), NULL_BLOCK_IDX); + evicted_slots[static_cast(group_id)] = item.slots[static_cast(group_id)]; + result.evicted_keys.push_back(cache_key); + result.evicted_slots[cache_key] = std::move(evicted_slots); + result.evicted_namespaces[cache_key] = + item.has_dependency ? item.dependency_namespace : SharedBlockCache::kGpuLogicalNamespace; + if (item.has_dependency) { + result.evicted_dependencies[cache_key] = item.dependency; + } + const int64_t created_time_us = + static_cast(group_id) < item.slot_created_time_us.size() ? + item.slot_created_time_us[static_cast(group_id)] : + item.created_time_us; + result.evicted_lifetime_ms[cache_key] = std::max(0, (currentTimeUs() - created_time_us) / 1000); + result.evicted_independent_group[cache_key] = group_id; + + item.slots[static_cast(group_id)] = NULL_BLOCK_IDX; + if (static_cast(group_id) < item.matchable_slots.size()) { + item.matchable_slots[static_cast(group_id)] = false; + } + if (static_cast(group_id) < item.slot_created_time_us.size()) { + item.slot_created_time_us[static_cast(group_id)] = 0; + } + + const bool has_any_slot = std::any_of(item.slots.begin(), item.slots.end(), [](BlockIdxType slot) { + return !isNullBlockIdx(slot); + }); + if (has_any_slot) { + lru_cache_.put(cache_key, item); + refreshAllTreeAliasesLocked(cache_key); + } else { + removeAllTreeAliasesForCacheKeyLocked(cache_key); + } + ++version_; +} + +bool SharedBlockCache::hasFlatItemLocked(CacheKeyType cache_key) const { + return lru_cache_.contains(cache_key); +} + +bool SharedBlockCache::isFlatItemResidentLocked(CacheKeyType cache_key) const { + for (const auto& [key, item] : lru_cache_.items()) { + if (key == cache_key) { + return item.is_resident; + } + } + return false; +} + +bool SharedBlockCache::isIndependentEvictionGroupLocked(int group_id) const { + return independent_eviction_group_ids_.find(group_id) != independent_eviction_group_ids_.end(); +} + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/SharedBlockCache.h b/rtp_llm/cpp/cache/SharedBlockCache.h new file mode 100644 index 0000000000..2fc0b2d6ac --- /dev/null +++ b/rtp_llm/cpp/cache/SharedBlockCache.h @@ -0,0 +1,187 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "rtp_llm/cpp/utils/LRUCache.h" +#include "rtp_llm/cpp/cache/Types.h" +#include "rtp_llm/cpp/cache/BlockPool.h" +#include "rtp_llm/cpp/cache/KVCacheResource.h" + +namespace rtp_llm { + +class SharedBlockCache { +public: + using NamespaceId = uint32_t; + + static constexpr NamespaceId kDefaultNamespace = 0; + static constexpr NamespaceId kGpuLogicalNamespace = 1; + static constexpr NamespaceId kGpuCpCanonicalNamespace = 2; + + struct NamespacedKey { + NamespaceId namespace_id{0}; + CacheKeyType cache_key{0}; + + bool operator==(const NamespacedKey& other) const { + return namespace_id == other.namespace_id && cache_key == other.cache_key; + } + }; + + struct NamespacedKeyHash { + size_t operator()(const NamespacedKey& key) const { + return std::hash()((static_cast(key.namespace_id) << 32) + ^ static_cast(key.cache_key)); + } + }; + + struct UnifiedCacheItem { + CacheKeyType cache_key; + bool is_resident = false; + std::vector slots; + std::vector matchable_slots; + std::vector slot_created_time_us; + int64_t created_time_us = 0; + BlockDependency dependency; + NamespaceId dependency_namespace = kDefaultNamespace; + bool has_dependency = false; + }; + + struct EvictResult { + std::vector evicted_keys; + std::unordered_map> evicted_slots; + std::unordered_map evicted_dependencies; + std::unordered_map evicted_namespaces; + std::unordered_map evicted_lifetime_ms; + std::unordered_map evicted_independent_group; + }; + + struct MatchResult { + bool found = false; + std::vector group_blocks; + }; + + using LRUCacheType = LRUCache; + +public: + explicit SharedBlockCache(): lru_cache_(kCacheMaxCapacity) {} + + void init(int group_num, const std::vector& group_pools); + + void put(CacheKeyType cache_key, const std::vector& group_slots, bool is_resident); + void put(CacheKeyType cache_key, + const std::vector& group_slots, + bool is_resident, + NamespaceId namespace_id, + const BlockDependency& dependency, + const std::vector& matchable_slots = {}); + + MatchResult match(CacheKeyType cache_key); + + BlockIdxType matchGroup(CacheKeyType cache_key, int group_id); + + EvictResult selectAndEvict(size_t min_blocks); + EvictResult selectAndEvictForGroup(int group_id, size_t min_blocks); + + size_t evictAndFree(size_t min_blocks); + size_t evictAndFreeForGroup(int group_id, size_t min_blocks, EvictResult* evict_result_out = nullptr); + + std::optional remove(CacheKeyType cache_key); + + bool contains(CacheKeyType cache_key) const; + + bool empty() const; + + size_t size() const; + + std::vector allCacheKeys() const; + + int64_t version() const; + void setPrefixTreeEnabled(bool enabled); + bool prefixTreeEnabled() const; + void setIndependentGroupEviction(bool enabled, const std::vector& group_ids); + +private: + static const size_t kCacheMaxCapacity = 10000000; + + struct PrefixTreeNode { + NamespacedKey key; + NamespacedKey parent; + bool has_parent{false}; + bool resident{false}; + uint32_t ordinal{0}; + uint64_t last_access_seq{0}; + std::unordered_set children; + }; + + struct LeafKey { + uint64_t last_access_seq{0}; + NamespaceId namespace_id{0}; + CacheKeyType cache_key{0}; + + bool operator<(const LeafKey& other) const { + if (last_access_seq != other.last_access_seq) { + return last_access_seq < other.last_access_seq; + } + if (namespace_id != other.namespace_id) { + return namespace_id < other.namespace_id; + } + return cache_key < other.cache_key; + } + }; + + void upsertTreeNodeLocked(CacheKeyType cache_key, + NamespaceId namespace_id, + const BlockDependency& dependency, + bool is_resident); + void detachPendingChildLocked(const NamespacedKey& parent, const NamespacedKey& child); + void attachPendingChildrenLocked(PrefixTreeNode& node); + void touchTreeAliasesLocked(CacheKeyType cache_key); + void touchTreeNodeLocked(PrefixTreeNode& node); + void eraseLeafLocked(const PrefixTreeNode& node); + void insertLeafIfEligibleLocked(const PrefixTreeNode& node); + void refreshLeafLocked(const NamespacedKey& key); + void removeTreeAliasLocked(const NamespacedKey& key); + void removeAllTreeAliasesForCacheKeyLocked(CacheKeyType cache_key); + void markAllTreeAliasesResidentLocked(CacheKeyType cache_key); + void refreshAllTreeAliasesLocked(CacheKeyType cache_key); + bool flatItemHasCanonicalDependencyLocked(CacheKeyType cache_key) const; + bool updateItemDependencyLocked(UnifiedCacheItem& item, + NamespaceId namespace_id, + const BlockDependency& dependency) const; + static bool slotMatchable(const UnifiedCacheItem& item, size_t group_id); + static bool hasUsableSlot(const UnifiedCacheItem& item, int group_id); + std::vector collectEvictChainLocked(const NamespacedKey& leaf_key) const; + bool chainHasUsableSlotLocked(const std::vector& chain, int group_id) const; + bool chainHasReachableAncestorSlotLocked(const std::vector& chain, int group_id) const; + bool subtreeEvictableForAncestorSlotLocked(const NamespacedKey& key) const; + bool selectIndependentGroupEvictionsLocked(int group_id, size_t min_blocks, EvictResult& result); + void removeSlotFromItemLocked(CacheKeyType cache_key, int group_id, EvictResult& result); + bool hasFlatItemLocked(CacheKeyType cache_key) const; + bool isFlatItemResidentLocked(CacheKeyType cache_key) const; + bool isIndependentEvictionGroupLocked(int group_id) const; + + LRUCacheType lru_cache_; + mutable std::mutex mu_; + int64_t version_{0}; + bool prefix_tree_enabled_{true}; + bool independent_group_eviction_enabled_{false}; + uint64_t tree_access_seq_{0}; + + int group_num_ = 0; + std::vector group_pools_; + std::unordered_map tree_nodes_; + std::unordered_map> aliases_by_cache_key_; + std::unordered_map, NamespacedKeyHash> + pending_children_by_parent_; + std::set leaf_lru_; + std::unordered_set independent_eviction_group_ids_; +}; + +using SharedBlockCachePtr = std::shared_ptr; + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/SingleConfigCreator.cc b/rtp_llm/cpp/cache/SingleConfigCreator.cc deleted file mode 100644 index 1a67d34057..0000000000 --- a/rtp_llm/cpp/cache/SingleConfigCreator.cc +++ /dev/null @@ -1,71 +0,0 @@ -#include "rtp_llm/cpp/cache/SingleConfigCreator.h" - -#include "rtp_llm/cpp/cache/KVCacheSpec.h" -#include "rtp_llm/cpp/cache/MemoryEvaluationHelper.h" -#include "rtp_llm/cpp/utils/Logger.h" - -namespace rtp_llm { - -CacheConfig SingleConfigCreator::createSingleConfig(const ModelConfig& model_config, - const ParallelismConfig& parallelism_config, - bool is_mtp) { - auto dtype = MemoryEvaluationHelper::getDataTypeForCache(model_config); - - auto layer_num = model_config.num_layers; - - std::vector all_layer_ids(layer_num); - for (int i = 0; i < layer_num; ++i) { - all_layer_ids[i] = i; - } - - CacheConfig config; - config.layer_num = static_cast(layer_num); - config.layer_all_num = static_cast(layer_num); - config.block_num = 0; - config.seq_size_per_block = static_cast(model_config.attn_config.tokens_per_block); - - config.use_mla = model_config.attn_config.use_mla; - config.dtype = dtype; - config.is_sparse = model_config.attn_config.is_sparse; - - KVCacheSpecPtr spec; - if (model_config.attn_config.use_mla && model_config.mla_ops_type != rtp_llm::MlaOpsType::MHA) { - spec = std::make_shared(model_config.attn_config, parallelism_config); - } else { - spec = std::make_shared(model_config.attn_config, parallelism_config); - } - spec->dtype = dtype; - config.cache_specs.push_back(spec); - config.group_types.push_back(CacheGroupType::FULL); - - // Using spec interface for block size and scale - config.kv_block_stride_bytes = config.cache_specs[0]->block_size_bytes(); - config.kv_block_size_bytes = static_cast(config.layer_num) * config.kv_block_stride_bytes; - - // Scale handling - no need to check dtype as scale_block_size_bytes() returns 0 if no scale support - config.kv_scale_stride_bytes = config.cache_specs[0]->scale_block_size_bytes(); - config.kv_scale_size_bytes = static_cast(config.layer_num) * config.kv_scale_stride_bytes; - - if (config.is_sparse) { - auto indexer_dim = model_config.attn_config.indexer_head_dim; - config.kv_scale_stride_bytes = (indexer_dim + indexer_dim / 128 * 4) * spec->seq_size_per_block; - config.kv_scale_size_bytes = static_cast(config.layer_num) * config.kv_scale_stride_bytes; - } - - config.block_size_bytes = config.kv_block_size_bytes + config.kv_scale_size_bytes; - config.group_layer_num = layer_num; // only 1 group for SingleConfig - - // Per-layer block stride (kv + scale). - const size_t per_layer_stride_bytes = config.kv_block_stride_bytes + config.kv_scale_stride_bytes; - config.layer_to_block_stride_bytes.assign(static_cast(config.layer_all_num), - static_cast(per_layer_stride_bytes)); - - // Global layer ids are the indices used by BlockPool::convertIndexToAddr (0..N-1 in a single-model case). - config.global_layer_ids.push_back(all_layer_ids); - config.layer_ids.push_back(all_layer_ids); - config.layer_to_group_id.assign(config.layer_num, 0); - config.layer_attn_types.assign(config.layer_num, CacheGroupType::FULL); - return config; -} - -} // namespace rtp_llm \ No newline at end of file diff --git a/rtp_llm/cpp/cache/Types.cc b/rtp_llm/cpp/cache/Types.cc new file mode 100644 index 0000000000..f4a65b82d3 --- /dev/null +++ b/rtp_llm/cpp/cache/Types.cc @@ -0,0 +1,11 @@ +#include "rtp_llm/cpp/cache/Types.h" + +#include "rtp_llm/cpp/engine_base/stream/CompleteTokenIds.h" + +namespace rtp_llm { + +int MallocInfo::incrSeqLen() const { + return incr_seq_len_override >= 0 ? incr_seq_len_override : complete_token_ids->seqLength(); +} + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/Types.h b/rtp_llm/cpp/cache/Types.h index 3a025e06bc..75908ddf05 100644 --- a/rtp_llm/cpp/cache/Types.h +++ b/rtp_llm/cpp/cache/Types.h @@ -1,17 +1,21 @@ #pragma once #include +#include #include #include #include "rtp_llm/cpp/cache/BlockInfo.h" -#include "rtp_llm/cpp/cache/CacheGroupType.h" +#include "rtp_llm/cpp/cache/spec/CacheGroupType.h" +#include "rtp_llm/cpp/cache/CPSlotMapper.h" #include "rtp_llm/models_py/bindings/core/Types.h" #include "rtp_llm/cpp/cache/BatchKVCacheResource.h" -#include "rtp_llm/cpp/engine_base/stream/CompleteTokenIds.h" namespace rtp_llm { +class CompleteTokenIds; +using CompleteTokenIdsPtr = std::shared_ptr; + typedef int32_t GroupIdType; typedef std::vector LayerIdsType; @@ -48,14 +52,21 @@ struct KVPartitionBytes { }; struct MallocInfo { - BatchKVCacheResourcePtr batch_kv_cache_resource; - CompleteTokenIdsPtr complete_token_ids; - int64_t request_id = 0; - bool verbose = true; // for failed log - bool reuse_cache = true; - bool enable_device_cache = true; - // Sparse linear-block cleanup is only valid for incremental allocation. + BatchKVCacheResourcePtr batch_kv_cache_resource; + CompleteTokenIdsPtr complete_token_ids; + int64_t request_id = 0; + bool verbose = true; // for failed log + bool reuse_cache = true; + bool enable_device_cache = true; + // Sparse tail-group cleanup is only valid for incremental allocation. + // Prefill init keeps reused prefix slots intact because model-path kernels + // still read them by prefix_length. bool enable_remove_skipped_blocks = true; + // Override for incrMalloc's seqLength read; -1 = fall back to complete_token_ids->seqLength(). + // Lets the state machine feed the publish-time value instead of racing with the async worker. + int incr_seq_len_override = -1; + + int incrSeqLen() const; }; struct MallocResult { @@ -73,9 +84,9 @@ struct FreeInfo { }; struct InsertInfo { - BatchKVCacheResourcePtr batch_kv_cache_resource; - CompleteTokenIdsPtr complete_token_ids; - bool is_resident; + BatchKVCacheResourcePtr batch_kv_cache_resource; + CompleteTokenIdsPtr complete_token_ids; + bool is_resident; }; -} // namespace rtp_llm \ No newline at end of file +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/allocator/HybridKVCacheAllocator.cc b/rtp_llm/cpp/cache/allocator/HybridKVCacheAllocator.cc new file mode 100644 index 0000000000..e5e76ee768 --- /dev/null +++ b/rtp_llm/cpp/cache/allocator/HybridKVCacheAllocator.cc @@ -0,0 +1,755 @@ +#include "rtp_llm/cpp/cache/allocator/HybridKVCacheAllocator.h" + +#include +#include +#include + +#include "rtp_llm/cpp/cache/BlockPoolConfigHelper.h" +#include "rtp_llm/cpp/cache/CPSlotMapper.h" +#include "rtp_llm/cpp/engine_base/stream/CompleteTokenIds.h" +#include "rtp_llm/cpp/utils/Logger.h" +#include "rtp_llm/cpp/utils/TimeUtil.h" + +namespace rtp_llm { +namespace { + +// CP shard helpers: when mapper is null/passthrough, all helpers no-op. +inline int cpEffectiveSeqLen(const std::shared_ptr& mapper, int seq_len) { + return (mapper && mapper->isSharded()) ? mapper->effectiveSeqLenForAlloc(seq_len) : seq_len; +} + +inline CacheKeysType cpEffectiveCacheKeys(const std::shared_ptr& mapper, const CacheKeysType& full) { + if (!mapper || !mapper->isSharded()) { + return full; + } + CacheKeysType local; + const int cp_size = mapper->cpSize(); + const int start = cp_size - 1; + for (int i = start; i < static_cast(full.size()); i += cp_size) { + local.push_back(full[i]); + } + return local; +} + +inline int cpVirtualBlockSize(const std::shared_ptr& mapper, int block_size) { + return (mapper && mapper->isSharded()) ? mapper->virtualBlockSize() : block_size; +} + +inline bool containsGroupId(const std::vector& group_ids, int gid) { + return std::find(group_ids.begin(), group_ids.end(), gid) != group_ids.end(); +} + +inline bool cpShardThisGroup(const std::shared_ptr& mapper, const KVCacheGroupPtr& group) { + return mapper && mapper->isSharded() && group && group->isCpShardable(); +} + +inline int cpEffectiveSeqLenForGroup(const std::shared_ptr& mapper, + const KVCacheGroupPtr& group, + int seq_len) { + return cpShardThisGroup(mapper, group) ? mapper->effectiveSeqLenForAlloc(seq_len) : seq_len; +} + +inline int cpVirtualBlockSizeForGroup(const std::shared_ptr& mapper, + const KVCacheGroupPtr& group, + int block_size) { + return cpShardThisGroup(mapper, group) ? mapper->virtualBlockSize() : block_size; +} + +inline size_t groupSeqSize(const CacheConfig& config, int gid, size_t fallback) { + return (gid >= 0 && static_cast(gid) < config.group_seq_size_per_block.size() + && config.group_seq_size_per_block[static_cast(gid)] > 0) ? + config.group_seq_size_per_block[static_cast(gid)] : + fallback; +} + +BlockIndicesType validBlocksAfter(const BlockIndicesType& blocks, size_t begin) { + BlockIndicesType valid; + if (begin >= blocks.size()) { + return valid; + } + valid.reserve(blocks.size() - begin); + for (size_t i = begin; i < blocks.size(); ++i) { + if (!isNullBlockIdx(blocks[i])) { + valid.push_back(blocks[i]); + } + } + return valid; +} + +} // namespace + +bool HybridKVCacheAllocator::skipReuseCacheGroup(int gid) const { + return gid >= 0 && static_cast(gid) < kv_cache_groups_.size() + && kv_cache_groups_[static_cast(gid)]->reusePolicy() == CacheReusePolicy::NON_REUSABLE; +} + +std::vector HybridKVCacheAllocator::independentEvictionGroupIds() const { + std::vector group_ids; + for (size_t gid = 0; gid < kv_cache_groups_.size(); ++gid) { + if (kv_cache_groups_[gid]->evictPolicy() == CacheEvictPolicy::INDEPENDENT) { + group_ids.push_back(static_cast(gid)); + } + } + return group_ids; +} + +bool HybridKVCacheAllocator::cpCompactSwaGroup(int gid, const std::shared_ptr& mapper) const { + if (!mapper || !mapper->isSharded() || gid < 0 || static_cast(gid) >= kv_cache_groups_.size() + || !kv_cache_groups_[static_cast(gid)]->cpCompactTailBlocks()) { + return false; + } + const auto row_tokens = groupSeqSize(config_, gid, seqSizePerBlock()); + return row_tokens == static_cast(mapper->virtualBlockSize()); +} + +HybridKVCacheAllocator::HybridKVCacheAllocator(const CacheConfig& config, + AllocationType allocation_type, + const kmonitor::MetricsReporterPtr metrics_reporter, + int64_t reserve_block_ratio): + KVCacheAllocator(config, allocation_type, metrics_reporter, reserve_block_ratio) {} + +int HybridKVCacheAllocator::reuseCache(const CacheKeysType& cache_keys, + BatchKVCacheResource& kv_resource, + const std::shared_ptr& cp_mapper) { + // Under cp shard, FULL groups index block_ids by cp-virtual-block units + // (one entry covers cp_size physical blocks). LINEAR/SWA groups index by + // raw block_size logical blocks. So when populating tail blocks for + // LINEAR/SWA we need to scale the array length and matched-block position + // back to the logical-block coordinate system. + const int cp_scale = (cp_mapper && cp_mapper->isSharded()) ? cp_mapper->cpSize() : 1; + int min_full_reuse_blocks = static_cast(cache_keys.size()); + std::vector full_matched_blocks(kv_cache_groups_.size()); + + for (int gid : full_group_ids_) { + auto match_result = kv_cache_groups_[static_cast(gid)]->match(cache_keys); + min_full_reuse_blocks = std::min(min_full_reuse_blocks, static_cast(match_result.reuse_blocks)); + full_matched_blocks[static_cast(gid)] = std::move(match_result.block_indices); + } + + int pos = min_full_reuse_blocks - 1; + std::vector linear_tail_blocks(linear_group_ids_.size(), NULL_BLOCK_IDX); + std::vector swa_tail_blocks(swa_group_ids_.size()); + const bool has_tail_groups = !linear_group_ids_.empty() || !swa_group_ids_.empty(); + for (; pos >= 0 && has_tail_groups; --pos) { + bool all_tail_groups_matched = true; + std::vector candidate_linear_tail_blocks(linear_group_ids_.size(), NULL_BLOCK_IDX); + std::vector candidate_swa_tail_blocks(swa_group_ids_.size()); + for (size_t i = 0; i < linear_group_ids_.size(); ++i) { + const int gid = linear_group_ids_[i]; + auto result = kv_cache_groups_[static_cast(gid)]->matchSingleKey(cache_keys[static_cast(pos)]); + if (result.block_indices.empty()) { + all_tail_groups_matched = false; + break; + } + candidate_linear_tail_blocks[i] = result.block_indices[0]; + } + if (!all_tail_groups_matched) { + continue; + } + for (size_t i = 0; i < swa_group_ids_.size(); ++i) { + const int gid = swa_group_ids_[i]; + if (skipReuseCacheGroup(gid)) { + continue; + } + auto result = kv_cache_groups_[static_cast(gid)]->matchSingleKey(cache_keys[static_cast(pos)]); + if (result.block_indices.empty()) { + all_tail_groups_matched = false; + break; + } + candidate_swa_tail_blocks[i].push_back(result.block_indices[0]); + } + if (all_tail_groups_matched) { + linear_tail_blocks = std::move(candidate_linear_tail_blocks); + swa_tail_blocks = std::move(candidate_swa_tail_blocks); + break; + } + } + + const int reuse_blocks_len = has_tail_groups ? std::max(pos + 1, 0) : std::max(min_full_reuse_blocks, 0); + if (reuse_blocks_len <= 0) { + return 0; + } + + for (int gid : full_group_ids_) { + BlockIndicesType full_blocks = full_matched_blocks[static_cast(gid)]; + if (static_cast(full_blocks.size()) > reuse_blocks_len) { + full_blocks.resize(static_cast(reuse_blocks_len)); + } + kv_resource.mutableBlockIds(0, gid).assign(std::move(full_blocks)); + } + + // LINEAR/SWA arrays are sized in logical-block units (cp_size× larger + // than the FULL groups' cp-virtual-block units). The matched tail block + // corresponds to the LAST logical block in the canonical (last-rank) + // namespace, so its index is `(reuse_blocks_len * cp_size) - 1` in + // logical units, NOT `reuse_blocks_len - 1`. + const int logical_reuse_len = reuse_blocks_len * cp_scale; + for (size_t i = 0; i < linear_group_ids_.size(); ++i) { + const int gid = linear_group_ids_[i]; + kv_resource.mutableBlockIds(0, gid).assign( + BlockIndicesType(static_cast(logical_reuse_len), NULL_BLOCK_IDX)); + kv_resource.mutableBlockIds(0, gid).setAt(static_cast(logical_reuse_len - 1), linear_tail_blocks[i]); + } + for (size_t i = 0; i < swa_group_ids_.size(); ++i) { + const int gid = swa_group_ids_[i]; + const int group_reuse_len = cpCompactSwaGroup(gid, cp_mapper) ? reuse_blocks_len : logical_reuse_len; + kv_resource.mutableBlockIds(0, gid).assign( + BlockIndicesType(static_cast(group_reuse_len), NULL_BLOCK_IDX)); + if (skipReuseCacheGroup(gid)) { + continue; + } + const size_t tail_begin = + static_cast(std::max(group_reuse_len - static_cast(swa_tail_blocks[i].size()), 0)); + for (size_t j = 0; j < swa_tail_blocks[i].size(); ++j) { + kv_resource.mutableBlockIds(0, gid).setAt(tail_begin + j, swa_tail_blocks[i][j]); + } + } + return reuse_blocks_len; +} + +MallocResult HybridKVCacheAllocator::initMallocForCommonLen(const MallocInfo& malloc_info) { + auto& kv_resource = malloc_info.batch_kv_cache_resource; + const int batch_size = kv_resource->batchSize(); + RTP_LLM_CHECK_WITH_INFO(batch_size == 1, "currently batch size should be 1 in hybrid attention but %d", batch_size); + + const int seq_len = malloc_info.complete_token_ids->seqLength(); + const int common_seq_len = std::min(malloc_info.complete_token_ids->commonSeqLength(), seq_len); + const auto& cp_mapper = cp_slot_mapper_; + // reuse_unit_tokens is computed against the canonical (paged FULL) group's + // block_size: cache_keys reuse only happens for paged groups so virtual block + // size = canonical block_size * cp_size; non-paged groups don't enter reuse. + const KVCacheGroupPtr reuse_group = + full_group_ids_.empty() ? KVCacheGroupPtr{} : kv_cache_groups_[static_cast(full_group_ids_.front())]; + const int reuse_unit_tokens = cpVirtualBlockSizeForGroup(cp_mapper, reuse_group, seqSizePerBlock()); + + const auto& cache_keys = kv_resource->cacheKeys(0); + int64_t match_cost_time_us = 0; + const size_t reserve_blocks = reserveBlockNum(); + int reuse_blocks = 0; + std::vector referenced_blocks(static_cast(kv_resource->groupNums())); + + if (malloc_info.enable_device_cache) { + // CP-sharded: subsample to last-rank canonical key namespace before matching. + CacheKeysType cp_keys = cpEffectiveCacheKeys(cp_mapper, cache_keys); + // Off mode drops the last key to skip the partial trailing block. Under + // CP sharding cpEffectiveCacheKeys already excludes the partial block + // (last-rank stride lands inside completed full blocks only), so the + // extra drop would discard a valid full-block key — costing the SWA + // tail-loop its only matchable key (full_keys[cp_size-1 + (n-1)*cp_size] + // is exactly what the non-sharded SWA group caches). + const bool cp_active = cp_mapper && cp_mapper->isSharded(); + CacheKeysType match_keys(cp_keys.begin(), + cp_active ? cp_keys.end() : (cp_keys.empty() ? cp_keys.end() : cp_keys.end() - 1)); + auto begin_us = currentTimeUs(); + reuse_blocks = reuseCache(match_keys, *kv_resource, cp_mapper); + match_cost_time_us = currentTimeUs() - begin_us; + + for (int gid = 0; gid < kv_resource->groupNums(); ++gid) { + const auto& blocks = kv_resource->blocks(0, gid); + BlockIndicesType valid; + valid.reserve(blocks.size()); + for (auto b : blocks) { + if (!isNullBlockIdx(b)) { + valid.push_back(b); + } + } + if (!valid.empty()) { + referenceBlocksInGroup(gid, valid); + referenced_blocks[static_cast(gid)] = std::move(valid); + } + } + kv_resource->cacheResource(0).setDeviceReuseBlockNum(reuse_blocks); + } + + if (reserve_blocks > 0 && !hasAvailableBlocksForReserve(malloc_info, reserve_blocks)) { + rollbackInitMalloc(*kv_resource, referenced_blocks, {}); + return {false, 0}; + } + + std::vector original_sizes(static_cast(kv_resource->groupNums())); + for (int gid = 0; gid < kv_resource->groupNums(); ++gid) { + original_sizes[static_cast(gid)] = kv_resource->blocksNum(0, gid); + } + for (int gid = 0; gid < kv_resource->groupNums(); ++gid) { + auto& block_ids_0 = kv_resource->mutableBlockIds(0, gid); + const int group_seq_len = + cpEffectiveSeqLenForGroup(cp_mapper, kv_cache_groups_[static_cast(gid)], common_seq_len); + if (!kv_cache_groups_[static_cast(gid)]->malloc( + block_ids_0, group_seq_len, malloc_info.reuse_cache, 0)) { + rollbackInitMalloc(*kv_resource, referenced_blocks, original_sizes); + return {false, 0}; + } + } + + for (int b = 1; b < batch_size; ++b) { + for (int gid = 0; gid < kv_resource->groupNums(); ++gid) { + kv_cache_groups_[static_cast(gid)]->reference(kv_resource->mutableBlockIds(b, gid), + kv_resource->blocks(0, gid)); + } + } + return {true, reuse_blocks * reuse_unit_tokens, match_cost_time_us}; +} + +MallocResult HybridKVCacheAllocator::incrMalloc(const MallocInfo& malloc_info) { + auto& kv_resource = malloc_info.batch_kv_cache_resource; + const auto& cp_mapper = cp_slot_mapper_; + const int batch_size = kv_resource->batchSize(); + const int raw_seq_len = malloc_info.incrSeqLen(); + const int reserve_step = malloc_info.complete_token_ids->getReserveStep(); + + std::vector> original_blocks(static_cast(batch_size)); + for (int b = 0; b < batch_size; ++b) { + original_blocks[static_cast(b)].resize(static_cast(kv_resource->groupNums())); + for (int gid = 0; gid < kv_resource->groupNums(); ++gid) { + original_blocks[static_cast(b)][static_cast(gid)] = kv_resource->blocks(b, gid); + } + } + + bool all_success = true; + int failed_batch = -1; + int failed_group = -1; + for (int b = 0; b < batch_size; ++b) { + for (int gid = 0; gid < kv_resource->groupNums(); ++gid) { + auto& block_ids = kv_resource->mutableBlockIds(b, gid); + const int group_seq_len = + cpEffectiveSeqLenForGroup(cp_mapper, kv_cache_groups_[static_cast(gid)], raw_seq_len); + if (!kv_cache_groups_[static_cast(gid)]->malloc( + block_ids, group_seq_len, malloc_info.reuse_cache, reserve_step)) { + all_success = false; + failed_batch = b; + failed_group = gid; + break; + } + } + if (!all_success) { + break; + } + } + + if (all_success) { + if (!malloc_info.enable_remove_skipped_blocks) { + return {true, 0}; + } + for (int b = 0; b < batch_size; ++b) { + for (int gid = 0; gid < kv_resource->groupNums(); ++gid) { + kv_cache_groups_[static_cast(gid)]->removeSkippedBlocks( + kv_resource->mutableBlockIds(b, gid), malloc_info.reuse_cache, reserve_step); + } + } + return {true, 0}; + } + + for (int b = 0; b <= failed_batch && b < batch_size; ++b) { + for (int gid = 0; gid < kv_resource->groupNums(); ++gid) { + auto& block_ids = kv_resource->mutableBlockIds(b, gid); + const auto& original = original_blocks[static_cast(b)][static_cast(gid)]; + + std::unordered_set original_valid_blocks; + original_valid_blocks.reserve(original.size()); + for (auto block : original) { + if (!isNullBlockIdx(block)) { + original_valid_blocks.insert(block); + } + } + + BlockIndicesType blocks_to_free; + for (auto block : block_ids.blocks()) { + if (!isNullBlockIdx(block) && original_valid_blocks.find(block) == original_valid_blocks.end()) { + blocks_to_free.push_back(block); + } + } + if (!blocks_to_free.empty()) { + freeBlocksInGroup(gid, blocks_to_free); + } + block_ids.assign(original); + } + } + RTP_LLM_LOG_WARNING("Hybrid incrMalloc failed at batch=%d group=%d", failed_batch, failed_group); + return {false, 0}; +} + +void HybridKVCacheAllocator::free(const FreeInfo& free_info) { + auto& kv_cache_resource = free_info.batch_kv_cache_resource; + if (kv_cache_resource->curBlocksNum() == 0) { + return; + } + for (int batch_id = 0; batch_id < kv_cache_resource->batchSize(); ++batch_id) { + for (int gid = 0; gid < kv_cache_resource->groupNums(); ++gid) { + kv_cache_groups_[static_cast(gid)]->free(kv_cache_resource->blocks(batch_id, gid)); + } + } + kv_cache_resource->clearBlocks(); +} + +void HybridKVCacheAllocator::insertIntoCache(const InsertInfo& insert_info) { + auto& kv_cache_resource = insert_info.batch_kv_cache_resource; + RTP_LLM_CHECK(kv_cache_resource != nullptr); + if (!shared_block_cache_) { + return; + } + + const auto& cp_mapper = cp_slot_mapper_; + const bool cp_active = cp_mapper && cp_mapper->isSharded(); + const int group_nums = kv_cache_resource->groupNums(); + const int batch_size = kv_cache_resource->batchSize(); + + for (int batch_id = 0; batch_id < batch_size; ++batch_id) { + kv_cache_resource->cacheResource(batch_id).ensureLinearBlockDependencies(); + const auto& full_keys = kv_cache_resource->cacheKeys(batch_id); + if (full_keys.empty()) { + continue; + } + const auto& full_dependencies = kv_cache_resource->cacheResource(batch_id).blockDependencies(); + + if (!cp_active) { + // Preserve the legacy non-CP GPU reuse surface: aggregate all groups + // under one key. The prefix tree only receives extra dependency + // metadata here. + const size_t max_keys = full_keys.size(); + for (size_t pos = max_keys; pos > 0; --pos) { + const size_t i = pos - 1; + std::vector group_slots(static_cast(group_nums), NULL_BLOCK_IDX); + bool has_valid = false; + for (int gid = 0; gid < group_nums; ++gid) { + if (skipReuseCacheGroup(gid)) { + continue; + } + const auto& blocks = kv_cache_resource->blocks(batch_id, gid); + if (i >= blocks.size()) { + continue; + } + if (!isNullBlockIdx(blocks[i])) { + group_slots[static_cast(gid)] = blocks[i]; + has_valid = true; + } + } + if (has_valid) { + const auto dependency = + i < full_dependencies.size() ? full_dependencies[i] : + BlockDependency{false, 0, static_cast(i)}; + shared_block_cache_->put(full_keys[i], + group_slots, + insert_info.is_resident, + SharedBlockCache::kGpuLogicalNamespace, + dependency); + } + } + continue; + } + + // Per-group key namespace, per-(key, group) put. SharedBlockCache::put + // merges multiple puts on the same key into a single item with each group's slot + // populated independently (NULL_BLOCK_IDX entries are skipped by the merge path). + // + // CP per-group key namespace: paged FULL groups use cp-subsampled (last-rank) keys + // to align 1:1 with rank-local blocks; non-paged groups (SWA / LINEAR) keep the + // full key sequence so their tail blocks (real entries at positions >= length-2) + // get inserted alongside the keys that the reuseCache tail-loop later queries. + CacheKeysType cp_keys = cpEffectiveCacheKeys(cp_mapper, full_keys); + BlockDependenciesType cp_dependencies; + cp_dependencies.reserve(cp_keys.size()); + for (size_t i = 0; i < cp_keys.size(); ++i) { + BlockDependency dependency; + dependency.ordinal = static_cast(i); + if (i > 0) { + dependency.has_parent = true; + dependency.parent_key = cp_keys[i - 1]; + } + cp_dependencies.push_back(dependency); + } + auto token_ids = insert_info.complete_token_ids->completeTokenIdsVec(batch_id); + if (token_ids.size() <= 1) { + continue; + } + const size_t token_len = token_ids.size() - 1; + + for (int gid = 0; gid < group_nums; ++gid) { + if (skipReuseCacheGroup(gid)) { + continue; + } + const int raw_group_seq = kv_cache_groups_[static_cast(gid)]->seqSizePerBlock(); + const bool gp_sharded = cpShardThisGroup(cp_mapper, kv_cache_groups_[static_cast(gid)]); + const bool compact_swa = cpCompactSwaGroup(gid, cp_mapper); + const bool use_cp_keys = cp_active && (gp_sharded || compact_swa); + const CacheKeysType& src_keys = use_cp_keys ? cp_keys : full_keys; + const auto& dependencies = use_cp_keys ? cp_dependencies : full_dependencies; + const auto namespace_id = use_cp_keys ? SharedBlockCache::kGpuCpCanonicalNamespace : + SharedBlockCache::kGpuLogicalNamespace; + if (src_keys.empty()) { + continue; + } + const int group_seq_size = + cpVirtualBlockSizeForGroup(cp_mapper, kv_cache_groups_[static_cast(gid)], raw_group_seq); + const size_t full_blocks_num = token_len / static_cast(group_seq_size); + const size_t n = std::min(src_keys.size(), full_blocks_num); + const auto& blocks = kv_cache_resource->blocks(batch_id, gid); + const size_t loop_end = std::min(n, blocks.size()); + + // Reverse iterate so prefix-base keys land at MRU end (matches non-CP path). + for (size_t pos = loop_end; pos > 0; --pos) { + const size_t i = pos - 1; + if (isNullBlockIdx(blocks[i])) { + continue; + } + std::vector group_slots(static_cast(group_nums), NULL_BLOCK_IDX); + std::vector matchable_slots(static_cast(group_nums), true); + group_slots[static_cast(gid)] = blocks[i]; + const auto dependency = + i < dependencies.size() ? dependencies[i] : BlockDependency{false, 0, static_cast(i)}; + shared_block_cache_->put( + src_keys[i], group_slots, insert_info.is_resident, namespace_id, dependency, matchable_slots); + } + } + } +} + +std::shared_ptr HybridKVCacheAllocator::incrKVCacheRef(const KVCacheResource& kvcache_resource, + const CacheKeysType& cache_keys, + bool is_connector) { + if (cache_keys.empty() || kvcache_resource.groupNums() <= 0) { + return nullptr; + } + + std::unordered_map key_to_pos; + const auto& resource_keys = kvcache_resource.cacheKeys(); + for (size_t i = 0; i < resource_keys.size(); ++i) { + key_to_pos.emplace(resource_keys[i], i); + } + + auto selected_resource_ptr = new KVCacheResource(kvcache_resource); + auto deleter = [self = shared_from_this(), is_connector](KVCacheResource* resource) { + self->decrKVCacheRef(*resource, is_connector); + delete resource; + }; + std::shared_ptr selected_resource(selected_resource_ptr, deleter); + selected_resource->initGroups(kvcache_resource.groupNums(), + static_cast(config_.layer_all_num), + config_.layerGroupIdsSnapshot(), + config_.kernelBlocksPerKvBlock(), + config_.groupTypesSnapshot()); + + CacheKeysType selected_keys; + BlockDependenciesType selected_dependencies; + std::vector selected_blocks(static_cast(kvcache_resource.groupNums())); + const auto& source_dependencies = kvcache_resource.blockDependencies(); + + selected_dependencies.reserve(cache_keys.size()); + selected_keys.reserve(cache_keys.size()); + for (auto key : cache_keys) { + auto it = key_to_pos.find(key); + if (it == key_to_pos.end()) { + continue; + } + const size_t pos = it->second; + bool any_valid_block = false; + std::vector blocks_for_key(static_cast(kvcache_resource.groupNums()), NULL_BLOCK_IDX); + for (int gid = 0; gid < kvcache_resource.groupNums(); ++gid) { + const auto& src_blocks = kvcache_resource.blocks(gid); + const auto block = pos < src_blocks.size() ? src_blocks[pos] : NULL_BLOCK_IDX; + blocks_for_key[static_cast(gid)] = block; + any_valid_block = any_valid_block || (!isNullBlockIdx(block) && block > 0); + } + const bool preserve_connector_tail = is_connector && !kvcache_resource.lastBlockAligned() + && pos + 1 == resource_keys.size() && !selected_keys.empty(); + if (!any_valid_block && !preserve_connector_tail) { + continue; + } + selected_keys.push_back(key); + selected_dependencies.push_back( + pos < source_dependencies.size() ? + source_dependencies[pos] : + BlockDependency{false, 0, static_cast(selected_dependencies.size())}); + for (int gid = 0; gid < kvcache_resource.groupNums(); ++gid) { + selected_blocks[static_cast(gid)].push_back(blocks_for_key[static_cast(gid)]); + } + } + + if (selected_keys.empty()) { + return nullptr; + } + + selected_resource->cacheKeys() = std::move(selected_keys); + selected_resource->setBlockDependencies(std::move(selected_dependencies)); + for (int gid = 0; gid < kvcache_resource.groupNums(); ++gid) { + BlockIndicesType valid; + for (auto b : selected_blocks[static_cast(gid)]) { + if (!isNullBlockIdx(b) && b > 0) { + valid.push_back(b); + } + } + if (!valid.empty()) { + referenceBlocksInGroup(gid, valid, is_connector); + } + selected_resource->mutableBlockIds(gid).assign(std::move(selected_blocks[static_cast(gid)])); + } + return selected_resource; +} + +void HybridKVCacheAllocator::decrKVCacheRef(const KVCacheResource& kvcache_resource, bool is_connector) { + for (int gid = 0; gid < kvcache_resource.groupNums(); ++gid) { + BlockIndicesType valid; + for (auto b : kvcache_resource.blocks(gid)) { + if (!isNullBlockIdx(b) && b > 0) { + valid.push_back(b); + } + } + if (!valid.empty()) { + freeBlocksInGroup(gid, valid, is_connector); + } + } +} + +bool HybridKVCacheAllocator::updateKVBlock(const BatchKVCacheResourcePtr& batch_kv_cache_resource, + const std::vector& block_src_batch, + bool copy_last_block, + std::vector& block_update_mapping) { + (void)batch_kv_cache_resource; + (void)block_src_batch; + (void)copy_last_block; + (void)block_update_mapping; + RTP_LLM_FAIL("HybridKVCacheAllocator::updateKVBlock is not supported"); +} + +int HybridKVCacheAllocator::seqSizePerBlock() const { + return static_cast(config_.seq_size_per_block); +} + +bool HybridKVCacheAllocator::hasAvailableBlocksForReserve(const MallocInfo& malloc_info, size_t reserve_blocks) const { + const int need_blocks = getNeedBlocks(malloc_info); + if (need_blocks <= 0) { + return true; + } + const size_t available_blocks = availableBlocksNum(); + const bool accepted = available_blocks >= static_cast(need_blocks) + reserve_blocks; + if (!accepted && malloc_info.verbose) { + RTP_LLM_LOG_INFO("Hybrid initMalloc rejected by reserve blocks: request_id=%ld " + "need_blocks=%d available_blocks=%zu reserve_blocks=%zu", + malloc_info.request_id, + need_blocks, + available_blocks, + reserve_blocks); + } + return accepted; +} + +void HybridKVCacheAllocator::rollbackBlockIdsToSize(int gid, BlockIds& block_ids, size_t original_size) { + if (block_ids.blocksNum() <= original_size) { + return; + } + const auto blocks_to_free = validBlocksAfter(block_ids.blocks(), original_size); + block_ids.resize(original_size); + if (!blocks_to_free.empty()) { + freeBlocksInGroup(gid, blocks_to_free); + } +} + +void HybridKVCacheAllocator::rollbackInitMalloc(BatchKVCacheResource& kv_resource, + const std::vector& referenced_blocks, + const std::vector& original_sizes) { + for (int gid = 0; gid < kv_resource.groupNums(); ++gid) { + auto& block_ids = kv_resource.mutableBlockIds(0, gid); + if (!original_sizes.empty() && static_cast(gid) < original_sizes.size() + && block_ids.blocksNum() > original_sizes[static_cast(gid)]) { + rollbackBlockIdsToSize(gid, block_ids, original_sizes[static_cast(gid)]); + } + if (static_cast(gid) < referenced_blocks.size() + && !referenced_blocks[static_cast(gid)].empty()) { + freeBlocksInGroup(gid, referenced_blocks[static_cast(gid)]); + } + block_ids.resize(0); + } + kv_resource.cacheResource(0).setDeviceReuseBlockNum(0); +} + +void HybridKVCacheAllocator::rollbackIncrMalloc(BatchKVCacheResource& kv_resource, + const std::vector>& original_sizes, + int failed_batch) { + const int last_touched_batch = std::min(failed_batch, kv_resource.batchSize() - 1); + for (int b = 0; b <= last_touched_batch; ++b) { + for (int gid = 0; gid < kv_resource.groupNums(); ++gid) { + auto& block_ids = kv_resource.mutableBlockIds(b, gid); + const size_t original_num = original_sizes[static_cast(b)][static_cast(gid)]; + rollbackBlockIdsToSize(gid, block_ids, original_num); + } + } +} + +int HybridKVCacheAllocator::getNeedBlocks(const MallocInfo& malloc_info) const { + if (!malloc_info.batch_kv_cache_resource || !malloc_info.complete_token_ids) { + return 0; + } + const auto& cp_mapper = cp_slot_mapper_; + const int batch_size = malloc_info.batch_kv_cache_resource->batchSize(); + const int total_seq_len = malloc_info.complete_token_ids->totalSeqLength(); + const int raw_common_seq_len = std::min(malloc_info.complete_token_ids->commonSeqLength(), total_seq_len); + const int raw_seq_len = malloc_info.complete_token_ids->seqLength(); + const int reserve_step = malloc_info.complete_token_ids->getReserveStep(); + const bool reuse_enabled = malloc_info.reuse_cache; + const int reuse_blocks_len = reuse_enabled ? malloc_info.batch_kv_cache_resource->curBlocksNum() : 0; + + int common_blocks_total = 0; + int extra_blocks_total = 0; + for (int gid = 0; gid < static_cast(kv_cache_groups_.size()); ++gid) { + const auto group = kv_cache_groups_[static_cast(gid)]; + const int group_common_seq = cpEffectiveSeqLenForGroup(cp_mapper, group, raw_common_seq_len); + const int group_seq_len = cpEffectiveSeqLenForGroup(cp_mapper, group, raw_seq_len); + const auto need = kv_cache_groups_[static_cast(gid)]->getNeedBlocks( + group_common_seq, group_seq_len, reserve_step, reuse_blocks_len, reuse_enabled); + common_blocks_total += need.common_blocks; + extra_blocks_total += need.extra_blocks; + } + return common_blocks_total + batch_size * extra_blocks_total; +} + +void HybridKVCacheAllocator::checkCPShardedMallocResult(const MallocInfo& malloc_info) const { + if (!cp_slot_mapper_ || !cp_slot_mapper_->isSharded()) { + return; + } + + const auto& kv_resource = malloc_info.batch_kv_cache_resource; + const int seq_len = malloc_info.incrSeqLen(); + const int reserve_step = malloc_info.complete_token_ids->getReserveStep(); + + for (int batch_id = 0; batch_id < kv_resource->batchSize(); ++batch_id) { + for (int gid = 0; gid < kv_resource->groupNums(); ++gid) { + const auto group = kv_cache_groups_[static_cast(gid)]; + if (!cpShardThisGroup(cp_slot_mapper_, group)) { + continue; + } + const int effective_seq_len = cpEffectiveSeqLenForGroup(cp_slot_mapper_, group, seq_len); + const int expected_blocks = + kv_cache_groups_[static_cast(gid)]->needBlocksNum(effective_seq_len, 0, reserve_step); + const int actual_blocks = kv_resource->blocksNum(batch_id, gid); + RTP_LLM_CHECK_WITH_INFO(actual_blocks == expected_blocks, + "CP invariant violated: batch=%d group=%d blocks=%d != expected_local_blocks=%d " + "(seq_len=%d, effective_seq_len=%d, reserve_step=%d, cp_size=%d, " + "block_size=%d, cacheKeys=%zu)", + batch_id, + gid, + actual_blocks, + expected_blocks, + seq_len, + effective_seq_len, + reserve_step, + cp_slot_mapper_->cpSize(), + cp_slot_mapper_->blockSize(), + kv_resource->cacheKeys(batch_id).size()); + } + } +} + +int HybridKVCacheAllocator::singleBatchNeedBlocks(const BatchKVCacheResourcePtr& batch_kv_cache_resource, + int seq_len, + int reserve_step) const { + int need_blocks = 0; + for (int gid = 0; gid < batch_kv_cache_resource->groupNums(); ++gid) { + const int effective_seq_len = + cpEffectiveSeqLenForGroup(cp_slot_mapper_, kv_cache_groups_[static_cast(gid)], seq_len); + const int cur_blocks = batch_kv_cache_resource->blocksNum(0, gid); + need_blocks += + kv_cache_groups_[static_cast(gid)]->needBlocksNum(effective_seq_len, cur_blocks, reserve_step); + } + return need_blocks; +} + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/allocator/HybridKVCacheAllocator.h b/rtp_llm/cpp/cache/allocator/HybridKVCacheAllocator.h new file mode 100644 index 0000000000..f68b5e9c00 --- /dev/null +++ b/rtp_llm/cpp/cache/allocator/HybridKVCacheAllocator.h @@ -0,0 +1,70 @@ +#pragma once + +#include +#include + +#include "rtp_llm/cpp/cache/group/FullKVCacheGroup.h" +#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h" +#include "rtp_llm/cpp/cache/group/LinearKVCacheGroup.h" +#include "rtp_llm/cpp/cache/group/SWAKVCacheGroup.h" + +namespace rtp_llm { + +class HybridKVCacheAllocator: public KVCacheAllocator, public std::enable_shared_from_this { +public: + HybridKVCacheAllocator(const CacheConfig& config, + AllocationType allocation_type = AllocationType::DEVICE, + const kmonitor::MetricsReporterPtr metrics_reporter = nullptr, + int64_t reserve_block_ratio = 0); + + void free(const FreeInfo& free_info) override; + void insertIntoCache(const InsertInfo& insert_info) override; + + std::shared_ptr incrKVCacheRef(const KVCacheResource& kvcache_resource, + const CacheKeysType& cache_keys, + bool is_connector = false) override; + + bool updateKVBlock(const BatchKVCacheResourcePtr& batch_kv_cache_resource, + const std::vector& block_src_batch, + bool copy_last_block, + std::vector& block_update_mapping) override; + + int seqSizePerBlock() const override; + int singleBatchNeedBlocks(const BatchKVCacheResourcePtr& batch_kv_cache_resource, + int seq_len, + int reserve_step) const override; + std::vector independentEvictionGroupIds() const override; + +protected: + MallocResult incrMalloc(const MallocInfo& malloc_info) override; + MallocResult initMallocForCommonLen(const MallocInfo& malloc_info) override; + int getNeedBlocks(const MallocInfo& malloc_info) const override; + void checkCPShardedMallocResult(const MallocInfo& malloc_info) const override; + void decrKVCacheRef(const KVCacheResource& kvcache_resource, bool is_connector = false) override; + + int reuseCache(const CacheKeysType& cache_keys, + BatchKVCacheResource& kv_resource, + const std::shared_ptr& cp_mapper); + + virtual void referenceBlocksInGroup(int gid, const BlockIndicesType& blocks, bool is_connector = false) const = 0; + virtual void freeBlocksInGroup(int gid, const BlockIndicesType& blocks, bool is_connector = false) = 0; + virtual bool hasAvailableBlocksForReserve(const MallocInfo& malloc_info, size_t reserve_blocks) const; + bool skipReuseCacheGroup(int gid) const; + bool cpCompactSwaGroup(int gid, const std::shared_ptr& mapper) const; + void rollbackBlockIdsToSize(int gid, BlockIds& block_ids, size_t original_size); + void rollbackInitMalloc(BatchKVCacheResource& kv_resource, + const std::vector& referenced_blocks, + const std::vector& original_sizes); + void rollbackIncrMalloc(BatchKVCacheResource& kv_resource, + const std::vector>& original_sizes, + int failed_batch); + + std::vector kv_cache_groups_; + std::vector full_group_ids_; + std::vector linear_group_ids_; + std::vector swa_group_ids_; +}; + +using HybridKVCacheAllocatorPtr = std::shared_ptr; + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/allocator/HybridPoolKVCacheAllocator.cc b/rtp_llm/cpp/cache/allocator/HybridPoolKVCacheAllocator.cc new file mode 100644 index 0000000000..0d59ec02d1 --- /dev/null +++ b/rtp_llm/cpp/cache/allocator/HybridPoolKVCacheAllocator.cc @@ -0,0 +1,687 @@ +#include "rtp_llm/cpp/cache/allocator/HybridPoolKVCacheAllocator.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rtp_llm/cpp/cache/BlockPoolConfigHelper.h" +#include "rtp_llm/cpp/cache/CPSlotMapper.h" +#include "rtp_llm/cpp/engine_base/stream/CompleteTokenIds.h" +#include "rtp_llm/cpp/metrics/RtpLLMMetrics.h" +#include "rtp_llm/cpp/utils/Logger.h" +#include "rtp_llm/models_py/bindings/core/ExecOps.h" +#include "rtp_llm/models_py/bindings/core/OpData.h" + +namespace rtp_llm { +namespace { + +inline bool cpShardThisGroupForReserve(const std::shared_ptr& mapper, CacheGroupType group_type) { + return mapper && mapper->isSharded() && group_type == CacheGroupType::FULL; +} + +inline int +cpEffectiveSeqLenForReserve(const std::shared_ptr& mapper, CacheGroupType group_type, int seq_len) { + return cpShardThisGroupForReserve(mapper, group_type) ? mapper->effectiveSeqLenForAlloc(seq_len) : seq_len; +} + +void appendPoolSummary(std::ostringstream& os, + bool& has_any, + int gid, + const std::string& tag, + CacheGroupType group_type, + const BlockPoolConfig& pool_config) { + static constexpr double kBytesPerMB = 1024.0 * 1024.0; + if (has_any) { + os << "; "; + } + has_any = true; + os << "pool_name=" << pool_config.pool_name << ", gid=" << gid << ", tag=" << tag + << ", type=" << cacheGroupTypeName(group_type) << ", size=" << pool_config.total_size_bytes << " bytes(" + << std::fixed << std::setprecision(2) << static_cast(pool_config.total_size_bytes) / kBytesPerMB + << " MB)" + << ", blocks=" << pool_config.block_num; +} + +} // namespace + +HybridPoolKVCacheAllocator::HybridPoolKVCacheAllocator(const CacheConfig& config, + AllocationType allocation_type, + const kmonitor::MetricsReporterPtr metrics_reporter, + int64_t reserve_block_ratio, + RoleType role_type): + HybridKVCacheAllocator(config, allocation_type, metrics_reporter, reserve_block_ratio), role_type_(role_type) {} + +bool HybridPoolKVCacheAllocator::doInit() { + RTP_LLM_CHECK_WITH_INFO(config_.groupNums() > 0, "no cache groups found in CacheConfig"); + + const int group_nums = config_.groupNums(); + group_block_pools_.reserve(static_cast(group_nums)); + kv_cache_groups_.reserve(static_cast(group_nums)); + + SharedBlockCache* shared_cache_raw = shared_block_cache_ ? shared_block_cache_.get() : nullptr; + static constexpr double kBytesPerMB = 1024.0 * 1024.0; + std::ostringstream pool_summary; + size_t pool_total_bytes = 0; + size_t pool_total_blocks = 0; + bool has_pool = false; + + std::vector group_pool_configs; + group_pool_configs.reserve(static_cast(group_nums)); + for (int gid = 0; gid < group_nums; ++gid) { + auto pool_config = BlockPoolConfigHelper::createConfigForGroup(config_, static_cast(gid)); + const auto tag = config_.tagForGroup(static_cast(gid)); + const auto group_type = config_.typeForGroup(static_cast(gid)); + appendPoolSummary(pool_summary, has_pool, gid, tag, group_type, pool_config); + pool_total_bytes += pool_config.total_size_bytes; + pool_total_blocks += pool_config.block_num; + group_pool_configs.push_back(std::move(pool_config)); + } + + if (has_pool) { + const auto summary = pool_summary.str(); + RTP_LLM_LOG_INFO("HybridPool pool summary: pools=[%s], total_size=%zu bytes total_size_mb=%.2f " + "total_blocks=%zu", + summary.c_str(), + pool_total_bytes, + static_cast(pool_total_bytes) / kBytesPerMB, + pool_total_blocks); + } + + for (int gid = 0; gid < group_nums; ++gid) { + const auto& pool_config = group_pool_configs[static_cast(gid)]; + const auto group_type = config_.typeForGroup(static_cast(gid)); + + auto group_pool = std::make_shared(pool_config, + allocation_type_, + /*use_pinned_cpu_backing=*/false, + use_cuda_malloc_block_pool_); + RTP_LLM_CHECK_WITH_INFO( + group_pool->init(), "Failed to initialize block pool %s(group %d)", pool_config.pool_name.c_str(), gid); + + const auto& ids = config_.layerIdsForGroup(static_cast(gid)); + auto spec = config_.specForGroup(static_cast(gid)); + const auto policy = config_.policyForGroup(static_cast(gid)); + + KVCacheGroupPtr group; + if (group_type == CacheGroupType::LINEAR) { + group = std::make_shared( + ids, spec, group_pool, gid, config_.linear_step, shared_cache_raw, metrics_reporter_, policy); + linear_group_ids_.push_back(gid); + } else if (group_type == CacheGroupType::SWA) { + group = std::make_shared( + ids, spec, group_pool, gid, config_.linear_step, shared_cache_raw, metrics_reporter_, policy); + swa_group_ids_.push_back(gid); + } else { + group = std::make_shared(ids, spec, group_pool, gid, shared_cache_raw, metrics_reporter_, policy); + full_group_ids_.push_back(gid); + } + + RTP_LLM_CHECK_WITH_INFO( + group->init(), "Failed to initialize KVCacheGroup %s(gid %d)", pool_config.pool_name.c_str(), gid); + group_block_pools_.push_back(group_pool); + kv_cache_groups_.push_back(group); + } + + if (shared_block_cache_) { + shared_block_cache_->init(group_nums, group_block_pools_); + } + + RTP_LLM_LOG_INFO("HybridPoolKVCacheAllocator init success, group pools=%zu", group_block_pools_.size()); + return true; +} + +int HybridPoolKVCacheAllocator::defaultGroupIdForLayer(int layer_id) const { + if (layer_id < 0 || static_cast(layer_id) >= config_.layer_all_num) { + RTP_LLM_FAIL("invalid layer_id=%d", layer_id); + } + const int gid = config_.groupIdFor(layer_id); + RTP_LLM_CHECK_WITH_INFO(gid >= 0 && gid < static_cast(kv_cache_groups_.size()), + "invalid default group id %d for layer %d", + gid, + layer_id); + return gid; +} + +int HybridPoolKVCacheAllocator::validateGroupIdForLayer(int layer_id, int group_id) const { + RTP_LLM_CHECK_WITH_INFO(group_id >= 0 && group_id < static_cast(kv_cache_groups_.size()), + "invalid group id %d for layer %d", + group_id, + layer_id); + RTP_LLM_CHECK_WITH_INFO(layer_id >= 0 && static_cast(layer_id) < config_.layer_all_num, + "invalid layer id %d for layer_all_num=%u", + layer_id, + config_.layer_all_num); + const auto& group_ids = config_.groupIdsForLayer(layer_id); + RTP_LLM_CHECK_WITH_INFO(std::find(group_ids.begin(), group_ids.end(), group_id) != group_ids.end(), + "layer %d does not own cache group %d", + layer_id, + group_id); + return group_id; +} + +void HybridPoolKVCacheAllocator::referenceBlocksInGroup(int gid, + const BlockIndicesType& blocks, + bool is_connector) const { + if (is_connector) { + group_block_pools_[static_cast(gid)]->connectorReference(blocks); + } else { + group_block_pools_[static_cast(gid)]->requestReference(blocks); + } +} + +void HybridPoolKVCacheAllocator::freeBlocksInGroup(int gid, const BlockIndicesType& blocks, bool is_connector) { + if (is_connector) { + group_block_pools_[static_cast(gid)]->connectorFree(blocks); + } else { + group_block_pools_[static_cast(gid)]->requestFree(blocks); + } +} + +CacheLayerLayout HybridPoolKVCacheAllocator::allLayerCacheBase() const { + CacheLayerLayout layout; + const auto layer_group_ids = config_.layerGroupIdsSnapshot(); + layout.layer_to_group_ids = layer_group_ids; + layout.group_types = config_.groupTypesSnapshot(); + layout.group_tags = config_.groupTagsSnapshot(); + layout.layer_tag_to_group_id = config_.layerTagToGroupIdSnapshot(); + layout.group_seq_size_per_block = config_.group_seq_size_per_block; + layout.layer_group_types.resize(config_.layer_all_num, CacheGroupType::FULL); + for (size_t layer_id = 0; layer_id < layer_group_ids.size() && layer_id < layout.layer_group_types.size(); + ++layer_id) { + if (!layer_group_ids[layer_id].empty()) { + layout.layer_group_types[layer_id] = + config_.typeForGroup(static_cast(layer_group_ids[layer_id].front())); + } + } + + layout.layers_to_kv_buffer_ptrs.resize(config_.layer_all_num); + layout.layers_to_scale_buffer_ptrs.resize(config_.layer_all_num); + const size_t group_count = kv_cache_groups_.size(); + layout.layers_to_kv_buffer_ptrs_by_group.resize(config_.layer_all_num); + layout.layers_to_scale_buffer_ptrs_by_group.resize(config_.layer_all_num); + for (size_t layer_id = 0; layer_id < static_cast(config_.layer_all_num); ++layer_id) { + layout.layers_to_kv_buffer_ptrs_by_group[layer_id].resize(group_count); + layout.layers_to_scale_buffer_ptrs_by_group[layer_id].resize(group_count); + } + + for (size_t layer_id = 0; layer_id < static_cast(config_.layer_all_num); ++layer_id) { + if (layer_id >= layer_group_ids.size() || layer_group_ids[layer_id].size() != 1) { + continue; + } + const int gid = layer_group_ids[layer_id][0]; + RTP_LLM_CHECK_WITH_INFO(gid >= 0 && gid < static_cast(kv_cache_groups_.size()), + "invalid single-tag group id %d for layer %zu", + gid, + layer_id); + const auto layer_tensors = kv_cache_groups_[static_cast(gid)]->allLayerCacheBase(); + const auto scale_tensors = kv_cache_groups_[static_cast(gid)]->allLayerScaleCacheBase(); + auto it = layer_tensors.find(static_cast(layer_id)); + if (it != layer_tensors.end()) { + layout.layers_to_kv_buffer_ptrs[layer_id] = it->second; + } + auto scale_it = scale_tensors.find(static_cast(layer_id)); + if (scale_it != scale_tensors.end()) { + layout.layers_to_scale_buffer_ptrs[layer_id] = scale_it->second; + } + } + + for (int gid = 0; gid < static_cast(kv_cache_groups_.size()); ++gid) { + const auto layer_tensors = kv_cache_groups_[static_cast(gid)]->allLayerCacheBase(); + const auto scale_tensors = kv_cache_groups_[static_cast(gid)]->allLayerScaleCacheBase(); + for (const auto& [layer_id, tensor] : layer_tensors) { + RTP_LLM_CHECK_WITH_INFO( + layer_id >= 0 && static_cast(layer_id) < layout.layers_to_kv_buffer_ptrs_by_group.size(), + "layer_id %d out of by-group kv layout range %zu", + layer_id, + layout.layers_to_kv_buffer_ptrs_by_group.size()); + layout.layers_to_kv_buffer_ptrs_by_group[static_cast(layer_id)][static_cast(gid)] = tensor; + } + for (const auto& [layer_id, tensor] : scale_tensors) { + RTP_LLM_CHECK_WITH_INFO( + layer_id >= 0 && static_cast(layer_id) < layout.layers_to_scale_buffer_ptrs_by_group.size(), + "layer_id %d out of by-group scale layout range %zu", + layer_id, + layout.layers_to_scale_buffer_ptrs_by_group.size()); + layout.layers_to_scale_buffer_ptrs_by_group[static_cast(layer_id)][static_cast(gid)] = + tensor; + } + } + return layout; +} + +BlockAddrInfo HybridPoolKVCacheAllocator::convertIndexToAddr(int layer_id, int block_id) const { + const int gid = defaultGroupIdForLayer(layer_id); + return kv_cache_groups_[static_cast(gid)]->convertIndexToAddr(layer_id, block_id); +} + +std::vector HybridPoolKVCacheAllocator::convertIndexToBuffer(int layer_id, int block_id) const { + const int gid = defaultGroupIdForLayer(layer_id); + return kv_cache_groups_[static_cast(gid)]->convertIndexToBuffer(layer_id, block_id); +} + +std::vector HybridPoolKVCacheAllocator::convertIndexToBuffer(int layer_id, + int block_id, + int partition_count, + int partition_id) const { + const int gid = defaultGroupIdForLayer(layer_id); + return kv_cache_groups_[static_cast(gid)]->convertIndexToBuffer( + layer_id, block_id, partition_count, partition_id); +} + +BlockAddrInfo HybridPoolKVCacheAllocator::convertIndexToAddr(int layer_id, int group_id, int block_id) const { + const int gid = validateGroupIdForLayer(layer_id, group_id); + return kv_cache_groups_[static_cast(gid)]->convertIndexToAddr(layer_id, block_id); +} + +std::vector HybridPoolKVCacheAllocator::convertIndexToBuffer(int layer_id, int group_id, int block_id) const { + const int gid = validateGroupIdForLayer(layer_id, group_id); + return kv_cache_groups_[static_cast(gid)]->convertIndexToBuffer(layer_id, block_id); +} + +std::vector HybridPoolKVCacheAllocator::convertIndexToBuffer( + int layer_id, int group_id, int block_id, int partition_count, int partition_id) const { + const int gid = validateGroupIdForLayer(layer_id, group_id); + return kv_cache_groups_[static_cast(gid)]->convertIndexToBuffer( + layer_id, block_id, partition_count, partition_id); +} + +void HybridPoolKVCacheAllocator::blockBatchCopy(const BlockIdPair* begin_ptr, const BlockIdPair* end_ptr) { + if (end_ptr == begin_ptr) { + return; + } + + size_t copy_nums[BatchCopyParams::TYPE_SIZE] = {}; + for (int gid = 0; gid < static_cast(kv_cache_groups_.size()); ++gid) { + RTP_LLM_CHECK_WITH_INFO( + static_cast(gid) < group_block_pools_.size(), "missing block pool for group %d", gid); + const auto copy_type = BatchCopyParams::get_copy_type(group_block_pools_[static_cast(gid)]->where(), + group_block_pools_[static_cast(gid)]->where()); + const auto& spec = config_.specForGroup(static_cast(gid)); + const size_t buffers_per_layer = spec->scale_block_size_bytes() > 0 ? 2 : 1; + copy_nums[copy_type] += config_.layerIdsForGroup(static_cast(gid)).size() + * static_cast(end_ptr - begin_ptr) * buffers_per_layer; + } + + BatchCopyParams copy_params; + for (size_t i = 0; i < BatchCopyParams::TYPE_SIZE; ++i) { + copy_params.reserve(static_cast(i), copy_nums[i]); + } + + for (auto it = begin_ptr; it != end_ptr; ++it) { + auto [src_block_index, dest_block_index] = *it; + + for (int gid = 0; gid < static_cast(kv_cache_groups_.size()); ++gid) { + const auto& spec = config_.specForGroup(static_cast(gid)); + const size_t kv_block_size_bytes = spec->block_size_bytes(); + const size_t scale_block_bytes = spec->scale_block_size_bytes(); + const auto copy_type = + BatchCopyParams::get_copy_type(group_block_pools_[static_cast(gid)]->where(), + group_block_pools_[static_cast(gid)]->where()); + + for (int layer_id : config_.layerIdsForGroup(static_cast(gid))) { + auto src_addr_info = + kv_cache_groups_[static_cast(gid)]->convertIndexToAddr(layer_id, src_block_index); + auto dst_addr_info = + kv_cache_groups_[static_cast(gid)]->convertIndexToAddr(layer_id, dest_block_index); + + if (!src_addr_info.kv_addr || !dst_addr_info.kv_addr) { + RTP_LLM_LOG_ERROR("Failed to get block address for pool %s(group %d) layer %d, src_block %d, " + "dst_block %d", + group_block_pools_[static_cast(gid)]->poolName().c_str(), + gid, + layer_id, + src_block_index, + dest_block_index); + continue; + } + + copy_params.add(dst_addr_info.kv_addr, src_addr_info.kv_addr, kv_block_size_bytes, copy_type); + + if (scale_block_bytes > 0 && src_addr_info.kv_scale_addr && dst_addr_info.kv_scale_addr) { + copy_params.add( + dst_addr_info.kv_scale_addr, src_addr_info.kv_scale_addr, scale_block_bytes, copy_type); + } + } + } + } + + execBatchCopy(copy_params); +} + +size_t HybridPoolKVCacheAllocator::freeBlocksNum() const { + size_t total = 0; + for (const auto& pool : group_block_pools_) { + total += pool->freeBlocksNum(); + } + return total; +} + +size_t HybridPoolKVCacheAllocator::availableBlocksNum() const { + size_t total = 0; + for (const auto& pool : group_block_pools_) { + total += pool->availableBlocksNum(); + } + return total; +} + +BatchKVCacheResourcePtr HybridPoolKVCacheAllocator::popBlocksFromCache(size_t min_blocks_to_free) { + if (min_blocks_to_free == 0 || !shared_block_cache_) { + return nullptr; + } + + auto evict_result = shared_block_cache_->selectAndEvict(min_blocks_to_free); + if (evict_result.evicted_keys.empty()) { + return nullptr; + } + if (metrics_reporter_) { + for (const auto& [cache_key, lifetime_ms] : evict_result.evicted_lifetime_ms) { + RtpLLMCacheEvictionMetricsCollector collector; + collector.lifetime_ms = lifetime_ms; + kmonitor::MetricsTags tags("scope", "gpu"); + tags.AddTag("evict_policy", + evict_result.evicted_independent_group.count(cache_key) ? "independent" : "chain"); + tags.AddTag("backing", "device"); + metrics_reporter_->report(&tags, + &collector); + } + } + + auto batch_resource = std::make_shared(); + batch_resource->resetBatchSize(1); + batch_resource->initGroups(config_.groupNums(), + static_cast(config_.layer_all_num), + config_.layerGroupIdsSnapshot(), + config_.kernelBlocksPerKvBlock(), + config_.groupTypesSnapshot()); + batch_resource->setLastBlockAligned(true); + + for (int gid = 0; gid < config_.groupNums(); ++gid) { + batch_resource->mutableBlockIds(0, gid).resize(evict_result.evicted_keys.size(), NULL_BLOCK_IDX); + } + + CacheKeysType evicted_keys; + BlockDependenciesType evicted_dependencies; + evicted_keys.reserve(evict_result.evicted_keys.size()); + evicted_dependencies.reserve(evict_result.evicted_keys.size()); + for (size_t evicted_idx = 0; evicted_idx < evict_result.evicted_keys.size(); ++evicted_idx) { + const auto cache_key = evict_result.evicted_keys[evicted_idx]; + const auto& slots = evict_result.evicted_slots.at(cache_key); + evicted_keys.push_back(cache_key); + auto dep_it = evict_result.evicted_dependencies.find(cache_key); + if (dep_it != evict_result.evicted_dependencies.end()) { + evicted_dependencies.push_back(dep_it->second); + } else { + BlockDependency dependency; + dependency.ordinal = static_cast(evicted_idx); + if (evicted_idx > 0) { + dependency.has_parent = true; + dependency.parent_key = evict_result.evicted_keys[evicted_idx - 1]; + } + evicted_dependencies.push_back(dependency); + } + for (int gid = 0; gid < static_cast(slots.size()) && gid < config_.groupNums(); ++gid) { + if (!isNullBlockIdx(slots[gid])) { + batch_resource->mutableBlockIds(0, gid).setAt(evicted_idx, slots[gid]); + } + } + } + batch_resource->cacheResource(0).setCacheKeys(std::move(evicted_keys)); + batch_resource->cacheResource(0).setBlockDependencies(std::move(evicted_dependencies)); + // Evicted keys already come from the GPU cache's actual key namespace. + // Under CP this can be a mixed batch of canonical paged keys and logical + // state/SWA keys, so coordinator must not remap the whole batch again. + batch_resource->cacheResource(0).setCacheKeysAreCpCanonical(true); + return batch_resource; +} + +void HybridPoolKVCacheAllocator::blockCacheFree(const BatchKVCacheResourcePtr& batch_kv_cache_resource) { + if (!batch_kv_cache_resource) { + return; + } + for (int batch_id = 0; batch_id < batch_kv_cache_resource->batchSize(); ++batch_id) { + for (int gid = 0; gid < batch_kv_cache_resource->groupNums(); ++gid) { + BlockIndicesType blocks_to_free; + std::unordered_set seen_blocks; + for (auto block_idx : batch_kv_cache_resource->blocks(batch_id, gid)) { + if (isNullBlockIdx(block_idx) || !seen_blocks.insert(block_idx).second) { + continue; + } + blocks_to_free.push_back(block_idx); + } + if (!blocks_to_free.empty()) { + group_block_pools_[static_cast(gid)]->blockCacheFree(blocks_to_free); + } + } + } +} + +size_t HybridPoolKVCacheAllocator::requestRefBlocksNum() const { + size_t total = 0; + for (const auto& pool : group_block_pools_) { + total += pool->requestRefBlocksNum(); + } + return total; +} + +size_t HybridPoolKVCacheAllocator::connectorRefBlocksNum() const { + size_t total = 0; + for (const auto& pool : group_block_pools_) { + total += pool->connectorRefBlocksNum(); + } + return total; +} + +size_t HybridPoolKVCacheAllocator::blockCacheRefBlocksNum() const { + size_t total = 0; + for (const auto& pool : group_block_pools_) { + total += pool->blockCacheRefBlocksNum(); + } + return total; +} + +size_t HybridPoolKVCacheAllocator::notInUseBlocksNum() const { + size_t total = 0; + for (const auto& pool : group_block_pools_) { + total += pool->notInUseBlocksNum(); + } + return total; +} + +size_t HybridPoolKVCacheAllocator::minTokenCapacity(bool use_available_blocks, bool full_groups_only) const { + if (group_block_pools_.empty()) { + return 0; + } + + auto calculate = [&](bool only_full_groups) { + size_t min_tokens = std::numeric_limits::max(); + bool saw_group = false; + for (size_t gid = 0; gid < group_block_pools_.size(); ++gid) { + if (only_full_groups && config_.typeForGroup(gid) != CacheGroupType::FULL) { + continue; + } + if (!group_block_pools_[gid]) { + continue; + } + saw_group = true; + const auto block = use_available_blocks ? group_block_pools_[gid]->availableBlocksNum() : + group_block_pools_[gid]->totalBlocksNum(); + min_tokens = std::min(min_tokens, block * logicalSeqSizePerBlockForCapacity(gid)); + } + return std::make_pair(saw_group, min_tokens); + }; + + if (full_groups_only) { + const auto [saw_full_group, min_tokens] = calculate(/*only_full_groups=*/true); + if (saw_full_group) { + return min_tokens; + } + } + + const auto [saw_group, min_tokens] = calculate(/*only_full_groups=*/false); + return saw_group ? min_tokens : 0; +} + +size_t HybridPoolKVCacheAllocator::availableTokensNum() const { + return minTokenCapacity(/*use_available_blocks=*/true, /*full_groups_only=*/true); +} + +size_t HybridPoolKVCacheAllocator::totalTokensNum() const { + return minTokenCapacity(/*use_available_blocks=*/false, /*full_groups_only=*/true); +} + +size_t HybridPoolKVCacheAllocator::totalBlocksNum() const { + size_t total = 0; + for (const auto& pool : group_block_pools_) { + total += pool->totalBlocksNum(); + } + return total; +} + +size_t HybridPoolKVCacheAllocator::maxAvailableTokensNum() const { + return minTokenCapacity(/*use_available_blocks=*/false, /*full_groups_only=*/true); +} + +KVCacheTokenCapacity HybridPoolKVCacheAllocator::tokenCapacity(size_t default_seq_size_per_block) const { + if (group_block_pools_.empty()) { + return {}; + } + size_t total_tokens = std::numeric_limits::max(); + size_t available_tokens = std::numeric_limits::max(); + bool has_pool = false; + for (size_t gid = 0; gid < group_block_pools_.size(); ++gid) { + const auto& pool = group_block_pools_[gid]; + if (!pool) { + continue; + } + const size_t seq_size = + (gid < config_.group_seq_size_per_block.size() && config_.group_seq_size_per_block[gid] > 0) ? + config_.group_seq_size_per_block[gid] : + default_seq_size_per_block; + total_tokens = std::min(total_tokens, pool->totalBlocksNum() * seq_size); + available_tokens = std::min(available_tokens, pool->availableBlocksNum() * seq_size); + has_pool = true; + } + return has_pool ? KVCacheTokenCapacity{total_tokens, available_tokens} : KVCacheTokenCapacity{}; +} + +std::vector HybridPoolKVCacheAllocator::poolMetricsSnapshots() const { + std::vector snapshots; + snapshots.reserve(group_block_pools_.size()); + const size_t reserve_blocks = reserveBlockNum(); + const size_t total_reservable_available_blocks = totalReservableAvailableBlocks(); + for (size_t gid = 0; gid < group_block_pools_.size(); ++gid) { + const auto& pool = group_block_pools_[gid]; + if (!pool) { + continue; + } + KVCachePoolMetricsSnapshot snapshot; + snapshot.pool_index = gid; + snapshot.pool_name = pool->poolName(); + snapshot.total_blocks = pool->totalBlocksNum(); + snapshot.available_blocks = pool->availableBlocksNum(); + snapshot.free_blocks = pool->freeBlocksNum(); + snapshot.request_ref_blocks = pool->requestRefBlocksNum(); + snapshot.connector_ref_blocks = pool->connectorRefBlocksNum(); + snapshot.reserve_blocks = reserveBlocksForPool(gid, reserve_blocks, total_reservable_available_blocks); + snapshot.used_ratio = (snapshot.total_blocks == 0) ? + 0.0f : + static_cast(100.0 * (snapshot.total_blocks - snapshot.available_blocks) + / static_cast(snapshot.total_blocks)); + snapshots.push_back(snapshot); + } + return snapshots; +} + +void HybridPoolKVCacheAllocator::regUserMr(size_t model_id, std::shared_ptr cache_store) { + for (auto& pool : group_block_pools_) { + pool->regUserMr(model_id, cache_store); + } +} + +int64_t HybridPoolKVCacheAllocator::getMrCostTimeMs() const { + int64_t total = 0; + for (const auto& pool : group_block_pools_) { + total += pool->getMrCostTimeMs(); + } + return total; +} + +size_t HybridPoolKVCacheAllocator::totalReservableAvailableBlocks() const { + size_t total = 0; + for (size_t gid = 0; gid < group_block_pools_.size(); ++gid) { + if (!group_block_pools_[gid] || config_.usesExplicitIndependentBlocks(gid)) { + continue; + } + total += group_block_pools_[gid]->availableBlocksNum(); + } + return total; +} + +size_t HybridPoolKVCacheAllocator::reserveBlocksForPool(size_t gid, + size_t reserve_blocks, + size_t total_reservable_available_blocks) const { + if (gid >= group_block_pools_.size() || !group_block_pools_[gid] || config_.usesExplicitIndependentBlocks(gid) + || total_reservable_available_blocks == 0) { + return 0; + } + return reserve_blocks * group_block_pools_[gid]->availableBlocksNum() / total_reservable_available_blocks; +} + +bool HybridPoolKVCacheAllocator::hasAvailableBlocksForReserve(const MallocInfo& malloc_info, + size_t reserve_blocks) const { + if (!malloc_info.batch_kv_cache_resource || !malloc_info.complete_token_ids) { + return true; + } + const auto& cp_mapper = cp_slot_mapper_; + const int batch_size = malloc_info.batch_kv_cache_resource->batchSize(); + const int total_seq_len = malloc_info.complete_token_ids->totalSeqLength(); + const int raw_common_seq_len = std::min(malloc_info.complete_token_ids->commonSeqLength(), total_seq_len); + const int raw_seq_len = malloc_info.complete_token_ids->seqLength(); + const int reserve_step = malloc_info.complete_token_ids->getReserveStep(); + const bool reuse_enabled = malloc_info.reuse_cache; + + const size_t total_reservable_available_blocks = totalReservableAvailableBlocks(); + + for (int gid = 0; gid < static_cast(kv_cache_groups_.size()); ++gid) { + const auto group_type = config_.typeForGroup(static_cast(gid)); + const int group_common_seq = cpEffectiveSeqLenForReserve(cp_mapper, group_type, raw_common_seq_len); + const int group_seq_len = cpEffectiveSeqLenForReserve(cp_mapper, group_type, raw_seq_len); + const int group_reuse_blocks_len = reuse_enabled ? malloc_info.batch_kv_cache_resource->blocksNum(0, gid) : 0; + const auto need = kv_cache_groups_[static_cast(gid)]->getNeedBlocks( + group_common_seq, group_seq_len, reserve_step, group_reuse_blocks_len, reuse_enabled); + const int need_blocks = need.common_blocks + batch_size * need.extra_blocks; + if (need_blocks <= 0) { + continue; + } + const auto& pool = group_block_pools_[static_cast(gid)]; + const size_t available_blocks = pool->availableBlocksNum(); + const size_t total_blocks = pool->totalBlocksNum(); + const size_t group_reserve_blocks = + reserveBlocksForPool(static_cast(gid), reserve_blocks, total_reservable_available_blocks); + if (available_blocks < static_cast(need_blocks) + group_reserve_blocks) { + if (malloc_info.verbose) { + RTP_LLM_LOG_INFO("HybridPool initMalloc rejected by reserve blocks: request_id=%ld pool_name=%s " + "group=%d need_blocks=%d total_blocks=%zu available_blocks=%zu " + "reserve_blocks=%zu group_reserve_blocks=%zu", + malloc_info.request_id, + pool->poolName().c_str(), + gid, + need_blocks, + total_blocks, + available_blocks, + reserve_blocks, + group_reserve_blocks); + } + return false; + } + } + return true; +} + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/allocator/HybridPoolKVCacheAllocator.h b/rtp_llm/cpp/cache/allocator/HybridPoolKVCacheAllocator.h new file mode 100644 index 0000000000..4c5dc81c60 --- /dev/null +++ b/rtp_llm/cpp/cache/allocator/HybridPoolKVCacheAllocator.h @@ -0,0 +1,72 @@ +#pragma once + +#include +#include + +#include "rtp_llm/cpp/cache/allocator/HybridKVCacheAllocator.h" +#include "rtp_llm/cpp/config/ConfigModules.h" + +namespace rtp_llm { + +class HybridPoolKVCacheAllocator: public HybridKVCacheAllocator { +public: + HybridPoolKVCacheAllocator(const CacheConfig& config, + AllocationType allocation_type = AllocationType::DEVICE, + const kmonitor::MetricsReporterPtr metrics_reporter = nullptr, + int64_t reserve_block_ratio = 0, + RoleType role_type = RoleType::PDFUSION); + + BlockAddrInfo convertIndexToAddr(int layer_id, int block_id) const override; + std::vector convertIndexToBuffer(int layer_id, int block_id) const override; + std::vector + convertIndexToBuffer(int layer_id, int block_id, int partition_count, int partition_id) const override; + BlockAddrInfo convertIndexToAddr(int layer_id, int group_id, int block_id) const override; + std::vector convertIndexToBuffer(int layer_id, int group_id, int block_id) const override; + std::vector convertIndexToBuffer( + int layer_id, int group_id, int block_id, int partition_count, int partition_id) const override; + void blockBatchCopy(const BlockIdPair* copy_mapping_begin, const BlockIdPair* copy_mapping_end) override; + + CacheLayerLayout allLayerCacheBase() const override; + + size_t freeBlocksNum() const override; + size_t availableBlocksNum() const override; + BatchKVCacheResourcePtr popBlocksFromCache(size_t min_blocks_to_free) override; + void blockCacheFree(const BatchKVCacheResourcePtr& batch_kv_cache_resource) override; + size_t requestRefBlocksNum() const override; + size_t connectorRefBlocksNum() const override; + size_t blockCacheRefBlocksNum() const override; + size_t notInUseBlocksNum() const override; + size_t availableTokensNum() const override; + size_t totalTokensNum() const override; + size_t totalBlocksNum() const override; + size_t maxAvailableTokensNum() const override; + KVCacheTokenCapacity tokenCapacity(size_t default_seq_size_per_block) const override; + std::vector poolMetricsSnapshots() const override; + void regUserMr(size_t model_id, std::shared_ptr cache_store = nullptr) override; + int64_t getMrCostTimeMs() const override; + + // Per-pool access for diagnostics / per-pool metrics reporting. + const std::vector& groupBlockPools() const { + return group_block_pools_; + } + +private: + bool doInit() override; + + void referenceBlocksInGroup(int gid, const BlockIndicesType& blocks, bool is_connector = false) const override; + void freeBlocksInGroup(int gid, const BlockIndicesType& blocks, bool is_connector = false) override; + bool hasAvailableBlocksForReserve(const MallocInfo& malloc_info, size_t reserve_blocks) const override; + + int validateGroupIdForLayer(int layer_id, int group_id) const; + int defaultGroupIdForLayer(int layer_id) const; + size_t minTokenCapacity(bool use_available_blocks, bool full_groups_only) const; + size_t totalReservableAvailableBlocks() const; + size_t reserveBlocksForPool(size_t gid, size_t reserve_blocks, size_t total_reservable_available_blocks) const; + + std::vector group_block_pools_; + RoleType role_type_{RoleType::PDFUSION}; +}; + +using HybridPoolKVCacheAllocatorPtr = std::shared_ptr; + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/allocator/HybridTypeKVCacheAllocator.cc b/rtp_llm/cpp/cache/allocator/HybridTypeKVCacheAllocator.cc new file mode 100644 index 0000000000..0efe811580 --- /dev/null +++ b/rtp_llm/cpp/cache/allocator/HybridTypeKVCacheAllocator.cc @@ -0,0 +1,192 @@ +#include "rtp_llm/cpp/cache/allocator/HybridTypeKVCacheAllocator.h" + +#include +#include + +#include "rtp_llm/cpp/cache/BlockPoolConfigHelper.h" +#include "rtp_llm/cpp/utils/Logger.h" + +namespace rtp_llm { + +HybridTypeKVCacheAllocator::HybridTypeKVCacheAllocator(const CacheConfig& config, + AllocationType allocation_type, + const kmonitor::MetricsReporterPtr metrics_reporter, + int64_t reserve_block_ratio): + HybridKVCacheAllocator(config, allocation_type, metrics_reporter, reserve_block_ratio) {} + +bool HybridTypeKVCacheAllocator::doInit() { + RTP_LLM_CHECK_WITH_INFO(config_.groupNums() > 0, "no cache groups found in CacheConfig"); + + auto pool_config = BlockPoolConfigHelper::createConfig(config_); + block_pool_ = std::make_shared( + pool_config, allocation_type_, /*use_pinned_cpu_backing=*/false, use_cuda_malloc_block_pool_); + RTP_LLM_CHECK_WITH_INFO(block_pool_->init(), "Failed to initialize block pool for HybridTypeKVCacheAllocator"); + + const int group_nums = config_.groupNums(); + kv_cache_groups_.reserve(group_nums); + + SharedBlockCache* shared_cache_raw = shared_block_cache_ ? shared_block_cache_.get() : nullptr; + + if (shared_block_cache_) { + std::vector group_pools(static_cast(group_nums), block_pool_); + shared_block_cache_->init(group_nums, group_pools); + } + + for (int gid = 0; gid < group_nums; ++gid) { + KVCacheSpecPtr spec = config_.specForGroup(static_cast(gid)); + const auto& ids = config_.layerIdsForGroup(static_cast(gid)); + + KVCacheGroupPtr group; + const auto group_type = config_.typeForGroup(static_cast(gid)); + const auto policy = config_.policyForGroup(static_cast(gid)); + if (group_type == CacheGroupType::SWA) { + group = std::make_shared( + ids, spec, block_pool_, gid, config_.linear_step, shared_cache_raw, nullptr, policy); + swa_group_ids_.push_back(gid); + } else if (group_type == CacheGroupType::LINEAR || (spec && spec->type == KVCacheSpecType::LinearAttention)) { + group = std::make_shared( + ids, spec, block_pool_, gid, config_.linear_step, shared_cache_raw, nullptr, policy); + linear_group_ids_.push_back(gid); + } else { + group = std::make_shared(ids, spec, block_pool_, gid, shared_cache_raw, nullptr, policy); + full_group_ids_.push_back(gid); + } + + RTP_LLM_CHECK_WITH_INFO(group->init(), "Failed to initialize KVCacheGroup gid %d", gid); + kv_cache_groups_.push_back(group); + } + + global_layer_to_local_id_.assign(static_cast(config_.layer_all_num), -1); + for (int gid = 0; gid < group_nums; ++gid) { + const auto& cur_group_layers = config_.layerIdsForGroup(static_cast(gid)); + for (size_t local_layer_idx = 0; local_layer_idx < cur_group_layers.size(); ++local_layer_idx) { + const int global_layer_idx = cur_group_layers[local_layer_idx]; + if (global_layer_idx >= 0 && static_cast(global_layer_idx) < global_layer_to_local_id_.size()) { + global_layer_to_local_id_[static_cast(global_layer_idx)] = static_cast(local_layer_idx); + } + } + } + + RTP_LLM_LOG_INFO("HybridTypeKVCacheAllocator init success"); + return true; +} + +void HybridTypeKVCacheAllocator::referenceBlocksInGroup(int gid, + const BlockIndicesType& blocks, + bool is_connector) const { + (void)gid; + if (is_connector) { + block_pool_->connectorReference(blocks); + } else { + block_pool_->requestReference(blocks); + } +} + +void HybridTypeKVCacheAllocator::freeBlocksInGroup(int gid, const BlockIndicesType& blocks, bool is_connector) { + (void)gid; + if (is_connector) { + block_pool_->connectorFree(blocks); + } else { + block_pool_->requestFree(blocks); + } +} + +CacheLayerLayout HybridTypeKVCacheAllocator::allLayerCacheBase() const { + CacheLayerLayout layout; + const auto layer_tensors = block_pool_->allLayerCacheBase(); + const auto scale_tensors = block_pool_->allLayerScaleCacheBase(); + + layout.layer_to_group_ids = config_.layerGroupIdsSnapshot(); + layout.layers_to_kv_buffer_ptrs.resize(config_.layer_all_num); + layout.layers_to_scale_buffer_ptrs.resize(config_.layer_all_num); + + for (size_t layer_id = 0; layer_id < static_cast(config_.layer_all_num); ++layer_id) { + int32_t local = global_layer_to_local_id_[layer_id]; + const size_t local_idx = static_cast(local); + + if (local_idx < layer_tensors.size() && layer_tensors[local_idx].defined() + && layer_tensors[local_idx].numel() > 0) { + layout.layers_to_kv_buffer_ptrs[layer_id] = layer_tensors[local_idx]; + } + + if (!scale_tensors.empty() && local_idx < scale_tensors.size() && scale_tensors[local_idx].defined() + && scale_tensors[local_idx].numel() > 0) { + layout.layers_to_scale_buffer_ptrs[layer_id] = scale_tensors[local_idx]; + } + } + return layout; +} + +int HybridTypeKVCacheAllocator::defaultGroupIdForLayer(int layer_id) const { + if (layer_id < 0 || static_cast(layer_id) >= config_.layer_all_num) { + RTP_LLM_FAIL("invalid layer_id=%d", layer_id); + } + const int gid = config_.groupIdFor(layer_id); + RTP_LLM_CHECK_WITH_INFO(gid >= 0 && gid < static_cast(kv_cache_groups_.size()), "invalid group id mapping"); + return gid; +} + +int HybridTypeKVCacheAllocator::validateGroupIdForLayer(int layer_id, int group_id) const { + RTP_LLM_CHECK_WITH_INFO(group_id >= 0 && group_id < static_cast(kv_cache_groups_.size()), + "invalid group id %d for layer %d", + group_id, + layer_id); + RTP_LLM_CHECK_WITH_INFO(layer_id >= 0 && static_cast(layer_id) < config_.layer_all_num, + "invalid layer id %d for layer_all_num=%u", + layer_id, + config_.layer_all_num); + const auto& group_ids = config_.groupIdsForLayer(layer_id); + RTP_LLM_CHECK_WITH_INFO(std::find(group_ids.begin(), group_ids.end(), group_id) != group_ids.end(), + "layer %d does not own cache group %d", + layer_id, + group_id); + return group_id; +} + +BlockAddrInfo HybridTypeKVCacheAllocator::convertIndexToAddr(int layer_id, int block_id) const { + if (layer_id < 0 || static_cast(layer_id) >= config_.layer_all_num) { + RTP_LLM_FAIL("convertIndexToAddr invalid layer_id=%d", layer_id); + } + const int gid = defaultGroupIdForLayer(layer_id); + return kv_cache_groups_[static_cast(gid)]->convertIndexToAddr(layer_id, block_id); +} + +std::vector HybridTypeKVCacheAllocator::convertIndexToBuffer(int layer_id, int block_id) const { + if (layer_id < 0 || static_cast(layer_id) >= config_.layer_all_num) { + RTP_LLM_FAIL("convertIndexToBuffer invalid layer_id=%d", layer_id); + } + const int gid = defaultGroupIdForLayer(layer_id); + return kv_cache_groups_[static_cast(gid)]->convertIndexToBuffer(layer_id, block_id); +} + +std::vector HybridTypeKVCacheAllocator::convertIndexToBuffer(int layer_id, + int block_id, + int partition_count, + int partition_id) const { + if (layer_id < 0 || static_cast(layer_id) >= config_.layer_all_num) { + RTP_LLM_FAIL("convertIndexToBuffer(partition) invalid layer_id=%d", layer_id); + } + const int gid = defaultGroupIdForLayer(layer_id); + return kv_cache_groups_[static_cast(gid)]->convertIndexToBuffer( + layer_id, block_id, partition_count, partition_id); +} + +BlockAddrInfo HybridTypeKVCacheAllocator::convertIndexToAddr(int layer_id, int group_id, int block_id) const { + const int gid = validateGroupIdForLayer(layer_id, group_id); + return kv_cache_groups_[static_cast(gid)]->convertIndexToAddr(layer_id, block_id); +} + +std::vector +HybridTypeKVCacheAllocator::convertIndexToBuffer(int layer_id, int group_id, int block_id) const { + const int gid = validateGroupIdForLayer(layer_id, group_id); + return kv_cache_groups_[static_cast(gid)]->convertIndexToBuffer(layer_id, block_id); +} + +std::vector HybridTypeKVCacheAllocator::convertIndexToBuffer( + int layer_id, int group_id, int block_id, int partition_count, int partition_id) const { + const int gid = validateGroupIdForLayer(layer_id, group_id); + return kv_cache_groups_[static_cast(gid)]->convertIndexToBuffer( + layer_id, block_id, partition_count, partition_id); +} + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/allocator/HybridTypeKVCacheAllocator.h b/rtp_llm/cpp/cache/allocator/HybridTypeKVCacheAllocator.h new file mode 100644 index 0000000000..7dc4b3f119 --- /dev/null +++ b/rtp_llm/cpp/cache/allocator/HybridTypeKVCacheAllocator.h @@ -0,0 +1,42 @@ +#pragma once + +#include +#include + +#include "rtp_llm/cpp/cache/allocator/HybridKVCacheAllocator.h" + +namespace rtp_llm { + +class HybridTypeKVCacheAllocator: public HybridKVCacheAllocator { +public: + HybridTypeKVCacheAllocator(const CacheConfig& config, + AllocationType allocation_type = AllocationType::DEVICE, + const kmonitor::MetricsReporterPtr metrics_reporter = nullptr, + int64_t reserve_block_ratio = 0); + + BlockAddrInfo convertIndexToAddr(int layer_id, int block_id) const override; + std::vector convertIndexToBuffer(int layer_id, int block_id) const override; + std::vector + convertIndexToBuffer(int layer_id, int block_id, int partition_count, int partition_id) const override; + BlockAddrInfo convertIndexToAddr(int layer_id, int group_id, int block_id) const override; + std::vector convertIndexToBuffer(int layer_id, int group_id, int block_id) const override; + std::vector convertIndexToBuffer( + int layer_id, int group_id, int block_id, int partition_count, int partition_id) const override; + CacheLayerLayout allLayerCacheBase() const override; + +private: + bool doInit() override; + + void referenceBlocksInGroup(int gid, const BlockIndicesType& blocks, bool is_connector = false) const override; + void freeBlocksInGroup(int gid, const BlockIndicesType& blocks, bool is_connector = false) override; + + int defaultGroupIdForLayer(int layer_id) const; + int validateGroupIdForLayer(int layer_id, int group_id) const; + + // global layer id -> local layer id + std::vector global_layer_to_local_id_; +}; + +using HybridTypeKVCacheAllocatorPtr = std::shared_ptr; + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/KVCacheAllocator.cc b/rtp_llm/cpp/cache/allocator/KVCacheAllocator.cc similarity index 57% rename from rtp_llm/cpp/cache/KVCacheAllocator.cc rename to rtp_llm/cpp/cache/allocator/KVCacheAllocator.cc index 484c520596..32d2e3db0d 100644 --- a/rtp_llm/cpp/cache/KVCacheAllocator.cc +++ b/rtp_llm/cpp/cache/allocator/KVCacheAllocator.cc @@ -7,7 +7,7 @@ #include "rtp_llm/models_py/bindings/core/OpData.h" #include "rtp_llm/cpp/cache/BlockPoolConfigHelper.h" #include "rtp_llm/cpp/engine_base/stream/CompleteTokenIds.h" -#include "rtp_llm/cpp/cache/KVCacheAllocator.h" +#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h" #include "rtp_llm/cpp/metrics/RtpLLMMetrics.h" namespace rtp_llm { @@ -51,7 +51,8 @@ MallocResult KVCacheAllocator::initMalloc(const MallocInfo& malloc_info) { if (malloc_info.batch_kv_cache_resource) { const auto& cache_keys = malloc_info.batch_kv_cache_resource->cacheKeys(0); size_t match_keys_size = cache_keys.size(); - device_input_length = static_cast(match_keys_size) * config_.seq_size_per_block; + device_input_length = + static_cast(match_keys_size) * deviceCacheMetricTokensPerBlock(); } if (device_input_length > 0) { @@ -81,11 +82,16 @@ MallocResult KVCacheAllocator::malloc(const MallocInfo& malloc_info) { return {false, 0}; } + MallocResult result; if (malloc_info.batch_kv_cache_resource->curBlocksNum() == 0) { - return initMalloc(malloc_info); + result = initMalloc(malloc_info); } else { - return incrMalloc(malloc_info); + result = incrMalloc(malloc_info); } + if (result.success) { + checkCPShardedMallocResult(malloc_info); + } + return result; } uint32_t KVCacheAllocator::convertToGlobalLayerId(size_t model_id, int local_layer_id) const { @@ -110,17 +116,60 @@ uint32_t KVCacheAllocator::convertToGlobalLayerId(size_t model_id, int local_lay RTP_LLM_LOG_ERROR("convertToGlobalLayerId: mtp_sub_configs[%zu] is null", model_id - 1); return std::numeric_limits::max(); } - if (sub->global_layer_ids.empty()) { - RTP_LLM_LOG_ERROR("convertToGlobalLayerId: mtp_sub_configs[%zu] global_layer_ids is empty", model_id - 1); + if (sub->groupNums() <= 0) { + RTP_LLM_LOG_ERROR("convertToGlobalLayerId: mtp_sub_configs[%zu] cache groups are empty", model_id - 1); return std::numeric_limits::max(); } - if (local_layer_id >= 0 && static_cast(local_layer_id) < sub->global_layer_ids[0].size()) { - return sub->global_layer_ids[0][static_cast(local_layer_id)]; + // SWA-only DSV4 propose configs put the single MTP layer in the SWA group + // (gid=6), not FULL[0], so ``global_layer_ids[0]`` is empty. Flatten across + // all groups — matches ``KVCacheManager::getMTPModuleCacheLayerLayout``. + size_t flat_idx = 0; + for (int gid = 0; gid < sub->groupNums(); ++gid) { + const auto& group_ids = sub->layerIdsForGroup(static_cast(gid)); + for (int gid_val : group_ids) { + if (static_cast(flat_idx) == local_layer_id) { + return static_cast(gid_val); + } + ++flat_idx; + } } RTP_LLM_LOG_ERROR("convertToGlobalLayerId: local_layer_id=%d is invalid", local_layer_id); return std::numeric_limits::max(); } +BlockAddrInfo KVCacheAllocator::convertIndexToAddr(int layer_id, int group_id, int block_id) const { + (void)group_id; + return convertIndexToAddr(layer_id, block_id); +} + +std::vector KVCacheAllocator::convertIndexToBuffer(int layer_id, int group_id, int block_id) const { + (void)group_id; + return convertIndexToBuffer(layer_id, block_id); +} + +std::vector KVCacheAllocator::convertIndexToBuffer( + int layer_id, int group_id, int block_id, int partition_count, int partition_id) const { + (void)group_id; + return convertIndexToBuffer(layer_id, block_id, partition_count, partition_id); +} + +BlockAddrInfo KVCacheAllocator::convertIndexToAddrByTag(int layer_id, const std::string& tag, int block_id) const { + const int group_id = config_.groupIdForLayerTag(layer_id, tag); + return convertIndexToAddr(layer_id, group_id, block_id); +} + +std::vector +KVCacheAllocator::convertIndexToBufferByTag(int layer_id, const std::string& tag, int block_id) const { + const int group_id = config_.groupIdForLayerTag(layer_id, tag); + return convertIndexToBuffer(layer_id, group_id, block_id); +} + +std::vector KVCacheAllocator::convertIndexToBufferByTag( + int layer_id, const std::string& tag, int block_id, int partition_count, int partition_id) const { + const int group_id = config_.groupIdForLayerTag(layer_id, tag); + return convertIndexToBuffer(layer_id, group_id, block_id, partition_count, partition_id); +} + void KVCacheAllocator::blockCopy(int src_block_index, int dest_block_index) { BlockIdPair copy_mapping{src_block_index, dest_block_index}; blockBatchCopy(©_mapping, ©_mapping + 1); @@ -158,7 +207,7 @@ void KVCacheAllocator::blockBatchCopy(const BlockIdPair* begin_ptr, const BlockI copy_params.reserve(static_cast(i), copy_nums[i]); } - auto& spec = config_.cache_specs[0]; + auto& spec = config_.specForGroup(0); size_t kv_block_size_bytes = spec->block_size_bytes(); for (auto it = begin_ptr; it != end_ptr; ++it) { @@ -181,7 +230,7 @@ void KVCacheAllocator::blockBatchCopy(const BlockIdPair* begin_ptr, const BlockI if (src_addr_info.kv_scale_addr && dst_addr_info.kv_scale_addr) { copy_params.add(dst_addr_info.kv_scale_addr, src_addr_info.kv_scale_addr, - static_cast(config_.kv_scale_stride_bytes), + config_.kvScaleStrideBytesForGroup(0), copy_type); } } @@ -203,43 +252,72 @@ size_t KVCacheAllocator::availableBlocksNum() const { } BatchKVCacheResourcePtr KVCacheAllocator::popBlocksFromCache(size_t min_blocks_to_free) { - if (!block_pool_ || min_blocks_to_free == 0) { + if (!shared_block_cache_ || min_blocks_to_free == 0) { return nullptr; } - auto block_cache = block_pool_->blockCache(); - if (!block_cache) { - return nullptr; - } - - auto evict_result = block_cache->selectAndEvict(min_blocks_to_free); + auto evict_result = shared_block_cache_->selectAndEvict(min_blocks_to_free); if (evict_result.evicted_keys.empty()) { return nullptr; } + if (metrics_reporter_) { + for (const auto& [cache_key, lifetime_ms] : evict_result.evicted_lifetime_ms) { + RtpLLMCacheEvictionMetricsCollector collector; + collector.lifetime_ms = lifetime_ms; + kmonitor::MetricsTags tags("scope", "gpu"); + tags.AddTag("evict_policy", + evict_result.evicted_independent_group.count(cache_key) ? "independent" : "chain"); + tags.AddTag("backing", "device"); + metrics_reporter_->report(&tags, + &collector); + } + } auto batch_resource = std::make_shared(); batch_resource->resetBatchSize(1); - batch_resource->initGroups(config_.groupNums(), static_cast(config_.layer_all_num), config_.layer_to_group_id); + batch_resource->initGroups(config_.groupNums(), + static_cast(config_.layer_all_num), + config_.layerGroupIdsSnapshot(), + config_.kernelBlocksPerKvBlock(), + config_.groupTypesSnapshot()); batch_resource->setLastBlockAligned(true); for (int gid = 0; gid < config_.groupNums(); ++gid) { batch_resource->mutableBlockIds(0, gid).resize(evict_result.evicted_keys.size(), NULL_BLOCK_IDX); } - size_t evicted_idx = 0; - for (const auto cache_key : evict_result.evicted_keys) { - batch_resource->pushBackCacheKey(0, cache_key); - auto& items = evict_result.evicted_items.at(cache_key); - for (const auto& item : items) { - auto& block_ids = batch_resource->mutableBlockIds(0, item.group_id); - RTP_LLM_CHECK_WITH_INFO(evicted_idx < block_ids.blocksNum(), - "evicted index out of range: idx=%zu, blocks_num=%zu", - evicted_idx, - block_ids.blocksNum()); - block_ids.setAt(evicted_idx, item.block_index); + CacheKeysType evicted_keys; + BlockDependenciesType evicted_dependencies; + evicted_keys.reserve(evict_result.evicted_keys.size()); + evicted_dependencies.reserve(evict_result.evicted_keys.size()); + for (size_t evicted_idx = 0; evicted_idx < evict_result.evicted_keys.size(); ++evicted_idx) { + const auto cache_key = evict_result.evicted_keys[evicted_idx]; + const auto& slots = evict_result.evicted_slots.at(cache_key); + evicted_keys.push_back(cache_key); + auto dep_it = evict_result.evicted_dependencies.find(cache_key); + if (dep_it != evict_result.evicted_dependencies.end()) { + evicted_dependencies.push_back(dep_it->second); + } else { + BlockDependency dependency; + dependency.ordinal = static_cast(evicted_idx); + if (evicted_idx > 0) { + dependency.has_parent = true; + dependency.parent_key = evict_result.evicted_keys[evicted_idx - 1]; + } + evicted_dependencies.push_back(dependency); + } + for (int gid = 0; gid < static_cast(slots.size()) && gid < config_.groupNums(); ++gid) { + if (!isNullBlockIdx(slots[gid])) { + batch_resource->mutableBlockIds(0, gid).setAt(evicted_idx, slots[gid]); + } } - ++evicted_idx; } + batch_resource->cacheResource(0).setCacheKeys(std::move(evicted_keys)); + batch_resource->cacheResource(0).setBlockDependencies(std::move(evicted_dependencies)); + // Evicted keys already come from the GPU cache's actual key namespace. + // Under CP this can be a mixed batch of canonical paged keys and logical + // state/SWA keys, so coordinator must not remap the whole batch again. + batch_resource->cacheResource(0).setCacheKeysAreCpCanonical(true); return batch_resource; } @@ -282,7 +360,11 @@ size_t KVCacheAllocator::notInUseBlocksNum() const { } size_t KVCacheAllocator::availableTokensNum() const { - return block_pool_ ? (block_pool_->availableBlocksNum() * seqSizePerBlock()) : 0; + return block_pool_ ? (block_pool_->availableBlocksNum() * logicalSeqSizePerBlockForCapacity(/*gid=*/0)) : 0; +} + +size_t KVCacheAllocator::totalTokensNum() const { + return block_pool_ ? (block_pool_->totalBlocksNum() * logicalSeqSizePerBlockForCapacity(/*gid=*/0)) : 0; } size_t KVCacheAllocator::totalBlocksNum() const { @@ -290,7 +372,51 @@ size_t KVCacheAllocator::totalBlocksNum() const { } size_t KVCacheAllocator::maxAvailableTokensNum() const { - return block_pool_ ? (block_pool_->totalBlocksNum() * seqSizePerBlock()) : 0; + return totalTokensNum(); +} + +bool KVCacheAllocator::cpShardThisGroupForCapacity(size_t gid) const { + if (!cp_slot_mapper_ || !cp_slot_mapper_->isSharded()) { + return false; + } + return gid >= static_cast(config_.groupNums()) || config_.typeForGroup(gid) == CacheGroupType::FULL; +} + +size_t KVCacheAllocator::logicalSeqSizePerBlockForCapacity(size_t gid) const { + if (cpShardThisGroupForCapacity(gid)) { + return static_cast(cp_slot_mapper_->virtualBlockSize()); + } + return (gid < config_.group_seq_size_per_block.size() && config_.group_seq_size_per_block[gid] > 0) ? + config_.group_seq_size_per_block[gid] : + config_.seq_size_per_block; +} + +int KVCacheAllocator::cpEffectiveSeqLenForAlloc(size_t gid, int seq_len) const { + if (cpShardThisGroupForCapacity(gid)) { + return cp_slot_mapper_->effectiveSeqLenForAlloc(seq_len); + } + return seq_len; +} + +int KVCacheAllocator::deviceCacheMetricTokensPerBlock() const { + if (cp_slot_mapper_ && cp_slot_mapper_->isSharded()) { + return cp_slot_mapper_->virtualBlockSize(); + } + return seqSizePerBlock(); +} + +KVCacheTokenCapacity KVCacheAllocator::tokenCapacity(size_t default_seq_size_per_block) const { + const size_t total_blocks = totalBlocksNum(); + const size_t available_blocks = availableBlocksNum(); + return {total_blocks * default_seq_size_per_block, available_blocks * default_seq_size_per_block}; +} + +std::vector KVCacheAllocator::poolMetricsSnapshots() const { + return {}; +} + +std::vector KVCacheAllocator::independentEvictionGroupIds() const { + return {}; } void KVCacheAllocator::regUserMr(size_t model_id, std::shared_ptr cache_store) { diff --git a/rtp_llm/cpp/cache/KVCacheAllocator.h b/rtp_llm/cpp/cache/allocator/KVCacheAllocator.h similarity index 50% rename from rtp_llm/cpp/cache/KVCacheAllocator.h rename to rtp_llm/cpp/cache/allocator/KVCacheAllocator.h index 73c7584faa..5023a0d946 100644 --- a/rtp_llm/cpp/cache/KVCacheAllocator.h +++ b/rtp_llm/cpp/cache/allocator/KVCacheAllocator.h @@ -1,18 +1,38 @@ #pragma once +#include #include #include #include +#include #include #include "kmonitor/client/MetricsReporter.h" #include "rtp_llm/cpp/cache/Types.h" #include "rtp_llm/cpp/cache/CacheConfig.h" #include "rtp_llm/cpp/cache/BlockPool.h" +#include "rtp_llm/cpp/cache/SharedBlockCache.h" #include "rtp_llm/cpp/cache/BufferTypes.h" namespace rtp_llm { +struct KVCacheTokenCapacity { + size_t total_tokens = 0; + size_t available_tokens = 0; +}; + +struct KVCachePoolMetricsSnapshot { + size_t pool_index = 0; + std::string pool_name = "unnamed"; + size_t free_blocks = 0; + size_t available_blocks = 0; + size_t request_ref_blocks = 0; + size_t connector_ref_blocks = 0; + size_t total_blocks = 0; + size_t reserve_blocks = 0; + float used_ratio = 0.0f; +}; + class KVCacheAllocator { public: KVCacheAllocator(const CacheConfig& config, @@ -33,9 +53,17 @@ class KVCacheAllocator { virtual std::vector convertIndexToBuffer(int layer_id, int block_id) const = 0; virtual std::vector convertIndexToBuffer(int layer_id, int block_id, int partition_count, int partition_id) const = 0; + virtual BlockAddrInfo convertIndexToAddr(int layer_id, int group_id, int block_id) const; + virtual std::vector convertIndexToBuffer(int layer_id, int group_id, int block_id) const; + virtual std::vector + convertIndexToBuffer(int layer_id, int group_id, int block_id, int partition_count, int partition_id) const; + virtual BlockAddrInfo convertIndexToAddrByTag(int layer_id, const std::string& tag, int block_id) const; + virtual std::vector convertIndexToBufferByTag(int layer_id, const std::string& tag, int block_id) const; + virtual std::vector convertIndexToBufferByTag( + int layer_id, const std::string& tag, int block_id, int partition_count, int partition_id) const; virtual std::shared_ptr incrKVCacheRef(const KVCacheResource& kvcache_resource, const CacheKeysType& cache_keys, - bool is_connector = false) = 0; + bool is_connector = false) = 0; virtual CacheLayerLayout allLayerCacheBase() const = 0; virtual bool updateKVBlock(const BatchKVCacheResourcePtr& batch_kv_cache_resource, @@ -45,18 +73,38 @@ class KVCacheAllocator { virtual int seqSizePerBlock() const = 0; virtual int singleBatchNeedBlocks(const BatchKVCacheResourcePtr& batch_kv_cache_resource, int seq_len, - int reserve_step) const = 0; + int reserve_step) const = 0; MallocResult malloc(const MallocInfo& malloc_info); - void blockCopy(int src_block_index, int dest_block_index); - void blockBatchCopy(const std::vector& copy_mapping); - void blockBatchCopy(const BlockIdPair* copy_mapping_begin, const BlockIdPair* copy_mapping_end); - void blockBatchCopy(const torch::Tensor& copy_mapping); + virtual void blockCopy(int src_block_index, int dest_block_index); + virtual void blockBatchCopy(const std::vector& copy_mapping); + virtual void blockBatchCopy(const BlockIdPair* copy_mapping_begin, const BlockIdPair* copy_mapping_end); + virtual void blockBatchCopy(const torch::Tensor& copy_mapping); BlockPoolPtr getBlockPool() const { return block_pool_; } + SharedBlockCachePtr sharedBlockCache() const { + return shared_block_cache_; + } + + void setSharedBlockCache(SharedBlockCachePtr shared_block_cache) { + shared_block_cache_ = std::move(shared_block_cache); + } + + void setUseCudaMallocBlockPool(bool use_cuda_malloc_block_pool) { + use_cuda_malloc_block_pool_ = use_cuda_malloc_block_pool; + } + + void setCPSlotMapper(std::shared_ptr cp_slot_mapper) { + cp_slot_mapper_ = std::move(cp_slot_mapper); + } + + std::shared_ptr cpSlotMapper() const { + return cp_slot_mapper_; + } + // Reserve some blocks for already-running streams' future allocations. // Only applied to "init malloc" requests where batch_kv_cache_resource has no blocks yet. void setReserveBlockNum(size_t reserve_block_num) { @@ -66,19 +114,23 @@ class KVCacheAllocator { return reserve_block_num_; } - void regUserMr(size_t model_id, std::shared_ptr cache_store = nullptr); - int64_t getMrCostTimeMs() const; - size_t freeBlocksNum() const; - size_t availableBlocksNum() const; - BatchKVCacheResourcePtr popBlocksFromCache(size_t min_blocks_to_free); - void blockCacheFree(const BatchKVCacheResourcePtr& batch_kv_cache_resource); - size_t requestRefBlocksNum() const; - size_t connectorRefBlocksNum() const; - size_t blockCacheRefBlocksNum() const; - size_t notInUseBlocksNum() const; - size_t availableTokensNum() const; - size_t totalBlocksNum() const; - size_t maxAvailableTokensNum() const; + virtual void regUserMr(size_t model_id, std::shared_ptr cache_store = nullptr); + virtual int64_t getMrCostTimeMs() const; + virtual size_t freeBlocksNum() const; + virtual size_t availableBlocksNum() const; + virtual BatchKVCacheResourcePtr popBlocksFromCache(size_t min_blocks_to_free); + virtual void blockCacheFree(const BatchKVCacheResourcePtr& batch_kv_cache_resource); + virtual size_t requestRefBlocksNum() const; + virtual size_t connectorRefBlocksNum() const; + virtual size_t blockCacheRefBlocksNum() const; + virtual size_t notInUseBlocksNum() const; + virtual size_t availableTokensNum() const; + virtual size_t totalTokensNum() const; + virtual size_t totalBlocksNum() const; + virtual size_t maxAvailableTokensNum() const; + virtual KVCacheTokenCapacity tokenCapacity(size_t default_seq_size_per_block) const; + virtual std::vector poolMetricsSnapshots() const; + virtual std::vector independentEvictionGroupIds() const; /// Returns global layer id; std::numeric_limits::max() indicates invalid (caller must check). uint32_t convertToGlobalLayerId(size_t model_id, int local_layer_id) const; @@ -88,12 +140,20 @@ class KVCacheAllocator { virtual MallocResult incrMalloc(const MallocInfo& malloc_info) = 0; virtual MallocResult initMallocForCommonLen(const MallocInfo& malloc_info) = 0; virtual int getNeedBlocks(const MallocInfo& malloc_info) const = 0; + virtual void checkCPShardedMallocResult(const MallocInfo&) const {} virtual void decrKVCacheRef(const KVCacheResource& kvcache_resource, bool is_connector = false) = 0; + bool cpShardThisGroupForCapacity(size_t gid) const; + size_t logicalSeqSizePerBlockForCapacity(size_t gid) const; + int cpEffectiveSeqLenForAlloc(size_t gid, int seq_len) const; + int deviceCacheMetricTokensPerBlock() const; CacheConfig config_; AllocationType allocation_type_; BlockPoolPtr block_pool_; - const kmonitor::MetricsReporterPtr metrics_reporter_ = nullptr; + SharedBlockCachePtr shared_block_cache_; + std::shared_ptr cp_slot_mapper_; + const kmonitor::MetricsReporterPtr metrics_reporter_ = nullptr; + bool use_cuda_malloc_block_pool_ = false; size_t reserve_block_num_{0}; int64_t reserve_block_ratio_{0}; diff --git a/rtp_llm/cpp/cache/SingleTypeKVCacheAllocator.cc b/rtp_llm/cpp/cache/allocator/SingleTypeKVCacheAllocator.cc similarity index 65% rename from rtp_llm/cpp/cache/SingleTypeKVCacheAllocator.cc rename to rtp_llm/cpp/cache/allocator/SingleTypeKVCacheAllocator.cc index 3475fe1023..c7bca4db4a 100644 --- a/rtp_llm/cpp/cache/SingleTypeKVCacheAllocator.cc +++ b/rtp_llm/cpp/cache/allocator/SingleTypeKVCacheAllocator.cc @@ -1,4 +1,4 @@ -#include "rtp_llm/cpp/cache/SingleTypeKVCacheAllocator.h" +#include "rtp_llm/cpp/cache/allocator/SingleTypeKVCacheAllocator.h" #include #include @@ -18,15 +18,49 @@ int SingleTypeKVCacheAllocator::getNeedBlocks(const MallocInfo& malloc_info) con const bool reuse_enabled = malloc_info.reuse_cache; const int reuse_blocks_len = reuse_enabled ? malloc_info.batch_kv_cache_resource->curBlocksNum() : 0; const int batch_size = malloc_info.batch_kv_cache_resource->batchSize(); - const int seq_len = malloc_info.complete_token_ids->seqLength(); + int seq_len = malloc_info.complete_token_ids->seqLength(); const int reserve_step = malloc_info.complete_token_ids->getReserveStep(); - const int common_seq_len = std::min(malloc_info.complete_token_ids->commonSeqLength(), seq_len); + int common_seq_len = std::min(malloc_info.complete_token_ids->commonSeqLength(), seq_len); + + if (cp_slot_mapper_ && cp_slot_mapper_->isSharded()) { + seq_len = cp_slot_mapper_->effectiveSeqLenForAlloc(seq_len); + common_seq_len = cp_slot_mapper_->effectiveSeqLenForAlloc(common_seq_len); + } const auto need = full_kv_cache_group_->getNeedBlocks(common_seq_len, seq_len, reserve_step, reuse_blocks_len, reuse_enabled); return (batch_size <= 0) ? 0 : (need.common_blocks + batch_size * need.extra_blocks); } +void SingleTypeKVCacheAllocator::checkCPShardedMallocResult(const MallocInfo& malloc_info) const { + if (!cp_slot_mapper_ || !cp_slot_mapper_->isSharded()) { + return; + } + + const auto& kv_resource = malloc_info.batch_kv_cache_resource; + const int seq_len = malloc_info.incrSeqLen(); + const int reserve_step = malloc_info.complete_token_ids->getReserveStep(); + const int effective_seq_len = cp_slot_mapper_->effectiveSeqLenForAlloc(seq_len); + const int expected_blocks = full_kv_cache_group_->needBlocksNum(effective_seq_len, 0, reserve_step); + + for (int batch_id = 0; batch_id < kv_resource->batchSize(); ++batch_id) { + const int actual_blocks = kv_resource->blocksNum(batch_id); + RTP_LLM_CHECK_WITH_INFO(actual_blocks == expected_blocks, + "CP invariant violated: batch=%d blocks=%d != expected_local_blocks=%d " + "(seq_len=%d, effective_seq_len=%d, reserve_step=%d, cp_size=%d, " + "block_size=%d, cacheKeys=%zu)", + batch_id, + actual_blocks, + expected_blocks, + seq_len, + effective_seq_len, + reserve_step, + cp_slot_mapper_->cpSize(), + cp_slot_mapper_->blockSize(), + kv_resource->cacheKeys(batch_id).size()); + } +} + SingleTypeKVCacheAllocator::SingleTypeKVCacheAllocator(const CacheConfig& config, AllocationType allocation_type, const kmonitor::MetricsReporterPtr metrics_reporter, @@ -34,8 +68,8 @@ SingleTypeKVCacheAllocator::SingleTypeKVCacheAllocator(const CacheConfig& KVCacheAllocator(config, allocation_type, metrics_reporter, reserve_block_ratio) {} bool SingleTypeKVCacheAllocator::doInit() { - RTP_LLM_CHECK_WITH_INFO(!config_.cache_specs.empty(), "cache specs must not be empty"); - auto& spec = config_.cache_specs[0]; + RTP_LLM_CHECK_WITH_INFO(config_.groupNums() > 0, "cache groups must not be empty"); + auto& spec = config_.specForGroup(0); RTP_LLM_CHECK_WITH_INFO(spec != nullptr, "cache spec[0] is null"); RTP_LLM_CHECK_WITH_INFO(spec->type == rtp_llm::KVCacheSpecType::MultiHeadAttention || spec->type == rtp_llm::KVCacheSpecType::MultiHeadLatentAttention, @@ -44,14 +78,22 @@ bool SingleTypeKVCacheAllocator::doInit() { BlockPoolConfig pool_config; pool_config = BlockPoolConfigHelper::createConfig(config_); - block_pool_ = std::make_shared(pool_config, allocation_type_); + block_pool_ = std::make_shared( + pool_config, allocation_type_, /*use_pinned_cpu_backing=*/false, use_cuda_malloc_block_pool_); if (!block_pool_->init()) { RTP_LLM_LOG_ERROR("Failed to initialize block pool for SingleTypeKVCacheAllocator"); return false; } - std::vector layer_ids(config_.global_layer_ids[0]); - full_kv_cache_group_ = std::make_shared(layer_ids, spec, block_pool_, 0); + SharedBlockCache* shared_cache_raw = shared_block_cache_ ? shared_block_cache_.get() : nullptr; + + if (shared_block_cache_) { + std::vector group_pools = {block_pool_}; + shared_block_cache_->init(1, group_pools); + } + + std::vector layer_ids(config_.layerIdsForGroup(0)); + full_kv_cache_group_ = std::make_shared(layer_ids, spec, block_pool_, 0, shared_cache_raw); if (!full_kv_cache_group_->init()) { RTP_LLM_LOG_ERROR("Failed to initialize FullKVCacheGroup"); @@ -68,6 +110,10 @@ MallocResult SingleTypeKVCacheAllocator::initMallocForCommonLen(const MallocInfo int common_seq_len = std::min(malloc_info.complete_token_ids->commonSeqLength(), malloc_info.complete_token_ids->totalSeqLength()); + if (cp_slot_mapper_ && cp_slot_mapper_->isSharded()) { + common_seq_len = cp_slot_mapper_->effectiveSeqLenForAlloc(common_seq_len); + } + const auto& cache_keys = kv_resource->cacheKeys(0); auto& block_ids_0 = kv_resource->mutableBlockIds(0); int64_t match_cost_time_us = 0; @@ -82,12 +128,27 @@ MallocResult SingleTypeKVCacheAllocator::initMallocForCommonLen(const MallocInfo // 2. if the last block is full and matched, the reuse length will be equal to the seq_len, which causes core dump // in computing ops. if (malloc_info.enable_device_cache) { - CacheKeysType match_keys(cache_keys.begin(), cache_keys.empty() ? cache_keys.end() : cache_keys.end() - 1); - auto match_begin_time_us = currentTimeUs(); - auto match_result = full_kv_cache_group_->match(match_keys); - match_cost_time_us = currentTimeUs() - match_begin_time_us; - reuse_len = static_cast(match_result.reuse_length); - reuse_blocks = static_cast(match_result.reuse_blocks); + CacheKeysType match_keys; + if (cp_slot_mapper_ && cp_slot_mapper_->isSharded()) { + // Drop the last virtual-block key (same reasoning as non-CP) to avoid + // a full-len reuse / empty-block crash. Use last-rank stride so all + // ranks share one canonical key namespace. + int cp_size = cp_slot_mapper_->cpSize(); + auto vblock_keys = kv_resource->cacheResource(0).localCacheKeys(cp_size - 1, cp_size); + match_keys.assign(vblock_keys.begin(), vblock_keys.empty() ? vblock_keys.end() : vblock_keys.end() - 1); + } else { + match_keys.assign(cache_keys.begin(), cache_keys.empty() ? cache_keys.end() : cache_keys.end() - 1); + } + auto match_begin_time_us = currentTimeUs(); + MatchResult match_result = full_kv_cache_group_->match(match_keys); + if (cp_slot_mapper_ && cp_slot_mapper_->isSharded()) { + // virtual block ⇒ reuse_length covers cp_size physical blocks of + // tokens; reuse_blocks counts virtual blocks. + match_result.reuse_length = match_result.reuse_blocks * cp_slot_mapper_->virtualBlockSize(); + } + match_cost_time_us = currentTimeUs() - match_begin_time_us; + reuse_len = static_cast(match_result.reuse_length); + reuse_blocks = static_cast(match_result.reuse_blocks); kv_resource->cacheResource(0).setDeviceReuseBlockNum(reuse_blocks); full_kv_cache_group_->reference(block_ids_0, match_result.block_indices); } @@ -128,9 +189,13 @@ MallocResult SingleTypeKVCacheAllocator::incrMalloc(const MallocInfo& malloc_inf auto& kv_resource = malloc_info.batch_kv_cache_resource; int batch_size = kv_resource->batchSize(); int current_blocks = kv_resource->curBlocksNum(); - int seq_len = malloc_info.complete_token_ids->seqLength(); + int seq_len = malloc_info.incrSeqLen(); int reserve_step = malloc_info.complete_token_ids->getReserveStep(); + if (cp_slot_mapper_ && cp_slot_mapper_->isSharded()) { + seq_len = cp_slot_mapper_->effectiveSeqLenForAlloc(seq_len); + } + auto need_blocks = full_kv_cache_group_->needBlocksNum(seq_len, current_blocks, reserve_step); if (need_blocks == 0) { return {true, 0}; @@ -189,24 +254,55 @@ void SingleTypeKVCacheAllocator::free(const FreeInfo& free_info) { void SingleTypeKVCacheAllocator::insertIntoCache(const InsertInfo& insert_info) { auto& kv_resource = insert_info.batch_kv_cache_resource; - int batch_size = kv_resource->batchSize(); + if (!shared_block_cache_) { + return; + } - // TODO(chanyin): set batch_size to 1 for now - batch_size = 1; + int batch_size = kv_resource->batchSize(); + batch_size = 1; for (int batch_id = 0; batch_id < batch_size; ++batch_id) { - const auto& cache_keys = kv_resource->cacheKeys(batch_id); - const auto& blocks = kv_resource->blocks(batch_id); + kv_resource->cacheResource(batch_id).ensureLinearBlockDependencies(); + const auto& blocks = kv_resource->blocks(batch_id); + + // Under CP sharding, use the same last-rank-key namespace as match() + // (see initMallocForCommonLen) so the device cache stays consistent + // across ranks without any cross-rank coordination. + CacheKeysType insert_keys; + SharedBlockCache::NamespaceId namespace_id = SharedBlockCache::kGpuLogicalNamespace; + if (cp_slot_mapper_ && cp_slot_mapper_->isSharded()) { + int cp_size = cp_slot_mapper_->cpSize(); + insert_keys = kv_resource->cacheResource(batch_id).localCacheKeys(cp_size - 1, cp_size); + namespace_id = SharedBlockCache::kGpuCpCanonicalNamespace; + } else { + insert_keys = kv_resource->cacheKeys(batch_id); + } + BlockDependenciesType dependencies; + dependencies.reserve(insert_keys.size()); + for (size_t i = 0; i < insert_keys.size(); ++i) { + BlockDependency dependency; + dependency.ordinal = static_cast(i); + if (i > 0) { + dependency.has_parent = true; + dependency.parent_key = insert_keys[i - 1]; + } + dependencies.push_back(dependency); + } - size_t block_num = std::min(size_t(cache_keys.size()), size_t(blocks.size())); + size_t block_num = std::min(size_t(insert_keys.size()), size_t(blocks.size())); if (block_num == 0) { continue; } - CacheKeysType put_cache_keys(cache_keys.begin(), cache_keys.begin() + block_num); - BlockIndicesType put_block_ids(blocks.begin(), blocks.begin() + block_num); - - full_kv_cache_group_->insertIntoCache(put_cache_keys, put_block_ids, insert_info.is_resident); + for (size_t i = block_num; i > 0; --i) { + const size_t idx = i - 1; + if (isNullBlockIdx(blocks[idx])) { + continue; + } + std::vector group_slots = {blocks[idx]}; + shared_block_cache_->put( + insert_keys[idx], group_slots, insert_info.is_resident, namespace_id, dependencies[idx]); + } } } @@ -226,10 +322,10 @@ CacheLayerLayout SingleTypeKVCacheAllocator::allLayerCacheBase() const { layout.layers_to_scale_buffer_ptrs[layer_id] = scale_tensors[layer_id]; } } - layout.layer_to_groups.reserve(config_.layer_all_num); - int group_id = full_kv_cache_group_->group_id(); - for (int layed_id = 0; layed_id < config_.layer_all_num; layed_id++) { - layout.layer_to_groups.push_back(group_id); + layout.layer_to_group_ids.resize(config_.layer_all_num); + const int group_id = full_kv_cache_group_->group_id(); + for (int layer_id = 0; layer_id < config_.layer_all_num; ++layer_id) { + layout.layer_to_group_ids[static_cast(layer_id)] = {group_id}; } return layout; } @@ -272,13 +368,15 @@ std::shared_ptr SingleTypeKVCacheAllocator::incrKVCacheRef(cons delete resource; }; std::shared_ptr selected_resource(selected_resource_ptr, deleter); - selected_resource->initGroups( - 1, config_.layer_all_num, config_.layer_to_group_id, config_.kernelBlocksPerKvBlock()); + selected_resource->initGroups(1, config_.layer_all_num, config_.layerGroupIdsSnapshot(), config_.kernelBlocksPerKvBlock()); - CacheKeysType selected_cache_keys; - BlockIndicesType selected_blocks; + CacheKeysType selected_cache_keys; + BlockDependenciesType selected_dependencies; + BlockIndicesType selected_blocks; + BlockIndicesType referenced_blocks; - const auto& src_blocks = kvcache_resource.blocks(0); + const auto& src_blocks = kvcache_resource.blocks(0); + const auto& source_dependencies = kvcache_resource.blockDependencies(); for (auto key : cache_keys) { auto it = key_to_pos.find(key); @@ -286,27 +384,37 @@ std::shared_ptr SingleTypeKVCacheAllocator::incrKVCacheRef(cons continue; } const size_t pos = it->second; - if (pos >= src_blocks.size()) { + const bool preserve_connector_tail = is_connector && !kvcache_resource.lastBlockAligned() + && pos + 1 == resource_keys.size() && !selected_cache_keys.empty(); + if (pos >= src_blocks.size() && !preserve_connector_tail) { continue; } - const auto block = src_blocks[pos]; - if (block > 0 && !isNullBlockIdx(block)) { + const auto block = pos < src_blocks.size() ? src_blocks[pos] : NULL_BLOCK_IDX; + if ((block > 0 && !isNullBlockIdx(block)) || preserve_connector_tail) { selected_cache_keys.push_back(key); + selected_dependencies.push_back( + pos < source_dependencies.size() ? + source_dependencies[pos] : + BlockDependency{false, 0, static_cast(selected_dependencies.size())}); selected_blocks.push_back(block); + if (block > 0 && !isNullBlockIdx(block)) { + referenced_blocks.push_back(block); + } } } - if (selected_blocks.empty()) { + if (referenced_blocks.empty()) { return nullptr; } if (is_connector) { - block_pool_->connectorReference(selected_blocks); + block_pool_->connectorReference(referenced_blocks); } else { - block_pool_->requestReference(selected_blocks); + block_pool_->requestReference(referenced_blocks); } selected_resource->mutableBlockIds(0).assign(std::move(selected_blocks)); - selected_resource->cacheKeys() = std::move(selected_cache_keys); + selected_resource->setCacheKeys(std::move(selected_cache_keys)); + selected_resource->setBlockDependencies(std::move(selected_dependencies)); return selected_resource; } @@ -315,7 +423,12 @@ void SingleTypeKVCacheAllocator::decrKVCacheRef(const KVCacheResource& kvcache_r RTP_LLM_CHECK_WITH_INFO( kvcache_resource.groupNums() == 1, "decrKVCacheRef expects groupNums==1, got %d", kvcache_resource.groupNums()); - const auto& blocks_to_free = kvcache_resource.blocks(0); + BlockIndicesType blocks_to_free; + for (auto block : kvcache_resource.blocks(0)) { + if (block > 0 && !isNullBlockIdx(block)) { + blocks_to_free.push_back(block); + } + } if (!blocks_to_free.empty()) { if (is_connector) { block_pool_->connectorFree(blocks_to_free); @@ -380,8 +493,7 @@ bool SingleTypeKVCacheAllocator::updateKVBlock(const BatchKVCacheResourcePtr& kv kv_cache_resource->resetAndReturnOldResources(new_batch_size, old_resources); // init for all batch - kv_cache_resource->initGroups( - 1, config_.layer_all_num, config_.layer_to_group_id, config_.kernelBlocksPerKvBlock()); + kv_cache_resource->initGroups(1, config_.layer_all_num, config_.layerGroupIdsSnapshot(), config_.kernelBlocksPerKvBlock()); for (int new_batch_idx = 0; new_batch_idx < new_batch_size; ++new_batch_idx) { const int old_batch_idx = block_src_batch[new_batch_idx]; @@ -420,7 +532,10 @@ int SingleTypeKVCacheAllocator::seqSizePerBlock() const { int SingleTypeKVCacheAllocator::singleBatchNeedBlocks(const BatchKVCacheResourcePtr& batch_kv_cache_resource, int seq_len, int reserve_step) const { - return full_kv_cache_group_->needBlocksNum(seq_len, 0, reserve_step); + (void)batch_kv_cache_resource; + const int effective_seq_len = + (cp_slot_mapper_ && cp_slot_mapper_->isSharded()) ? cp_slot_mapper_->effectiveSeqLenForAlloc(seq_len) : seq_len; + return full_kv_cache_group_->needBlocksNum(effective_seq_len, 0, reserve_step); } } // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/SingleTypeKVCacheAllocator.h b/rtp_llm/cpp/cache/allocator/SingleTypeKVCacheAllocator.h similarity index 90% rename from rtp_llm/cpp/cache/SingleTypeKVCacheAllocator.h rename to rtp_llm/cpp/cache/allocator/SingleTypeKVCacheAllocator.h index 28adcfe212..b0b3d39b18 100644 --- a/rtp_llm/cpp/cache/SingleTypeKVCacheAllocator.h +++ b/rtp_llm/cpp/cache/allocator/SingleTypeKVCacheAllocator.h @@ -1,12 +1,12 @@ #pragma once #include -#include "rtp_llm/cpp/cache/KVCacheAllocator.h" -#include "rtp_llm/cpp/cache/FullKVCacheGroup.h" +#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h" +#include "rtp_llm/cpp/cache/group/FullKVCacheGroup.h" namespace rtp_llm { -// SingleTypedKVCacheAllocator is used for model with full attentions only +// SingleTypeKVCacheAllocator is used for model with full attentions only class SingleTypeKVCacheAllocator: public KVCacheAllocator, public std::enable_shared_from_this { @@ -42,6 +42,7 @@ class SingleTypeKVCacheAllocator: MallocResult incrMalloc(const MallocInfo& malloc_info) override; MallocResult initMallocForCommonLen(const MallocInfo& malloc_info) override; int getNeedBlocks(const MallocInfo& malloc_info) const override; + void checkCPShardedMallocResult(const MallocInfo& malloc_info) const override; void decrKVCacheRef(const KVCacheResource& kvcache_resource, bool is_connector = false) override; private: diff --git a/rtp_llm/cpp/cache/config_creator/CacheConfigCreator.cc b/rtp_llm/cpp/cache/config_creator/CacheConfigCreator.cc new file mode 100644 index 0000000000..4bf86b3ed0 --- /dev/null +++ b/rtp_llm/cpp/cache/config_creator/CacheConfigCreator.cc @@ -0,0 +1,317 @@ +#include "rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h" + +#include +#include + +#include "rtp_llm/cpp/cache/config_creator/HybridPoolConfigCreator.h" +#include "rtp_llm/cpp/cache/config_creator/HybridConfigCreator.h" +#include "rtp_llm/cpp/cache/config_creator/MemoryEvaluationHelper.h" +#include "rtp_llm/cpp/cache/config_creator/SingleConfigCreator.h" +#include "rtp_llm/cpp/cache/spec/KVCacheSpecDesc.h" +#include "rtp_llm/cpp/utils/Logger.h" +#include "rtp_llm/cpp/utils/AssertUtils.h" + +namespace rtp_llm { + +namespace { + +size_t steppedBytes(size_t bytes, int step) { + return (bytes > 0 && step > 1) ? bytes / static_cast(step) : bytes; +} + +size_t nonExplicitFixedPoolHbmBytes(const CacheConfig& config) { + // Only independent-pool configs use per-group HBM accounting; SingleConfig + // and HybridConfig leave use_independent_block_pools false. + if (!config.use_independent_block_pools) { + return 0; + } + + size_t bytes = 0; + for (size_t gid = 0; gid < static_cast(config.groupNums()); ++gid) { + const auto& spec = config.specForGroup(gid); + if (spec == nullptr || !spec->isFixedCache()) { + continue; + } + if (!config.usesExplicitIndependentBlocks(gid)) { + bytes += config.blockSizeBytesForGroup(gid); + } + } + return bytes; +} + +size_t effectivePagedBlockBytes(const CacheConfig& config, int step) { + return config.block_size_bytes + steppedBytes(nonExplicitFixedPoolHbmBytes(config), step); +} + +void setupKernelSeqSize(CacheConfig& config, const KVCacheConfig& kv_cache_config, const char* config_name) { + if (kv_cache_config.kernel_seq_size_per_block > 0) { + const auto kernel_seq_size_per_block = static_cast(kv_cache_config.kernel_seq_size_per_block); + // Generic divisibility check. Desc-based hybrid pool layouts validate + // their own stricter alignment during createBasicConfig(). + RTP_LLM_CHECK_WITH_INFO(config.seq_size_per_block % kernel_seq_size_per_block == 0, + "%s seq_size_per_block(%zu) must be divisible by kernel_seq_size_per_block(%zu)", + config_name, + config.seq_size_per_block, + kernel_seq_size_per_block); + config.kernel_seq_size_per_block = kernel_seq_size_per_block; + } else if (config.kernel_seq_size_per_block == 0 || config.kernel_seq_size_per_block == config.seq_size_per_block) { + config.kernel_seq_size_per_block = config.seq_size_per_block; + } +} + +uint32_t computeBlockNum(CacheConfig& config, + const ModelConfig& model_config, + const RuntimeConfig& runtime_config, + const KVCacheConfig& kv_cache_config, + const ParallelismConfig& parallelism_config, + const std::optional& warm_up_result, + const std::optional& sp_config) { + if (kv_cache_config.test_block_num > 0) { + RTP_LLM_LOG_INFO("KVCacheConfig explicitly specified kv cache block num %d", kv_cache_config.test_block_num); + config.finalizeBlockNums(kv_cache_config.test_block_num, runtime_config); + return static_cast(kv_cache_config.test_block_num); + } + + const auto kv_cache_mem_size = MemoryEvaluationHelper::getKVCacheMemorySize( + runtime_config, kv_cache_config, model_config, parallelism_config, warm_up_result, sp_config); + // Explicitly-sized pool reservation depends on runtime scheduler limits, + // so finalize it here after RuntimeConfig is available. + config.finalizeBlockNums(0, runtime_config); + + size_t paged_budget = kv_cache_mem_size; + if (config.explicitly_sized_pool_reserve_bytes > 0) { + RTP_LLM_CHECK_WITH_INFO(kv_cache_mem_size > config.explicitly_sized_pool_reserve_bytes, + "kv cache budget %zu MiB is smaller than explicitly-sized pool reservation %zu MiB " + "(reduce explicitly sized pool blocks if needed)", + kv_cache_mem_size / 1024 / 1024, + config.explicitly_sized_pool_reserve_bytes / 1024 / 1024); + paged_budget = kv_cache_mem_size - config.explicitly_sized_pool_reserve_bytes; + RTP_LLM_LOG_INFO("kv cache: total budget %zu MiB, explicitly-sized pool reserve %zu MiB, paged budget %zu MiB", + kv_cache_mem_size / 1024 / 1024, + config.explicitly_sized_pool_reserve_bytes / 1024 / 1024, + paged_budget / 1024 / 1024); + } + const int joint_step = std::max(1, config.linear_step); + return static_cast(paged_budget / effectivePagedBlockBytes(config, joint_step)); +} + +} // namespace + +LayerKVCacheSpecs CacheConfigCreator::buildLayerSpecsFromDescs(const LayerKVCacheSpecDescs& layer_descs, + const SpecBuildContext& ctx, + int64_t expected_layer_num) { + RTP_LLM_CHECK_WITH_INFO(layer_descs.size() == static_cast(expected_layer_num), + "kv_cache_spec_descs size %zu != num_layers %ld", + layer_descs.size(), + expected_layer_num); + LayerKVCacheSpecs layer_specs(layer_descs.size()); + for (size_t layer_id = 0; layer_id < layer_descs.size(); ++layer_id) { + const auto& descs = layer_descs[layer_id]; + RTP_LLM_CHECK_WITH_INFO(!descs.empty(), "kv_cache_spec_descs layer %zu has no descs", layer_id); + auto& specs = layer_specs[layer_id]; + specs.reserve(descs.size()); + for (const auto& desc : descs) { + specs.push_back(SpecBuilder::build(desc, ctx)); + } + } + return layer_specs; +} + +CacheConfig CacheConfigCreator::createBasicConfig(const ModelConfig& model_config, + const ParallelismConfig& parallelism_config, + const KVCacheConfig& kv_cache_config, + bool is_mtp, + int gen_num_per_cycle) { + // Routing priority: + // 1. enable_independent_kv_cache_pools=true → HybridPool (independent BlockPool per group) + // 2. enable_hybrid_attention=true → HybridType (shared BlockPool across groups) + // 3. else → Single (standard MHA/MLA path) + if (model_config.hybrid_attention_config.enable_independent_kv_cache_pools) { + return HybridPoolConfigCreator::createConfig( + model_config, parallelism_config, kv_cache_config, is_mtp, gen_num_per_cycle); + } else if (model_config.hybrid_attention_config.enable_hybrid_attention) { + return HybridConfigCreator::createHybridConfig( + model_config, parallelism_config, kv_cache_config, is_mtp, gen_num_per_cycle); + } else { + return SingleConfigCreator::createSingleConfig( + model_config, parallelism_config, kv_cache_config, is_mtp, gen_num_per_cycle); + } +} + +CacheConfig CacheConfigCreator::createConfig(const ModelConfig& model_config, + const ParallelismConfig& parallelism_config, + const RuntimeConfig& runtime_config, + const KVCacheConfig& kv_cache_config, + const std::optional& warm_up_result, + const std::optional& sp_config) { + CacheConfig config = + CacheConfigCreator::createBasicConfig(model_config, parallelism_config, kv_cache_config, false, 0); + + config.linear_step = kv_cache_config.linear_step; + setupKernelSeqSize(config, kv_cache_config, "cache"); + + uint32_t block_num = computeBlockNum(config, model_config, runtime_config, kv_cache_config, + parallelism_config, warm_up_result, sp_config); + RTP_LLM_CHECK_WITH_INFO(block_num > 0, + "kv cache needs at least 1 block but %ld, each block needs %ld MiB memory", + block_num, + static_cast(config.block_size_bytes / 1024 / 1024)); + + const auto kv_cache_seq_len = static_cast(block_num) * config.seq_size_per_block; + config.block_num = static_cast(block_num); + config.finalizeBlockNums(block_num, runtime_config); + RTP_LLM_LOG_INFO("kv cache block nums is %u, allows storing %ld tokens", block_num, kv_cache_seq_len); + if (kv_cache_seq_len < model_config.max_seq_len) { + RTP_LLM_LOG_WARNING("kv cache block nums %u can only store %ld tokens, less than max_seq_len %ld, " + "this is dangerous, consider decrease max_seq_len", + block_num, + kv_cache_seq_len, + model_config.max_seq_len); + } + return config; +} + +CacheConfig CacheConfigCreator::createSpConfig(const ModelConfig& score_model_config, + const ModelConfig& propose_model_config, + const ParallelismConfig& parallelism_config, + const RuntimeConfig& runtime_config, + const KVCacheConfig& kv_cache_config, + const SpeculativeExecutionConfig& sp_config, + const std::optional& warm_up_result, + bool is_mtp, + bool is_eagle) { + CacheConfig score_config = CacheConfigCreator::createBasicConfig( + score_model_config, parallelism_config, kv_cache_config, false, sp_config.gen_num_per_cycle); + CacheConfig propose_config = CacheConfigCreator::createBasicConfig( + propose_model_config, parallelism_config, kv_cache_config, is_mtp, sp_config.gen_num_per_cycle); + + setupKernelSeqSize(score_config, kv_cache_config, "score"); + setupKernelSeqSize(propose_config, kv_cache_config, "propose"); + + int num_mtp_modules = 1; + if (is_mtp) { + num_mtp_modules = sp_config.gen_num_per_cycle; + if (is_eagle) { + num_mtp_modules = 1; + } + } + + // Fixed-pool block counts depend on runtime scheduler limits. Finalize the + // score and propose configs before sizing the shared paged budget so fixed + // state pools are accounted outside the paged KV-cache block budget. + score_config.finalizeBlockNums(0, runtime_config); + propose_config.finalizeBlockNums(0, runtime_config); + + uint32_t total_layer_num = score_config.layer_num; + for (int i = 0; i < num_mtp_modules; ++i) { + total_layer_num += propose_config.layer_num; + } + + size_t total_block_size_bytes = score_config.block_size_bytes; + for (int i = 0; i < num_mtp_modules; ++i) { + total_block_size_bytes += propose_config.block_size_bytes; + } + + const size_t explicit_pool_reserve = + score_config.explicitly_sized_pool_reserve_bytes + + propose_config.explicitly_sized_pool_reserve_bytes * static_cast(num_mtp_modules); + + size_t block_num = 0; + if (kv_cache_config.test_block_num > 0) { + block_num = kv_cache_config.test_block_num; + } else { + const auto kv_cache_mem_size = MemoryEvaluationHelper::getKVCacheMemorySize( + runtime_config, kv_cache_config, score_model_config, parallelism_config, warm_up_result, sp_config); + + size_t paged_budget = kv_cache_mem_size; + if (explicit_pool_reserve > 0) { + RTP_LLM_CHECK_WITH_INFO(kv_cache_mem_size > explicit_pool_reserve, + "sp kv cache budget %zu MiB is smaller than explicitly-sized pool reservation %zu MiB " + "(reduce explicitly sized pool blocks if needed)", + kv_cache_mem_size / 1024 / 1024, + explicit_pool_reserve / 1024 / 1024); + paged_budget = kv_cache_mem_size - explicit_pool_reserve; + RTP_LLM_LOG_INFO( + "sp kv cache: total budget %zu MiB, explicitly-sized pool reserve %zu MiB (score=%zu MiB + propose=%zu MiB x %d), paged budget %zu MiB", + kv_cache_mem_size / 1024 / 1024, + explicit_pool_reserve / 1024 / 1024, + score_config.explicitly_sized_pool_reserve_bytes / 1024 / 1024, + propose_config.explicitly_sized_pool_reserve_bytes / 1024 / 1024, + num_mtp_modules, + paged_budget / 1024 / 1024); + } + + const int joint_step = std::max(1, kv_cache_config.linear_step); + auto effective_size = [&](const CacheConfig& cfg) -> size_t { + return effectivePagedBlockBytes(cfg, joint_step); + }; + block_num = + paged_budget + / (effective_size(score_config) + effective_size(propose_config) * static_cast(num_mtp_modules)); + } + + RTP_LLM_CHECK_WITH_INFO(block_num > 0, "kv cache needs at least 1 block but %zu", block_num); + + CacheConfig config = score_config; + config.linear_step = std::max(1, kv_cache_config.linear_step); + config.layer_all_num = total_layer_num; + config.block_size_bytes = total_block_size_bytes; + config.block_num = block_num; + config.explicitly_sized_pool_reserve_bytes = explicit_pool_reserve; + + const uint32_t main_layer_num = score_config.layer_num; + const uint32_t mtp_layer_num = propose_config.layer_num; + + // Each sub-model needs an independent CacheConfig because global_layer_ids differs per module. + config.mtp_sub_configs.clear(); + config.mtp_sub_configs.reserve(num_mtp_modules); + config.resizeLayerRoutes(static_cast(total_layer_num)); + config.layer_to_block_stride_bytes.assign(static_cast(total_layer_num), 0); + + // Main(score) model per-layer stride (kv + scale). + // This is expected to be fully populated by createBasicConfig() (Single/Hybrid creators). + const size_t score_layers = static_cast(main_layer_num); + RTP_LLM_CHECK_WITH_INFO(score_config.layer_to_block_stride_bytes.size() == score_layers, + "score_config.layer_to_block_stride_bytes size mismatch, got=%zu need=%zu", + score_config.layer_to_block_stride_bytes.size(), + score_layers); + for (size_t l = 0; l < score_layers; ++l) { + config.layer_to_block_stride_bytes[l] = score_config.layer_to_block_stride_bytes[l]; + } + + for (int m = 0; m < num_mtp_modules; ++m) { + RTP_LLM_CHECK_WITH_INFO(propose_config.layer_to_block_stride_bytes.size() == static_cast(mtp_layer_num), + "sub_cfg.layer_to_block_stride_bytes size mismatch, got=%zu need=%u", + propose_config.layer_to_block_stride_bytes.size(), + mtp_layer_num); + auto sub_cfg = config.mergeMTPModule(propose_config, m, main_layer_num); + sub_cfg->finalizeBlockNums(static_cast(block_num), runtime_config); + config.mtp_sub_configs.push_back(sub_cfg); + } + + config.finalizeBlockNums(static_cast(block_num), runtime_config); + config.explicitly_sized_pool_reserve_bytes = explicit_pool_reserve; + + const auto kv_cache_seq_len = static_cast(block_num) * config.seq_size_per_block; + RTP_LLM_LOG_INFO("CacheConfig created: is_mtp=%d, total_layers=%u, num_mtp_modules=%d, block_num=%zu, " + "allows storing %zu tokens, total_block_size=%zu bytes (main=%zu + %d*propose=%zu)", + is_mtp, + total_layer_num, + num_mtp_modules, + block_num, + kv_cache_seq_len, + total_block_size_bytes, + score_config.block_size_bytes, + num_mtp_modules, + propose_config.block_size_bytes); + + RTP_LLM_LOG_INFO("CacheConfig debugString(main_score_model):\n%s", score_config.debugString().c_str()); + for (size_t i = 0; i < config.mtp_sub_configs.size(); ++i) { + const auto& sub = config.mtp_sub_configs[i]; + RTP_LLM_LOG_INFO("CacheConfig debugString(sub_propose_model[%zu]):\n%s", i, sub->debugString().c_str()); + } + + return config; +} + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/CacheConfigCreator.h b/rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h similarity index 76% rename from rtp_llm/cpp/cache/CacheConfigCreator.h rename to rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h index fc52e975ed..f3abe53fa6 100644 --- a/rtp_llm/cpp/cache/CacheConfigCreator.h +++ b/rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h @@ -6,6 +6,7 @@ #include "absl/status/statusor.h" #include "rtp_llm/cpp/cache/CacheConfig.h" #include "rtp_llm/cpp/cache/WarmUpResult.h" +#include "rtp_llm/cpp/cache/spec/KVCacheSpecDesc.h" #include "rtp_llm/cpp/config/ConfigModules.h" #include "rtp_llm/cpp/config/ModelConfig.h" @@ -15,7 +16,9 @@ class CacheConfigCreator { public: static CacheConfig createBasicConfig(const ModelConfig& model_config, const ParallelismConfig& parallelism_config, - bool is_mtp = false); + const KVCacheConfig& kv_cache_config, + bool is_mtp, + int gen_num_per_cycle); static CacheConfig createConfig(const ModelConfig& model_config, const ParallelismConfig& parallelism_config, const RuntimeConfig& runtime_config, @@ -32,15 +35,12 @@ class CacheConfigCreator { bool is_mtp, bool is_eagle); -private: - // Removed functions moved to MemoryEvaluationHelper: - // getDefaultRuntimeMemorySize - // getKVCacheMemorySize + // Unified desc->spec conversion. Callers provide the runtime build context; + // descs remain read-only. + static LayerKVCacheSpecs buildLayerSpecsFromDescs(const LayerKVCacheSpecDescs& layer_descs, + const SpecBuildContext& ctx, + int64_t expected_layer_num); - // Removed functions moved to dedicated creators: - // createSingleConfig - // createHybridConfig - // splitIntoGroups (moved to HybridConfigCreator) }; } // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/config_creator/HybridConfigCreator.cc b/rtp_llm/cpp/cache/config_creator/HybridConfigCreator.cc new file mode 100644 index 0000000000..93830012fb --- /dev/null +++ b/rtp_llm/cpp/cache/config_creator/HybridConfigCreator.cc @@ -0,0 +1,305 @@ +#include "rtp_llm/cpp/cache/config_creator/HybridConfigCreator.h" + +#include + +#include "rtp_llm/cpp/cache/spec/KVCacheSpec.h" +#include "rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h" +#include "rtp_llm/cpp/cache/config_creator/MemoryEvaluationHelper.h" + +namespace rtp_llm { + +std::vector> HybridConfigCreator::splitIntoGroups(const std::vector& ids, int group_layer_num) { + std::vector> groups; + if (ids.empty()) { + return groups; + } + const int n = static_cast(ids.size()); + const int s = std::max(group_layer_num, 1); + groups.reserve((n + s - 1) / s); + for (int i = 0; i < n; i += s) { + const int end = std::min(i + s, n); + groups.emplace_back(ids.begin() + i, ids.begin() + end); + } + return groups; +} + +int HybridConfigCreator::calculateGroupLayerNum(int linear_layer_count, int full_layer_count) { + int group_layer_num = 0; + if (linear_layer_count > 0 && full_layer_count > 0) { + group_layer_num = std::gcd(linear_layer_count, full_layer_count); + } else { + group_layer_num = std::max(linear_layer_count, full_layer_count); + } + group_layer_num = std::max(group_layer_num, 1); + return group_layer_num; +} + +std::pair, std::vector> +HybridConfigCreator::splitLayersByAttentionType(const ModelConfig& model_config) { + int64_t layer_num = model_config.num_layers; + RTP_LLM_CHECK_WITH_INFO(layer_num > 0, "invalid model_config.num_layers=%ld", layer_num); + + std::vector linear_layers; + std::vector full_layers; + linear_layers.reserve(layer_num); + full_layers.reserve(layer_num); + + const auto& types = model_config.hybrid_attention_config.hybrid_attention_types; + RTP_LLM_CHECK_WITH_INFO(types.size() == static_cast(layer_num), + "hybrid_attention_types size %zu != num_layers %ld", + types.size(), + layer_num); + for (int i = 0; i < static_cast(layer_num); ++i) { + if (types[static_cast(i)] == HybridAttentionType::LINEAR) { + linear_layers.push_back(i); + } else { + full_layers.push_back(i); + } + } + + return std::make_pair(std::move(linear_layers), std::move(full_layers)); +} + +CacheConfig HybridConfigCreator::initializeConfig(const ModelConfig& model_config, + const std::vector& linear_layers, + const std::vector& full_layers, + rtp_llm::DataType dtype) { + int64_t layer_num = model_config.num_layers; + + CacheConfig config; + config.layer_num = static_cast(layer_num); + config.layer_all_num = static_cast(layer_num); + config.block_num = 0; + config.seq_size_per_block = static_cast(model_config.attn_config.tokens_per_block); + config.use_mla = model_config.attn_config.use_mla; + config.dtype = dtype; + config.linear_step = 1; + + return config; +} + +KVCacheSpecPtr HybridConfigCreator::getSpecFromLayers(const LayerKVCacheSpecs& runtime_specs, + const std::vector& layer_ids, + const char* spec_role) { + KVCacheSpecPtr result; + std::string fingerprint; + for (int layer_id : layer_ids) { + RTP_LLM_CHECK_WITH_INFO(static_cast(layer_id) < runtime_specs.size() + && !runtime_specs[static_cast(layer_id)].empty(), + "missing runtime kv_cache specs for %s layer %d", + spec_role, + layer_id); + RTP_LLM_CHECK_WITH_INFO(runtime_specs[static_cast(layer_id)].size() == 1, + "%s layer %d must have exactly one runtime kv_cache spec, got %zu", + spec_role, + layer_id, + runtime_specs[static_cast(layer_id)].size()); + const auto& spec = runtime_specs[static_cast(layer_id)][0]; + RTP_LLM_CHECK_WITH_INFO(spec != nullptr, "%s layer %d has null kv_cache spec", spec_role, layer_id); + if (result == nullptr) { + result = spec; + fingerprint = spec->fingerprint(); + } else { + RTP_LLM_CHECK_WITH_INFO(fingerprint == spec->fingerprint(), + "%s layers have different kv_cache spec fingerprints", + spec_role); + } + } + RTP_LLM_CHECK_WITH_INFO(result != nullptr, "no %s layers found", spec_role); + return result->clone(); +} + +void HybridConfigCreator::prepareFullAttentionSpec(KVCacheSpecPtr spec, + const ModelConfig& model_config, + const ParallelismConfig& parallelism_config, + rtp_llm::DataType dtype, + uint32_t layer_num) { + if (model_config.attn_config.use_mla && model_config.mla_ops_type != rtp_llm::MlaOpsType::MHA) { + auto* mla_spec = dynamic_cast(spec.get()); + RTP_LLM_CHECK_WITH_INFO(mla_spec != nullptr && spec->type == KVCacheSpecType::MultiHeadLatentAttention, + "full kv_cache spec must be MLAKVCacheSpec for MLA model"); + // local_head_num_kv is already set to 1 by Python-side MLAKVCacheSpec default. + // kv_lora_rank, rope_head_dim, seq_size_per_block are already populated by Python. + } else { + auto* mha_spec = dynamic_cast(spec.get()); + RTP_LLM_CHECK_WITH_INFO(mha_spec != nullptr && spec->type == KVCacheSpecType::MultiHeadAttention, + "full kv_cache spec must be MHAKVCacheSpec for MHA/GQA model"); + // local_head_num_kv depends on TP and cannot be provided by Python-side spec. + spec->local_head_num_kv = static_cast( + (model_config.attn_config.kv_head_num % parallelism_config.get_attn_tp_size() == 0) ? + model_config.attn_config.kv_head_num / parallelism_config.get_attn_tp_size() : + model_config.attn_config.kv_head_num + / std::gcd(model_config.attn_config.kv_head_num, parallelism_config.get_attn_tp_size())); + // size_per_head, seq_size_per_block are already populated by Python. + } + // dtype depends on runtime quantization config and cannot be provided by Python-side spec. + spec->dtype = dtype; +} + +void HybridConfigCreator::prepareLinearAttentionSpec(KVCacheSpecPtr spec, + const ModelConfig& model_config, + const ParallelismConfig& parallelism_config, + rtp_llm::DataType dtype, + uint32_t layer_num) { + auto* linear_spec = dynamic_cast(spec.get()); + RTP_LLM_CHECK_WITH_INFO(linear_spec != nullptr && spec->type == KVCacheSpecType::LinearAttention, + "linear kv_cache spec must be LinearKVCacheSpec"); + + const auto& linear_config = model_config.linear_attention_config; + RTP_LLM_CHECK_WITH_INFO(linear_config.linear_key_head_dim > 0 && linear_config.linear_value_head_dim > 0, + "invalid linear head dim"); + RTP_LLM_CHECK_WITH_INFO(linear_config.linear_conv_kernel_dim > 1, + "invalid linear_conv_kernel_dim=%d", + linear_config.linear_conv_kernel_dim); + RTP_LLM_CHECK_WITH_INFO(linear_config.linear_num_key_heads > 0 && linear_config.linear_num_value_heads > 0, + "invalid linear heads"); + RTP_LLM_CHECK_WITH_INFO(linear_config.linear_key_head_dim == linear_config.linear_value_head_dim, + "linear head dims must match (current impl): k=%d v=%d", + linear_config.linear_key_head_dim, + linear_config.linear_value_head_dim); + + // local_num_k_heads, local_num_v_heads, and local_head_num_kv depend on TP + // and cannot be provided by Python-side spec. + const int tp = std::max(1, static_cast(parallelism_config.get_attn_tp_size())); + linear_spec->local_num_k_heads = static_cast(linear_config.linear_num_key_heads / tp); + linear_spec->local_num_v_heads = static_cast(linear_config.linear_num_value_heads / tp); + RTP_LLM_CHECK_WITH_INFO(linear_spec->local_num_k_heads > 0 && linear_spec->local_num_v_heads > 0, + "invalid local heads for linear attention: k=%d v=%d tp=%d", + linear_spec->local_num_k_heads, + linear_spec->local_num_v_heads, + tp); + spec->local_head_num_kv = static_cast(std::max( + 1, + (linear_config.linear_num_value_heads > 1) ? + static_cast(linear_config.linear_num_value_heads / parallelism_config.get_attn_tp_size()) : + static_cast(linear_config.linear_num_value_heads))); + // dtype depends on runtime quantization config and cannot be provided by Python-side spec. + spec->dtype = dtype; + // seq_size_per_block, head_k_dim, head_v_dim, conv_kernel_dim, + // ssm_state_dtype, conv_state_dtype are already populated by Python. +} + +std::pair>, std::vector>> HybridConfigCreator::createLayerGroups( + const std::vector& linear_layers, const std::vector& full_layers, int& group_layer_num) { + const int linear_cnt = static_cast(linear_layers.size()); + const int full_cnt = static_cast(full_layers.size()); + group_layer_num = HybridConfigCreator::calculateGroupLayerNum(linear_cnt, full_cnt); + + const auto linear_groups = HybridConfigCreator::splitIntoGroups(linear_layers, group_layer_num); + const auto full_groups = HybridConfigCreator::splitIntoGroups(full_layers, group_layer_num); + + return std::make_pair(std::move(linear_groups), std::move(full_groups)); +} + +void HybridConfigCreator::setupCacheConfigSpecs(CacheConfig& config, + const std::vector>& linear_groups, + const std::vector>& full_groups, + const KVCacheSpecPtr& linear_spec, + const KVCacheSpecPtr& full_spec) { + std::vector groups; + std::vector layers(static_cast(config.layer_num)); + + auto append_group = [&](const KVCacheSpecPtr& spec, CacheGroupType type, const std::vector& layer_ids) { + GroupBase group; + group.spec = spec; + group.policy = defaultCacheGroupPolicy(type); + group.layer_ids = layer_ids; + const int gid = static_cast(groups.size()); + groups.push_back(group); + for (int layer_id : layer_ids) { + auto& layer = layers[static_cast(layer_id)]; + layer.group_ids.push_back(gid); + layer.tag_to_gid[spec->tag] = gid; + } + }; + + // Keep order: all full groups first, then linear groups. + for (const auto& g : full_groups) { + append_group(full_spec, CacheGroupType::FULL, g); + } + for (const auto& g : linear_groups) { + append_group(linear_spec, CacheGroupType::LINEAR, g); + } + config.setTopology(std::move(groups), std::move(layers)); +} + +void HybridConfigCreator::setupPhysicalSizes(CacheConfig& config, + const KVCacheSpecPtr& full_spec, + const KVCacheSpecPtr& linear_spec) { + // Decide the physical KV block/scale sizes by taking max between full and linear specs. + const size_t full_kv_block_stride_bytes = full_spec->block_size_bytes(); + const size_t linear_kv_block_stride_bytes = linear_spec->block_size_bytes(); + + // now we only support that linear attention block have padding + RTP_LLM_CHECK_WITH_INFO(full_kv_block_stride_bytes >= linear_kv_block_stride_bytes, + "not support full attention with padding now"); + + config.kv_block_stride_bytes = full_kv_block_stride_bytes; + config.kv_block_size_bytes = static_cast(config.group_layer_num) * config.kv_block_stride_bytes; + config.kv_scale_stride_bytes = full_spec->scale_block_size_bytes(); + config.kv_scale_size_bytes = static_cast(config.group_layer_num) * config.kv_scale_stride_bytes; + config.block_size_bytes = config.kv_block_size_bytes + config.kv_scale_size_bytes; +} + +CacheConfig HybridConfigCreator::createHybridConfig(const ModelConfig& model_config, + const ParallelismConfig& parallelism_config, + const KVCacheConfig& kv_cache_config, + bool is_mtp, + int gen_num_per_cycle) { + (void)is_mtp; + auto dtype = MemoryEvaluationHelper::getDataTypeForCache(model_config); + const auto physical_tokens_per_block = + kv_cache_config.seq_size_per_block > 0 ? static_cast(kv_cache_config.seq_size_per_block) : + static_cast(model_config.attn_config.tokens_per_block); + const auto kernel_tokens_per_block = + kv_cache_config.kernel_seq_size_per_block > 0 ? static_cast(kv_cache_config.kernel_seq_size_per_block) : + physical_tokens_per_block; + RTP_LLM_CHECK_WITH_INFO(physical_tokens_per_block > 0, "hybrid seq_size_per_block must be > 0"); + RTP_LLM_CHECK_WITH_INFO(kernel_tokens_per_block > 0, "hybrid kernel_seq_size_per_block must be > 0"); + SpecBuildContext ctx; + ctx.dtype = dtype; + ctx.seq_size_per_block = physical_tokens_per_block; + ctx.attn_tp_size = static_cast(parallelism_config.get_attn_tp_size()); + ctx.kernel_tokens_per_block = kernel_tokens_per_block; + ctx.gen_num_per_cycle = static_cast(gen_num_per_cycle); + const auto runtime_specs = + CacheConfigCreator::buildLayerSpecsFromDescs(model_config.kv_cache_spec_descs, ctx, model_config.num_layers); + + // Split layers by attention type + auto [linear_layers, full_layers] = HybridConfigCreator::splitLayersByAttentionType(model_config); + + // Initialize config + CacheConfig config = HybridConfigCreator::initializeConfig(model_config, linear_layers, full_layers, dtype); + config.seq_size_per_block = physical_tokens_per_block; + config.kernel_seq_size_per_block = kernel_tokens_per_block; + + auto full_spec = HybridConfigCreator::getSpecFromLayers(runtime_specs, full_layers, "full attention"); + auto linear_spec = HybridConfigCreator::getSpecFromLayers(runtime_specs, linear_layers, "linear attention"); + + // Create layer groups and calculate group layer number + int group_layer_num = 0; + auto [linear_groups, full_groups] = + HybridConfigCreator::createLayerGroups(linear_layers, full_layers, group_layer_num); + config.group_layer_num = group_layer_num; + + HybridConfigCreator::prepareFullAttentionSpec( + full_spec, model_config, parallelism_config, dtype, static_cast(full_layers.size())); + HybridConfigCreator::prepareLinearAttentionSpec( + linear_spec, model_config, parallelism_config, dtype, static_cast(linear_layers.size())); + + // Setup cache config specs + HybridConfigCreator::setupCacheConfigSpecs(config, linear_groups, full_groups, linear_spec, full_spec); + + // Setup physical sizes + HybridConfigCreator::setupPhysicalSizes(config, full_spec, linear_spec); + + // Per-layer block stride (kv + scale). + // For hybrid attention, the physical per-layer stride follows the selected physical layout stride. + const size_t per_layer_stride_bytes = config.kv_block_stride_bytes + config.kv_scale_stride_bytes; + config.layer_to_block_stride_bytes.assign(static_cast(config.layer_all_num), + static_cast(per_layer_stride_bytes)); + + return config; +} + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/HybridConfigCreator.h b/rtp_llm/cpp/cache/config_creator/HybridConfigCreator.h similarity index 59% rename from rtp_llm/cpp/cache/HybridConfigCreator.h rename to rtp_llm/cpp/cache/config_creator/HybridConfigCreator.h index c8cf684d01..e4542a373e 100644 --- a/rtp_llm/cpp/cache/HybridConfigCreator.h +++ b/rtp_llm/cpp/cache/config_creator/HybridConfigCreator.h @@ -3,7 +3,9 @@ #include #include #include +#include #include "rtp_llm/cpp/cache/CacheConfig.h" +#include "rtp_llm/cpp/config/ConfigModules.h" #include "rtp_llm/cpp/config/ModelConfig.h" namespace rtp_llm { @@ -12,7 +14,9 @@ class HybridConfigCreator { public: static CacheConfig createHybridConfig(const ModelConfig& model_config, const ParallelismConfig& parallelism_config, - bool is_mtp = false); + const KVCacheConfig& kv_cache_config, + bool is_mtp = false, + int gen_num_per_cycle = 0); static std::vector> splitIntoGroups(const std::vector& ids, int group_layer_num); // Calculate the number of layers per group based on linear and full layers count @@ -25,12 +29,19 @@ class HybridConfigCreator { const std::vector& linear_layers, const std::vector& full_layers, rtp_llm::DataType dtype); - static KVCacheSpecPtr createFullAttentionSpec(const ModelConfig& model_config, - const ParallelismConfig& parallelism_config, - rtp_llm::DataType dtype); - static KVCacheSpecPtr createLinearAttentionSpec(const ModelConfig& model_config, - const ParallelismConfig& parallelism_config, - rtp_llm::DataType dtype); + static KVCacheSpecPtr getSpecFromLayers(const LayerKVCacheSpecs& runtime_specs, + const std::vector& layer_ids, + const char* spec_role); + static void prepareFullAttentionSpec(KVCacheSpecPtr spec, + const ModelConfig& model_config, + const ParallelismConfig& parallelism_config, + rtp_llm::DataType dtype, + uint32_t layer_num); + static void prepareLinearAttentionSpec(KVCacheSpecPtr spec, + const ModelConfig& model_config, + const ParallelismConfig& parallelism_config, + rtp_llm::DataType dtype, + uint32_t layer_num); static std::pair>, std::vector>> createLayerGroups(const std::vector& linear_layers, const std::vector& full_layers, int& group_layer_num); static void setupCacheConfigSpecs(CacheConfig& config, @@ -40,7 +51,6 @@ class HybridConfigCreator { const KVCacheSpecPtr& full_spec); static void setupPhysicalSizes(CacheConfig& config, const KVCacheSpecPtr& full_spec, const KVCacheSpecPtr& linear_spec); - static void setupLayerToGroupMapping(CacheConfig& config); }; -} // namespace rtp_llm \ No newline at end of file +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/config_creator/HybridPoolConfigCreator.cc b/rtp_llm/cpp/cache/config_creator/HybridPoolConfigCreator.cc new file mode 100644 index 0000000000..befe99de4f --- /dev/null +++ b/rtp_llm/cpp/cache/config_creator/HybridPoolConfigCreator.cc @@ -0,0 +1,370 @@ +#include "rtp_llm/cpp/cache/config_creator/HybridPoolConfigCreator.h" + +#include +#include +#include +#include + +#include "rtp_llm/cpp/cache/spec/KVCacheSpec.h" +#include "rtp_llm/cpp/cache/spec/KVCacheSpecDesc.h" +#include "rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h" +#include "rtp_llm/cpp/cache/config_creator/MemoryEvaluationHelper.h" +#include "rtp_llm/cpp/utils/AssertUtils.h" + +namespace rtp_llm { + +namespace { + +uint32_t fixedRegionCpSize(const ParallelismConfig& parallelism_config) { + if (!parallelism_config.prefill_cp_config.kv_cache_sharded) { + return 1; + } + if (parallelism_config.role_type == RoleType::PREFILL && parallelism_config.tp_size > 1) { + return static_cast(parallelism_config.tp_size); + } + if (parallelism_config.role_type == RoleType::DECODE && parallelism_config.prefill_cp_config.is_prefill_enabled()) { + RTP_LLM_CHECK_WITH_INFO( + parallelism_config.prefill_cp_config.prefill_cp_size > 1, + "fixed/SWA CP sharding decode requires explicit prefill_cp_size when PREFILL_CP and kv_cache_sharded are enabled"); + return static_cast(parallelism_config.prefill_cp_config.prefill_cp_size); + } + return 1; +} + +bool isPrefillCpSliced(const ParallelismConfig& parallelism_config) { + return parallelism_config.role_type == RoleType::PREFILL && fixedRegionCpSize(parallelism_config) > 1; +} + +CacheGroupPolicy policyFromSpecDesc(const KVCacheSpecDesc& desc) { + CacheGroupPolicy policy = defaultCacheGroupPolicy(SpecBuilder::groupType(desc)); + if (desc.is_state_cache) { + policy.evict_policy = CacheEvictPolicy::INDEPENDENT; + } + if (desc.skip_prefix_reuse) { + policy.reuse_policy = CacheReusePolicy::NON_REUSABLE; + policy.active_tail_blocks = 1; + policy.validate_tail_blocks = false; + } + if (desc.has_reuse_policy) { + policy.reuse_policy = desc.reuse_policy; + } + if (desc.has_evict_policy) { + policy.evict_policy = desc.evict_policy; + } + if (desc.has_active_tail_blocks) { + policy.active_tail_blocks = desc.active_tail_blocks; + } + if (desc.has_validate_tail_blocks) { + policy.validate_tail_blocks = desc.validate_tail_blocks; + } + policy.explicit_block_num = desc.extra.explicit_block_num; + policy.reserve_from_paged_budget = desc.extra.reserve_from_paged_budget; + if (desc.has_prefix_reusable) { + policy.prefix_reusable = desc.prefix_reusable; + } + policy.uses_pinned_cpu_backing = desc.uses_pinned_cpu_backing; + if (desc.has_is_cp_shardable) { + policy.is_cp_shardable = desc.is_cp_shardable; + } + if (desc.cache_type == CacheType::COMPRESSED_KV) { + policy.has_sparse_slots = true; + } + if (desc.has_sparse_slots) { + policy.has_sparse_slots = desc.sparse_slots; + } + if (desc.has_kernel_block_subdiv) { + policy.has_kernel_block_subdiv = desc.kernel_block_subdiv; + } + if (desc.has_cp_compact_tail_blocks) { + policy.cp_compact_tail_blocks = desc.cp_compact_tail_blocks; + } + if (desc.has_is_reservable) { + policy.is_reservable = desc.is_reservable; + } + return policy; +} + +void validateHybridPoolDescs(const ModelConfig& model_config, + uint32_t kernel_tokens_per_block, + int gen_num_per_cycle) { + RTP_LLM_CHECK_WITH_INFO(model_config.kv_cache_spec_descs.size() == static_cast(model_config.num_layers), + "hybrid-pool desc config requires layer-wise kv_cache_spec_descs for every layer, got %zu/%ld", + model_config.kv_cache_spec_descs.size(), + model_config.num_layers); + RTP_LLM_CHECK_WITH_INFO(gen_num_per_cycle >= 0, + "hybrid-pool desc config requires non-negative gen_num_per_cycle, got %d", + gen_num_per_cycle); + + for (int64_t layer_id = 0; layer_id < model_config.num_layers; ++layer_id) { + const auto& layer_descs = model_config.kv_cache_spec_descs[static_cast(layer_id)]; + RTP_LLM_CHECK_WITH_INFO(!layer_descs.empty(), + "hybrid-pool desc config layer %ld has no descs", + layer_id); + for (const auto& desc : layer_descs) { + RTP_LLM_CHECK_WITH_INFO( + desc.cache_type != CacheType::MHA || desc.num_kv_heads > 0, + "hybrid-pool MHA desc tag=%s missing num_kv_heads (must be set by Python model)", + desc.tag.c_str()); + RTP_LLM_CHECK_WITH_INFO( + desc.cache_type != CacheType::LINEAR || (desc.num_k_heads > 0 && desc.num_v_heads > 0), + "hybrid-pool LINEAR desc tag=%s missing num_k_heads/num_v_heads (must be set by Python model)", + desc.tag.c_str()); + if (desc.extra.derive_entries_from_kernel_block) { + RTP_LLM_CHECK_WITH_INFO(desc.compression_ratio > 0, + "desc tag=%s derives entries from kernel block but has invalid compression_ratio=%u", + desc.tag.c_str(), + desc.compression_ratio); + RTP_LLM_CHECK_WITH_INFO(kernel_tokens_per_block > 0, + "desc tag=%s derives entries from kernel block but kernel_tokens_per_block is 0", + desc.tag.c_str()); + RTP_LLM_CHECK_WITH_INFO(kernel_tokens_per_block % desc.compression_ratio == 0, + "desc tag=%s compression_ratio=%u must divide kernel block %u", + desc.tag.c_str(), + desc.compression_ratio, + kernel_tokens_per_block); + } + if (desc.extra.state_ring_compression_ratio > 0) { + RTP_LLM_CHECK_WITH_INFO(desc.extra.state_ring_compression_ratio > 0, + "state ring desc tag=%s requires positive state_ring_compression_ratio", + desc.tag.c_str()); + } + } + } +} + +void populateGroupsFromLayerSpecs(CacheConfig& config, + const LayerKVCacheSpecDescs& layer_descs, + const LayerKVCacheSpecs& layer_specs) { + RTP_LLM_CHECK_WITH_INFO(layer_descs.size() == static_cast(config.layer_num), + "hybrid-pool layer desc count %zu != layer_num %u", + layer_descs.size(), + config.layer_num); + RTP_LLM_CHECK_WITH_INFO(layer_specs.size() == static_cast(config.layer_num), + "hybrid-pool layer spec count %zu != layer_num %u", + layer_specs.size(), + config.layer_num); + + struct GroupBuildState { + KVCacheSpecPtr spec; + std::string fingerprint; + CacheGroupType type; + CacheGroupPolicy policy; + std::vector layers; + }; + + std::map group_by_tag; + std::vector ordered_tags; + + for (uint32_t layer = 0; layer < config.layer_num; ++layer) { + const auto& descs = layer_descs[layer]; + const auto& specs = layer_specs[layer]; + RTP_LLM_CHECK_WITH_INFO(!descs.empty(), "hybrid-pool layer %u has no descs", layer); + RTP_LLM_CHECK_WITH_INFO(descs.size() == specs.size(), + "hybrid-pool layer %u desc count %zu != spec count %zu", + layer, + descs.size(), + specs.size()); + std::set layer_tags; + for (size_t idx = 0; idx < descs.size(); ++idx) { + const auto& desc = descs[idx]; + const auto& spec = specs[idx]; + RTP_LLM_CHECK_WITH_INFO(spec != nullptr, "hybrid-pool layer %u has null spec", layer); + RTP_LLM_CHECK_WITH_INFO(layer_tags.insert(spec->tag).second, + "hybrid-pool layer %u has duplicate tag=%s", + layer, + spec->tag.c_str()); + const auto policy = policyFromSpecDesc(desc); + const auto type = SpecBuilder::groupType(desc); + auto group_it = group_by_tag.find(spec->tag); + if (group_it == group_by_tag.end()) { + GroupBuildState state; + state.spec = spec; + state.fingerprint = spec->fingerprint(); + state.type = type; + state.policy = policy; + group_it = group_by_tag.emplace(spec->tag, std::move(state)).first; + ordered_tags.push_back(spec->tag); + } else { + RTP_LLM_CHECK_WITH_INFO(group_it->second.fingerprint == spec->fingerprint(), + "hybrid-pool tag=%s has multiple physical prototypes", + spec->tag.c_str()); + RTP_LLM_CHECK_WITH_INFO(group_it->second.type == type, + "hybrid-pool tag=%s has inconsistent group type", + spec->tag.c_str()); + RTP_LLM_CHECK_WITH_INFO(CacheConfig::samePolicy(group_it->second.policy, policy), + "hybrid-pool tag=%s has inconsistent policy", + spec->tag.c_str()); + } + group_it->second.layers.push_back(static_cast(layer)); + } + } + + std::vector groups; + std::vector layers(static_cast(config.layer_num)); + groups.reserve(ordered_tags.size()); + for (const auto& tag : ordered_tags) { + const auto& state = group_by_tag.at(tag); + GroupBase group; + group.spec = state.spec; + group.policy = state.policy; + group.layer_ids = state.layers; + const int gid = static_cast(groups.size()); + groups.push_back(group); + for (int layer_id : state.layers) { + auto& layer = layers[static_cast(layer_id)]; + layer.group_ids.push_back(gid); + layer.tag_to_gid[tag] = gid; + } + } + config.setTopology(std::move(groups), std::move(layers)); +} + +size_t kernelBlocksPerKvBlockForGroup(const CacheConfig& config, size_t group_id) { + const bool is_full = config.typeForGroup(group_id) == CacheGroupType::FULL; + return is_full ? config.kernelBlocksPerKvBlock() : 1; +} + +void setupIndependentPoolSizes(CacheConfig& config, bool is_mtp) { + config.use_independent_block_pools = true; + const auto group_num = static_cast(config.groupNums()); + std::vector group_block_nums(group_num, 0); + config.group_seq_size_per_block.resize(group_num, config.seq_size_per_block); + std::vector group_kv_block_stride_bytes(group_num, 0); + std::vector group_kv_scale_stride_bytes(group_num, 0); + + size_t max_kv_stride = 0; + size_t max_scale_stride = 0; + size_t total_kv_block_bytes = 0; + size_t total_scale_block_bytes = 0; + uint32_t max_group_layers = 0; + + config.layer_to_block_stride_bytes.assign(config.layer_all_num, 0); + for (size_t gid = 0; gid < group_num; ++gid) { + const auto& spec = config.specForGroup(gid); + RTP_LLM_CHECK_WITH_INFO(spec != nullptr, "cache_specs[%zu] is null", gid); + const auto layer_count = static_cast(config.layerIdsForGroup(gid).size()); + const size_t kernel_kv_stride = spec->block_size_bytes(); + const auto kernel_scale = spec->scale_block_size_bytes(); + const size_t group_bpk = kernelBlocksPerKvBlockForGroup(config, gid); + const size_t kv_stride = kernel_kv_stride * group_bpk; + const size_t scale_stride = kernel_scale * group_bpk; + group_kv_block_stride_bytes[gid] = kv_stride; + group_kv_scale_stride_bytes[gid] = scale_stride; + const auto type = config.typeForGroup(gid); + const bool is_state = spec->is_state_cache; + if (!is_state && type == CacheGroupType::FULL) { + total_kv_block_bytes += static_cast(layer_count) * kv_stride; + total_scale_block_bytes += static_cast(layer_count) * scale_stride; + } + max_kv_stride = std::max(max_kv_stride, kv_stride); + max_scale_stride = std::max(max_scale_stride, scale_stride); + max_group_layers = std::max(max_group_layers, layer_count); + + for (int layer_id : config.layerIdsForGroup(gid)) { + config.layer_to_block_stride_bytes[static_cast(layer_id)] = + static_cast(kv_stride + scale_stride); + } + } + + config.group_layer_num = static_cast(std::max(1, max_group_layers)); + config.kv_block_stride_bytes = max_kv_stride; + config.kv_scale_stride_bytes = max_scale_stride; + config.kv_block_size_bytes = total_kv_block_bytes; + config.kv_scale_size_bytes = total_scale_block_bytes; + const size_t paged_block_bytes = config.kv_block_size_bytes + config.kv_scale_size_bytes; + if (paged_block_bytes == 0) { + RTP_LLM_CHECK_WITH_INFO(is_mtp && config.use_typed_cache_regions, + "hybrid-pool paged groups produced zero block bytes"); + config.kv_block_size_bytes = 1; + config.kv_scale_size_bytes = 0; + config.block_size_bytes = 1; + } else { + config.block_size_bytes = paged_block_bytes; + } + config.explicitly_sized_pool_reserve_bytes = 0; + config.setGroupBlockLayout(group_block_nums, group_kv_block_stride_bytes, group_kv_scale_stride_bytes); +} + +CacheConfig createHybridAttentionPoolConfig(const ModelConfig& model_config, + const ParallelismConfig& parallelism_config, + const KVCacheConfig& kv_cache_config, + bool is_mtp, + int gen_num_per_cycle) { + const auto dtype = MemoryEvaluationHelper::getDataTypeForCache(model_config); + const auto physical_tokens_per_block = + kv_cache_config.seq_size_per_block > 0 ? static_cast(kv_cache_config.seq_size_per_block) : + static_cast(model_config.attn_config.tokens_per_block); + const auto kernel_tokens_per_block = + kv_cache_config.kernel_seq_size_per_block > 0 ? static_cast(kv_cache_config.kernel_seq_size_per_block) : + physical_tokens_per_block; + RTP_LLM_CHECK_WITH_INFO(physical_tokens_per_block > 0, "hybrid-pool seq_size_per_block must be > 0"); + RTP_LLM_CHECK_WITH_INFO(kernel_tokens_per_block > 0, "hybrid-pool kernel_seq_size_per_block must be > 0"); + RTP_LLM_CHECK_WITH_INFO(physical_tokens_per_block >= kernel_tokens_per_block + && physical_tokens_per_block % kernel_tokens_per_block == 0, + "hybrid-pool seq_size_per_block=%u must be >= kernel_seq_size_per_block=%u and divisible by it", + physical_tokens_per_block, + kernel_tokens_per_block); + + CacheConfig config; + config.layer_num = static_cast(model_config.num_layers); + config.layer_all_num = config.layer_num; + config.block_num = 0; + config.seq_size_per_block = physical_tokens_per_block; + config.kernel_seq_size_per_block = kernel_tokens_per_block; + config.use_mla = model_config.attn_config.use_mla; + config.dtype = dtype; + config.linear_step = 1; + config.is_sparse = model_config.attn_config.is_sparse; + + if (!model_config.kv_cache_spec_descs.empty()) { + validateHybridPoolDescs(model_config, kernel_tokens_per_block, gen_num_per_cycle); + SpecBuildContext ctx; + ctx.dtype = dtype; + ctx.seq_size_per_block = physical_tokens_per_block; + ctx.attn_tp_size = static_cast(parallelism_config.get_attn_tp_size()); + ctx.kernel_tokens_per_block = kernel_tokens_per_block; + ctx.gen_num_per_cycle = static_cast(gen_num_per_cycle); + ctx.cp_size = fixedRegionCpSize(parallelism_config); + ctx.cp_prefill_sliced = isPrefillCpSliced(parallelism_config); + auto refreshed_specs = CacheConfigCreator::buildLayerSpecsFromDescs( + model_config.kv_cache_spec_descs, ctx, model_config.num_layers); + populateGroupsFromLayerSpecs(config, model_config.kv_cache_spec_descs, refreshed_specs); + config.group_seq_size_per_block.resize(static_cast(config.groupNums()), config.seq_size_per_block); + for (size_t gid = 0; gid < static_cast(config.groupNums()); ++gid) { + const auto& spec = config.specForGroup(gid); + RTP_LLM_CHECK_WITH_INFO(spec != nullptr, "hybrid-pool desc config produced null spec gid=%zu", gid); + config.group_seq_size_per_block[gid] = spec->seq_size_per_block; + config.use_typed_cache_regions = + config.use_typed_cache_regions || spec->type == KVCacheSpecType::OpaqueKV + || spec->type == KVCacheSpecType::OpaqueState; + config.use_opaque_kv_cache_store = + config.use_opaque_kv_cache_store || spec->type == KVCacheSpecType::OpaqueKV + || spec->type == KVCacheSpecType::OpaqueState; + } + for (const auto& layer_descs : model_config.kv_cache_spec_descs) { + for (const auto& desc : layer_descs) { + config.is_sparse = config.is_sparse || desc.cache_type == CacheType::COMPRESSED_KV; + } + } + config.disable_decode_first_malloc_device_reuse = + config.disable_decode_first_malloc_device_reuse || config.use_opaque_kv_cache_store; + } else { + RTP_LLM_CHECK_WITH_INFO(false, "HybridPoolConfigCreator requires kv_cache_spec_descs"); + } + + RTP_LLM_CHECK_WITH_INFO(config.groupNums() > 0, "hybrid-pool config produced no cache specs"); + setupIndependentPoolSizes(config, is_mtp); + return config; +} + +} // namespace + +CacheConfig HybridPoolConfigCreator::createConfig(const ModelConfig& model_config, + const ParallelismConfig& parallelism_config, + const KVCacheConfig& kv_cache_config, + bool is_mtp, + int gen_num_per_cycle) { + return createHybridAttentionPoolConfig(model_config, parallelism_config, kv_cache_config, is_mtp, gen_num_per_cycle); +} + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/config_creator/HybridPoolConfigCreator.h b/rtp_llm/cpp/cache/config_creator/HybridPoolConfigCreator.h new file mode 100644 index 0000000000..dac00b5099 --- /dev/null +++ b/rtp_llm/cpp/cache/config_creator/HybridPoolConfigCreator.h @@ -0,0 +1,18 @@ +#pragma once + +#include "rtp_llm/cpp/cache/CacheConfig.h" +#include "rtp_llm/cpp/config/ConfigModules.h" +#include "rtp_llm/cpp/config/ModelConfig.h" + +namespace rtp_llm { + +class HybridPoolConfigCreator { +public: + static CacheConfig createConfig(const ModelConfig& model_config, + const ParallelismConfig& parallelism_config, + const KVCacheConfig& kv_cache_config, + bool is_mtp, + int gen_num_per_cycle); +}; + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/MemoryEvaluationHelper.cc b/rtp_llm/cpp/cache/config_creator/MemoryEvaluationHelper.cc similarity index 99% rename from rtp_llm/cpp/cache/MemoryEvaluationHelper.cc rename to rtp_llm/cpp/cache/config_creator/MemoryEvaluationHelper.cc index f2813ffeda..71838bef0d 100644 --- a/rtp_llm/cpp/cache/MemoryEvaluationHelper.cc +++ b/rtp_llm/cpp/cache/config_creator/MemoryEvaluationHelper.cc @@ -1,4 +1,4 @@ -#include "rtp_llm/cpp/cache/MemoryEvaluationHelper.h" +#include "rtp_llm/cpp/cache/config_creator/MemoryEvaluationHelper.h" #include @@ -20,12 +20,12 @@ namespace rtp_llm { // Helper function to update memory size if below minimum requirement void MemoryEvaluationHelper::updateMemoryIfNeeded(size_t& current_size, size_t min_required, const char* scenario) { if (current_size < min_required) { - current_size = min_required; RTP_LLM_LOG_INFO("%s needs at least %ld MiB memory for runtime by default, " "but only %ld MiB memory reserved. adjust to minimal value.", scenario, min_required / 1024 / 1024, current_size / 1024 / 1024); + current_size = min_required; } } diff --git a/rtp_llm/cpp/cache/MemoryEvaluationHelper.h b/rtp_llm/cpp/cache/config_creator/MemoryEvaluationHelper.h similarity index 100% rename from rtp_llm/cpp/cache/MemoryEvaluationHelper.h rename to rtp_llm/cpp/cache/config_creator/MemoryEvaluationHelper.h diff --git a/rtp_llm/cpp/cache/config_creator/SingleConfigCreator.cc b/rtp_llm/cpp/cache/config_creator/SingleConfigCreator.cc new file mode 100644 index 0000000000..855c688202 --- /dev/null +++ b/rtp_llm/cpp/cache/config_creator/SingleConfigCreator.cc @@ -0,0 +1,131 @@ +#include "rtp_llm/cpp/cache/config_creator/SingleConfigCreator.h" + +#include "rtp_llm/cpp/cache/spec/KVCacheSpec.h" +#include "rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h" +#include "rtp_llm/cpp/cache/config_creator/MemoryEvaluationHelper.h" +#include "rtp_llm/cpp/utils/Logger.h" + +#include + +namespace rtp_llm { + +namespace { + +KVCacheSpecPtr getDefaultSpecFromRuntimeSpecs(const ModelConfig& model_config, + const LayerKVCacheSpecs& runtime_specs) { + RTP_LLM_CHECK_WITH_INFO(runtime_specs.size() == static_cast(model_config.num_layers), + "single cache config requires layer-wise runtime specs for every layer, got %zu/%ld", + runtime_specs.size(), + model_config.num_layers); + RTP_LLM_CHECK_WITH_INFO(!runtime_specs.empty(), "single cache config requires at least one runtime spec"); + RTP_LLM_CHECK_WITH_INFO(runtime_specs[0].size() == 1, + "single cache config requires exactly one spec for layer 0, got %zu", + runtime_specs[0].size()); + auto spec = runtime_specs[0][0]; + RTP_LLM_CHECK_WITH_INFO(spec != nullptr, "single cache config got null runtime spec for layer 0"); + RTP_LLM_CHECK_WITH_INFO(spec->tag == "default", + "single cache config requires tag=default for layer 0, got=%s", + spec->tag.c_str()); + const auto fingerprint = spec->fingerprint(); + for (int64_t layer_id = 1; layer_id < model_config.num_layers; ++layer_id) { + const auto layer = static_cast(layer_id); + RTP_LLM_CHECK_WITH_INFO(runtime_specs[layer].size() == 1, + "single cache config requires exactly one spec for layer %ld, got %zu", + layer_id, + runtime_specs[layer].size()); + const auto& layer_spec = runtime_specs[layer][0]; + RTP_LLM_CHECK_WITH_INFO(layer_spec != nullptr, "single cache config got null runtime spec for layer %ld", layer_id); + RTP_LLM_CHECK_WITH_INFO(layer_spec->tag == "default", + "single cache config requires tag=default for layer %ld, got=%s", + layer_id, + layer_spec->tag.c_str()); + RTP_LLM_CHECK_WITH_INFO(layer_spec->fingerprint() == fingerprint, + "single cache config default spec differs at layer %ld", + layer_id); + } + return spec->clone(); +} + +} // namespace + +CacheConfig SingleConfigCreator::createSingleConfig(const ModelConfig& model_config, + const ParallelismConfig& parallelism_config, + const KVCacheConfig& kv_cache_config, + bool is_mtp, + int gen_num_per_cycle) { + (void)is_mtp; + auto dtype = MemoryEvaluationHelper::getDataTypeForCache(model_config); + const auto physical_tokens_per_block = + kv_cache_config.seq_size_per_block > 0 ? static_cast(kv_cache_config.seq_size_per_block) : + static_cast(model_config.attn_config.tokens_per_block); + const auto kernel_tokens_per_block = + kv_cache_config.kernel_seq_size_per_block > 0 ? static_cast(kv_cache_config.kernel_seq_size_per_block) : + physical_tokens_per_block; + RTP_LLM_CHECK_WITH_INFO(physical_tokens_per_block > 0, "single seq_size_per_block must be > 0"); + RTP_LLM_CHECK_WITH_INFO(kernel_tokens_per_block > 0, "single kernel_seq_size_per_block must be > 0"); + SpecBuildContext ctx; + ctx.dtype = dtype; + ctx.seq_size_per_block = physical_tokens_per_block; + ctx.attn_tp_size = static_cast(parallelism_config.get_attn_tp_size()); + ctx.kernel_tokens_per_block = kernel_tokens_per_block; + ctx.gen_num_per_cycle = static_cast(gen_num_per_cycle); + const auto runtime_specs = + CacheConfigCreator::buildLayerSpecsFromDescs(model_config.kv_cache_spec_descs, ctx, model_config.num_layers); + + auto layer_num = model_config.num_layers; + + CacheConfig config; + config.layer_num = static_cast(layer_num); + config.layer_all_num = static_cast(layer_num); + config.block_num = 0; + config.seq_size_per_block = physical_tokens_per_block; + config.kernel_seq_size_per_block = kernel_tokens_per_block; + + config.use_mla = model_config.attn_config.use_mla; + config.dtype = dtype; + config.is_sparse = model_config.attn_config.is_sparse; + + auto spec = getDefaultSpecFromRuntimeSpecs(model_config, runtime_specs); + + std::vector layer_ids(static_cast(layer_num)); + std::iota(layer_ids.begin(), layer_ids.end(), 0); + GroupBase group; + group.spec = spec; + group.policy = defaultCacheGroupPolicy(CacheGroupType::FULL); + group.layer_ids = layer_ids; + + std::vector layers(static_cast(layer_num)); + for (int64_t layer_id = 0; layer_id < layer_num; ++layer_id) { + auto& layer = layers[static_cast(layer_id)]; + layer.group_ids = {0}; + layer.tag_to_gid[spec->tag] = 0; + } + config.setTopology({group}, std::move(layers)); + RTP_LLM_CHECK_WITH_INFO(config.groupNums() == 1, "single config expected one cache group"); + + // Using spec interface for block size and scale + config.kv_block_stride_bytes = spec->block_size_bytes(); + config.kv_block_size_bytes = static_cast(config.layer_num) * config.kv_block_stride_bytes; + + // Scale handling - no need to check dtype as scale_block_size_bytes() returns 0 if no scale support + config.kv_scale_stride_bytes = spec->scale_block_size_bytes(); + config.kv_scale_size_bytes = static_cast(config.layer_num) * config.kv_scale_stride_bytes; + + if (config.is_sparse) { + auto indexer_dim = model_config.attn_config.indexer_head_dim; + config.kv_scale_stride_bytes = (indexer_dim + indexer_dim / 128 * 4) * spec->seq_size_per_block; + config.kv_scale_size_bytes = static_cast(config.layer_num) * config.kv_scale_stride_bytes; + } + + config.block_size_bytes = config.kv_block_size_bytes + config.kv_scale_size_bytes; + config.group_layer_num = layer_num; // only 1 group for SingleConfig + + // Per-layer block stride (kv + scale). + const size_t per_layer_stride_bytes = config.kv_block_stride_bytes + config.kv_scale_stride_bytes; + config.layer_to_block_stride_bytes.assign(static_cast(config.layer_all_num), + static_cast(per_layer_stride_bytes)); + + return config; +} + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/SingleConfigCreator.h b/rtp_llm/cpp/cache/config_creator/SingleConfigCreator.h similarity index 72% rename from rtp_llm/cpp/cache/SingleConfigCreator.h rename to rtp_llm/cpp/cache/config_creator/SingleConfigCreator.h index 032a254636..0a64fd75f1 100644 --- a/rtp_llm/cpp/cache/SingleConfigCreator.h +++ b/rtp_llm/cpp/cache/config_creator/SingleConfigCreator.h @@ -12,7 +12,9 @@ class SingleConfigCreator { public: static CacheConfig createSingleConfig(const ModelConfig& model_config, const ParallelismConfig& parallelism_config, - bool is_mtp = false); + const KVCacheConfig& kv_cache_config, + bool is_mtp = false, + int gen_num_per_cycle = 0); }; } // namespace rtp_llm \ No newline at end of file diff --git a/rtp_llm/cpp/cache/connector/KVCacheConnectorCoordinator.cc b/rtp_llm/cpp/cache/connector/KVCacheConnectorCoordinator.cc index 16a748eaf3..5091874bdb 100644 --- a/rtp_llm/cpp/cache/connector/KVCacheConnectorCoordinator.cc +++ b/rtp_llm/cpp/cache/connector/KVCacheConnectorCoordinator.cc @@ -1,8 +1,9 @@ #include "rtp_llm/cpp/cache/connector/KVCacheConnectorCoordinator.h" #include +#include -#include "rtp_llm/cpp/cache/KVCacheAllocator.h" +#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h" #include "rtp_llm/cpp/utils/Logger.h" #include "rtp_llm/cpp/utils/ProfilingScope.h" #include "rtp_llm/cpp/cache/connector/KVCacheConnectorReadWriteContext.h" @@ -14,6 +15,121 @@ #endif namespace rtp_llm { +namespace { + +CacheGroupType groupTypeForConnector(const CacheConfig& cache_config, int group_id) { + if (group_id >= 0 && group_id < cache_config.groupNums()) { + return cache_config.typeForGroup(static_cast(group_id)); + } + return CacheGroupType::FULL; +} + +bool isCpCompactSliceGroup(const CacheConfig& cache_config, int group_id, int cp_size) { + if (cp_size <= 1 || group_id < 0 || group_id >= cache_config.groupNums() + || static_cast(group_id) >= cache_config.group_seq_size_per_block.size()) { + return false; + } + const auto& spec = cache_config.specForGroup(static_cast(group_id)); + if (!spec || !spec->supportsCpSlice()) { + return false; + } + const auto row_tokens = cache_config.group_seq_size_per_block[static_cast(group_id)]; + return row_tokens > 0 && row_tokens == cache_config.seq_size_per_block * static_cast(cp_size); +} + +bool isCompactFullBlockList(const KVCacheResource& source, + const BlockIndicesType& src_blocks, + const CacheKeysType& selected_keys) { + return src_blocks.size() <= selected_keys.size() || src_blocks.size() < source.cacheKeys().size(); +} + +bool selectedLastRankKeysAreAligned(const KVCacheResource& source, int cp_size) { + if (source.lastBlockAligned()) { + return true; + } + const auto& keys = source.cacheKeys(); + if (keys.empty() || cp_size <= 1) { + return source.lastBlockAligned(); + } + const int partial_key_pos = static_cast(keys.size() - 1); + const int last_rank = cp_size - 1; + return partial_key_pos % cp_size != last_rank; +} + +KVCacheResource makeCpShardedConnectorResource(const KVCacheResource& source, + const CacheConfig& cache_config, + const CacheKeysType& selected_keys, + int cp_size) { + std::vector group_types; + group_types.reserve(static_cast(source.groupNums())); + for (int gid = 0; gid < source.groupNums(); ++gid) { + group_types.push_back(groupTypeForConnector(cache_config, gid)); + } + + KVCacheResource selected = source; + selected.initGroups(source.groupNums(), + static_cast(cache_config.layer_all_num), + cache_config.layerGroupIdsSnapshot(), + cache_config.kernelBlocksPerKvBlock(), + group_types); + selected.setCacheKeys(selected_keys); + const bool selected_aligned = selectedLastRankKeysAreAligned(source, cp_size); + selected.setLastBlockAligned(selected_aligned); + + // Memory connector intentionally drops the last key to avoid matching a + // partial tail. After CP Page-RR remap, a source partial can belong to a + // non-last rank, making the selected last-rank key complete. Append the + // original partial key as a connector-only dummy tail so the drop-last + // contract discards the dummy, not the usable selected key. + if (!source.lastBlockAligned() && selected_aligned && !source.cacheKeys().empty()) { + selected.cacheKeys().push_back(source.cacheKeys().back()); + selected.rebuildLinearBlockDependencies(); + selected.setLastBlockAligned(false); + } + + for (int gid = 0; gid < source.groupNums(); ++gid) { + const auto& src_blocks = source.blocks(gid); + BlockIndicesType dst_blocks; + dst_blocks.reserve(selected_keys.size()); + + if (isCpCompactSliceGroup(cache_config, gid, cp_size)) { + // Intra-block CP-sliced groups can be compact by using a row size + // of seq_size_per_block * cp_size, so their block list is already + // in the canonical last-rank key namespace. + for (size_t i = 0; i < selected_keys.size(); ++i) { + dst_blocks.push_back(i < src_blocks.size() ? src_blocks[i] : NULL_BLOCK_IDX); + } + } else if (group_types[static_cast(gid)] == CacheGroupType::FULL) { + // Prefill rank-local FULL blocks are compact already. Decode-side + // FULL blocks are full-logical, so select the canonical last-rank + // logical positions. + if (isCompactFullBlockList(source, src_blocks, selected_keys)) { + for (size_t i = 0; i < selected_keys.size(); ++i) { + dst_blocks.push_back(i < src_blocks.size() ? src_blocks[i] : NULL_BLOCK_IDX); + } + } else { + for (size_t logical_pos = static_cast(cp_size - 1); dst_blocks.size() < selected_keys.size(); + logical_pos += static_cast(cp_size)) { + dst_blocks.push_back(logical_pos < src_blocks.size() ? src_blocks[logical_pos] : NULL_BLOCK_IDX); + } + } + } else { + // SWA/state groups keep the non-sharded logical coordinate system. + // Select the block at the original logical key position instead of + // reinterpreting the group as rank-local compact storage. + for (size_t logical_pos = static_cast(cp_size - 1); dst_blocks.size() < selected_keys.size(); + logical_pos += static_cast(cp_size)) { + dst_blocks.push_back(logical_pos < src_blocks.size() ? src_blocks[logical_pos] : NULL_BLOCK_IDX); + } + } + + selected.mutableBlockIds(gid).assign(std::move(dst_blocks)); + } + + return selected; +} + +} // namespace KVCacheConnectorCoordinator::KVCacheConnectorCoordinator(const CacheConfig& cache_config, const KVCacheConfig& kv_cache_config, @@ -116,7 +232,23 @@ KVCacheConnectorCoordinator::asyncRead(const std::shared_ptrincrKVCacheRef(kvcache_resource, kvcache_resource.cacheKeys(), true); + const int cp_size = cpSize(); + CacheKeysType ref_keys = kvcache_resource.cacheKeys(); + KVCacheResource ref_resource = kvcache_resource; + if (cp_size > 1) { + if (!kvcache_resource.cacheKeysAreCpCanonical()) { + ref_keys = kvcache_resource.localCacheKeys(cp_size - 1, cp_size); + // Short requests (< cp_size logical blocks) have no complete virtual + // block, so the canonical last-rank-key namespace is empty by design. + // Skip silently — connector activity for these is a no-op anyway. + if (ref_keys.empty()) { + return nullptr; + } + ref_resource = makeCpShardedConnectorResource(kvcache_resource, cache_config_, ref_keys, cp_size); + ref_keys = ref_resource.cacheKeys(); + } + } + auto resource = allocator_->incrKVCacheRef(ref_resource, ref_keys, true); if (!resource) { RTP_LLM_LOG_WARNING("async read failed, incr kvcache ref failed, resource: [%s]", kvcache_resource.debugString().c_str()); @@ -154,7 +286,20 @@ KVCacheConnectorCoordinator::asyncWrite(const std::shared_ptrincrKVCacheRef(kvcache_resource, kvcache_resource.cacheKeys(), true); + const int cp_size = cpSize(); + CacheKeysType ref_keys = kvcache_resource.cacheKeys(); + KVCacheResource ref_resource = kvcache_resource; + if (cp_size > 1) { + if (!kvcache_resource.cacheKeysAreCpCanonical()) { + ref_keys = kvcache_resource.localCacheKeys(cp_size - 1, cp_size); + if (ref_keys.empty()) { + return nullptr; // request shorter than one virtual block — nothing to write + } + ref_resource = makeCpShardedConnectorResource(kvcache_resource, cache_config_, ref_keys, cp_size); + ref_keys = ref_resource.cacheKeys(); + } + } + auto resource = allocator_->incrKVCacheRef(ref_resource, ref_keys, true); if (!resource) { RTP_LLM_LOG_WARNING("async write failed, incr kvcache ref failed, resource: [%s]", kvcache_resource.debugString().c_str()); @@ -193,8 +338,12 @@ KVCacheConnectorCoordinator::asyncWriteByLayer(int } std::shared_ptr KVCacheConnectorCoordinator::initMemoryConnector() { - auto memory_connector = std::make_shared( - cache_config_, kv_cache_config_, allocator_, runtime_config_.worker_grpc_addrs, metrics_reporter_); + auto memory_connector = std::make_shared(cache_config_, + kv_cache_config_, + parallelism_config_, + allocator_, + runtime_config_.worker_grpc_addrs, + metrics_reporter_); RTP_LLM_CHECK_WITH_INFO(memory_connector->init(), "memory connector init failed"); return memory_connector; } @@ -220,6 +369,21 @@ std::shared_ptr KVCacheConnectorCoordinator::initRemoteConnecto #endif } +int KVCacheConnectorCoordinator::cpSize() const { + const auto& cp_cfg = parallelism_config_.prefill_cp_config; + if (!cp_cfg.kv_cache_sharded) { + return 1; + } + if (parallelism_config_.tp_size > 1) { + return static_cast(parallelism_config_.tp_size); + } + if (parallelism_config_.role_type == RoleType::DECODE && cp_cfg.is_prefill_enabled() + && cp_cfg.prefill_cp_size > 1) { + return static_cast(cp_cfg.prefill_cp_size); + } + return 1; +} + void KVCacheConnectorCoordinator::updateOnce() { RTP_LLM_PROFILE_FUNCTION(); processReadContexts(); diff --git a/rtp_llm/cpp/cache/connector/KVCacheConnectorCoordinator.h b/rtp_llm/cpp/cache/connector/KVCacheConnectorCoordinator.h index ba8bfaa2a8..253bf86d10 100644 --- a/rtp_llm/cpp/cache/connector/KVCacheConnectorCoordinator.h +++ b/rtp_llm/cpp/cache/connector/KVCacheConnectorCoordinator.h @@ -11,7 +11,7 @@ #include "rtp_llm/cpp/cache/connector/AsyncContext.h" #include "rtp_llm/cpp/cache/connector/IKVCacheConnectorCoordinator.h" #include "rtp_llm/cpp/cache/connector/KVCacheConnector.h" -#include "rtp_llm/cpp/cache/KVCacheAllocator.h" +#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h" #include "rtp_llm/cpp/config/ConfigModules.h" #include "rtp_llm/cpp/model_rpc/proto/model_rpc_service.grpc.pb.h" #include "rtp_llm/cpp/model_rpc/proto/model_rpc_service.pb.h" @@ -67,10 +67,12 @@ class KVCacheConnectorCoordinator: public IKVCacheConnectorCoordinator { std::shared_ptr initMemoryConnector(); std::shared_ptr initRemoteConnector(); bool initP2PConnectorInternal(); - void initUpdateThread(); - void updateOnce(); - void processReadContexts(); - void processWriteContexts(); + // Returns CP size when page-level RR sharding is active; 1 otherwise. + int cpSize() const; + void initUpdateThread(); + void updateOnce(); + void processReadContexts(); + void processWriteContexts(); void asyncReadAfterMatch(std::shared_ptr fused_read_context); bool isPdInvertMode() const; diff --git a/rtp_llm/cpp/cache/connector/memory/BUILD b/rtp_llm/cpp/cache/connector/memory/BUILD index e2a7955837..7d6ec080a3 100644 --- a/rtp_llm/cpp/cache/connector/memory/BUILD +++ b/rtp_llm/cpp/cache/connector/memory/BUILD @@ -12,10 +12,9 @@ cc_library( deps = [ "//rtp_llm/cpp/cache/connector:connector_base", "//rtp_llm/cpp/config:config_modules", - "//rtp_llm/models_py/bindings/core:exec_ops_hdr", "//rtp_llm/cpp/metrics:metrics", "//rtp_llm/cpp/model_rpc:broadcast_manager", - "//rtp_llm/cpp/cache:kv_cache_allocator", + "//rtp_llm/cpp/cache:kv_cache_allocator_hdr", "//rtp_llm/cpp/utils:lru_cache", "//rtp_llm/cpp/utils:profiling_scope", "@havenask//aios/autil:json", diff --git a/rtp_llm/cpp/cache/connector/memory/CacheBlockKind.h b/rtp_llm/cpp/cache/connector/memory/CacheBlockKind.h new file mode 100644 index 0000000000..1ccd56dccd --- /dev/null +++ b/rtp_llm/cpp/cache/connector/memory/CacheBlockKind.h @@ -0,0 +1,30 @@ +#pragma once + +namespace rtp_llm { + +enum class CacheBlockKind { + COMPLETE = 0, + INCOMPLETE = 1, + COMPRESSED_KV = 2, + STATE_SWA_KV = 3, +}; + +inline CacheBlockKind blockKindFromComplete(bool is_complete) { + return is_complete ? CacheBlockKind::COMPLETE : CacheBlockKind::INCOMPLETE; +} + +inline const char* cacheBlockKindName(CacheBlockKind kind) { + switch (kind) { + case CacheBlockKind::COMPLETE: + return "complete"; + case CacheBlockKind::INCOMPLETE: + return "incomplete"; + case CacheBlockKind::COMPRESSED_KV: + return "compressed_kv"; + case CacheBlockKind::STATE_SWA_KV: + return "state_swa_kv"; + } + return "unknown"; +} + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/connector/memory/DiskBlockIO.cc b/rtp_llm/cpp/cache/connector/memory/DiskBlockIO.cc new file mode 100644 index 0000000000..fe9a480c19 --- /dev/null +++ b/rtp_llm/cpp/cache/connector/memory/DiskBlockIO.cc @@ -0,0 +1,151 @@ +#include "rtp_llm/cpp/cache/connector/memory/DiskBlockIO.h" + +#include +#include +#include +#include +#include + +#include "rtp_llm/cpp/utils/Logger.h" + +namespace rtp_llm { +namespace { + +constexpr size_t kDirectIOAlignment = 4096; + +} // namespace + +PosixDiskBlockIO::~PosixDiskBlockIO() { + close(); +} + +bool PosixDiskBlockIO::openAndPreallocate(const std::string& file_path, size_t bytes, bool buffered_io) { + close(); + file_path_ = file_path; + bytes_ = bytes; + buffered_io_ = buffered_io; + + int flags = O_CREAT | O_EXCL | O_RDWR | O_CLOEXEC; + if (!buffered_io_) { +#ifdef O_DIRECT + flags |= O_DIRECT; +#else + RTP_LLM_LOG_ERROR("O_DIRECT is not supported on this platform"); + return false; +#endif + } + + fd_ = ::open(file_path.c_str(), flags, 0600); + if (fd_ < 0) { + RTP_LLM_LOG_ERROR("open disk kv file failed, file=%s, error=%s", file_path.c_str(), std::strerror(errno)); + return false; + } + + const int rc = ::posix_fallocate(fd_, 0, static_cast(bytes)); + if (rc != 0) { + RTP_LLM_LOG_ERROR("posix_fallocate disk kv file failed, file=%s, bytes=%zu, error=%s", + file_path.c_str(), + bytes, + std::strerror(rc)); + close(); + return false; + } + return true; +} + +bool PosixDiskBlockIO::checkDirectIOAlignment(uint64_t offset, const void* buffer, size_t bytes) const { + if (buffered_io_) { + return true; + } + const auto addr = reinterpret_cast(buffer); + if (offset % kDirectIOAlignment != 0 || addr % kDirectIOAlignment != 0 || bytes % kDirectIOAlignment != 0) { + RTP_LLM_LOG_ERROR("direct disk io alignment failed, file=%s, offset=%lu, addr=%p, bytes=%zu", + file_path_.c_str(), + offset, + buffer, + bytes); + return false; + } + return true; +} + +bool PosixDiskBlockIO::read(uint64_t offset, void* dst, size_t bytes) { + if (fd_ < 0 || dst == nullptr || offset + bytes > bytes_ || !checkDirectIOAlignment(offset, dst, bytes)) { + return false; + } + size_t done = 0; + while (done < bytes) { + const auto rc = ::pread(fd_, static_cast(dst) + done, bytes - done, static_cast(offset + done)); + if (rc < 0) { + if (errno == EINTR) { + continue; + } + RTP_LLM_LOG_ERROR("pread disk kv file failed, file=%s, offset=%lu, bytes=%zu, done=%zu, error=%s", + file_path_.c_str(), + offset, + bytes, + done, + std::strerror(errno)); + return false; + } + if (rc == 0) { + RTP_LLM_LOG_ERROR("pread disk kv file got EOF, file=%s, offset=%lu, bytes=%zu, done=%zu", + file_path_.c_str(), + offset, + bytes, + done); + return false; + } + done += static_cast(rc); + } + return true; +} + +bool PosixDiskBlockIO::write(uint64_t offset, const void* src, size_t bytes) { + if (fd_ < 0 || src == nullptr || offset + bytes > bytes_ || !checkDirectIOAlignment(offset, src, bytes)) { + return false; + } + size_t done = 0; + while (done < bytes) { + const auto rc = + ::pwrite(fd_, static_cast(src) + done, bytes - done, static_cast(offset + done)); + if (rc < 0) { + if (errno == EINTR) { + continue; + } + RTP_LLM_LOG_ERROR("pwrite disk kv file failed, file=%s, offset=%lu, bytes=%zu, done=%zu, error=%s", + file_path_.c_str(), + offset, + bytes, + done, + std::strerror(errno)); + return false; + } + if (rc == 0) { + RTP_LLM_LOG_ERROR("pwrite disk kv file made no progress, file=%s, offset=%lu, bytes=%zu, done=%zu", + file_path_.c_str(), + offset, + bytes, + done); + return false; + } + done += static_cast(rc); + } + return true; +} + +void PosixDiskBlockIO::close() { + if (fd_ >= 0) { + ::close(fd_); + fd_ = -1; + } +} + +std::string PosixDiskBlockIO::debugString() const { + std::ostringstream oss; + oss << "PosixDiskBlockIO{file=" << file_path_ << ", bytes=" << bytes_ + << ", io=" << (buffered_io_ ? "buffered" : "direct") << "}"; + return oss.str(); +} + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/connector/memory/DiskBlockIO.h b/rtp_llm/cpp/cache/connector/memory/DiskBlockIO.h new file mode 100644 index 0000000000..6c818eab6b --- /dev/null +++ b/rtp_llm/cpp/cache/connector/memory/DiskBlockIO.h @@ -0,0 +1,41 @@ +#pragma once + +#include +#include +#include + +namespace rtp_llm { + +class IDiskBlockIO { +public: + virtual ~IDiskBlockIO() = default; + + virtual bool openAndPreallocate(const std::string& file_path, size_t bytes, bool buffered_io) = 0; + virtual bool read(uint64_t offset, void* dst, size_t bytes) = 0; + virtual bool write(uint64_t offset, const void* src, size_t bytes) = 0; + virtual void close() = 0; + virtual std::string debugString() const = 0; +}; + +class PosixDiskBlockIO: public IDiskBlockIO { +public: + PosixDiskBlockIO() = default; + ~PosixDiskBlockIO() override; + + bool openAndPreallocate(const std::string& file_path, size_t bytes, bool buffered_io) override; + bool read(uint64_t offset, void* dst, size_t bytes) override; + bool write(uint64_t offset, const void* src, size_t bytes) override; + void close() override; + std::string debugString() const override; + +private: + bool checkDirectIOAlignment(uint64_t offset, const void* buffer, size_t bytes) const; + +private: + int fd_{-1}; + std::string file_path_; + size_t bytes_{0}; + bool buffered_io_{true}; +}; + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/connector/memory/DiskBlockPool.cc b/rtp_llm/cpp/cache/connector/memory/DiskBlockPool.cc new file mode 100644 index 0000000000..917fac1193 --- /dev/null +++ b/rtp_llm/cpp/cache/connector/memory/DiskBlockPool.cc @@ -0,0 +1,344 @@ +#include "rtp_llm/cpp/cache/connector/memory/DiskBlockPool.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rtp_llm/cpp/utils/Logger.h" +#include "rtp_llm/cpp/utils/StringUtil.h" + +namespace rtp_llm { +namespace { + +constexpr size_t kDiskIOAlignment = 4096; + +std::string joinPath(const std::string& parent, const std::string& child) { + if (parent.empty() || parent.back() == '/') { + return parent + child; + } + return parent + "/" + child; +} + +bool mkdirIfMissing(const std::string& path) { + if (::mkdir(path.c_str(), 0755) == 0) { + return true; + } + if (errno == EEXIST) { + struct stat st; + return ::stat(path.c_str(), &st) == 0 && S_ISDIR(st.st_mode); + } + return false; +} + +bool directoryExists(const std::string& path) { + struct stat st; + return ::stat(path.c_str(), &st) == 0 && S_ISDIR(st.st_mode); +} + +} // namespace + +DiskMountGuard::~DiskMountGuard() { + unlockAndClose(); +} + +bool DiskMountGuard::init(const std::string& mount_path) { + mount_path_ = mount_path; + work_dir_ = joinPath(mount_path_, "rtp_llm_disk_kv"); + lock_path_ = joinPath(work_dir_, ".lock"); + if (!initDirectoryAndLock() || !cleanupStaleFiles()) { + unlockAndClose(); + return false; + } + RTP_LLM_LOG_INFO("disk kv mount guard init success: %s", debugString().c_str()); + return true; +} + +const std::string& DiskMountGuard::workDir() const { + return work_dir_; +} + +const std::string& DiskMountGuard::mountPath() const { + return mount_path_; +} + +std::string DiskMountGuard::debugString() const { + std::ostringstream oss; + oss << "DiskMountGuard{mount=" << mount_path_ << ", work_dir=" << work_dir_ << ", lock=" << lock_path_ << "}"; + return oss.str(); +} + +bool DiskMountGuard::initDirectoryAndLock() { + if (!directoryExists(mount_path_)) { + RTP_LLM_LOG_ERROR("disk kv mount path does not exist or is not a directory, mount=%s, error=%s", + mount_path_.c_str(), + std::strerror(errno)); + return false; + } + + if (!mkdirIfMissing(work_dir_)) { + RTP_LLM_LOG_ERROR("create disk kv directory failed, mount=%s, work_dir=%s, error=%s", + mount_path_.c_str(), + work_dir_.c_str(), + std::strerror(errno)); + return false; + } + + lock_fd_ = ::open(lock_path_.c_str(), O_CREAT | O_RDWR | O_CLOEXEC, 0600); + if (lock_fd_ < 0) { + RTP_LLM_LOG_ERROR("open disk kv lock failed, lock=%s, error=%s", lock_path_.c_str(), std::strerror(errno)); + return false; + } + if (::flock(lock_fd_, LOCK_EX | LOCK_NB) != 0) { + RTP_LLM_LOG_ERROR("lock disk kv mount failed, lock=%s, error=%s", lock_path_.c_str(), std::strerror(errno)); + unlockAndClose(); + return false; + } + return true; +} + +bool DiskMountGuard::cleanupStaleFiles() { + DIR* dir = ::opendir(work_dir_.c_str()); + if (dir == nullptr) { + RTP_LLM_LOG_ERROR("open disk kv work dir failed, dir=%s, error=%s", work_dir_.c_str(), std::strerror(errno)); + return false; + } + while (auto* entry = ::readdir(dir)) { + const std::string name(entry->d_name); + if (name == "." || name == ".." || name == ".lock") { + continue; + } + const bool framework_file = + (startsWith(name, "rank_") && (name.size() >= 3 && name.substr(name.size() - 3) == ".kv")) + || (name.size() >= 4 && name.substr(name.size() - 4) == ".tmp"); + if (!framework_file) { + continue; + } + const auto path = joinPath(work_dir_, name); + if (::unlink(path.c_str()) != 0 && errno != ENOENT) { + RTP_LLM_LOG_ERROR( + "remove stale disk kv file failed, file=%s, error=%s", path.c_str(), std::strerror(errno)); + ::closedir(dir); + return false; + } + } + ::closedir(dir); + return true; +} + +void DiskMountGuard::unlockAndClose() { + if (lock_fd_ >= 0) { + ::flock(lock_fd_, LOCK_UN); + ::close(lock_fd_); + lock_fd_ = -1; + } +} + +DiskBlockPool::DiskBlockPool(DiskBlockPoolConfig config, std::unique_ptr io): + config_(std::move(config)), io_(std::move(io)) { + if (!io_) { + io_ = std::make_unique(); + } +} + +DiskBlockPool::~DiskBlockPool() { + if (io_) { + io_->close(); + } +} + +size_t DiskBlockPool::alignUp(size_t value, size_t alignment) { + return ((value + alignment - 1) / alignment) * alignment; +} + +bool DiskBlockPool::init() { + if (config_.work_dir.empty() || config_.disk_size_bytes == 0 || config_.block_size_bytes == 0) { + RTP_LLM_LOG_ERROR("init disk block pool failed, invalid config: %s", debugString().c_str()); + return false; + } + slot_stride_bytes_ = alignUp(config_.block_size_bytes, kDiskIOAlignment); + slot_count_ = config_.disk_size_bytes / slot_stride_bytes_; + if (slot_count_ == 0) { + RTP_LLM_LOG_ERROR("init disk block pool failed, disk size too small, disk=%zu, block=%zu, stride=%zu", + config_.disk_size_bytes, + config_.block_size_bytes, + slot_stride_bytes_); + return false; + } + + if (!initFile()) { + if (!file_path_.empty()) { + ::unlink(file_path_.c_str()); + } + return false; + } + + { + std::lock_guard lock(mutex_); + slots_.assign(slot_count_, SlotState{}); + free_slots_.clear(); + for (size_t i = 0; i < slot_count_; ++i) { + free_slots_.insert(static_cast(i)); + } + } + + RTP_LLM_LOG_INFO("disk kv block pool init success: %s", debugString().c_str()); + return true; +} + +bool DiskBlockPool::initFile() { + file_path_ = joinPath(config_.work_dir, + fmtstr("rank_%ld_world_%ld_%s.kv", + config_.local_rank, + config_.world_rank, + cacheBlockKindName(config_.pool_kind))); + return io_->openAndPreallocate(file_path_, slot_count_ * slot_stride_bytes_, config_.buffered_io); +} + +std::optional DiskBlockPool::malloc() { + std::lock_guard lock(mutex_); + if (free_slots_.empty()) { + return std::nullopt; + } + const auto slot = *free_slots_.begin(); + free_slots_.erase(free_slots_.begin()); + slots_[static_cast(slot)].request_ref++; + return slot; +} + +bool DiskBlockPool::validSlot(int32_t slot) const { + return slot >= 0 && static_cast(slot) < slot_count_; +} + +void DiskBlockPool::requestReference(int32_t slot) { + std::lock_guard lock(mutex_); + if (!validSlot(slot)) { + return; + } + auto& state = slots_[static_cast(slot)]; + state.request_ref++; + free_slots_.erase(slot); +} + +void DiskBlockPool::requestFree(int32_t slot) { + std::lock_guard lock(mutex_); + if (!validSlot(slot)) { + return; + } + auto& state = slots_[static_cast(slot)]; + if (state.request_ref > 0) { + state.request_ref--; + } + tryFreeSlotLocked(slot); +} + +void DiskBlockPool::blockCacheReference(int32_t slot) { + std::lock_guard lock(mutex_); + if (!validSlot(slot)) { + return; + } + auto& state = slots_[static_cast(slot)]; + state.cache_ref++; + free_slots_.erase(slot); +} + +void DiskBlockPool::blockCacheFree(int32_t slot) { + std::lock_guard lock(mutex_); + if (!validSlot(slot)) { + return; + } + auto& state = slots_[static_cast(slot)]; + if (state.cache_ref > 0) { + state.cache_ref--; + } + tryFreeSlotLocked(slot); +} + +void DiskBlockPool::tryFreeSlotLocked(int32_t slot) { + auto& state = slots_[static_cast(slot)]; + if (state.request_ref == 0 && state.cache_ref == 0) { + free_slots_.insert(slot); + } +} + +bool DiskBlockPool::read(int32_t slot, void* dst, size_t bytes) { + if (!validSlot(slot) || bytes > slot_stride_bytes_) { + return false; + } + const uint64_t offset = static_cast(slot) * slot_stride_bytes_; + if (!io_->read(offset, dst, bytes)) { + return false; + } + read_bytes_.fetch_add(bytes, std::memory_order_relaxed); + return true; +} + +bool DiskBlockPool::write(int32_t slot, const void* src, size_t bytes) { + if (!validSlot(slot) || bytes > slot_stride_bytes_) { + return false; + } + const uint64_t offset = static_cast(slot) * slot_stride_bytes_; + if (!io_->write(offset, src, bytes)) { + return false; + } + write_bytes_.fetch_add(bytes, std::memory_order_relaxed); + return true; +} + +size_t DiskBlockPool::totalSlots() const { + return slot_count_; +} + +size_t DiskBlockPool::freeSlots() const { + std::lock_guard lock(mutex_); + return free_slots_.size(); +} + +size_t DiskBlockPool::availableSlots() const { + std::lock_guard lock(mutex_); + size_t available = 0; + for (const auto& state : slots_) { + if (state.request_ref == 0) { + ++available; + } + } + return available; +} + +size_t DiskBlockPool::blockSizeBytes() const { + return config_.block_size_bytes; +} + +size_t DiskBlockPool::slotStrideBytes() const { + return slot_stride_bytes_; +} + +size_t DiskBlockPool::readBytes() const { + return read_bytes_.load(std::memory_order_relaxed); +} + +size_t DiskBlockPool::writeBytes() const { + return write_bytes_.load(std::memory_order_relaxed); +} + +const std::string& DiskBlockPool::filePath() const { + return file_path_; +} + +std::string DiskBlockPool::debugString() const { + std::ostringstream oss; + oss << "DiskBlockPool{work_dir=" << config_.work_dir << ", file=" << file_path_ + << ", local_rank=" << config_.local_rank << ", world_rank=" << config_.world_rank + << ", kind=" << cacheBlockKindName(config_.pool_kind) << ", disk_size=" << config_.disk_size_bytes + << ", block_size=" << config_.block_size_bytes + << ", stride=" << slot_stride_bytes_ << ", slots=" << slot_count_ + << ", io=" << (config_.buffered_io ? "buffered" : "direct") << "}"; + return oss.str(); +} + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/connector/memory/DiskBlockPool.h b/rtp_llm/cpp/cache/connector/memory/DiskBlockPool.h new file mode 100644 index 0000000000..b4c6e21654 --- /dev/null +++ b/rtp_llm/cpp/cache/connector/memory/DiskBlockPool.h @@ -0,0 +1,107 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rtp_llm/cpp/cache/connector/memory/CacheBlockKind.h" +#include "rtp_llm/cpp/cache/connector/memory/DiskBlockIO.h" + +namespace rtp_llm { + +class DiskMountGuard { +public: + DiskMountGuard() = default; + ~DiskMountGuard(); + + bool init(const std::string& mount_path); + const std::string& workDir() const; + const std::string& mountPath() const; + std::string debugString() const; + +private: + bool initDirectoryAndLock(); + bool cleanupStaleFiles(); + void unlockAndClose(); + +private: + std::string mount_path_; + std::string work_dir_; + std::string lock_path_; + int lock_fd_{-1}; +}; + +struct DiskBlockPoolConfig { + std::string work_dir; + int64_t local_rank{0}; + int64_t world_rank{0}; + size_t disk_size_bytes{0}; + size_t block_size_bytes{0}; + bool buffered_io{true}; + CacheBlockKind pool_kind{CacheBlockKind::COMPLETE}; +}; + +class DiskBlockPool { +public: + explicit DiskBlockPool(DiskBlockPoolConfig config, std::unique_ptr io = nullptr); + ~DiskBlockPool(); + + bool init(); + + // Slot allocation is driven by the copy-plan owner, matching the existing + // memory connector metadata model. Follower ranks receive the slot id in + // the broadcast copy plan and use it as an externally assigned file offset; + // they do not independently allocate or evict disk slots. + std::optional malloc(); + void requestReference(int32_t slot); + void requestFree(int32_t slot); + void blockCacheReference(int32_t slot); + void blockCacheFree(int32_t slot); + + bool read(int32_t slot, void* dst, size_t bytes); + bool write(int32_t slot, const void* src, size_t bytes); + + bool validSlot(int32_t slot) const; + size_t totalSlots() const; + size_t freeSlots() const; + size_t availableSlots() const; + size_t blockSizeBytes() const; + size_t slotStrideBytes() const; + size_t readBytes() const; + size_t writeBytes() const; + const std::string& filePath() const; + std::string debugString() const; + + static size_t alignUp(size_t value, size_t alignment); + +private: + struct SlotState { + uint32_t request_ref{0}; + uint32_t cache_ref{0}; + }; + + bool initFile(); + void tryFreeSlotLocked(int32_t slot); + +private: + DiskBlockPoolConfig config_; + std::unique_ptr io_; + std::string file_path_; + size_t slot_stride_bytes_{0}; + size_t slot_count_{0}; + mutable std::mutex mutex_; + std::set free_slots_; + std::vector slots_; + std::atomic read_bytes_{0}; + std::atomic write_bytes_{0}; +}; + +using DiskBlockPoolPtr = std::shared_ptr; + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/connector/memory/KVCacheMemoryConnector.cc b/rtp_llm/cpp/cache/connector/memory/KVCacheMemoryConnector.cc index 3f1b7109d2..99fe7aceae 100644 --- a/rtp_llm/cpp/cache/connector/memory/KVCacheMemoryConnector.cc +++ b/rtp_llm/cpp/cache/connector/memory/KVCacheMemoryConnector.cc @@ -1,21 +1,24 @@ #include "rtp_llm/cpp/cache/connector/memory/KVCacheMemoryConnector.h" +#include +#include +#include + #include "rtp_llm/cpp/cache/BlockPool.h" #include "rtp_llm/cpp/cache/BlockPoolConfigHelper.h" #include "rtp_llm/cpp/cache/connector/memory/MemoryAsyncContext.h" #include "rtp_llm/cpp/cache/connector/Meta.h" -#include "rtp_llm/cpp/cache/KVCacheAllocator.h" -#include "rtp_llm/models_py/bindings/core/ExecOps.h" +#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h" #include "rtp_llm/models_py/bindings/NoBlockCopy.h" #include "rtp_llm/cpp/utils/Logger.h" #include "rtp_llm/cpp/metrics/RtpLLMMetrics.h" #include "rtp_llm/cpp/utils/ProfilingScope.h" +#include "rtp_llm/cpp/utils/StringUtil.h" +#include "rtp_llm/cpp/utils/TimeUtil.h" namespace rtp_llm { - // When set on MultiCopyParams, execNoBlockCopy may try CUDA split scatter/gather (SplitKvCacheCopy; not on PPU). -// Eligibility for the fast path is decided only by enable_memory_cache_sm_copy; splitKvMultiCopy falls back if layout -// mismatches. +// This legacy SM-copy path is only used for non typed layer-region layouts. static void applySplitKvMultiCopyFieldsIfEligible(bool enable_sm_copy, const CacheConfig& cfg, MultiCopyParams& out) { if (!enable_sm_copy) { return; @@ -25,17 +28,62 @@ static void applySplitKvMultiCopyFieldsIfEligible(bool enable_sm_copy, const Cac out.split_kv_scale_stride_bytes = cfg.kv_scale_stride_bytes; } +static void +appendBatchedMemoryCopyTile(void* dst, const void* src, size_t bytes, std::vector& tiles) { + if (bytes > 0) { + tiles.push_back(BatchedMemoryCopyTile{dst, src, bytes}); + } +} + +static void +appendStagedMemoryCopyTile(void* gpu, size_t host_offset, size_t bytes, std::vector& tiles) { + if (gpu != nullptr && bytes > 0) { + tiles.push_back(StagedMemoryCopyTile{gpu, host_offset, bytes}); + } +} + +static void appendStagedMemoryCopyHostSegment(void* host, + size_t host_offset, + size_t bytes, + std::vector& segments) { + if (host == nullptr || bytes == 0) { + return; + } + if (!segments.empty()) { + auto& prev = segments.back(); + if (static_cast(prev.host) + prev.bytes == host && prev.host_offset + prev.bytes == host_offset) { + prev.bytes += bytes; + return; + } + } + segments.push_back(StagedMemoryCopyHostSegment{host, host_offset, bytes}); +} + +static size_t alignUp(size_t value, size_t alignment) { + RTP_LLM_CHECK_WITH_INFO(alignment != 0, "alignment must not be zero"); + return ((value + alignment - 1) / alignment) * alignment; +} + KVCacheMemoryConnector::KVCacheMemoryConnector(const CacheConfig& cache_config, const KVCacheConfig& kv_cache_config, + const ParallelismConfig& parallelism_config, const std::shared_ptr& allocator, const std::vector& tp_addrs, const kmonitor::MetricsReporterPtr& metrics_reporter): cache_config_(cache_config), kv_cache_config_(kv_cache_config), + parallelism_config_(parallelism_config), allocator_(allocator), tp_addrs_(tp_addrs), metrics_reporter_(metrics_reporter) {} +KVCacheMemoryConnector::KVCacheMemoryConnector(const CacheConfig& cache_config, + const KVCacheConfig& kv_cache_config, + const std::shared_ptr& allocator, + const std::vector& tp_addrs, + const kmonitor::MetricsReporterPtr& metrics_reporter): + KVCacheMemoryConnector(cache_config, kv_cache_config, ParallelismConfig{}, allocator, tp_addrs, metrics_reporter) {} + KVCacheMemoryConnector::~KVCacheMemoryConnector() { RTP_LLM_LOG_INFO("KVCacheMemoryConnector destructor"); stop_.store(true); @@ -50,6 +98,17 @@ KVCacheMemoryConnector::~KVCacheMemoryConnector() { broadcast_manager_.reset(); block_pool_.reset(); block_cache_.reset(); + complete_pool_.reset(); + incomplete_pool_.reset(); + { + std::lock_guard lock(staged_copy_scratch_mutex_); + for (auto& [_, scratch] : staged_copy_scratch_by_device_) { + if (scratch) { + releaseStagedMemoryCopyScratch(*scratch); + } + } + staged_copy_scratch_by_device_.clear(); + } } bool KVCacheMemoryConnector::init() { @@ -61,7 +120,7 @@ bool KVCacheMemoryConnector::init() { checkLayerBlockStrideBytes(); initBlockPool(); - block_cache_ = std::make_shared(); + block_cache_ = std::make_shared(); broadcast_manager_ = std::make_shared(tp_addrs_); RTP_LLM_CHECK_WITH_INFO(broadcast_manager_->init(), "init failed, broadcast manager init failed"); @@ -76,16 +135,27 @@ bool KVCacheMemoryConnector::init() { } void KVCacheMemoryConnector::checkLayerBlockStrideBytes() const { - const size_t layer_num = cache_config_.layer_all_num; - const auto& layer_block_stride = cache_config_.layer_to_block_stride_bytes; - RTP_LLM_CHECK_WITH_INFO(layer_block_stride.size() == layer_num, - "layer block stride size must equal to layer num, got=%zu need=%zu", - layer_block_stride.size(), - layer_num); - for (size_t i = 0; i < layer_num; ++i) { - RTP_LLM_CHECK_WITH_INFO( - layer_block_stride[i] > 0, "invalid block stride bytes at layer=%zu: %d", i, layer_block_stride[i]); + const auto slots = layerTagSlots(); + RTP_LLM_CHECK_WITH_INFO(!slots.empty(), "layer-attn slots must not be empty"); + for (const auto& slot : slots) { + RTP_LLM_CHECK_WITH_INFO(slot.stride_bytes > 0, + "invalid block stride bytes at layer=%d tag=%s group=%d: %zu", + slot.layer_id, + slot.tag.c_str(), + slot.group_id, + slot.stride_bytes); + } +} + +bool KVCacheMemoryConnector::isDualPool() const { + return complete_pool_ != nullptr; +} + +bool KVCacheMemoryConnector::isFullOnlySlot(const LayerTagSlot& slot) const { + if (slot.group_id < 0 || slot.group_id >= cache_config_.groupNums()) { + return true; } + return cache_config_.typeForGroup(static_cast(slot.group_id)) == CacheGroupType::FULL; } void KVCacheMemoryConnector::initBlockPool() { @@ -94,15 +164,139 @@ void KVCacheMemoryConnector::initBlockPool() { "init block pool failed, memory size is invalid, memory size: %ld MB", memory_cache_size_mb); - const auto& layer_block_stride = cache_config_.layer_to_block_stride_bytes; + const auto slots = layerTagSlots(); + + size_t total_block_size = 0; + size_t full_only_block_size = 0; + for (const auto& slot : slots) { + total_block_size += slot.stride_bytes; + if (isFullOnlySlot(slot)) { + full_only_block_size += slot.stride_bytes; + } + } + RTP_LLM_CHECK_WITH_INFO(total_block_size > 0, "block size is invalid: %zu", total_block_size); - // block_size here means "one cache-key across all layers" total bytes (kv + scale). - // Use per-layer block strides so NULL_BLOCK_IDX layers still occupy space in merged layout. - size_t block_size = std::accumulate(layer_block_stride.begin(), layer_block_stride.end(), 0); - RTP_LLM_CHECK_WITH_INFO(block_size > 0, "block size is invalid: %zu", block_size); + const bool use_dual = + hasTypedLayerTagSlots(slots) && full_only_block_size > 0 && full_only_block_size < total_block_size; - block_pool_ = createBlockPool(block_size, memory_cache_size_mb); - RTP_LLM_CHECK_WITH_INFO(block_pool_ != nullptr, "init block pool failed, create block pool failed"); + if (!use_dual) { + block_pool_ = createBlockPool(total_block_size, memory_cache_size_mb); + RTP_LLM_CHECK_WITH_INFO(block_pool_ != nullptr, "init block pool failed, create block pool failed"); + return; + } + + complete_block_size_ = total_block_size; + incomplete_block_size_ = full_only_block_size; + + const int step = std::max(1, cache_config_.linear_step); + const size_t total_bytes = static_cast(memory_cache_size_mb) * 1024ULL * 1024ULL; + + size_t complete_block_num; + size_t incomplete_block_num; + if (step > 1) { + const size_t effective_block_bytes = + complete_block_size_ + incomplete_block_size_ * static_cast(step - 1); + RTP_LLM_CHECK_WITH_INFO(effective_block_bytes > 0, "effective block bytes is zero"); + complete_block_num = total_bytes / effective_block_bytes; + incomplete_block_num = complete_block_num * static_cast(step - 1); + } else { + complete_block_num = total_bytes / complete_block_size_; + incomplete_block_num = 0; + } + RTP_LLM_CHECK_WITH_INFO(complete_block_num > 0, + "pool_size_mb=%ld too small for complete_block_size=%zu", + memory_cache_size_mb, + complete_block_size_); + + RTP_LLM_LOG_INFO( + "dual pool init: complete_size=%zu complete_num=%zu incomplete_size=%zu incomplete_num=%zu step=%d", + complete_block_size_, + complete_block_num, + incomplete_block_size_, + incomplete_block_num, + step); + + auto make_pool = [](size_t block_size, size_t block_num) -> std::shared_ptr { + if (block_num == 0) { + return nullptr; + } + RTP_LLM_LOG_INFO("create memory block pool, block num: %zu, block size: %zu", block_num, block_size); + const auto pool_config = BlockPoolConfigHelper::createConfig( + /*layer_num=*/1, static_cast(block_num), static_cast(block_size), rtp_llm::TYPE_INT8); + auto pool = std::make_shared(pool_config, AllocationType::HOST); + RTP_LLM_CHECK_WITH_INFO(pool->init(), "memory block pool init failed, block size: %zu", block_size); + return pool; + }; + + complete_pool_ = make_pool(complete_block_size_, complete_block_num); + RTP_LLM_CHECK_WITH_INFO(complete_pool_ != nullptr, "init complete pool failed"); + if (incomplete_block_num > 0) { + incomplete_pool_ = make_pool(incomplete_block_size_, incomplete_block_num); + RTP_LLM_CHECK_WITH_INFO(incomplete_pool_ != nullptr, "init incomplete pool failed"); + } +} + +size_t KVCacheMemoryConnector::memoryCacheBlockSizeBytes() const { + const auto slots = layerTagSlots(); + size_t block_size = 0; + for (const auto& slot : slots) { + block_size += slot.stride_bytes; + } + return block_size; +} + +std::vector KVCacheMemoryConnector::layerTagSlots() const { + std::vector slots; + const size_t layer_num = cache_config_.layer_all_num; + + auto group_stride = [this](int gid, int layer_id) -> size_t { + if (gid >= 0 && gid < cache_config_.groupNums()) { + const size_t kv_stride = cache_config_.kvBlockStrideBytesForGroup(static_cast(gid)); + const size_t scale_stride = cache_config_.kvScaleStrideBytesForGroup(static_cast(gid)); + if (kv_stride + scale_stride > 0) { + return kv_stride + scale_stride; + } + } + if (layer_id >= 0 && static_cast(layer_id) < cache_config_.layer_to_block_stride_bytes.size()) { + return static_cast(cache_config_.layer_to_block_stride_bytes[static_cast(layer_id)]); + } + return cache_config_.kv_block_stride_bytes + cache_config_.kv_scale_stride_bytes; + }; + + const auto layer_group_ids = cache_config_.layerGroupIdsSnapshot(); + for (size_t layer = 0; layer < layer_num; ++layer) { + if (layer < layer_group_ids.size()) { + for (int gid : layer_group_ids[layer]) { + if (gid < 0) { + continue; + } + const auto policy = cache_config_.policyForGroup(static_cast(gid)); + if (policy.reuse_policy == CacheReusePolicy::NON_REUSABLE) { + continue; + } + const std::string tag = gid < cache_config_.groupNums() ? + cache_config_.tagForGroup(static_cast(gid)) : + std::string("group_") + std::to_string(gid); + slots.push_back(LayerTagSlot{static_cast(layer), + tag, + gid, + group_stride(gid, static_cast(layer))}); + } + } + } + return slots; +} + +bool KVCacheMemoryConnector::hasTypedLayerTagSlots(const std::vector& slots) const { + if (slots.size() != cache_config_.layer_all_num) { + return true; + } + for (size_t i = 0; i < slots.size(); ++i) { + if (slots[i].layer_id != static_cast(i) || slots[i].tag != "default") { + return true; + } + } + return false; } std::shared_ptr KVCacheMemoryConnector::asyncMatch(const std::shared_ptr& resource, @@ -115,16 +309,19 @@ std::shared_ptr KVCacheMemoryConnector::asyncMatch(const std: } const auto& cache_keys = resource->cacheKeys(); - // do not match last block, whether it is aligned or not, otherwise may cause core dump in computing ops. + // Do not match the last key. It is either a real partial tail or a + // connector-level dummy tail used to preserve the same contract after CP + // Page-RR remap. const auto cache_keys_size = cache_keys.empty() ? 0 : cache_keys.size() - 1; if (cache_keys_size == 0) { RTP_LLM_LOG_DEBUG("async match skip, cache keys is empty"); return nullptr; } - const auto& layer_block_ids = resource->layerBlocks(); - if (!checkLayerBlocks(layer_block_ids, cache_keys_size)) { - RTP_LLM_LOG_WARNING("async match failed, invalid layer_block_ids, cache_keys_size=%zu", cache_keys_size); + const auto slots = layerTagSlots(); + const auto layer_attn_block_ids = resourceLayerRegionBlocks(*resource, slots); + if (!checkLayerRegionBlocks(layer_attn_block_ids, slots, cache_keys_size)) { + RTP_LLM_LOG_WARNING("async match failed, invalid layer_attn_block_ids, cache_keys_size=%zu", cache_keys_size); return nullptr; } @@ -154,7 +351,8 @@ std::shared_ptr KVCacheMemoryConnector::asyncMatch(const std: if (isNullBlockIdx(match_result.matched_index)) { break; // only continuous prefix } - if (match_result.is_complete && gpuBlocksAllValid(layer_block_ids, i)) { + const bool gpu_blocks_all_valid = gpuBlocksAllValid(layer_attn_block_ids, slots, i); + if (match_result.is_complete && gpu_blocks_all_valid) { matched_num = i + 1; } } @@ -163,15 +361,29 @@ std::shared_ptr KVCacheMemoryConnector::asyncMatch(const std: RTP_LLM_LOG_DEBUG("not matched cache in memory, cache keys size: %zu, already_reuse_num: %zu", cache_keys_size, already_reuse_num); - reportMatchMetrics(/*success=*/false, timer.done_us(), cache_keys_size, matched_num); + reportMatchMetrics(/*success=*/true, timer.done_us(), cache_keys_size, matched_num); + return nullptr; + } + const int start_read_block_index = static_cast(already_reuse_num); + const int read_block_num = static_cast(matched_num - already_reuse_num); + auto copy_plan = + buildCopyPlanForRead(cache_keys, layer_attn_block_ids, slots, start_read_block_index, read_block_num); + if (!copy_plan || copy_plan->copy_infos.empty()) { + RTP_LLM_LOG_DEBUG( + "memory cache match dropped because read copy plan is empty, already_reuse=%zu matched=%zu cache_keys=%zu", + already_reuse_num, + matched_num, + cache_keys_size); + reportMatchMetrics(/*success=*/false, timer.done_us(), cache_keys_size, already_reuse_num); return nullptr; } - RTP_LLM_LOG_INFO("memory cache matched blocks: already_reuse=%zu matched=%zu cache_keys=%zu", - already_reuse_num, - matched_num, - cache_keys_size); + + RTP_LLM_LOG_DEBUG("memory cache matched blocks: already_reuse=%zu matched=%zu cache_keys=%zu", + already_reuse_num, + matched_num, + cache_keys_size); reportMatchMetrics(/*success=*/true, timer.done_us(), cache_keys_size, matched_num); - return std::make_shared(matched_num); + return std::make_shared(matched_num, start_read_block_index, read_block_num, copy_plan); } bool KVCacheMemoryConnector::gpuBlocksAllValid(const LayerBlockIds& layer_block_ids, size_t key_index) const { @@ -184,6 +396,31 @@ bool KVCacheMemoryConnector::gpuBlocksAllValid(const LayerBlockIds& layer_block_ return true; } +bool KVCacheMemoryConnector::gpuBlocksAllValid(const LayerAttnBlockIds& layer_attn_block_ids, + const std::vector& slots, + size_t key_index) const { + for (const auto& slot : slots) { + const auto layer = static_cast(slot.layer_id); + const auto attn = static_cast(slot.group_id); + if (layer >= layer_attn_block_ids.size() || attn >= layer_attn_block_ids[layer].size() + || layer_attn_block_ids[layer][attn] == nullptr) { + return false; + } + const auto& blocks = layer_attn_block_ids[layer][attn]->blocks(); + if (key_index >= blocks.size() || isNullBlockIdx(blocks[key_index])) { + return false; + } + } + return true; +} + +CacheGroupPolicy KVCacheMemoryConnector::groupPolicyForSlot(const LayerTagSlot& slot) const { + if (slot.group_id < 0 || slot.group_id >= cache_config_.groupNums()) { + return CacheGroupPolicy{}; + } + return cache_config_.policyForGroup(static_cast(slot.group_id)); +} + std::shared_ptr KVCacheMemoryConnector::asyncRead(const std::shared_ptr& resource, const std::shared_ptr& meta, const std::shared_ptr& match_context, @@ -200,8 +437,9 @@ std::shared_ptr KVCacheMemoryConnector::asyncRead(const std::share autil::ScopedTime2 timer; - const auto& layer_block_ids = resource->layerBlocks(); - if (!checkLayerBlocks(layer_block_ids, cache_keys_size)) { + const auto slots = layerTagSlots(); + const auto layer_attn_block_ids = resourceLayerRegionBlocks(*resource, slots); + if (!checkLayerRegionBlocks(layer_attn_block_ids, slots, cache_keys_size)) { reportReadMetrics(false, timer.done_us(), cache_keys_size, 0); return nullptr; } @@ -217,7 +455,26 @@ std::shared_ptr KVCacheMemoryConnector::asyncRead(const std::share return nullptr; } - auto copy_plan = buildCopyPlanForRead(cache_keys, layer_block_ids, start_read_block_index, read_block_num); + std::shared_ptr copy_plan; + auto memory_match_context = std::dynamic_pointer_cast(match_context); + if (memory_match_context && memory_match_context->readCopyPlan()) { + if (memory_match_context->startReadBlockIndex() == start_read_block_index + && memory_match_context->readBlockNum() == read_block_num) { + copy_plan = std::static_pointer_cast(memory_match_context->readCopyPlan()); + memory_match_context->clearReadCopyPlan(); + } else { + RTP_LLM_LOG_WARNING( + "async read ignored read copy plan because range mismatched, plan_start=%d plan_num=%d read_start=%d read_num=%d", + memory_match_context->startReadBlockIndex(), + memory_match_context->readBlockNum(), + start_read_block_index, + read_block_num); + } + } + if (!copy_plan) { + copy_plan = + buildCopyPlanForRead(cache_keys, layer_attn_block_ids, slots, start_read_block_index, read_block_num); + } if (!copy_plan || copy_plan->copy_infos.empty()) { reportReadMetrics(false, timer.done_us(), cache_keys_size, 0); return nullptr; @@ -229,11 +486,12 @@ std::shared_ptr KVCacheMemoryConnector::asyncRead(const std::share if (success) { resource->setMemoryReuseBlockNum(read_block_num); for (const auto& copy_info : copy_plan->copy_infos) { - const auto removed_item = block_cache_->removeIfMatch(copy_info.cache_key, copy_info.mem_block); + const auto removed_item = block_cache_->removeIfMatch( + copy_info.cache_key, CacheBackingType::MEMORY, copy_info.mem_block, /*disk_slot=*/-1); if (!removed_item.has_value()) { continue; } - freeBlocks({removed_item->block_index}, /*cache_free=*/true); + releaseCacheBacking(*removed_item); } RTP_LLM_LOG_INFO("memory cache read success: read_blocks=%d released_blocks=%zu total_blocks=%zu", read_block_num, @@ -254,31 +512,40 @@ std::shared_ptr KVCacheMemoryConnector::asyncRead(const std::share return context; } -std::shared_ptr KVCacheMemoryConnector::buildCopyPlanForRead( - const CacheKeysType& cache_keys, const LayerBlockIds& layer_block_ids, int start_index, int read_num) { +std::shared_ptr +KVCacheMemoryConnector::buildCopyPlanForRead(const CacheKeysType& cache_keys, + const LayerAttnBlockIds& layer_attn_block_ids, + const std::vector& slots, + int start_index, + int read_num) { std::vector copy_infos; - const auto layer_num = cache_config_.layer_all_num; - bool success = true; + bool success = true; for (int i = start_index; i < start_index + read_num; ++i) { const auto cache_key = cache_keys.at(i); - const auto match_result = block_cache_->match(static_cast(cache_key)); + const auto match_result = block_cache_->matchAndMarkInFlight(static_cast(cache_key)); if (isNullBlockIdx(match_result.matched_index)) { RTP_LLM_LOG_WARNING("build copy plan for read failed, cache key not found, cache key: %ld", cache_key); success = false; break; } // 每次都加引用的原因是为了确保match到的block不会被释放(避免在写时malloc如果cache满弹出该block) - referenceBlocks({match_result.matched_index}, /*cache_ref=*/false); + auto source_pool = memoryPoolFor(blockKindFromComplete(match_result.is_complete)); + if (!source_pool) { + RTP_LLM_LOG_WARNING("build copy plan for read failed, missing memory pool, cache key: %ld", cache_key); + success = false; + break; + } + referenceBlocksInPool(source_pool, {match_result.matched_index}, /*cache_ref=*/false); CopyInfoPerKey copy_info; copy_info.cache_key = cache_key; copy_info.mem_block = match_result.matched_index; - copy_info.gpu_blocks.reserve(layer_num); - for (size_t layer = 0; layer < layer_num; ++layer) { - // Do NOT skip NULL_BLOCK_IDX here. The merged memory block layout requires reserving - // per-layer stride even when this layer has no gpu block (-1). - copy_info.gpu_blocks.push_back(layer_block_ids.at(layer)->blocks().at(i)); + copy_info.gpu_blocks.reserve(slots.size()); + for (const auto& slot : slots) { + const auto layer = static_cast(slot.layer_id); + const auto attn = static_cast(slot.group_id); + copy_info.gpu_blocks.push_back(layer_attn_block_ids.at(layer).at(attn)->blocks().at(i)); } copy_info.is_complete = match_result.is_complete; copy_infos.emplace_back(std::move(copy_info)); @@ -315,13 +582,16 @@ std::shared_ptr KVCacheMemoryConnector::asyncWrite(const std::shar autil::ScopedTime2 timer; - const auto& layer_block_ids = resource->layerBlocks(); - if (!checkLayerBlocks(layer_block_ids, cache_keys_size)) { + const auto slots = layerTagSlots(); + const auto layer_attn_block_ids = resourceLayerRegionBlocks(*resource, slots); + if (!checkLayerRegionBlocks(layer_attn_block_ids, slots, cache_keys_size)) { + RTP_LLM_LOG_WARNING("async write failed, invalid layer_attn_block_ids, cache_keys_size=%zu resource_keys=%zu", + cache_keys_size, + cache_keys.size()); reportWriteMetrics(false, timer.done_us(), cache_keys_size, 0); return nullptr; } - // 计算内存中已存在的前缀长度 size_t mem_matched_num = 0; for (; mem_matched_num < cache_keys_size; ++mem_matched_num) { if (!block_cache_->contains(static_cast(cache_keys[mem_matched_num]))) { @@ -339,28 +609,32 @@ std::shared_ptr KVCacheMemoryConnector::asyncWrite(const std::shar bool no_need_write = false; auto copy_plan = buildCopyPlanForWrite( - cache_keys, layer_block_ids, mem_matched_num, cache_keys_size - mem_matched_num, no_need_write); + cache_keys, layer_attn_block_ids, slots, mem_matched_num, cache_keys_size - mem_matched_num, no_need_write); if (!copy_plan || copy_plan->copy_infos.empty()) { + RTP_LLM_LOG_DEBUG( + "async write skip, no copy plan, cache_keys=%zu write_start=%zu write_num=%zu no_need_write=%d", + cache_keys_size, + mem_matched_num, + cache_keys_size - mem_matched_num, + no_need_write); reportWriteMetrics(no_need_write, timer.done_us(), static_cast(cache_keys_size), 0); return nullptr; } auto write_done = [copy_plan, resource_copy = resource, timer, total_block_num = cache_keys_size, this](bool success) mutable { - RTP_LLM_LOG_DEBUG("async write done, success: %d", success); + RTP_LLM_LOG_DEBUG("memory cache write done: success=%d write_blocks=%zu total_blocks=%zu", + success, + copy_plan ? copy_plan->copy_infos.size() : 0, + total_block_num); if (success) { - for (const auto& copy_info : copy_plan->copy_infos) { - MemoryBlockCache::CacheItem item; - item.cache_key = copy_info.cache_key; - item.block_index = copy_info.mem_block; - item.is_resident = false; - item.is_complete = copy_info.is_complete; - putToCache(item); + for (auto& copy_info : copy_plan->copy_infos) { + putToCache(copy_info); } - // reset resource to decrease block ref count in destructor - resource_copy.reset(); } + // reset resource to decrease block ref count in destructor + resource_copy.reset(); const int64_t write_block_num = success ? static_cast(copy_plan->copy_infos.size()) : 0; // reset copy plan to release memory block refs copy_plan.reset(); @@ -377,12 +651,12 @@ std::shared_ptr KVCacheMemoryConnector::asyncWrite(const std::shar } std::shared_ptr -KVCacheMemoryConnector::buildCopyPlanForWrite(const CacheKeysType& cache_keys, - const LayerBlockIds& layer_block_ids, - int start_index, - int write_num, - bool& no_need_write) { - const auto layer_num = cache_config_.layer_all_num; +KVCacheMemoryConnector::buildCopyPlanForWrite(const CacheKeysType& cache_keys, + const LayerAttnBlockIds& layer_attn_block_ids, + const std::vector& slots, + int start_index, + int write_num, + bool& no_need_write) { std::vector copy_infos; copy_infos.reserve(write_num); @@ -395,11 +669,13 @@ KVCacheMemoryConnector::buildCopyPlanForWrite(const CacheKeysType& cache_keys, for (int i = start_index; i < start_index + write_num; ++i) { const auto cache_key = cache_keys.at(i); std::vector gpu_blocks; - gpu_blocks.reserve(layer_num); + gpu_blocks.reserve(slots.size()); size_t null_block_num = 0; - for (size_t layer = 0; layer < layer_num; ++layer) { - const int gpu_block_idx = layer_block_ids.at(layer)->blocks().at(i); - // Do NOT skip NULL_BLOCK_IDX here. We must keep per-layer stride slots in the merged big block. + for (const auto& slot : slots) { + const auto layer = static_cast(slot.layer_id); + const auto attn = static_cast(slot.group_id); + const int gpu_block_idx = layer_attn_block_ids.at(layer).at(attn)->blocks().at(i); + // Do NOT skip NULL_BLOCK_IDX here. We must keep per-layer+attn stride slots in the merged big block. if (isNullBlockIdx(gpu_block_idx)) { ++null_block_num; } @@ -422,6 +698,8 @@ KVCacheMemoryConnector::buildCopyPlanForWrite(const CacheKeysType& cache_keys, // ensure the final written key is complete no_need_write = last_complete_index < start_index; if (no_need_write) { + RTP_LLM_LOG_DEBUG( + "build copy plan for write found no complete key, start=%d write_num=%d", start_index, write_num); return nullptr; } @@ -429,15 +707,24 @@ KVCacheMemoryConnector::buildCopyPlanForWrite(const CacheKeysType& cache_keys, const size_t keep_cnt = static_cast(last_complete_index - start_index + 1); copy_infos.resize(keep_cnt); - std::vector mem_blocks; - if (!mallocBlocks(copy_infos.size(), mem_blocks)) { - RTP_LLM_LOG_WARNING("build copy plan for write failed, malloc blocks failed, need blocks: %zu", + if (isDualPool() && !incomplete_pool_) { + const auto before = copy_infos.size(); + copy_infos.erase( + std::remove_if(copy_infos.begin(), copy_infos.end(), [](const auto& ci) { return !ci.is_complete; }), + copy_infos.end()); + if (copy_infos.size() != before) { + RTP_LLM_LOG_DEBUG("build copy plan for write skip incomplete blocks because incomplete pool is disabled, " + "before=%zu after=%zu", + before, + copy_infos.size()); + } + } + + if (!allocateBackingsForWrite(copy_infos)) { + RTP_LLM_LOG_WARNING("build copy plan for write failed, allocate backing failed, need blocks: %zu", copy_infos.size()); return nullptr; } - for (size_t i = 0; i < copy_infos.size(); ++i) { - copy_infos[i].mem_block = mem_blocks[i]; - } // free blocks in destructor auto plan = createCopyPlan(copy_infos, CopyDirection::D2H); @@ -450,12 +737,13 @@ KVCacheMemoryConnector::createCopyPlan(const std::vector& copy_i plan->copy_infos = copy_infos; plan->direction = direction; auto deleter = [this](CopyPlan* plan) { - std::vector blocks; - blocks.reserve(plan->copy_infos.size()); for (const auto& copy_info : plan->copy_infos) { - blocks.push_back(copy_info.mem_block); + releaseRequestBacking(copy_info); + if (plan->direction == CopyDirection::H2D) { + block_cache_->releaseInFlight( + copy_info.cache_key, CacheBackingType::MEMORY, copy_info.mem_block, /*disk_slot=*/-1); + } } - freeBlocks(blocks, /*cache_free=*/false); delete plan; }; return std::shared_ptr(plan, deleter); @@ -466,9 +754,11 @@ bool KVCacheMemoryConnector::startCopyAsync(const std::shared_ptrpushTask([this, context, copy_plan]() mutable { - auto send_result = sendCopyPlan(copy_plan); + auto task_copy_plan = copy_plan; + auto code = wait_done_thread_pool_->pushTask([this, context, task_copy_plan]() mutable { + auto send_result = sendCopyPlan(task_copy_plan); context->setBroadcastResult(send_result); + task_copy_plan.reset(); context->waitDone(); }); if (code != autil::ThreadPoolBase::ERROR_NONE) { @@ -486,11 +776,18 @@ KVCacheMemoryConnector::sendCopyPlan(const std::shared_ptr& copy_plan) for (const auto& copy_info : copy_plan->copy_infos) { auto* item = mem_req.add_copy_items(); item->set_mem_block(copy_info.mem_block); + item->set_is_complete(copy_info.is_complete); + item->set_backing_type(MemoryOperationRequestPB::MEMORY); for (const auto& block : copy_info.gpu_blocks) { item->add_gpu_blocks(block); } } + return sendMemoryRequest(mem_req, copyPlanTimeoutMs(copy_plan)); +} + +std::shared_ptr> +KVCacheMemoryConnector::sendMemoryRequest(const MemoryOperationRequestPB& mem_req, int64_t timeout_ms) const { std::vector requests; requests.reserve(broadcast_manager_->workerNum()); for (size_t i = 0; i < broadcast_manager_->workerNum(); ++i) { @@ -498,15 +795,13 @@ KVCacheMemoryConnector::sendCopyPlan(const std::shared_ptr& copy_plan) req.mutable_mem_request()->CopyFrom(mem_req); requests.emplace_back(std::move(req)); } - auto rpc_call = [](const std::shared_ptr& stub, const std::shared_ptr& context, const FunctionRequestPB& request, grpc::CompletionQueue* completion_queue) { return stub->AsyncExecuteFunction(context.get(), request, completion_queue); }; - return broadcast_manager_->broadcast( - requests, kv_cache_config_.memory_cache_sync_timeout_ms, rpc_call); + return broadcast_manager_->broadcast(requests, timeout_ms, rpc_call); } void KVCacheMemoryConnector::printCopyPlan(const std::shared_ptr& copy_plan) const { @@ -515,7 +810,8 @@ void KVCacheMemoryConnector::printCopyPlan(const std::shared_ptr& copy << ", copy infos size: " << copy_plan->copy_infos.size() << "\n"; for (int i = 0; i < copy_plan->copy_infos.size(); ++i) { const auto& copy_info = copy_plan->copy_infos.at(i); - oss << "copy info " << i << ": cache key: " << copy_info.cache_key << ", mem block: " << copy_info.mem_block + oss << "copy info " << i << ": cache key: " << copy_info.cache_key + << ", mem block: " << copy_info.mem_block << ", gpu layer blocks: ["; for (const auto& gpu_block : copy_info.gpu_blocks) { oss << gpu_block << ", "; @@ -528,45 +824,329 @@ void KVCacheMemoryConnector::printCopyPlan(const std::shared_ptr& copy bool KVCacheMemoryConnector::copyCache(const MemoryOperationRequestPB& request, MemoryOperationResponsePB& response) { RTP_LLM_PROFILE_FUNCTION(); autil::ScopedTime2 timer; - const auto copy_direction = - (request.copy_direction() == MemoryOperationRequestPB::H2D) ? CopyDirection::H2D : CopyDirection::D2H; + CopyDirection copy_direction = CopyDirection::D2H; + if (request.copy_direction() == MemoryOperationRequestPB::H2D) { + copy_direction = CopyDirection::H2D; + } + const auto slots = layerTagSlots(); + const bool has_typed_slots = hasTypedLayerTagSlots(slots); + + if (request.copy_items_size() == 0) { + RTP_LLM_LOG_WARNING("copy cache failed, copy_items is empty"); + response.set_success(false); + reportCopyMetrics(false, timer.done_us(), copy_direction); + return false; + } + + for (int i = 0; i < request.copy_items_size(); ++i) { + const auto& item = request.copy_items(i); + if (!validateCopyItemBacking(item)) { + response.set_success(false); + reportCopyMetrics(false, timer.done_us(), copy_direction); + return false; + } + } + + if (tryCopyCacheWithStagedMemoryCopy(request, copy_direction, slots)) { + response.set_success(true); + reportCopyMetrics(true, timer.done_us(), copy_direction); + return true; + } + if (cache_config_.use_typed_cache_regions && cache_config_.use_opaque_kv_cache_store) { + RTP_LLM_LOG_WARNING("typed opaque memory copy failed for typed layout"); + response.set_success(false); + reportCopyMetrics(false, timer.done_us(), copy_direction); + return false; + } + if (has_typed_slots && tryCopyCacheWithBatchedMemoryCopy(request, copy_direction, slots)) { + response.set_success(true); + reportCopyMetrics(true, timer.done_us(), copy_direction); + return true; + } + + if (!copyMemoryItemsGeneric(request, copy_direction, slots)) { + response.set_success(false); + reportCopyMetrics(false, timer.done_us(), copy_direction); + return false; + } + + response.set_success(true); + reportCopyMetrics(true, timer.done_us(), copy_direction); + return true; +} + +bool KVCacheMemoryConnector::validateCopyItemBacking(const MemoryOperationRequestPB::CopyItem& item) const { + if (item.backing_type() != MemoryOperationRequestPB::MEMORY) { + RTP_LLM_LOG_WARNING("copy item has unsupported backing_type=%d", static_cast(item.backing_type())); + return false; + } + return true; +} +bool KVCacheMemoryConnector::copyMemoryItemsGeneric(const MemoryOperationRequestPB& request, + CopyDirection direction, + const std::vector& slots) { std::vector dst_buffers; std::vector src_buffers; for (int i = 0; i < request.copy_items_size(); ++i) { - const auto& item = request.copy_items(i); + const auto& item = request.copy_items(i); + if (item.backing_type() != MemoryOperationRequestPB::MEMORY) { + continue; + } const auto mem_block = static_cast(item.mem_block()); const std::vector gpu_blocks(item.gpu_blocks().begin(), item.gpu_blocks().end()); - if (!prepareCopyBuffers(mem_block, gpu_blocks, copy_direction, dst_buffers, src_buffers)) { - RTP_LLM_LOG_WARNING("copy cache failed, prepare copy buffers failed, mem_block=%d, direction=%s", + if (!prepareCopyBuffers(mem_block, gpu_blocks, direction, item.is_complete(), dst_buffers, src_buffers)) { + RTP_LLM_LOG_WARNING("copy cache failed, prepare memory copy buffers failed, mem_block=%d, direction=%s", mem_block, - copy_direction == CopyDirection::H2D ? "H2D" : "D2H"); - response.set_success(false); - reportCopyMetrics(false, timer.done_us(), copy_direction); + direction == CopyDirection::H2D ? "H2D" : "D2H"); return false; } } if (!dst_buffers.empty()) { MultiCopyParams mc{dst_buffers, src_buffers}; - applySplitKvMultiCopyFieldsIfEligible(kv_cache_config_.enable_memory_cache_sm_copy, cache_config_, mc); + const bool can_use_split_kv_copy = !hasTypedLayerTagSlots(slots); + applySplitKvMultiCopyFieldsIfEligible( + kv_cache_config_.enable_memory_cache_sm_copy && can_use_split_kv_copy, cache_config_, mc); execNoBlockCopy(mc); } + return true; +} - response.set_success(true); - reportCopyMetrics(true, timer.done_us(), copy_direction); +bool KVCacheMemoryConnector::tryCopyCacheWithStagedMemoryCopy(const MemoryOperationRequestPB& request, + CopyDirection direction, + const std::vector& slots) { + RTP_LLM_PROFILE_SCOPE("reuse_cache.memory.copy.plan_staged"); + if (!isDualPool() && block_pool_ == nullptr) { + return false; + } + if (isDualPool() && !complete_pool_) { + return false; + } + if (allocator_ == nullptr) { + return false; + } + + StagedMemoryCopyParams params; + params.direction = + direction == CopyDirection::H2D ? StagedMemoryCopyDirection::H2D : StagedMemoryCopyDirection::D2H; + size_t logical_rows = 0; + size_t payload_bytes = 0; + + for (int i = 0; i < request.copy_items_size(); ++i) { + const auto& item = request.copy_items(i); + const auto mem_block = static_cast(item.mem_block()); + const std::vector gpu_blocks(item.gpu_blocks().begin(), item.gpu_blocks().end()); + const bool item_is_complete = item.is_complete(); + + if (isNullBlockIdx(mem_block) || gpu_blocks.size() != slots.size()) { + return false; + } + + auto& pool_ref = isDualPool() ? (item_is_complete ? complete_pool_ : incomplete_pool_) : block_pool_; + if (!pool_ref) { + return false; + } + auto mem_buffers = pool_ref->convertIndexToBuffer(/*layer_id=*/0, mem_block); + if (mem_buffers.size() != 1u || mem_buffers[0].addr == nullptr || mem_buffers[0].size_bytes == 0 + || mem_buffers[0].is_cuda) { + return false; + } + const auto& mem_buffer = mem_buffers[0]; + auto* mem_addr = static_cast(mem_buffer.addr); + + size_t byte_off = 0; + for (size_t slot_idx = 0; slot_idx < slots.size(); ++slot_idx) { + const auto& slot = slots[slot_idx]; + const auto gpu_block = gpu_blocks.at(slot_idx); + const auto layer_stride = slot.stride_bytes; + + if (!item_is_complete && !isFullOnlySlot(slot)) { + continue; + } + + if (isNullBlockIdx(gpu_block)) { + byte_off += layer_stride; + continue; + } + + const auto gpu_buffers = allocator_->convertIndexToBuffer(slot.layer_id, slot.group_id, gpu_block); + size_t within_layer_off = 0; + for (const auto& gpu_buffer : gpu_buffers) { + if (gpu_buffer.addr == nullptr || gpu_buffer.size_bytes == 0) { + within_layer_off += gpu_buffer.size_bytes; + continue; + } + if (within_layer_off + gpu_buffer.size_bytes > layer_stride + || byte_off + within_layer_off + gpu_buffer.size_bytes > mem_buffer.size_bytes) { + return false; + } + auto* host_addr = mem_addr + byte_off + within_layer_off; + if (!gpu_buffer.is_cuda) { + return false; + } + if (params.device_index < 0) { + params.device_index = gpu_buffer.device_index; + } else if (params.device_index != gpu_buffer.device_index) { + return false; + } + + // The SM copy kernels vectorize with int4/int2. Keep every staged tile aligned so compact + // staging does not trade fewer memcpy calls for misaligned vector accesses. + constexpr size_t kStagedTileAlignment = 16; + const size_t staging_offset = alignUp(params.host_bytes, kStagedTileAlignment); + params.host_bytes = staging_offset; + appendStagedMemoryCopyHostSegment( + host_addr, staging_offset, gpu_buffer.size_bytes, params.host_segments); + appendStagedMemoryCopyTile(gpu_buffer.addr, staging_offset, gpu_buffer.size_bytes, params.tiles); + params.host_bytes += gpu_buffer.size_bytes; + ++logical_rows; + payload_bytes += gpu_buffer.size_bytes; + within_layer_off += gpu_buffer.size_bytes; + } + byte_off += layer_stride; + } + } + + if (params.tiles.empty()) { + return true; + } + + RTP_LLM_LOG_DEBUG("cuda staged memory copy, direction=%s, rows=%zu, tiles=%zu, bytes=%zu, span=%zu, device=%d", + direction == CopyDirection::H2D ? "H2D" : "D2H", + logical_rows, + params.tiles.size(), + payload_bytes, + params.host_bytes, + params.device_index); + RTP_LLM_PROFILE_SCOPE("reuse_cache.memory.copy.exec_staged"); + std::lock_guard scratch_lock(staged_copy_scratch_mutex_); + if (!execStagedMemoryCopy(params, &stagedCopyScratchForDevice(params.device_index))) { + return false; + } return true; } +StagedMemoryCopyScratch& KVCacheMemoryConnector::stagedCopyScratchForDevice(int device_index) { + auto& scratch = staged_copy_scratch_by_device_[device_index]; + if (!scratch) { + scratch = std::make_unique(); + } + return *scratch; +} + +bool KVCacheMemoryConnector::tryCopyCacheWithBatchedMemoryCopy(const MemoryOperationRequestPB& request, + CopyDirection direction, + const std::vector& slots) { + RTP_LLM_PROFILE_SCOPE("reuse_cache.memory.copy.plan_batch"); + if (!isDualPool() && block_pool_ == nullptr) { + return false; + } + if (isDualPool() && !complete_pool_) { + return false; + } + if (allocator_ == nullptr) { + return false; + } + + BatchedMemoryCopyParams params; + size_t logical_rows = 0; + size_t payload_bytes = 0; + + for (int i = 0; i < request.copy_items_size(); ++i) { + const auto& item = request.copy_items(i); + const auto mem_block = static_cast(item.mem_block()); + const std::vector gpu_blocks(item.gpu_blocks().begin(), item.gpu_blocks().end()); + const bool item_is_complete = item.is_complete(); + + if (isNullBlockIdx(mem_block) || gpu_blocks.size() != slots.size()) { + return false; + } + + auto& pool_ref = isDualPool() ? (item_is_complete ? complete_pool_ : incomplete_pool_) : block_pool_; + if (!pool_ref) { + return false; + } + auto mem_buffers = pool_ref->convertIndexToBuffer(/*layer_id=*/0, mem_block); + if (mem_buffers.size() != 1u || mem_buffers[0].addr == nullptr || mem_buffers[0].size_bytes == 0) { + return false; + } + const auto& mem_buffer = mem_buffers[0]; + + size_t byte_off = 0; + for (size_t slot_idx = 0; slot_idx < slots.size(); ++slot_idx) { + const auto& slot = slots[slot_idx]; + const auto gpu_block = gpu_blocks.at(slot_idx); + const auto layer_stride = slot.stride_bytes; + + if (!item_is_complete && !isFullOnlySlot(slot)) { + continue; + } + + if (isNullBlockIdx(gpu_block)) { + byte_off += layer_stride; + continue; + } + + const auto gpu_buffers = allocator_->convertIndexToBuffer(slot.layer_id, slot.group_id, gpu_block); + size_t within_layer_off = 0; + for (const auto& gpu_buffer : gpu_buffers) { + if (gpu_buffer.addr == nullptr || gpu_buffer.size_bytes == 0) { + within_layer_off += gpu_buffer.size_bytes; + continue; + } + if (!gpu_buffer.is_cuda) { + return false; + } + if (within_layer_off + gpu_buffer.size_bytes > layer_stride + || byte_off + within_layer_off + gpu_buffer.size_bytes > mem_buffer.size_bytes) { + return false; + } + if (params.device_index < 0) { + params.device_index = gpu_buffer.device_index; + } else if (params.device_index != gpu_buffer.device_index) { + return false; + } + + auto* mem_addr = static_cast(static_cast(mem_buffer.addr) + byte_off + within_layer_off); + if (direction == CopyDirection::H2D) { + appendBatchedMemoryCopyTile(gpu_buffer.addr, mem_addr, gpu_buffer.size_bytes, params.tiles); + } else { + appendBatchedMemoryCopyTile(mem_addr, gpu_buffer.addr, gpu_buffer.size_bytes, params.tiles); + } + ++logical_rows; + payload_bytes += gpu_buffer.size_bytes; + within_layer_off += gpu_buffer.size_bytes; + } + byte_off += layer_stride; + } + } + + if (params.tiles.empty()) { + return true; + } + + RTP_LLM_LOG_DEBUG("cuda memcpy batch, direction=%s, rows=%zu, tiles=%zu, bytes=%zu, device=%d", + direction == CopyDirection::H2D ? "H2D" : "D2H", + logical_rows, + params.tiles.size(), + payload_bytes, + params.device_index); + RTP_LLM_PROFILE_SCOPE("reuse_cache.memory.copy.exec_batch"); + return execBatchedMemoryCopy(params); +} + bool KVCacheMemoryConnector::prepareCopyBuffers(BlockIdxType mem_block, const std::vector& gpu_blocks, CopyDirection direction, + bool is_complete, std::vector& dst, std::vector& src) { RTP_LLM_CHECK_WITH_INFO(mem_block != NULL_BLOCK_IDX, "mem block is null"); - RTP_LLM_CHECK_WITH_INFO(block_pool_ != nullptr, "block pool is null"); - auto mem_buffers = block_pool_->convertIndexToBuffer(/*layer_id=*/0, mem_block); + auto& pool_ref = isDualPool() ? (is_complete ? complete_pool_ : incomplete_pool_) : block_pool_; + RTP_LLM_CHECK_WITH_INFO(pool_ref != nullptr, "block pool is null"); + auto mem_buffers = pool_ref->convertIndexToBuffer(/*layer_id=*/0, mem_block); if (mem_buffers.empty()) { RTP_LLM_LOG_WARNING("prepare copy buffers failed, mem buffers are empty, block=%d, direction=%s", mem_block, @@ -583,29 +1163,36 @@ bool KVCacheMemoryConnector::prepareCopyBuffers(BlockIdxType mem_block, direction == CopyDirection::H2D ? "H2D" : "D2H"); - const size_t layer_num = cache_config_.layer_all_num; - RTP_LLM_CHECK_WITH_INFO(gpu_blocks.size() == layer_num, - "gpu_blocks must contain all layers, got=%zu need=%zu", + const auto slots = layerTagSlots(); + RTP_LLM_CHECK_WITH_INFO(gpu_blocks.size() == slots.size(), + "gpu_blocks must contain all layer-attn slots, got=%zu need=%zu", gpu_blocks.size(), - layer_num); + slots.size()); size_t byte_off = 0; - for (int layer = 0; layer < layer_num; ++layer) { - const auto gpu_block = gpu_blocks.at(layer); - const auto layer_stride = cache_config_.layer_to_block_stride_bytes[layer]; + for (size_t slot_idx = 0; slot_idx < slots.size(); ++slot_idx) { + const auto& slot = slots[slot_idx]; + const auto gpu_block = gpu_blocks.at(slot_idx); + const auto layer_stride = slot.stride_bytes; + + if (!is_complete && !isFullOnlySlot(slot)) { + continue; + } if (isNullBlockIdx(gpu_block)) { byte_off += layer_stride; continue; } - const auto gpu_buffers = allocator_->convertIndexToBuffer(layer, gpu_block); + const auto gpu_buffers = allocator_->convertIndexToBuffer(slot.layer_id, slot.group_id, gpu_block); size_t within_layer_off = 0; for (const auto& gpu_buffer : gpu_buffers) { if (within_layer_off + gpu_buffer.size_bytes > layer_stride) { RTP_LLM_LOG_WARNING("prepare copy buffers failed, gpu buffer overflow: " - "layer=%zu byte_off=%zu within_layer_off=%zu gpu_buffer_size=%zu", - layer, + "layer=%d tag=%s group=%d byte_off=%zu within_layer_off=%zu gpu_buffer_size=%zu", + slot.layer_id, + slot.tag.c_str(), + slot.group_id, byte_off, within_layer_off, gpu_buffer.size_bytes); @@ -687,33 +1274,45 @@ bool KVCacheMemoryConnector::checkLayerBlocks(const LayerBlockIds& layer_block_i return true; } -bool KVCacheMemoryConnector::mallocBlocks(size_t need_blocks, std::vector& malloced_blocks) { - RTP_LLM_PROFILE_FUNCTION(); - if (need_blocks == 0) { - RTP_LLM_LOG_WARNING("malloc memory blocks failed, need blocks cannot be 0"); - return false; +LayerAttnBlockIds KVCacheMemoryConnector::resourceLayerRegionBlocks(const KVCacheResource& resource, + const std::vector& slots) const { + if (!resource.layerGroupBlocks().empty()) { + return resource.layerGroupBlocks(); } - // make sure `eusure + malloc` is atomic - std::unique_lock lock(malloc_mutex_); + return {}; +} - if (!ensureEnoughFreeBlocks(need_blocks)) { - RTP_LLM_LOG_WARNING( - "malloc memory blocks failed, ensure enough free blocks failed, need blocks: %zu, free blocks: %zu", - need_blocks, - block_pool_->freeBlocksNum()); +bool KVCacheMemoryConnector::checkLayerRegionBlocks(const LayerAttnBlockIds& layer_attn_block_ids, + const std::vector& slots, + size_t required_len) const { + if (layer_attn_block_ids.empty()) { + RTP_LLM_LOG_WARNING("check layer-attn blocks failed, layer_attn_block_ids is empty (required_len=%zu)", + required_len); return false; } - - auto blocks = block_pool_->malloc(need_blocks); - if (blocks.size() != need_blocks) { - RTP_LLM_LOG_WARNING("malloc memory blocks failed, malloc failed, need blocks: %zu, allocated blocks: %zu", - need_blocks, - blocks.size()); - freeBlocks(blocks, /*cache_free=*/false); - return false; + for (const auto& slot : slots) { + const auto layer = static_cast(slot.layer_id); + const auto attn = static_cast(slot.group_id); + if (layer >= layer_attn_block_ids.size() || attn >= layer_attn_block_ids[layer].size() + || layer_attn_block_ids[layer][attn] == nullptr) { + RTP_LLM_LOG_WARNING("check layer-group blocks failed, missing slot layer=%d tag=%s group=%d", + slot.layer_id, + slot.tag.c_str(), + slot.group_id); + return false; + } + if (layer_attn_block_ids[layer][attn]->blocksNum() < required_len) { + RTP_LLM_LOG_WARNING( + "check layer-group blocks failed, blocksNum is less than required_len, layer=%d tag=%s group=%d blocksNum=%zu required_len=%zu", + slot.layer_id, + slot.tag.c_str(), + slot.group_id, + layer_attn_block_ids[layer][attn]->blocksNum(), + required_len); + return false; + } } - malloced_blocks = std::move(blocks); return true; } @@ -751,6 +1350,86 @@ void KVCacheMemoryConnector::referenceBlocks(const std::vector& bl } } +bool KVCacheMemoryConnector::allocateBackingsForWrite(std::vector& copy_infos) { + std::unique_lock lock(malloc_mutex_); + std::vector allocated_indices; + allocated_indices.reserve(copy_infos.size()); + for (size_t i = 0; i < copy_infos.size(); ++i) { + if (!allocateOneBacking(copy_infos[i])) { + for (const auto idx : allocated_indices) { + releaseRequestBacking(copy_infos[idx]); + } + return false; + } + allocated_indices.push_back(i); + } + return true; +} + +bool KVCacheMemoryConnector::allocateOneBacking(CopyInfoPerKey& copy_info) { + const auto kind = blockKindFromComplete(copy_info.is_complete); + BlockIdxType mem_block = NULL_BLOCK_IDX; + if (tryMallocMemoryBlock(kind, mem_block)) { + copy_info.mem_block = mem_block; + return true; + } + + while (true) { + auto evicted = block_cache_->popOldestEvictable(kind); + if (!evicted.has_value()) { + return false; + } + reportEvictionLifetime(kind, evicted->backing_type, evicted->created_time_us); + releaseCacheBacking(*evicted); + if (tryMallocMemoryBlock(kind, mem_block)) { + copy_info.mem_block = mem_block; + return true; + } + } +} + +bool KVCacheMemoryConnector::tryMallocMemoryBlock(CacheBlockKind kind, BlockIdxType& block) { + block = NULL_BLOCK_IDX; + auto pool = memoryPoolFor(kind); + if (pool == nullptr || pool->freeBlocksNum() == 0) { + return false; + } + auto blocks = pool->malloc(1); + if (blocks.size() != 1) { + return false; + } + block = blocks[0]; + return true; +} + +void KVCacheMemoryConnector::releaseRequestBacking(const CopyInfoPerKey& copy_info) { + auto pool = memoryPoolFor(blockKindFromComplete(copy_info.is_complete)); + if (pool) { + freeBlocksFromPool(pool, {copy_info.mem_block}, /*cache_free=*/false); + } +} + +void KVCacheMemoryConnector::releaseCacheBacking(const MemoryDiskBlockCache::CacheItem& item) { + auto pool = memoryPoolFor(blockKindFromComplete(item.is_complete)); + if (pool) { + freeBlocksFromPool(pool, {item.block_index}, /*cache_free=*/true); + } +} + +void KVCacheMemoryConnector::referenceCacheBacking(const MemoryDiskBlockCache::CacheItem& item) { + auto pool = memoryPoolFor(blockKindFromComplete(item.is_complete)); + if (pool) { + referenceBlocksInPool(pool, {item.block_index}, /*cache_ref=*/true); + } +} + +std::shared_ptr KVCacheMemoryConnector::memoryPoolFor(CacheBlockKind kind) const { + if (!isDualPool()) { + return block_pool_; + } + return kind == CacheBlockKind::COMPLETE ? complete_pool_ : incomplete_pool_; +} + std::shared_ptr KVCacheMemoryConnector::createBlockPool(size_t block_size, size_t pool_size_mb) const { RTP_LLM_CHECK_WITH_INFO(pool_size_mb > 0, "pool size must be > 0"); const int64_t block_num = pool_size_mb * 1024 * 1024 / static_cast(block_size); @@ -769,40 +1448,80 @@ std::shared_ptr KVCacheMemoryConnector::createBlockPool(size_t block_ std::string KVCacheMemoryConnector::blockPoolDebugString() const { std::stringstream oss; - oss << "total blocks num: " << block_pool_->totalBlocksNum() - << ", free blocks num: " << block_pool_->freeBlocksNum() - << ", available blocks num: " << block_pool_->availableBlocksNum(); + if (isDualPool()) { + oss << "complete pool: total=" << complete_pool_->totalBlocksNum() + << " free=" << complete_pool_->freeBlocksNum() << " available=" << complete_pool_->availableBlocksNum(); + if (incomplete_pool_) { + oss << " | incomplete pool: total=" << incomplete_pool_->totalBlocksNum() + << " free=" << incomplete_pool_->freeBlocksNum() + << " available=" << incomplete_pool_->availableBlocksNum(); + } + } else { + oss << "total blocks num: " << block_pool_->totalBlocksNum() + << ", free blocks num: " << block_pool_->freeBlocksNum() + << ", available blocks num: " << block_pool_->availableBlocksNum(); + } return oss.str(); } void KVCacheMemoryConnector::putToCache(const MemoryBlockCache::CacheItem& item) { RTP_LLM_PROFILE_FUNCTION(); - if (auto [success, popped_item_opt] = block_cache_->put(item); success) { - RTP_LLM_LOG_DEBUG("write cache, cache key: %ld, block index: %d, block size: %zu", - item.cache_key, - item.block_index, - item.block_size); - referenceBlocks({item.block_index}, /*cache_ref=*/true); - if (popped_item_opt.has_value()) { - const auto popped_item = popped_item_opt.value(); - freeBlocks({popped_item.block_index}, /*cache_free=*/true); - } + MemoryDiskBlockCache::CacheItem new_item; + new_item.cache_key = item.cache_key; + new_item.backing_type = CacheBackingType::MEMORY; + new_item.block_index = item.block_index; + new_item.disk_slot = -1; + new_item.block_size = item.block_size; + new_item.is_resident = item.is_resident; + new_item.is_complete = item.is_complete; + putToCache(new_item, /*already_has_cache_ref=*/false); +} + +void KVCacheMemoryConnector::putToCache(CopyInfoPerKey& copy_info) { + const auto kind = blockKindFromComplete(copy_info.is_complete); + MemoryDiskBlockCache::CacheItem item; + item.cache_key = copy_info.cache_key; + item.backing_type = CacheBackingType::MEMORY; + item.block_index = copy_info.mem_block; + item.disk_slot = -1; + item.block_size = isDualPool() ? + (kind == CacheBlockKind::COMPLETE ? complete_block_size_ : incomplete_block_size_) : + memoryCacheBlockSizeBytes(); + item.is_resident = false; + item.is_complete = copy_info.is_complete; + + // Add cache ref. The request ref will be released by the CopyPlan deleter. + if (!putToCache(item, /*already_has_cache_ref=*/false)) { + return; } } -// this function is called under lock -bool KVCacheMemoryConnector::ensureEnoughFreeBlocks(size_t need_blocks) { +bool KVCacheMemoryConnector::putToCache(const MemoryDiskBlockCache::CacheItem& item, bool already_has_cache_ref) { RTP_LLM_PROFILE_FUNCTION(); - auto free_blocks = block_pool_->freeBlocksNum(); - if (free_blocks >= need_blocks) { - return true; + if (!already_has_cache_ref) { + referenceCacheBacking(item); } - const auto need_evict_blocks = need_blocks - free_blocks; - const auto evict_blocks = block_cache_->pop(need_evict_blocks); - if (!evict_blocks.empty()) { - freeBlocks(evict_blocks, /*cache_free=*/true); + auto [success, popped_item_opt] = block_cache_->putCommitted(item); + if (!success) { + releaseCacheBacking(item); + return false; + } + + RTP_LLM_LOG_DEBUG("write cache, cache key: %ld, backing: %d, block index: %d, disk slot: %d, block size: %zu", + item.cache_key, + static_cast(item.backing_type), + item.block_index, + item.disk_slot, + item.block_size); + if (popped_item_opt.has_value()) { + const auto popped_item = popped_item_opt.value(); + releaseCacheBacking(popped_item); } - return block_pool_->freeBlocksNum() >= need_blocks; + return true; +} + +int64_t KVCacheMemoryConnector::copyPlanTimeoutMs(const std::shared_ptr& copy_plan) const { + return kv_cache_config_.memory_cache_sync_timeout_ms; } std::vector KVCacheMemoryConnector::cacheKeys() const { @@ -819,10 +1538,11 @@ void KVCacheMemoryConnector::reportMatchMetrics(bool success, } RtpLLMMemoryCacheMatchMetricsCollector collector; + const int64_t tokens_per_block = cacheKeyTokensPerBlockForMetrics(); collector.failed = !success; collector.latency_us = latency_us; - collector.input_token = input_block_num * cache_config_.seq_size_per_block; - collector.matched_token = matched_block_num * cache_config_.seq_size_per_block; + collector.input_token = input_block_num * tokens_per_block; + collector.matched_token = matched_block_num * tokens_per_block; metrics_reporter_->report(nullptr, &collector); } @@ -836,10 +1556,11 @@ void KVCacheMemoryConnector::reportReadMetrics(bool success, } RtpLLMMemoryCacheReadMetricsCollector collector; + const int64_t tokens_per_block = cacheKeyTokensPerBlockForMetrics(); collector.failed = !success; collector.latency_us = latency_us; - collector.input_token = input_block_num * cache_config_.seq_size_per_block; - collector.read_token = read_block_num * cache_config_.seq_size_per_block; + collector.input_token = input_block_num * tokens_per_block; + collector.read_token = read_block_num * tokens_per_block; metrics_reporter_->report(nullptr, &collector); } @@ -853,10 +1574,11 @@ void KVCacheMemoryConnector::reportWriteMetrics(bool success, } RtpLLMMemoryCacheWriteMetricsCollector collector; + const int64_t tokens_per_block = cacheKeyTokensPerBlockForMetrics(); collector.failed = !success; collector.latency_us = latency_us; - collector.input_token = input_block_num * cache_config_.seq_size_per_block; - collector.write_token = write_block_num * cache_config_.seq_size_per_block; + collector.input_token = input_block_num * tokens_per_block; + collector.write_token = write_block_num * tokens_per_block; metrics_reporter_->report(nullptr, &collector); } @@ -874,27 +1596,172 @@ void KVCacheMemoryConnector::reportCopyMetrics(bool success, int64_t latency_us, metrics_reporter_->report(nullptr, &collector); } +int KVCacheMemoryConnector::cpSizeForMetrics() const { + const auto& cp_cfg = parallelism_config_.prefill_cp_config; + if (!cp_cfg.kv_cache_sharded) { + return 1; + } + if (parallelism_config_.tp_size > 1) { + return static_cast(parallelism_config_.tp_size); + } + if (parallelism_config_.role_type == RoleType::DECODE && cp_cfg.is_prefill_enabled() + && cp_cfg.prefill_cp_size > 1) { + return static_cast(cp_cfg.prefill_cp_size); + } + return 1; +} + +int KVCacheMemoryConnector::cacheKeyTokensPerBlockForMetrics() const { + return static_cast(cache_config_.seq_size_per_block) * cpSizeForMetrics(); +} + +void KVCacheMemoryConnector::reportEvictionLifetime(CacheBlockKind kind, + CacheBackingType backing_type, + int64_t created_time_us) { + if (!metrics_reporter_ || created_time_us <= 0) { + return; + } + RtpLLMCacheEvictionMetricsCollector collector; + collector.lifetime_ms = std::max(0, (currentTimeUs() - created_time_us) / 1000); + kmonitor::MetricsTags tags("scope", "memory"); + tags.AddTag("kind", cacheBlockKindName(kind)); + tags.AddTag("backing", "memory"); + metrics_reporter_->report(&tags, &collector); +} + void KVCacheMemoryConnector::reportMetricsLoop() { while (!stop_.load()) { if (metrics_reporter_) { - if (!block_pool_) { - std::this_thread::sleep_for(std::chrono::seconds(1)); - continue; + const auto item_num = block_cache_ ? block_cache_->size() : 0; + if (isDualPool()) { + if (!complete_pool_) { + std::this_thread::sleep_for(std::chrono::seconds(1)); + continue; + } + const auto total = + complete_pool_->totalBlocksNum() + (incomplete_pool_ ? incomplete_pool_->totalBlocksNum() : 0); + const auto free = + complete_pool_->freeBlocksNum() + (incomplete_pool_ ? incomplete_pool_->freeBlocksNum() : 0); + const auto avail = complete_pool_->availableBlocksNum() + + (incomplete_pool_ ? incomplete_pool_->availableBlocksNum() : 0); + + RtpLLMMemoryCacheStatusMetricsCollector collector; + collector.item_num = static_cast(item_num); + collector.total_block_num = total; + collector.allocated_block_num = total - free; + collector.available_block_num = avail; + collector.used_ratio = + total == 0 ? 0.0f : static_cast(100.0 * (total - avail) / static_cast(total)); + metrics_reporter_->report( + nullptr, &collector); + } else { + if (!block_pool_) { + std::this_thread::sleep_for(std::chrono::seconds(1)); + continue; + } + const auto total_blocks = block_pool_->totalBlocksNum(); + const auto free_blocks = block_pool_->freeBlocksNum(); + const auto available_blocks = block_pool_->availableBlocksNum(); + + RtpLLMMemoryCacheStatusMetricsCollector collector; + collector.item_num = static_cast(item_num); + collector.total_block_num = total_blocks; + collector.allocated_block_num = total_blocks - free_blocks; + collector.available_block_num = available_blocks; + collector.used_ratio = total_blocks == 0 ? + 0.0f : + static_cast(100.0 * (total_blocks - available_blocks) + / static_cast(total_blocks)); + metrics_reporter_->report( + nullptr, &collector); } - const auto total_blocks = block_pool_->totalBlocksNum(); - const auto free_blocks = block_pool_->freeBlocksNum(); - const auto available_blocks = block_pool_->availableBlocksNum(); + } + std::this_thread::sleep_for(std::chrono::seconds(1)); + } +} - RtpLLMMemoryCacheStatusMetricsCollector collector; - collector.total_block_num = total_blocks; - collector.allocated_block_num = total_blocks - free_blocks; - collector.available_block_num = available_blocks; +bool KVCacheMemoryConnector::mallocBlocksFromPool(const std::shared_ptr& pool, + const std::shared_ptr& cache, + size_t need_blocks, + std::vector& malloced_blocks) { + RTP_LLM_PROFILE_FUNCTION(); + if (need_blocks == 0) { + return true; + } + std::unique_lock lock(malloc_mutex_); + if (!ensureEnoughFreeBlocksInPool(pool, cache, need_blocks)) { + RTP_LLM_LOG_WARNING("malloc blocks from pool failed, need=%zu free=%zu", need_blocks, pool->freeBlocksNum()); + return false; + } + auto blocks = pool->malloc(need_blocks); + if (blocks.size() != need_blocks) { + RTP_LLM_LOG_WARNING("malloc blocks from pool failed, need=%zu got=%zu", need_blocks, blocks.size()); + freeBlocksFromPool(pool, std::vector(blocks.begin(), blocks.end()), false); + return false; + } + malloced_blocks.insert(malloced_blocks.end(), blocks.begin(), blocks.end()); + return true; +} - metrics_reporter_->report(nullptr, - &collector); +bool KVCacheMemoryConnector::freeBlocksFromPool(const std::shared_ptr& pool, + const std::vector& blocks, + bool cache_free) { + std::vector need_free; + need_free.reserve(blocks.size()); + for (const auto& b : blocks) { + if (!isNullBlockIdx(b)) { + need_free.push_back(static_cast(b)); + } + } + if (need_free.empty()) { + return true; + } + RTP_LLM_CHECK_WITH_INFO(pool != nullptr, "pool is null"); + if (cache_free) { + pool->blockCacheFree(need_free); + } else { + pool->requestFree(need_free); + } + return true; +} + +void KVCacheMemoryConnector::referenceBlocksInPool(const std::shared_ptr& pool, + const std::vector& blocks, + bool cache_ref) { + RTP_LLM_CHECK_WITH_INFO(pool != nullptr, "pool is null"); + if (cache_ref) { + pool->blockCacheReference(blocks); + } else { + pool->requestReference(blocks); + } +} + +bool KVCacheMemoryConnector::ensureEnoughFreeBlocksInPool(const std::shared_ptr& pool, + const std::shared_ptr& cache, + size_t need_blocks) { + RTP_LLM_PROFILE_FUNCTION(); + auto free_blocks = pool->freeBlocksNum(); + if (free_blocks >= need_blocks) { + return true; + } + const auto need_evict = need_blocks - free_blocks; + const auto evicted = cache->pop(need_evict); + if (!evicted.empty()) { + freeBlocksFromPool(pool, evicted, true); + } + return pool->freeBlocksNum() >= need_blocks; +} + +void KVCacheMemoryConnector::putToCacheInPool(const std::shared_ptr& pool, + const std::shared_ptr& cache, + const MemoryBlockCache::CacheItem& item) { + RTP_LLM_PROFILE_FUNCTION(); + if (auto [success, popped_item_opt] = cache->put(item); success) { + referenceBlocksInPool(pool, {item.block_index}, true); + if (popped_item_opt.has_value()) { + freeBlocksFromPool(pool, {popped_item_opt->block_index}, true); } - std::this_thread::sleep_for(std::chrono::seconds(1)); } } diff --git a/rtp_llm/cpp/cache/connector/memory/KVCacheMemoryConnector.h b/rtp_llm/cpp/cache/connector/memory/KVCacheMemoryConnector.h index 73f20ee99d..ffc0ae10f6 100644 --- a/rtp_llm/cpp/cache/connector/memory/KVCacheMemoryConnector.h +++ b/rtp_llm/cpp/cache/connector/memory/KVCacheMemoryConnector.h @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -13,6 +14,7 @@ #include "rtp_llm/cpp/cache/CacheConfig.h" #include "rtp_llm/cpp/cache/connector/KVCacheConnector.h" #include "rtp_llm/cpp/cache/connector/memory/MemoryBlockCache.h" +#include "rtp_llm/cpp/cache/connector/memory/MemoryDiskBlockCache.h" #include "rtp_llm/cpp/cache/Types.h" #include "rtp_llm/cpp/config/ConfigModules.h" #include "rtp_llm/cpp/model_rpc/BroadcastManager.h" @@ -24,9 +26,16 @@ class BlockPool; class BroadcastManager; class KVCacheAllocator; class MemoryAsyncContext; +struct StagedMemoryCopyScratch; class KVCacheMemoryConnector: public KVCacheConnector { public: + KVCacheMemoryConnector(const CacheConfig& cache_config, + const KVCacheConfig& kv_cache_config, + const ParallelismConfig& parallelism_config, + const std::shared_ptr& allocator, + const std::vector& tp_addrs, + const kmonitor::MetricsReporterPtr& metrics_reporter = nullptr); KVCacheMemoryConnector(const CacheConfig& cache_config, const KVCacheConfig& kv_cache_config, const std::shared_ptr& allocator, @@ -56,9 +65,17 @@ class KVCacheMemoryConnector: public KVCacheConnector { std::vector cacheKeys() const; private: + struct LayerTagSlot { + int layer_id{-1}; + std::string tag; + int group_id{-1}; + size_t stride_bytes{0}; + }; struct CopyInfoPerKey { CacheKeyType cache_key{0}; + CacheBlockKind kind{CacheBlockKind::COMPLETE}; BlockIdxType mem_block{NULL_BLOCK_IDX}; + size_t block_size{0}; std::vector gpu_blocks; bool is_complete{true}; }; @@ -71,65 +88,132 @@ class KVCacheMemoryConnector: public KVCacheConnector { CopyDirection direction; }; - std::shared_ptr buildCopyPlanForRead(const CacheKeysType& cache_keys, - const LayerBlockIds& layer_block_ids, - int start_index, - int read_num); - std::shared_ptr buildCopyPlanForWrite(const CacheKeysType& cache_keys, - const LayerBlockIds& layer_block_ids, - int start_index, - int write_num, - bool& no_need_write); + std::shared_ptr buildCopyPlanForRead(const CacheKeysType& cache_keys, + const LayerAttnBlockIds& layer_attn_block_ids, + const std::vector& slots, + int start_index, + int read_num); + std::shared_ptr buildCopyPlanForWrite(const CacheKeysType& cache_keys, + const LayerAttnBlockIds& layer_attn_block_ids, + const std::vector& slots, + int start_index, + int write_num, + bool& no_need_write); std::shared_ptr createCopyPlan(const std::vector& copy_infos, const CopyDirection& direction); bool startCopyAsync(const std::shared_ptr& context, const std::shared_ptr& copy_plan); std::shared_ptr> sendCopyPlan(const std::shared_ptr& copy_plan) const; + std::shared_ptr> + sendMemoryRequest(const MemoryOperationRequestPB& mem_req, int64_t timeout_ms) const; void printCopyPlan(const std::shared_ptr& copy_plan) const; - bool prepareCopyBuffers(BlockIdxType mem_block, - const std::vector& gpu_blocks, - CopyDirection direction, - std::vector& dst, - std::vector& src); - bool appendCopyBytesToBuffers(const BlockInfo& mem_block, - const BlockInfo& gpu_block, - size_t byte_off, - CopyDirection direction, - std::vector& dst, - std::vector& src); - - void checkLayerBlockStrideBytes() const; - bool checkLayerBlocks(const LayerBlockIds& layer_block_ids, size_t required_len) const; - bool gpuBlocksAllValid(const LayerBlockIds& layer_block_ids, size_t key_index) const; - - bool mallocBlocks(size_t need_blocks, std::vector& malloced_blocks); + bool prepareCopyBuffers(BlockIdxType mem_block, + const std::vector& gpu_blocks, + CopyDirection direction, + bool is_complete, + std::vector& dst, + std::vector& src); + bool tryCopyCacheWithBatchedMemoryCopy(const MemoryOperationRequestPB& request, + CopyDirection direction, + const std::vector& slots); + bool tryCopyCacheWithStagedMemoryCopy(const MemoryOperationRequestPB& request, + CopyDirection direction, + const std::vector& slots); + StagedMemoryCopyScratch& stagedCopyScratchForDevice(int device_index); + bool appendCopyBytesToBuffers(const BlockInfo& mem_block, + const BlockInfo& gpu_block, + size_t byte_off, + CopyDirection direction, + std::vector& dst, + std::vector& src); + bool copyMemoryItemsGeneric(const MemoryOperationRequestPB& request, + CopyDirection direction, + const std::vector& slots); + bool validateCopyItemBacking(const MemoryOperationRequestPB::CopyItem& item) const; + + void checkLayerBlockStrideBytes() const; + std::vector layerTagSlots() const; + bool hasTypedLayerTagSlots(const std::vector& slots) const; + bool checkLayerBlocks(const LayerBlockIds& layer_block_ids, size_t required_len) const; + LayerAttnBlockIds resourceLayerRegionBlocks(const KVCacheResource& resource, + const std::vector& slots) const; + bool checkLayerRegionBlocks(const LayerAttnBlockIds& layer_attn_block_ids, + const std::vector& slots, + size_t required_len) const; + bool gpuBlocksAllValid(const LayerBlockIds& layer_block_ids, size_t key_index) const; + bool gpuBlocksAllValid(const LayerAttnBlockIds& layer_attn_block_ids, + const std::vector& slots, + size_t key_index) const; + CacheGroupPolicy groupPolicyForSlot(const LayerTagSlot& slot) const; + bool freeBlocks(const std::vector& blocks, bool cache_free = true); void referenceBlocks(const std::vector& blocks, bool cache_ref = true); - bool ensureEnoughFreeBlocks(size_t need_blocks); + bool allocateBackingsForWrite(std::vector& copy_infos); + bool allocateOneBacking(CopyInfoPerKey& copy_info); + bool tryMallocMemoryBlock(CacheBlockKind kind, BlockIdxType& block); + void releaseRequestBacking(const CopyInfoPerKey& copy_info); + void releaseCacheBacking(const MemoryDiskBlockCache::CacheItem& item); + void referenceCacheBacking(const MemoryDiskBlockCache::CacheItem& item); + std::shared_ptr memoryPoolFor(CacheBlockKind kind) const; + + bool isDualPool() const; + bool isFullOnlySlot(const LayerTagSlot& slot) const; + bool mallocBlocksFromPool(const std::shared_ptr& pool, + const std::shared_ptr& cache, + size_t need_blocks, + std::vector& malloced_blocks); + bool freeBlocksFromPool(const std::shared_ptr& pool, + const std::vector& blocks, + bool cache_free); + void referenceBlocksInPool(const std::shared_ptr& pool, + const std::vector& blocks, + bool cache_ref); + bool ensureEnoughFreeBlocksInPool(const std::shared_ptr& pool, + const std::shared_ptr& cache, + size_t need_blocks); + void putToCacheInPool(const std::shared_ptr& pool, + const std::shared_ptr& cache, + const MemoryBlockCache::CacheItem& item); void initBlockPool(); + int64_t copyPlanTimeoutMs(const std::shared_ptr& copy_plan) const; std::shared_ptr createBlockPool(size_t block_size, size_t pool_size_mb) const; std::string blockPoolDebugString() const; + size_t memoryCacheBlockSizeBytes() const; void putToCache(const MemoryBlockCache::CacheItem& item); + void putToCache(CopyInfoPerKey& copy_info); + bool putToCache(const MemoryDiskBlockCache::CacheItem& item, + bool already_has_cache_ref = false); void reportMatchMetrics(bool success, int64_t latency_us, int64_t input_block_num, int64_t matched_block_num); void reportReadMetrics(bool success, int64_t latency_us, int64_t input_block_num, int64_t read_block_num); void reportWriteMetrics(bool success, int64_t latency_us, int64_t input_block_num, int64_t write_block_num); void reportCopyMetrics(bool success, int64_t latency_us, CopyDirection direction); + int cpSizeForMetrics() const; + int cacheKeyTokensPerBlockForMetrics() const; + void reportEvictionLifetime(CacheBlockKind kind, CacheBackingType backing_type, int64_t created_time_us); void reportMetricsLoop(); private: const CacheConfig& cache_config_; const KVCacheConfig& kv_cache_config_; + const ParallelismConfig parallelism_config_; std::shared_ptr allocator_; const std::vector tp_addrs_; - std::shared_ptr block_pool_; - mutable std::mutex malloc_mutex_; - std::shared_ptr block_cache_; - std::shared_ptr broadcast_manager_; - std::shared_ptr wait_done_thread_pool_; + std::shared_ptr block_pool_; + mutable std::mutex malloc_mutex_; + mutable std::mutex staged_copy_scratch_mutex_; + std::map> staged_copy_scratch_by_device_; + std::shared_ptr block_cache_; + std::shared_ptr broadcast_manager_; + std::shared_ptr wait_done_thread_pool_; + + std::shared_ptr complete_pool_; + std::shared_ptr incomplete_pool_; + size_t complete_block_size_{0}; + size_t incomplete_block_size_{0}; // metrics reporter kmonitor::MetricsReporterPtr metrics_reporter_; diff --git a/rtp_llm/cpp/cache/connector/memory/MemoryAsyncContext.cc b/rtp_llm/cpp/cache/connector/memory/MemoryAsyncContext.cc index 0412e1f285..f142899807 100644 --- a/rtp_llm/cpp/cache/connector/memory/MemoryAsyncContext.cc +++ b/rtp_llm/cpp/cache/connector/memory/MemoryAsyncContext.cc @@ -20,13 +20,29 @@ size_t MemoryAsyncMatchContext::matchedBlockCount() const { return matched_block_count_; } +int MemoryAsyncMatchContext::startReadBlockIndex() const { + return start_read_block_index_; +} + +int MemoryAsyncMatchContext::readBlockNum() const { + return read_block_num_; +} + +std::shared_ptr MemoryAsyncMatchContext::readCopyPlan() const { + return read_copy_plan_; +} + +void MemoryAsyncMatchContext::clearReadCopyPlan() { + read_copy_plan_.reset(); +} + // ----------------------------- MemoryAsyncContext --------------------------------- bool MemoryAsyncContext::done() const { return already_done_.load(); } -bool MemoryAsyncContext::success() const { +bool MemoryAsyncContext::successLocked() const { if (!broadcast_result_ || !broadcast_result_->success()) { return false; } @@ -39,22 +55,58 @@ bool MemoryAsyncContext::success() const { return true; } +bool MemoryAsyncContext::success() const { + std::lock_guard lock(mutex_); + return successLocked(); +} + void MemoryAsyncContext::waitDone() { - if (done()) { - return; + std::shared_ptr> result; + { + std::unique_lock lock(mutex_); + cv_.wait(lock, [this]() { return result_ready_ || already_done_.load(); }); + if (already_done_.load()) { + return; + } + if (finalizing_) { + cv_.wait(lock, [this]() { return already_done_.load(); }); + return; + } + finalizing_ = true; + result = broadcast_result_; + } + + if (result) { + result->waitDone(); } - if (broadcast_result_) { - broadcast_result_->waitDone(); + + bool ok = false; + std::function done_callback; + { + std::lock_guard lock(mutex_); + ok = successLocked(); + done_callback = std::move(done_callback_); } - if (done_callback_) { - done_callback_(success()); + if (done_callback) { + done_callback(ok); } - already_done_.store(true); + + { + std::lock_guard lock(mutex_); + already_done_.store(true); + finalizing_ = false; + } + cv_.notify_all(); } void MemoryAsyncContext::setBroadcastResult( const std::shared_ptr>& result) { - broadcast_result_ = result; + { + std::lock_guard lock(mutex_); + broadcast_result_ = result; + result_ready_ = true; + } + cv_.notify_all(); } -} // namespace rtp_llm \ No newline at end of file +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/connector/memory/MemoryAsyncContext.h b/rtp_llm/cpp/cache/connector/memory/MemoryAsyncContext.h index 4d45c9cf41..6fc57f8219 100644 --- a/rtp_llm/cpp/cache/connector/memory/MemoryAsyncContext.h +++ b/rtp_llm/cpp/cache/connector/memory/MemoryAsyncContext.h @@ -1,8 +1,11 @@ #pragma once #include +#include #include #include +#include +#include #include "rtp_llm/cpp/cache/connector/AsyncContext.h" #include "rtp_llm/cpp/model_rpc/BroadcastManager.h" @@ -12,7 +15,14 @@ namespace rtp_llm { // 用于 memory connector match class MemoryAsyncMatchContext: public AsyncMatchContext { public: - explicit MemoryAsyncMatchContext(size_t matched_block_count): matched_block_count_(matched_block_count) {} + explicit MemoryAsyncMatchContext(size_t matched_block_count, + int start_read_block_index = -1, + int read_block_num = 0, + std::shared_ptr read_copy_plan = nullptr): + matched_block_count_(matched_block_count), + start_read_block_index_(start_read_block_index), + read_block_num_(read_block_num), + read_copy_plan_(std::move(read_copy_plan)) {} ~MemoryAsyncMatchContext() override = default; public: @@ -20,9 +30,16 @@ class MemoryAsyncMatchContext: public AsyncMatchContext { bool done() const override; bool success() const override; size_t matchedBlockCount() const override; + int startReadBlockIndex() const; + int readBlockNum() const; + std::shared_ptr readCopyPlan() const; + void clearReadCopyPlan(); private: - size_t matched_block_count_{0}; + size_t matched_block_count_{0}; + int start_read_block_index_{-1}; + int read_block_num_{0}; + std::shared_ptr read_copy_plan_; }; // 用于 memory connector read/write @@ -38,8 +55,15 @@ class MemoryAsyncContext: public AsyncContext { void setBroadcastResult(const std::shared_ptr>& result); private: + bool successLocked() const; + +private: + mutable std::mutex mutex_; + std::condition_variable cv_; std::shared_ptr> broadcast_result_; std::function done_callback_; + bool result_ready_{false}; + bool finalizing_{false}; std::atomic already_done_{false}; }; diff --git a/rtp_llm/cpp/cache/connector/memory/MemoryDiskBlockCache.cc b/rtp_llm/cpp/cache/connector/memory/MemoryDiskBlockCache.cc new file mode 100644 index 0000000000..374b0cee62 --- /dev/null +++ b/rtp_llm/cpp/cache/connector/memory/MemoryDiskBlockCache.cc @@ -0,0 +1,327 @@ +#include "rtp_llm/cpp/cache/connector/memory/MemoryDiskBlockCache.h" + +#include +#include + +#include "rtp_llm/cpp/utils/AssertUtils.h" +#include "rtp_llm/cpp/utils/Logger.h" +#include "rtp_llm/cpp/utils/ProfilingScope.h" +#include "rtp_llm/cpp/utils/TimeUtil.h" + +namespace rtp_llm { + +MemoryDiskBlockCache::MatchResult MemoryDiskBlockCache::match(CacheKeyType cache_key) { + RTP_LLM_PROFILE_FUNCTION(); + std::unique_lock lock(mutex_); + auto it = items_.find(cache_key); + if (it == items_.end()) { + return {}; + } + touchLocked(it->second); + const auto& item = it->second; + return {item.backing_type, item.block_index, item.disk_slot, item.block_size, item.is_complete}; +} + +MemoryDiskBlockCache::MatchResult MemoryDiskBlockCache::matchAndMarkInFlight(CacheKeyType cache_key) { + RTP_LLM_PROFILE_FUNCTION(); + std::unique_lock lock(mutex_); + auto it = items_.find(cache_key); + if (it == items_.end()) { + return {}; + } + touchLocked(it->second); + it->second.in_flight_ref++; + const auto& item = it->second; + return {item.backing_type, item.block_index, item.disk_slot, item.block_size, item.is_complete}; +} + +bool MemoryDiskBlockCache::contains(CacheKeyType cache_key) const { + std::shared_lock lock(mutex_); + return items_.find(cache_key) != items_.end(); +} + +std::pair> +MemoryDiskBlockCache::putCommitted(const CacheItem& input_item) { + RTP_LLM_PROFILE_FUNCTION(); + RTP_LLM_CHECK_WITH_INFO(validItem(input_item), "invalid cache item backing fields"); + + std::unique_lock lock(mutex_); + auto item = input_item; + item.in_flight_ref = 0; + + auto existing = items_.find(item.cache_key); + if (existing != items_.end()) { + touchLocked(existing->second); + if (!existing->second.is_complete && item.is_complete) { + if (existing->second.in_flight_ref > 0) { + return {false, std::nullopt}; + } + auto old_item = existing->second; + eraseEvictKeyLocked(existing->second); + item.last_access_seq = ++access_seq_; + item.created_time_us = item.created_time_us > 0 ? item.created_time_us : currentTimeUs(); + existing->second = item; + insertEvictKeyLocked(existing->second); + return {true, old_item}; + } + return {false, std::nullopt}; + } + + item.last_access_seq = ++access_seq_; + item.created_time_us = item.created_time_us > 0 ? item.created_time_us : currentTimeUs(); + auto [it, inserted] = items_.emplace(item.cache_key, item); + (void)inserted; + insertEvictKeyLocked(it->second); + return {true, std::nullopt}; +} + +std::optional MemoryDiskBlockCache::removeIfMatch(CacheKeyType cache_key, + CacheBackingType backing_type, + BlockIdxType expected_block_index, + int32_t expected_disk_slot) { + std::unique_lock lock(mutex_); + auto it = items_.find(cache_key); + if (it == items_.end() || it->second.backing_type != backing_type) { + return std::nullopt; + } + if (backing_type == CacheBackingType::MEMORY && it->second.block_index != expected_block_index) { + return std::nullopt; + } + if (backing_type == CacheBackingType::DISK && it->second.disk_slot != expected_disk_slot) { + return std::nullopt; + } + auto removed_item = it->second; + eraseEvictKeyLocked(it->second); + items_.erase(it); + return removed_item; +} + +std::pair> +MemoryDiskBlockCache::put(const MemoryBlockCache::CacheItem& input_item) { + CacheItem item; + item.cache_key = input_item.cache_key; + item.backing_type = CacheBackingType::MEMORY; + item.block_index = input_item.block_index; + item.disk_slot = -1; + item.block_size = input_item.block_size; + item.is_resident = input_item.is_resident; + item.is_complete = input_item.is_complete; + auto [ok, popped] = putCommitted(item); + if (!popped.has_value()) { + return {ok, std::nullopt}; + } + return {ok, toMemoryCacheItem(*popped)}; +} + +std::optional MemoryDiskBlockCache::remove(CacheKeyType cache_key) { + std::unique_lock lock(mutex_); + auto it = items_.find(cache_key); + if (it == items_.end()) { + return std::nullopt; + } + auto removed_item = it->second; + eraseEvictKeyLocked(it->second); + items_.erase(it); + return toMemoryCacheItem(removed_item); +} + +std::optional MemoryDiskBlockCache::removeIfMatch(CacheKeyType cache_key, + BlockIdxType expected_block_index) { + auto removed = removeIfMatch(cache_key, CacheBackingType::MEMORY, expected_block_index, -1); + if (!removed.has_value()) { + return std::nullopt; + } + return toMemoryCacheItem(*removed); +} + +std::optional MemoryDiskBlockCache::popOldestEvictable() { + std::unique_lock lock(mutex_); + std::optional selected; + auto consider = [&selected](const std::optional& candidate) { + if (!candidate.has_value()) { + return; + } + if (!selected.has_value() || candidate->last_access_seq < selected->last_access_seq) { + selected = candidate; + } + }; + consider(oldestFromSetLocked(memory_complete_lru_)); + consider(oldestFromSetLocked(memory_incomplete_lru_)); + consider(oldestFromSetLocked(disk_complete_lru_)); + consider(oldestFromSetLocked(disk_incomplete_lru_)); + if (!selected.has_value()) { + return std::nullopt; + } + auto it = items_.find(selected->cache_key); + if (it != items_.end()) { + eraseEvictKeyLocked(it->second); + items_.erase(it); + } + return selected; +} + +std::optional MemoryDiskBlockCache::popOldestEvictable(CacheBlockKind kind) { + std::unique_lock lock(mutex_); + return popOldestEvictableLocked(kind); +} + +std::optional MemoryDiskBlockCache::popOldestEvictableLocked(CacheBlockKind kind) { + auto memory_item = oldestFromSetLocked(lruSetLocked(CacheBackingType::MEMORY, kind)); + auto disk_item = oldestFromSetLocked(lruSetLocked(CacheBackingType::DISK, kind)); + if (!memory_item.has_value()) { + if (!disk_item.has_value()) { + return std::nullopt; + } + auto it = items_.find(disk_item->cache_key); + if (it != items_.end()) { + eraseEvictKeyLocked(it->second); + items_.erase(it); + } + return disk_item; + } + if (!disk_item.has_value() || memory_item->last_access_seq <= disk_item->last_access_seq) { + auto it = items_.find(memory_item->cache_key); + if (it != items_.end()) { + eraseEvictKeyLocked(it->second); + items_.erase(it); + } + return memory_item; + } + auto it = items_.find(disk_item->cache_key); + if (it != items_.end()) { + eraseEvictKeyLocked(it->second); + items_.erase(it); + } + return disk_item; +} + +bool MemoryDiskBlockCache::markInFlight(CacheKeyType cache_key, + CacheBackingType backing_type, + BlockIdxType block_index, + int32_t disk_slot) { + std::unique_lock lock(mutex_); + auto it = items_.find(cache_key); + if (it == items_.end() || it->second.backing_type != backing_type) { + return false; + } + if (backing_type == CacheBackingType::MEMORY && it->second.block_index != block_index) { + return false; + } + if (backing_type == CacheBackingType::DISK && it->second.disk_slot != disk_slot) { + return false; + } + it->second.in_flight_ref++; + return true; +} + +void MemoryDiskBlockCache::releaseInFlight(CacheKeyType cache_key, + CacheBackingType backing_type, + BlockIdxType block_index, + int32_t disk_slot) { + std::unique_lock lock(mutex_); + auto it = items_.find(cache_key); + if (it == items_.end() || it->second.backing_type != backing_type) { + return; + } + if (backing_type == CacheBackingType::MEMORY && it->second.block_index != block_index) { + return; + } + if (backing_type == CacheBackingType::DISK && it->second.disk_slot != disk_slot) { + return; + } + if (it->second.in_flight_ref > 0) { + it->second.in_flight_ref--; + } +} + +bool MemoryDiskBlockCache::empty() const { + std::shared_lock lock(mutex_); + return items_.empty(); +} + +size_t MemoryDiskBlockCache::size() const { + std::shared_lock lock(mutex_); + return items_.size(); +} + +std::vector MemoryDiskBlockCache::cacheKeys() const { + std::shared_lock lock(mutex_); + std::vector values; + values.reserve(items_.size()); + for (const auto& [_, item] : items_) { + values.push_back(item); + } + std::sort(values.begin(), values.end(), [](const CacheItem& lhs, const CacheItem& rhs) { + return lhs.last_access_seq > rhs.last_access_seq; + }); + std::vector keys; + keys.reserve(values.size()); + for (const auto& item : values) { + keys.push_back(item.cache_key); + } + return keys; +} + +bool MemoryDiskBlockCache::validItem(const CacheItem& item) const { + if (item.backing_type == CacheBackingType::MEMORY) { + return !isNullBlockIdx(item.block_index) && item.disk_slot < 0; + } + if (item.backing_type == CacheBackingType::DISK) { + return isNullBlockIdx(item.block_index) && item.disk_slot >= 0; + } + return false; +} + +MemoryBlockCache::CacheItem MemoryDiskBlockCache::toMemoryCacheItem(const CacheItem& item) { + MemoryBlockCache::CacheItem memory_item; + memory_item.cache_key = item.cache_key; + memory_item.block_index = item.block_index; + memory_item.block_size = item.block_size; + memory_item.is_resident = item.is_resident; + memory_item.is_complete = item.is_complete; + return memory_item; +} + +void MemoryDiskBlockCache::insertEvictKeyLocked(const CacheItem& item) { + auto& eviction_set = lruSetLocked(item.backing_type, blockKindFromComplete(item.is_complete)); + eviction_set.insert(EvictKey{item.last_access_seq, item.cache_key}); +} + +void MemoryDiskBlockCache::eraseEvictKeyLocked(const CacheItem& item) { + auto& eviction_set = lruSetLocked(item.backing_type, blockKindFromComplete(item.is_complete)); + eviction_set.erase(EvictKey{item.last_access_seq, item.cache_key}); +} + +void MemoryDiskBlockCache::touchLocked(CacheItem& item) { + eraseEvictKeyLocked(item); + item.last_access_seq = ++access_seq_; + insertEvictKeyLocked(item); +} + +std::optional +MemoryDiskBlockCache::oldestFromSetLocked(std::set& eviction_set) { + for (auto evict_it = eviction_set.begin(); evict_it != eviction_set.end();) { + const auto key = *evict_it; + auto it = items_.find(key.cache_key); + if (it == items_.end() || it->second.last_access_seq != key.last_access_seq) { + evict_it = eviction_set.erase(evict_it); + continue; + } + if (it->second.is_resident || it->second.in_flight_ref > 0) { + ++evict_it; + continue; + } + return it->second; + } + return std::nullopt; +} + +std::set& MemoryDiskBlockCache::lruSetLocked(CacheBackingType backing_type, + CacheBlockKind kind) { + if (backing_type == CacheBackingType::MEMORY) { + return kind == CacheBlockKind::COMPLETE ? memory_complete_lru_ : memory_incomplete_lru_; + } + return kind == CacheBlockKind::COMPLETE ? disk_complete_lru_ : disk_incomplete_lru_; +} + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/connector/memory/MemoryDiskBlockCache.h b/rtp_llm/cpp/cache/connector/memory/MemoryDiskBlockCache.h new file mode 100644 index 0000000000..52998c4b63 --- /dev/null +++ b/rtp_llm/cpp/cache/connector/memory/MemoryDiskBlockCache.h @@ -0,0 +1,105 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "rtp_llm/cpp/cache/Types.h" +#include "rtp_llm/cpp/cache/KVCacheResource.h" +#include "rtp_llm/cpp/cache/connector/memory/CacheBlockKind.h" +#include "rtp_llm/cpp/cache/connector/memory/MemoryBlockCache.h" + +namespace rtp_llm { + +enum class CacheBackingType { + MEMORY = 0, + DISK = 1, +}; + +class MemoryDiskBlockCache { +public: + struct CacheItem { + CacheKeyType cache_key{0}; + CacheBackingType backing_type{CacheBackingType::MEMORY}; + BlockIdxType block_index{NULL_BLOCK_IDX}; + int32_t disk_slot{-1}; + size_t block_size{0}; + bool is_resident{false}; + bool is_complete{true}; + uint64_t last_access_seq{0}; + int64_t created_time_us{0}; + uint32_t in_flight_ref{0}; + }; + + struct MatchResult { + CacheBackingType backing_type{CacheBackingType::MEMORY}; + BlockIdxType matched_index{NULL_BLOCK_IDX}; + int32_t disk_slot{-1}; + size_t block_size{0}; + bool is_complete{false}; + }; + +public: + MatchResult match(CacheKeyType cache_key); + MatchResult matchAndMarkInFlight(CacheKeyType cache_key); + bool contains(CacheKeyType cache_key) const; + + std::pair> putCommitted(const CacheItem& item); + std::optional removeIfMatch(CacheKeyType cache_key, + CacheBackingType backing_type, + BlockIdxType expected_block_index, + int32_t expected_disk_slot); + std::pair> put(const MemoryBlockCache::CacheItem& item); + std::optional remove(CacheKeyType cache_key); + std::optional removeIfMatch(CacheKeyType cache_key, BlockIdxType expected_block_index); + std::optional popOldestEvictable(); + std::optional popOldestEvictable(CacheBlockKind kind); + + bool + markInFlight(CacheKeyType cache_key, CacheBackingType backing_type, BlockIdxType block_index, int32_t disk_slot); + void + releaseInFlight(CacheKeyType cache_key, CacheBackingType backing_type, BlockIdxType block_index, int32_t disk_slot); + + bool empty() const; + size_t size() const; + std::vector cacheKeys() const; + +private: + struct EvictKey { + uint64_t last_access_seq{0}; + CacheKeyType cache_key{0}; + + bool operator<(const EvictKey& other) const { + if (last_access_seq != other.last_access_seq) { + return last_access_seq < other.last_access_seq; + } + return cache_key < other.cache_key; + } + }; + + bool validItem(const CacheItem& item) const; + static MemoryBlockCache::CacheItem toMemoryCacheItem(const CacheItem& item); + void insertEvictKeyLocked(const CacheItem& item); + void eraseEvictKeyLocked(const CacheItem& item); + void touchLocked(CacheItem& item); + std::optional oldestFromSetLocked(std::set& eviction_set); + std::optional popOldestEvictableLocked(CacheBlockKind kind); + std::set& lruSetLocked(CacheBackingType backing_type, CacheBlockKind kind); + +private: + mutable std::shared_mutex mutex_; + std::unordered_map items_; + std::set memory_complete_lru_; + std::set memory_incomplete_lru_; + std::set disk_complete_lru_; + std::set disk_incomplete_lru_; + uint64_t access_seq_{0}; +}; + +using MemoryDiskBlockCachePtr = std::shared_ptr; + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/connector/memory/PrefixTreeMemoryBlockCache.cc b/rtp_llm/cpp/cache/connector/memory/PrefixTreeMemoryBlockCache.cc new file mode 100644 index 0000000000..4b745d37f0 --- /dev/null +++ b/rtp_llm/cpp/cache/connector/memory/PrefixTreeMemoryBlockCache.cc @@ -0,0 +1,768 @@ +#include "rtp_llm/cpp/cache/connector/memory/PrefixTreeMemoryBlockCache.h" + +#include +#include + +#include "rtp_llm/cpp/utils/AssertUtils.h" +#include "rtp_llm/cpp/utils/TimeUtil.h" + +namespace rtp_llm { + +size_t PrefixTreeMemoryBlockCache::kindIndex(CacheBlockKind kind) { + RTP_LLM_CHECK_WITH_INFO(validKind(kind), "invalid prefix-tree memory kind %d", static_cast(kind)); + return kind == CacheBlockKind::COMPRESSED_KV ? 0 : 1; +} + +bool PrefixTreeMemoryBlockCache::validKind(CacheBlockKind kind) { + return kind == CacheBlockKind::COMPRESSED_KV || kind == CacheBlockKind::STATE_SWA_KV; +} + +bool PrefixTreeMemoryBlockCache::slotMaskCovers(const std::vector& stored, + const std::vector& required) { + for (size_t i = 0; i < required.size(); ++i) { + if (required[i] == 0) { + continue; + } + if (i >= stored.size() || stored[i] == 0) { + return false; + } + } + return true; +} + +bool PrefixTreeMemoryBlockCache::contains(CacheKeyType cache_key, CacheBlockKind kind) const { + static const std::vector empty_required_mask; + return contains(cache_key, kind, empty_required_mask); +} + +bool PrefixTreeMemoryBlockCache::contains(CacheKeyType cache_key, + CacheBlockKind kind, + const std::vector& required_slot_mask) const { + std::shared_lock lock(mutex_); + auto it = nodes_.find(cache_key); + if (it == nodes_.end() || !validKind(kind)) { + return false; + } + const auto& state = it->second.kinds[kindIndex(kind)]; + return state.has_value && !state.detached && slotMaskCovers(state.slot_valid_mask, required_slot_mask); +} + +PrefixTreeMemoryBlockCache::MatchResult +PrefixTreeMemoryBlockCache::match(CacheKeyType cache_key, CacheBlockKind kind) { + static const std::vector empty_required_mask; + return match(cache_key, kind, empty_required_mask); +} + +PrefixTreeMemoryBlockCache::MatchResult +PrefixTreeMemoryBlockCache::match(CacheKeyType cache_key, + CacheBlockKind kind, + const std::vector& required_slot_mask) { + std::unique_lock lock(mutex_); + auto it = nodes_.find(cache_key); + if (it == nodes_.end() || !validKind(kind)) { + return {}; + } + auto& state = it->second.kinds[kindIndex(kind)]; + if (!state.has_value || state.detached || !slotMaskCovers(state.slot_valid_mask, required_slot_mask)) { + return {}; + } + touchLocked(it->second, kind); + return {true, + state.backing_type, + state.block_index, + state.disk_slot, + state.block_size, + state.generation, + state.created_time_us, + state.slot_valid_mask}; +} + +PrefixTreeMemoryBlockCache::MatchResult +PrefixTreeMemoryBlockCache::matchAndMarkInFlight(CacheKeyType cache_key, CacheBlockKind kind) { + static const std::vector empty_required_mask; + return matchAndMarkInFlight(cache_key, kind, empty_required_mask); +} + +PrefixTreeMemoryBlockCache::MatchResult +PrefixTreeMemoryBlockCache::matchAndMarkInFlight(CacheKeyType cache_key, + CacheBlockKind kind, + const std::vector& required_slot_mask) { + std::unique_lock lock(mutex_); + auto it = nodes_.find(cache_key); + if (it == nodes_.end() || !validKind(kind)) { + return {}; + } + auto& state = it->second.kinds[kindIndex(kind)]; + if (!state.has_value || state.detached || !slotMaskCovers(state.slot_valid_mask, required_slot_mask)) { + return {}; + } + touchLocked(it->second, kind); + state.in_flight_ref++; + eraseEvictKeyLocked(it->second, kind); + return {true, + state.backing_type, + state.block_index, + state.disk_slot, + state.block_size, + state.generation, + state.created_time_us, + state.slot_valid_mask}; +} + +std::pair> +PrefixTreeMemoryBlockCache::putCommitted(CacheKeyType cache_key, + const BlockDependency& dependency, + const CacheItem& input_item) { + RTP_LLM_CHECK_WITH_INFO(validKind(input_item.kind), "invalid prefix-tree memory kind"); + RTP_LLM_CHECK_WITH_INFO(input_item.cache_key == cache_key, "cache key mismatch"); + std::unique_lock lock(mutex_); + auto& node = upsertNodeLocked(cache_key, dependency); + auto& state = node.kinds[kindIndex(input_item.kind)]; + std::optional old_item; + if (state.has_value && !state.detached) { + if (slotMaskCovers(state.slot_valid_mask, input_item.slot_valid_mask)) { + return {false, std::nullopt}; + } + if (!slotMaskCovers(input_item.slot_valid_mask, state.slot_valid_mask)) { + return {false, std::nullopt}; + } + old_item = toItemLocked(node, input_item.kind); + eraseEvictKeyLocked(node, input_item.kind); + if (state.in_flight_ref > 0 && old_item.has_value()) { + node.retired_items[kindIndex(input_item.kind)].push_back(RetiredItem{*old_item, state.in_flight_ref}); + old_item.reset(); + } + } else { + incrementAncestorsLocked(cache_key, input_item.kind); + } + + state.has_value = true; + state.detached = false; + state.backing_type = input_item.backing_type; + state.block_index = input_item.block_index; + state.disk_slot = input_item.disk_slot; + state.block_size = input_item.block_size; + state.is_resident = input_item.is_resident; + state.generation = ++generation_seq_; + state.last_access_seq = ++access_seq_; + state.created_time_us = input_item.created_time_us > 0 ? input_item.created_time_us : currentTimeUs(); + state.in_flight_ref = 0; + state.slot_valid_mask = input_item.slot_valid_mask; + insertEvictKeyLocked(node, input_item.kind); + return {true, old_item}; +} + +std::optional +PrefixTreeMemoryBlockCache::detachIfMatch(CacheKeyType cache_key, + CacheBlockKind kind, + CacheBackingType backing_type, + BlockIdxType expected_block_index, + int32_t expected_disk_slot, + uint64_t expected_generation) { + std::unique_lock lock(mutex_); + auto it = nodes_.find(cache_key); + if (it == nodes_.end() || !validKind(kind)) { + return std::nullopt; + } + auto& state = it->second.kinds[kindIndex(kind)]; + if (!state.has_value || state.detached || state.backing_type != backing_type + || state.generation != expected_generation) { + return std::nullopt; + } + if (backing_type == CacheBackingType::MEMORY && state.block_index != expected_block_index) { + return std::nullopt; + } + if (backing_type == CacheBackingType::DISK && state.disk_slot != expected_disk_slot) { + return std::nullopt; + } + auto item = toItemLocked(it->second, kind); + if (!item.has_value()) { + return std::nullopt; + } + eraseEvictKeyLocked(it->second, kind); + state.detached = true; + decrementAncestorsLocked(cache_key, kind); + const auto descendant_ref_count = state.subtree_ref_count; + if (state.in_flight_ref == 0) { + state = KindState{}; + state.subtree_ref_count = descendant_ref_count; + pruneLocked(cache_key); + return item; + } + it->second.retired_items[kindIndex(kind)].push_back(RetiredItem{*item, state.in_flight_ref}); + state = KindState{}; + state.subtree_ref_count = descendant_ref_count; + pruneLocked(cache_key); + return std::nullopt; +} + +std::optional +PrefixTreeMemoryBlockCache::releaseInFlight(CacheKeyType cache_key, + CacheBlockKind kind, + CacheBackingType backing_type, + BlockIdxType block_index, + int32_t disk_slot, + uint64_t generation) { + std::unique_lock lock(mutex_); + auto it = nodes_.find(cache_key); + if (it == nodes_.end() || !validKind(kind)) { + return std::nullopt; + } + auto& state = it->second.kinds[kindIndex(kind)]; + if (!state.has_value || state.backing_type != backing_type || state.generation != generation) { + auto& retired_items = it->second.retired_items[kindIndex(kind)]; + for (auto retired_it = retired_items.begin(); retired_it != retired_items.end(); ++retired_it) { + auto& item = retired_it->item; + if (item.backing_type != backing_type || item.generation != generation) { + continue; + } + if (backing_type == CacheBackingType::MEMORY && item.block_index != block_index) { + continue; + } + if (backing_type == CacheBackingType::DISK && item.disk_slot != disk_slot) { + continue; + } + if (retired_it->in_flight_ref > 0) { + retired_it->in_flight_ref--; + } + if (retired_it->in_flight_ref == 0) { + auto released = item; + retired_items.erase(retired_it); + pruneLocked(cache_key); + return released; + } + return std::nullopt; + } + return std::nullopt; + } + if (backing_type == CacheBackingType::MEMORY && state.block_index != block_index) { + return std::nullopt; + } + if (backing_type == CacheBackingType::DISK && state.disk_slot != disk_slot) { + return std::nullopt; + } + if (state.in_flight_ref > 0) { + state.in_flight_ref--; + } + if (state.detached && state.in_flight_ref == 0) { + auto released = toItemLocked(it->second, kind); + state = KindState{}; + pruneLocked(cache_key); + return released; + } else if (!state.detached && state.in_flight_ref == 0) { + refreshEvictKeyLocked(it->second, kind); + } + return std::nullopt; +} + +std::optional +PrefixTreeMemoryBlockCache::popOldestEvictable(CacheBlockKind kind) { + std::unique_lock lock(mutex_); + if (!validKind(kind)) { + return std::nullopt; + } + auto& lru = leaf_lru_[kindIndex(kind)]; + for (auto it = lru.begin(); it != lru.end();) { + auto node_it = nodes_.find(it->cache_key); + if (node_it == nodes_.end()) { + it = lru.erase(it); + continue; + } + auto& state = node_it->second.kinds[kindIndex(kind)]; + if (!state.has_value || state.detached || state.last_access_seq != it->last_access_seq + || state.generation != it->generation) { + it = lru.erase(it); + continue; + } + if (state.is_resident || state.in_flight_ref > 0 || !isKindLeafLocked(node_it->second, kind)) { + ++it; + continue; + } + auto item = toItemLocked(node_it->second, kind); + it = lru.erase(it); + state = KindState{}; + decrementAncestorsLocked(item->cache_key, kind); + pruneLocked(item->cache_key); + return item; + } + return std::nullopt; +} + +std::optional +PrefixTreeMemoryBlockCache::popOldestEvictable(CacheBlockKind kind, CacheBackingType backing_type) { + std::unique_lock lock(mutex_); + if (!validKind(kind)) { + return std::nullopt; + } + auto& lru = leaf_lru_[kindIndex(kind)]; + for (auto it = lru.begin(); it != lru.end();) { + auto node_it = nodes_.find(it->cache_key); + if (node_it == nodes_.end()) { + it = lru.erase(it); + continue; + } + auto& state = node_it->second.kinds[kindIndex(kind)]; + if (!state.has_value || state.detached || state.last_access_seq != it->last_access_seq + || state.generation != it->generation) { + it = lru.erase(it); + continue; + } + if (state.backing_type != backing_type || state.is_resident || state.in_flight_ref > 0 + || !isKindLeafLocked(node_it->second, kind)) { + ++it; + continue; + } + auto item = toItemLocked(node_it->second, kind); + it = lru.erase(it); + state = KindState{}; + decrementAncestorsLocked(item->cache_key, kind); + pruneLocked(item->cache_key); + return item; + } + return std::nullopt; +} + +std::vector +PrefixTreeMemoryBlockCache::popOldestStateOrChainEvictable(CacheBackingType backing_type) { + std::unique_lock lock(mutex_); + std::vector leaf_keys; + const auto& state_lru = leaf_lru_[kindIndex(CacheBlockKind::STATE_SWA_KV)]; + leaf_keys.reserve(state_lru.size()); + for (const auto& evict_key : state_lru) { + leaf_keys.push_back(evict_key.cache_key); + } + + for (const auto leaf_key : leaf_keys) { + auto item = popStateOnlyFromChainLocked(leaf_key, backing_type); + if (item.has_value()) { + return {*item}; + } + } + for (const auto leaf_key : leaf_keys) { + auto items = popChainLocked(leaf_key, backing_type); + if (!items.empty()) { + return items; + } + } + return {}; +} + +std::vector PrefixTreeMemoryBlockCache::cacheKeys() const { + std::shared_lock lock(mutex_); + std::vector> entries; + for (const auto& [key, node] : nodes_) { + uint64_t latest = 0; + for (const auto& state : node.kinds) { + if (state.has_value && !state.detached) { + latest = std::max(latest, state.last_access_seq); + } + } + if (latest > 0) { + entries.emplace_back(latest, key); + } + } + std::sort(entries.begin(), entries.end(), [](const auto& lhs, const auto& rhs) { + if (lhs.first != rhs.first) { + return lhs.first > rhs.first; + } + return lhs.second < rhs.second; + }); + std::vector keys; + keys.reserve(entries.size()); + for (const auto& [_, key] : entries) { + keys.push_back(key); + } + return keys; +} + +std::vector PrefixTreeMemoryBlockCache::cacheKeysUnorderedForStatus() const { + std::shared_lock lock(mutex_); + std::vector keys; + keys.reserve(nodes_.size()); + for (const auto& [key, node] : nodes_) { + for (const auto& state : node.kinds) { + if (state.has_value && !state.detached) { + keys.push_back(key); + break; + } + } + } + return keys; +} + +size_t PrefixTreeMemoryBlockCache::size() const { + std::shared_lock lock(mutex_); + size_t count = 0; + for (const auto& [_, node] : nodes_) { + for (const auto& state : node.kinds) { + if (state.has_value && !state.detached) { + ++count; + } + } + } + return count; +} + +PrefixTreeMemoryBlockCache::Node& +PrefixTreeMemoryBlockCache::upsertNodeLocked(CacheKeyType cache_key, const BlockDependency& dependency) { + auto it = nodes_.find(cache_key); + if (it == nodes_.end()) { + Node node; + node.cache_key = cache_key; + node.parent_key = dependency.parent_key; + node.has_parent = dependency.has_parent && dependency.parent_key != cache_key; + node.ordinal = dependency.ordinal; + auto [inserted_it, _] = nodes_.emplace(cache_key, std::move(node)); + it = inserted_it; + } else { + if (it->second.has_parent + && (it->second.parent_key != dependency.parent_key || !dependency.has_parent + || dependency.parent_key == cache_key)) { + auto old_parent_it = nodes_.find(it->second.parent_key); + if (old_parent_it != nodes_.end()) { + subtractSubtreeRefsFromAncestorsLocked(old_parent_it->first, it->second); + old_parent_it->second.children.erase(cache_key); + } else { + detachPendingChildLocked(it->second.parent_key, cache_key); + } + } + it->second.parent_key = dependency.parent_key; + it->second.has_parent = dependency.has_parent && dependency.parent_key != cache_key; + it->second.ordinal = dependency.ordinal; + } + if (it->second.has_parent) { + auto parent_it = nodes_.find(it->second.parent_key); + if (parent_it != nodes_.end()) { + auto [_, inserted] = parent_it->second.children.insert(cache_key); + if (inserted) { + detachPendingChildLocked(it->second.parent_key, cache_key); + addSubtreeRefsToAncestorsLocked(parent_it->first, it->second); + } + } else { + pending_children_by_parent_[it->second.parent_key].insert(cache_key); + } + } + attachPendingChildrenLocked(it->second); + return it->second; +} + +void PrefixTreeMemoryBlockCache::incrementAncestorsLocked(CacheKeyType cache_key, CacheBlockKind kind) { + CacheKeyType cur = cache_key; + while (true) { + auto it = nodes_.find(cur); + if (it == nodes_.end()) { + break; + } + it->second.kinds[kindIndex(kind)].subtree_ref_count++; + refreshEvictKeyLocked(it->second, kind); + if (!it->second.has_parent) { + break; + } + cur = it->second.parent_key; + } +} + +void PrefixTreeMemoryBlockCache::decrementAncestorsLocked(CacheKeyType cache_key, CacheBlockKind kind) { + CacheKeyType cur = cache_key; + while (true) { + auto it = nodes_.find(cur); + if (it == nodes_.end()) { + break; + } + auto& count = it->second.kinds[kindIndex(kind)].subtree_ref_count; + if (count > 0) { + count--; + } + refreshEvictKeyLocked(it->second, kind); + if (!it->second.has_parent) { + break; + } + cur = it->second.parent_key; + } +} + +void PrefixTreeMemoryBlockCache::addSubtreeRefsToAncestorsLocked(CacheKeyType ancestor_key, const Node& child) { + CacheKeyType cur = ancestor_key; + while (true) { + auto it = nodes_.find(cur); + if (it == nodes_.end()) { + break; + } + for (size_t kind_idx = 0; kind_idx < kKindCount; ++kind_idx) { + const auto delta = child.kinds[kind_idx].subtree_ref_count; + if (delta == 0) { + continue; + } + auto kind = kind_idx == 0 ? CacheBlockKind::COMPRESSED_KV : CacheBlockKind::STATE_SWA_KV; + eraseEvictKeyLocked(it->second, kind); + it->second.kinds[kind_idx].subtree_ref_count += delta; + insertEvictKeyLocked(it->second, kind); + } + if (!it->second.has_parent) { + break; + } + cur = it->second.parent_key; + } +} + +void PrefixTreeMemoryBlockCache::subtractSubtreeRefsFromAncestorsLocked(CacheKeyType ancestor_key, const Node& child) { + CacheKeyType cur = ancestor_key; + while (true) { + auto it = nodes_.find(cur); + if (it == nodes_.end()) { + break; + } + for (size_t kind_idx = 0; kind_idx < kKindCount; ++kind_idx) { + const auto delta = child.kinds[kind_idx].subtree_ref_count; + if (delta == 0) { + continue; + } + auto kind = kind_idx == 0 ? CacheBlockKind::COMPRESSED_KV : CacheBlockKind::STATE_SWA_KV; + eraseEvictKeyLocked(it->second, kind); + auto& count = it->second.kinds[kind_idx].subtree_ref_count; + count = count > delta ? count - delta : 0; + insertEvictKeyLocked(it->second, kind); + } + if (!it->second.has_parent) { + break; + } + cur = it->second.parent_key; + } +} + +void PrefixTreeMemoryBlockCache::detachPendingChildLocked(CacheKeyType parent_key, CacheKeyType child_key) { + auto pending_it = pending_children_by_parent_.find(parent_key); + if (pending_it == pending_children_by_parent_.end()) { + return; + } + pending_it->second.erase(child_key); + if (pending_it->second.empty()) { + pending_children_by_parent_.erase(pending_it); + } +} + +void PrefixTreeMemoryBlockCache::attachPendingChildrenLocked(Node& node) { + auto pending_it = pending_children_by_parent_.find(node.cache_key); + if (pending_it == pending_children_by_parent_.end()) { + return; + } + auto pending_children = std::move(pending_it->second); + pending_children_by_parent_.erase(pending_it); + for (const auto child_key : pending_children) { + auto child_it = nodes_.find(child_key); + if (child_it == nodes_.end() || !child_it->second.has_parent || child_it->second.parent_key != node.cache_key) { + continue; + } + auto [_, inserted] = node.children.insert(child_key); + if (inserted) { + addSubtreeRefsToAncestorsLocked(node.cache_key, child_it->second); + } + } +} + +void PrefixTreeMemoryBlockCache::touchLocked(Node& node, CacheBlockKind kind) { + eraseEvictKeyLocked(node, kind); + auto& state = node.kinds[kindIndex(kind)]; + state.last_access_seq = ++access_seq_; + insertEvictKeyLocked(node, kind); +} + +void PrefixTreeMemoryBlockCache::insertEvictKeyLocked(const Node& node, CacheBlockKind kind) { + const auto& state = node.kinds[kindIndex(kind)]; + if (!state.has_value || state.detached || state.is_resident || state.in_flight_ref > 0 + || !isKindLeafLocked(node, kind)) { + return; + } + leaf_lru_[kindIndex(kind)].insert(EvictKey{state.last_access_seq, node.cache_key, state.generation}); +} + +void PrefixTreeMemoryBlockCache::eraseEvictKeyLocked(const Node& node, CacheBlockKind kind) { + const auto& state = node.kinds[kindIndex(kind)]; + leaf_lru_[kindIndex(kind)].erase(EvictKey{state.last_access_seq, node.cache_key, state.generation}); +} + +void PrefixTreeMemoryBlockCache::refreshEvictKeyLocked(const Node& node, CacheBlockKind kind) { + eraseEvictKeyLocked(node, kind); + insertEvictKeyLocked(node, kind); +} + +void PrefixTreeMemoryBlockCache::pruneLocked(CacheKeyType cache_key) { + auto it = nodes_.find(cache_key); + while (it != nodes_.end()) { + bool has_state = false; + for (const auto& state : it->second.kinds) { + if (state.has_value) { + has_state = true; + break; + } + } + if (!has_state) { + for (const auto& retired_items : it->second.retired_items) { + if (!retired_items.empty()) { + has_state = true; + break; + } + } + } + if (has_state || !it->second.children.empty()) { + break; + } + const bool has_parent = it->second.has_parent; + const auto parent_key = it->second.parent_key; + nodes_.erase(it); + if (!has_parent) { + break; + } + auto parent_it = nodes_.find(parent_key); + if (parent_it == nodes_.end()) { + detachPendingChildLocked(parent_key, cache_key); + break; + } + parent_it->second.children.erase(cache_key); + cache_key = parent_key; + it = parent_it; + } +} + +std::optional +PrefixTreeMemoryBlockCache::toItemLocked(const Node& node, CacheBlockKind kind) const { + if (!validKind(kind)) { + return std::nullopt; + } + const auto& state = node.kinds[kindIndex(kind)]; + if (!state.has_value) { + return std::nullopt; + } + return CacheItem{ + node.cache_key, kind, state.backing_type, state.block_index, state.disk_slot, state.block_size, + state.is_resident, state.generation, state.created_time_us, state.slot_valid_mask}; +} + +bool PrefixTreeMemoryBlockCache::isKindLeafLocked(const Node& node, CacheBlockKind kind) const { + const auto& state = node.kinds[kindIndex(kind)]; + if (!state.has_value || state.detached) { + return false; + } + return state.subtree_ref_count <= 1; +} + +std::optional +PrefixTreeMemoryBlockCache::popStateOnlyFromChainLocked(const CacheKeyType& leaf_key, CacheBackingType backing_type) { + auto leaf_it = nodes_.find(leaf_key); + if (leaf_it == nodes_.end()) { + return std::nullopt; + } + std::vector chain; + CacheKeyType cur = leaf_key; + while (true) { + auto node_it = nodes_.find(cur); + if (node_it == nodes_.end()) { + break; + } + chain.push_back(cur); + if (!node_it->second.has_parent) { + break; + } + auto parent_it = nodes_.find(node_it->second.parent_key); + if (parent_it == nodes_.end() || parent_it->second.children.size() != 1) { + break; + } + cur = parent_it->first; + } + if (chain.size() <= 1) { + return std::nullopt; + } + for (size_t idx = 1; idx < chain.size(); ++idx) { + auto node_it = nodes_.find(chain[idx]); + if (node_it == nodes_.end()) { + continue; + } + auto& state = node_it->second.kinds[kindIndex(CacheBlockKind::STATE_SWA_KV)]; + if (!state.has_value || state.detached || state.backing_type != backing_type || state.is_resident + || state.in_flight_ref > 0) { + continue; + } + auto item = toItemLocked(node_it->second, CacheBlockKind::STATE_SWA_KV); + eraseEvictKeyLocked(node_it->second, CacheBlockKind::STATE_SWA_KV); + state.detached = true; + decrementAncestorsLocked(item->cache_key, CacheBlockKind::STATE_SWA_KV); + const auto descendant_ref_count = state.subtree_ref_count; + state = KindState{}; + state.subtree_ref_count = descendant_ref_count; + pruneLocked(item->cache_key); + return item; + } + return std::nullopt; +} + +std::vector +PrefixTreeMemoryBlockCache::popChainLocked(const CacheKeyType& leaf_key, CacheBackingType backing_type) { + std::vector items; + auto leaf_it = nodes_.find(leaf_key); + if (leaf_it == nodes_.end()) { + return items; + } + std::vector chain; + CacheKeyType cur = leaf_key; + while (true) { + auto node_it = nodes_.find(cur); + if (node_it == nodes_.end()) { + break; + } + chain.push_back(cur); + if (!node_it->second.has_parent) { + break; + } + auto parent_it = nodes_.find(node_it->second.parent_key); + if (parent_it == nodes_.end() || parent_it->second.children.size() != 1) { + break; + } + cur = parent_it->first; + } + + bool has_target_state = false; + for (const auto key : chain) { + auto node_it = nodes_.find(key); + if (node_it == nodes_.end()) { + continue; + } + const auto& state = node_it->second.kinds[kindIndex(CacheBlockKind::STATE_SWA_KV)]; + if (state.has_value && !state.detached && state.backing_type == backing_type && !state.is_resident + && state.in_flight_ref == 0) { + has_target_state = true; + break; + } + } + if (!has_target_state) { + return items; + } + + for (auto chain_it = chain.begin(); chain_it != chain.end(); ++chain_it) { + auto node_it = nodes_.find(*chain_it); + if (node_it == nodes_.end()) { + continue; + } + for (auto kind : {CacheBlockKind::COMPRESSED_KV, CacheBlockKind::STATE_SWA_KV}) { + auto& state = node_it->second.kinds[kindIndex(kind)]; + if (!state.has_value || state.detached || state.backing_type != backing_type || state.is_resident + || state.in_flight_ref > 0) { + continue; + } + auto item = toItemLocked(node_it->second, kind); + if (!item.has_value()) { + continue; + } + eraseEvictKeyLocked(node_it->second, kind); + state.detached = true; + decrementAncestorsLocked(item->cache_key, kind); + const auto descendant_ref_count = state.subtree_ref_count; + state = KindState{}; + state.subtree_ref_count = descendant_ref_count; + items.push_back(*item); + } + pruneLocked(*chain_it); + } + return items; +} + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/connector/memory/PrefixTreeMemoryBlockCache.h b/rtp_llm/cpp/cache/connector/memory/PrefixTreeMemoryBlockCache.h new file mode 100644 index 0000000000..4cb1db4fad --- /dev/null +++ b/rtp_llm/cpp/cache/connector/memory/PrefixTreeMemoryBlockCache.h @@ -0,0 +1,158 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rtp_llm/cpp/cache/KVCacheResource.h" +#include "rtp_llm/cpp/cache/connector/memory/CacheBlockKind.h" +#include "rtp_llm/cpp/cache/connector/memory/MemoryDiskBlockCache.h" + +namespace rtp_llm { + +class PrefixTreeMemoryBlockCache { +public: + static constexpr size_t kKindCount = 2; + + struct KindState { + bool has_value{false}; + CacheBackingType backing_type{CacheBackingType::MEMORY}; + BlockIdxType block_index{NULL_BLOCK_IDX}; + int32_t disk_slot{-1}; + size_t block_size{0}; + bool is_resident{false}; + bool detached{false}; + uint64_t generation{0}; + uint64_t last_access_seq{0}; + int64_t created_time_us{0}; + uint32_t in_flight_ref{0}; + uint32_t subtree_ref_count{0}; + std::vector slot_valid_mask; + }; + + struct CacheItem { + CacheKeyType cache_key{0}; + CacheBlockKind kind{CacheBlockKind::COMPRESSED_KV}; + CacheBackingType backing_type{CacheBackingType::MEMORY}; + BlockIdxType block_index{NULL_BLOCK_IDX}; + int32_t disk_slot{-1}; + size_t block_size{0}; + bool is_resident{false}; + uint64_t generation{0}; + int64_t created_time_us{0}; + std::vector slot_valid_mask; + }; + + struct MatchResult { + bool found{false}; + CacheBackingType backing_type{CacheBackingType::MEMORY}; + BlockIdxType block_index{NULL_BLOCK_IDX}; + int32_t disk_slot{-1}; + size_t block_size{0}; + uint64_t generation{0}; + int64_t created_time_us{0}; + std::vector slot_valid_mask; + }; + + bool contains(CacheKeyType cache_key, CacheBlockKind kind) const; + bool contains(CacheKeyType cache_key, CacheBlockKind kind, const std::vector& required_slot_mask) const; + MatchResult match(CacheKeyType cache_key, CacheBlockKind kind); + MatchResult match(CacheKeyType cache_key, CacheBlockKind kind, const std::vector& required_slot_mask); + MatchResult matchAndMarkInFlight(CacheKeyType cache_key, CacheBlockKind kind); + MatchResult matchAndMarkInFlight(CacheKeyType cache_key, + CacheBlockKind kind, + const std::vector& required_slot_mask); + + std::pair> + putCommitted(CacheKeyType cache_key, const BlockDependency& dependency, const CacheItem& item); + std::optional detachIfMatch(CacheKeyType cache_key, + CacheBlockKind kind, + CacheBackingType backing_type, + BlockIdxType expected_block_index, + int32_t expected_disk_slot, + uint64_t expected_generation); + std::optional releaseInFlight(CacheKeyType cache_key, + CacheBlockKind kind, + CacheBackingType backing_type, + BlockIdxType block_index, + int32_t disk_slot, + uint64_t generation); + + std::optional popOldestEvictable(CacheBlockKind kind); + std::optional popOldestEvictable(CacheBlockKind kind, CacheBackingType backing_type); + std::vector popOldestStateOrChainEvictable(CacheBackingType backing_type); + std::vector cacheKeys() const; + std::vector cacheKeysUnorderedForStatus() const; + size_t size() const; + +private: + struct RetiredItem { + CacheItem item; + uint32_t in_flight_ref{0}; + }; + + struct Node { + CacheKeyType cache_key{0}; + CacheKeyType parent_key{0}; + bool has_parent{false}; + uint32_t ordinal{0}; + std::unordered_set children; + std::array kinds; + std::array, kKindCount> retired_items; + }; + + struct EvictKey { + uint64_t last_access_seq{0}; + CacheKeyType cache_key{0}; + uint64_t generation{0}; + + bool operator<(const EvictKey& other) const { + if (last_access_seq != other.last_access_seq) { + return last_access_seq < other.last_access_seq; + } + if (cache_key != other.cache_key) { + return cache_key < other.cache_key; + } + return generation < other.generation; + } + }; + + static size_t kindIndex(CacheBlockKind kind); + static bool validKind(CacheBlockKind kind); + static bool slotMaskCovers(const std::vector& stored, const std::vector& required); + + Node& upsertNodeLocked(CacheKeyType cache_key, const BlockDependency& dependency); + void incrementAncestorsLocked(CacheKeyType cache_key, CacheBlockKind kind); + void decrementAncestorsLocked(CacheKeyType cache_key, CacheBlockKind kind); + void addSubtreeRefsToAncestorsLocked(CacheKeyType ancestor_key, const Node& child); + void subtractSubtreeRefsFromAncestorsLocked(CacheKeyType ancestor_key, const Node& child); + void detachPendingChildLocked(CacheKeyType parent_key, CacheKeyType child_key); + void attachPendingChildrenLocked(Node& node); + void touchLocked(Node& node, CacheBlockKind kind); + void insertEvictKeyLocked(const Node& node, CacheBlockKind kind); + void eraseEvictKeyLocked(const Node& node, CacheBlockKind kind); + void refreshEvictKeyLocked(const Node& node, CacheBlockKind kind); + void pruneLocked(CacheKeyType cache_key); + std::optional toItemLocked(const Node& node, CacheBlockKind kind) const; + bool isKindLeafLocked(const Node& node, CacheBlockKind kind) const; + std::optional popStateOnlyFromChainLocked(const CacheKeyType& leaf_key, CacheBackingType backing_type); + std::vector popChainLocked(const CacheKeyType& leaf_key, CacheBackingType backing_type); + +private: + mutable std::shared_mutex mutex_; + std::unordered_map nodes_; + std::unordered_map> pending_children_by_parent_; + std::array, kKindCount> leaf_lru_; + uint64_t access_seq_{0}; + uint64_t generation_seq_{0}; +}; + +using PrefixTreeMemoryBlockCachePtr = std::shared_ptr; + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/connector/memory/test/BUILD b/rtp_llm/cpp/cache/connector/memory/test/BUILD index 473bb33475..bc8b5c2050 100644 --- a/rtp_llm/cpp/cache/connector/memory/test/BUILD +++ b/rtp_llm/cpp/cache/connector/memory/test/BUILD @@ -7,6 +7,7 @@ test_deps = [ "//rtp_llm/models_py/bindings/cuda/ops:cuda_impl", "//rtp_llm/models_py/bindings/core:exec_ops_test_lib", "//rtp_llm/models_py/bindings/cuda:no_block_copy", + "//rtp_llm/cpp/cache:kv_cache_allocator", "//rtp_llm/cpp/config:config_modules", "//rtp_llm/cpp/config:model_config", "//rtp_llm/cpp/utils:core_utils", @@ -36,6 +37,31 @@ cc_test( exec_properties = {'gpu':'H20'}, ) +cc_test( + name = "memory_connector_batch_copy_test", + srcs = [ + "KVCacheBatchedMemoryCopyTest.cc", + ], + data = [], + copts = test_copts, + deps = [ + "//rtp_llm/cpp/cache:block_pool", + "//rtp_llm/cpp/cache:cache_core", + "//rtp_llm/cpp/cache:kv_cache_allocator_hdr", + "//rtp_llm/cpp/cache/connector/memory:memory_connector", + "//rtp_llm/cpp/cache/test:cache_config_test_utils", + "//rtp_llm/cpp/config:config_modules", + "//rtp_llm/cpp/config:model_config", + "//rtp_llm/cpp/utils:core_utils", + "//rtp_llm/models_py/bindings/cuda:no_block_copy", + "@com_google_googletest//:gtest", + "@local_config_cuda//cuda:cuda_headers", + "@local_config_cuda//cuda:cudart", + ] + torch_deps(), + env = {}, + exec_properties = {'gpu':'H20'}, +) + cc_test( name = "memory_block_cache_test", srcs = [ @@ -50,6 +76,48 @@ cc_test( exec_properties = {'gpu':'H20'}, ) +cc_test( + name = "memory_disk_block_cache_test", + srcs = [ + "MemoryDiskBlockCacheTest.cc", + ], + data = [], + copts = test_copts, + deps = test_deps + [ + "//rtp_llm/cpp/cache/connector/memory:memory_connector", + ], + env = {}, + exec_properties = {'gpu':'H20'}, +) + +cc_test( + name = "prefix_tree_memory_block_cache_test", + srcs = [ + "PrefixTreeMemoryBlockCacheTest.cc", + ], + data = [], + copts = test_copts, + deps = test_deps + [ + "//rtp_llm/cpp/cache/connector/memory:memory_connector", + ], + env = {}, + exec_properties = {'gpu':'H20'}, +) + +cc_test( + name = "disk_block_pool_test", + srcs = [ + "DiskBlockPoolTest.cc", + ], + data = [], + copts = test_copts, + deps = test_deps + [ + "//rtp_llm/cpp/cache/connector/memory:memory_connector", + ], + env = {}, + exec_properties = {'gpu':'H20'}, +) + cc_test( name = "memory_async_context_test", srcs = [ @@ -62,4 +130,4 @@ cc_test( ], env = {}, exec_properties = {'gpu':'H20'}, -) \ No newline at end of file +) diff --git a/rtp_llm/cpp/cache/connector/memory/test/DiskBlockPoolTest.cc b/rtp_llm/cpp/cache/connector/memory/test/DiskBlockPoolTest.cc new file mode 100644 index 0000000000..384e2af160 --- /dev/null +++ b/rtp_llm/cpp/cache/connector/memory/test/DiskBlockPoolTest.cc @@ -0,0 +1,193 @@ +#include "gtest/gtest.h" + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "rtp_llm/cpp/cache/connector/memory/DiskBlockPool.h" + +namespace rtp_llm::test { +namespace { + +class TempDir { +public: + TempDir() { + char tmpl[] = "/tmp/rtp_disk_pool_test_XXXXXX"; + auto path = ::mkdtemp(tmpl); + EXPECT_NE(path, nullptr); + if (path != nullptr) { + path_ = path; + } + } + ~TempDir() { + if (path_.empty()) { + return; + } + const auto work_dir = path_ + "/rtp_llm_disk_kv"; + if (auto* dir = ::opendir(work_dir.c_str())) { + while (auto* entry = ::readdir(dir)) { + const std::string name(entry->d_name); + if (name == "." || name == "..") { + continue; + } + ::unlink((work_dir + "/" + name).c_str()); + } + ::closedir(dir); + } + ::rmdir(work_dir.c_str()); + ::rmdir(path_.c_str()); + } + const std::string& path() const { + return path_; + } + +private: + std::string path_; +}; + +DiskBlockPoolConfig makeConfig(const std::string& path, size_t disk_size_bytes = 3 * 4096) { + DiskBlockPoolConfig config; + config.work_dir = path; + config.local_rank = 0; + config.world_rank = 0; + config.disk_size_bytes = disk_size_bytes; + config.block_size_bytes = 1024; + config.buffered_io = true; + config.pool_kind = CacheBlockKind::COMPLETE; + return config; +} + +} // namespace + +TEST(DiskBlockPoolTest, InitPreallocatesFileAndCleansStaleFiles) { + TempDir temp_dir; + ASSERT_FALSE(temp_dir.path().empty()); + + const auto work_dir = temp_dir.path() + "/rtp_llm_disk_kv"; + ASSERT_EQ(::mkdir(work_dir.c_str(), 0755), 0); + const auto stale = work_dir + "/rank_stale.kv"; + int fd = ::open(stale.c_str(), O_CREAT | O_WRONLY, 0600); + ASSERT_GE(fd, 0); + ::close(fd); + ASSERT_EQ(::access(stale.c_str(), F_OK), 0); + + DiskMountGuard guard; + ASSERT_TRUE(guard.init(temp_dir.path())); + + DiskBlockPool pool(makeConfig(guard.workDir())); + ASSERT_TRUE(pool.init()); + EXPECT_EQ(::access(stale.c_str(), F_OK), -1); + EXPECT_EQ(::access(pool.filePath().c_str(), F_OK), 0); + EXPECT_NE(pool.filePath().find("rank_0_world_0_complete.kv"), std::string::npos); + EXPECT_EQ(pool.totalSlots(), 3u); + EXPECT_EQ(pool.freeSlots(), 3u); +} + +TEST(DiskBlockPoolTest, InitFailsWhenMountPathDoesNotExist) { + TempDir temp_dir; + ASSERT_FALSE(temp_dir.path().empty()); + + DiskMountGuard guard; + EXPECT_FALSE(guard.init(temp_dir.path() + "/missing_mount")); +} + +TEST(DiskBlockPoolTest, MountGuardAllowsTwoPoolsOnSameMountWithoutDeletingFirst) { + TempDir temp_dir; + ASSERT_FALSE(temp_dir.path().empty()); + + DiskMountGuard guard; + ASSERT_TRUE(guard.init(temp_dir.path())); + + DiskBlockPool complete_pool(makeConfig(guard.workDir())); + ASSERT_TRUE(complete_pool.init()); + ASSERT_EQ(::access(complete_pool.filePath().c_str(), F_OK), 0); + + auto incomplete_cfg = makeConfig(guard.workDir(), 6 * 4096); + incomplete_cfg.pool_kind = CacheBlockKind::INCOMPLETE; + incomplete_cfg.local_rank = 0; + incomplete_cfg.world_rank = 0; + DiskBlockPool incomplete_pool(incomplete_cfg); + ASSERT_TRUE(incomplete_pool.init()); + + EXPECT_EQ(::access(complete_pool.filePath().c_str(), F_OK), 0); + EXPECT_EQ(::access(incomplete_pool.filePath().c_str(), F_OK), 0); + EXPECT_NE(complete_pool.filePath(), incomplete_pool.filePath()); +} + +TEST(DiskBlockPoolTest, ReserveCommitAbortAndFreeSlots) { + TempDir temp_dir; + DiskMountGuard guard; + ASSERT_TRUE(guard.init(temp_dir.path())); + DiskBlockPool pool(makeConfig(guard.workDir())); + ASSERT_TRUE(pool.init()); + + auto slot = pool.malloc(); + ASSERT_TRUE(slot.has_value()); + EXPECT_EQ(pool.freeSlots(), 2u); + + pool.blockCacheReference(*slot); + pool.requestFree(*slot); + EXPECT_EQ(pool.freeSlots(), 2u); + EXPECT_EQ(pool.availableSlots(), 3u); + + pool.blockCacheFree(*slot); + EXPECT_EQ(pool.freeSlots(), 3u); +} + +TEST(DiskBlockPoolTest, RequestRefPreventsReuseUntilReleased) { + TempDir temp_dir; + DiskMountGuard guard; + ASSERT_TRUE(guard.init(temp_dir.path())); + DiskBlockPool pool(makeConfig(guard.workDir())); + ASSERT_TRUE(pool.init()); + + auto slot = pool.malloc(); + ASSERT_TRUE(slot.has_value()); + pool.blockCacheReference(*slot); + pool.requestReference(*slot); + + pool.blockCacheFree(*slot); + pool.requestFree(*slot); + EXPECT_EQ(pool.freeSlots(), 2u); + + pool.requestFree(*slot); + EXPECT_EQ(pool.freeSlots(), 3u); +} + +TEST(DiskBlockPoolTest, ReadWriteFullSlot) { + TempDir temp_dir; + DiskMountGuard guard; + ASSERT_TRUE(guard.init(temp_dir.path())); + DiskBlockPool pool(makeConfig(guard.workDir())); + ASSERT_TRUE(pool.init()); + + auto slot = pool.malloc(); + ASSERT_TRUE(slot.has_value()); + std::vector write_buf(pool.slotStrideBytes(), 0x5a); + std::vector read_buf(pool.slotStrideBytes(), 0); + + ASSERT_TRUE(pool.write(*slot, write_buf.data(), write_buf.size())); + ASSERT_TRUE(pool.read(*slot, read_buf.data(), read_buf.size())); + EXPECT_EQ(read_buf, write_buf); + EXPECT_EQ(pool.writeBytes(), write_buf.size()); + EXPECT_EQ(pool.readBytes(), read_buf.size()); +} + +TEST(DiskBlockPoolTest, FullPoolReturnsNullopt) { + TempDir temp_dir; + DiskMountGuard guard; + ASSERT_TRUE(guard.init(temp_dir.path())); + DiskBlockPool pool(makeConfig(guard.workDir(), 2 * 4096)); + ASSERT_TRUE(pool.init()); + ASSERT_TRUE(pool.malloc().has_value()); + ASSERT_TRUE(pool.malloc().has_value()); + EXPECT_FALSE(pool.malloc().has_value()); +} + +} // namespace rtp_llm::test diff --git a/rtp_llm/cpp/cache/connector/memory/test/KVCacheBatchedMemoryCopyTest.cc b/rtp_llm/cpp/cache/connector/memory/test/KVCacheBatchedMemoryCopyTest.cc new file mode 100644 index 0000000000..9b4eebc35b --- /dev/null +++ b/rtp_llm/cpp/cache/connector/memory/test/KVCacheBatchedMemoryCopyTest.cc @@ -0,0 +1,469 @@ +// Copyright (c) RTP-LLM + +#include +#include +#include +#include +#include +#include + +#include +#include +#include "gtest/gtest.h" + +#include "rtp_llm/cpp/cache/BlockPool.h" +#include "rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h" +#include "rtp_llm/cpp/cache/config_creator/HybridPoolConfigCreator.h" +#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h" +#include "rtp_llm/cpp/cache/spec/MHAKVCacheSpec.h" +#include "rtp_llm/cpp/cache/connector/memory/KVCacheMemoryConnector.h" +#include "rtp_llm/cpp/config/ConfigModules.h" +#include "rtp_llm/cpp/config/ModelConfig.h" +#include "rtp_llm/cpp/utils/Logger.h" +#include "rtp_llm/models_py/bindings/core/OpData.h" + +namespace rtp_llm { + +void execBatchCopy(const BatchCopyParams&) {} + +} // namespace rtp_llm + +namespace rtp_llm::test { +namespace { + +CacheConfig makeCompactDsv4TypedMemoryCopyConfig(bool use_flash) { + CacheConfig config; + config.dtype = rtp_llm::DataType::TYPE_UINT8; + config.layer_num = use_flash ? 43 : 61; + config.layer_all_num = config.layer_num; + config.block_num = 512; + config.seq_size_per_block = 256; + config.kernel_seq_size_per_block = 256; + config.use_independent_block_pools = true; + config.use_typed_cache_regions = true; + config.use_opaque_kv_cache_store = true; + config.is_sparse = true; + + constexpr size_t kDsv4PoolNum = 7; + const std::vector group_tags = { + "csa_kv", "hca_kv", "indexer_kv", "indexer_state", "csa_state", "hca_state", "swa_kv"}; + const std::vector group_types = {CacheGroupType::FULL, + CacheGroupType::FULL, + CacheGroupType::FULL, + CacheGroupType::SWA, + CacheGroupType::SWA, + CacheGroupType::SWA, + CacheGroupType::SWA}; + std::vector group_policies; + group_policies.reserve(kDsv4PoolNum); + for (const auto type : group_types) { + group_policies.push_back(defaultCacheGroupPolicy(type)); + } + group_policies[5].reuse_policy = CacheReusePolicy::NON_REUSABLE; + group_policies[5].active_tail_blocks = 1; + group_policies[5].validate_tail_blocks = false; + for (size_t gid : {3u, 4u, 5u, 6u}) { + group_policies[gid].evict_policy = CacheEvictPolicy::INDEPENDENT; + } + const std::vector group_kv_block_stride_bytes = {64, 16, 32, 48, 80, 40, 96}; + const std::vector group_kv_scale_stride_bytes(kDsv4PoolNum, 0); + config.group_seq_size_per_block = std::vector(kDsv4PoolNum, config.seq_size_per_block); + const std::vector group_block_nums(kDsv4PoolNum, config.block_num); + std::vector> layers_by_group(kDsv4PoolNum); + config.layer_to_block_stride_bytes = std::vector(config.layer_all_num, 0); + + auto make_spec = [&](size_t gid) -> KVCacheSpecPtr { + if (group_types[gid] == CacheGroupType::FULL) { + auto spec = std::make_shared(); + spec->type = KVCacheSpecType::OpaqueKV; + spec->dtype = config.dtype; + spec->store_dtype = config.dtype; + spec->entry_elems = static_cast(group_kv_block_stride_bytes[gid]); + spec->entries_per_block = 1; + spec->seq_size_per_block = static_cast(config.seq_size_per_block); + spec->tag = group_tags[gid]; + return spec; + } + auto spec = std::make_shared(); + spec->type = KVCacheSpecType::OpaqueState; + spec->dtype = config.dtype; + spec->store_dtype = config.dtype; + spec->state_dim = static_cast(group_kv_block_stride_bytes[gid]); + spec->entries_per_block = 1; + spec->seq_size_per_block = static_cast(config.seq_size_per_block); + spec->tag = group_tags[gid]; + return spec; + }; + + auto add_tag = [&](size_t layer, const std::string& tag, int gid) { + (void)tag; + layers_by_group[static_cast(gid)].push_back(static_cast(layer)); + }; + + for (size_t layer = 0; layer < config.layer_all_num; ++layer) { + const bool is_csa = layer >= 2 && layer % 2 == 0; + const bool is_hca = use_flash ? (layer >= 2 && layer % 2 == 1) : (!is_csa); + if (is_csa) { + add_tag(layer, "csa_kv", 0); + add_tag(layer, "indexer_kv", 2); + add_tag(layer, "indexer_state", 3); + add_tag(layer, "csa_state", 4); + } else if (is_hca) { + add_tag(layer, "hca_kv", 1); + add_tag(layer, "hca_state", 5); + } + add_tag(layer, "swa_kv", 6); + } + + std::vector specs; + specs.reserve(kDsv4PoolNum); + for (size_t gid = 0; gid < kDsv4PoolNum; ++gid) { + specs.push_back(make_spec(gid)); + } + config.fromGroupedSpecs(specs, layers_by_group, group_types, group_tags); + config.setGroupPolicies(group_policies); + config.setGroupBlockLayout(group_block_nums, group_kv_block_stride_bytes, group_kv_scale_stride_bytes); + return config; +} + +char copyTag(size_t index) { + return static_cast(33 + (index % 90)); +} + +size_t sumBlockInfosBytes(const std::vector& infos) { + size_t total = 0; + for (const auto& b : infos) { + if (b.addr && b.size_bytes > 0) { + total += b.size_bytes; + } + } + return total; +} + +void setBlockBytes(const BlockInfo& b, size_t byte_offset, size_t byte_len, char c) { + ASSERT_NE(b.addr, nullptr); + ASSERT_LE(byte_offset + byte_len, b.size_bytes); + auto* addr = static_cast(b.addr) + byte_offset; + if (b.is_cuda) { + const auto rc = cudaMemset(addr, c, byte_len); + ASSERT_EQ(rc, cudaSuccess) << cudaGetErrorString(rc); + const auto sync_rc = cudaDeviceSynchronize(); + ASSERT_EQ(sync_rc, cudaSuccess) << cudaGetErrorString(sync_rc); + } else { + memset(addr, c, byte_len); + } +} + +void verifyBlockBytesEq(const BlockInfo& b, size_t byte_offset, size_t byte_len, char expected) { + ASSERT_NE(b.addr, nullptr); + ASSERT_LE(byte_offset + byte_len, b.size_bytes); + auto* addr = static_cast(b.addr) + byte_offset; + + std::vector data(byte_len, 0); + if (b.is_cuda) { + const auto rc = cudaMemcpy(data.data(), addr, byte_len, cudaMemcpyDeviceToHost); + ASSERT_EQ(rc, cudaSuccess) << cudaGetErrorString(rc); + } else { + memcpy(data.data(), addr, byte_len); + } + size_t mismatch = 0; + for (; mismatch < byte_len; ++mismatch) { + if (data[mismatch] != static_cast(expected)) { + break; + } + } + ASSERT_EQ(mismatch, byte_len) << "mismatch at byte offset " << mismatch << " expect '" << expected << "' got 0x" + << std::hex << static_cast(data[mismatch]) << std::dec; +} + +void setBlockInfosContent(const std::vector& infos, char c) { + for (const auto& b : infos) { + if (b.addr && b.size_bytes > 0) { + setBlockBytes(b, /*byte_offset=*/0, b.size_bytes, c); + } + } +} + +void verifyBlockInfosContent(const std::vector& infos, char c) { + for (const auto& b : infos) { + if (b.addr && b.size_bytes > 0) { + verifyBlockBytesEq(b, /*byte_offset=*/0, b.size_bytes, c); + } + } +} + +class FakeTypedKVCacheAllocator: public KVCacheAllocator { +public: + explicit FakeTypedKVCacheAllocator(const CacheConfig& config, + size_t payload_gap_bytes = 0, + std::set host_groups = {}): + KVCacheAllocator(config, AllocationType::DEVICE), + host_groups_(std::move(host_groups)), + payload_gap_bytes_(payload_gap_bytes) { + const auto cuda_options = torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCUDA); + const auto host_options = torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU); + const auto layer_group_ids = config.layerGroupIdsSnapshot(); + const auto kv_strides = config.groupKvBlockStrideBytesSnapshot(); + const auto scale_strides = config.groupKvScaleStrideBytesSnapshot(); + for (int layer = 0; layer < static_cast(config.layer_all_num); ++layer) { + if (static_cast(layer) >= layer_group_ids.size()) { + continue; + } + const auto& layer_groups = layer_group_ids[static_cast(layer)]; + for (const int gid : layer_groups) { + if (gid < 0 || static_cast(gid) >= kv_strides.size()) { + continue; + } + const size_t stride = kv_strides[static_cast(gid)] + + (static_cast(gid) < scale_strides.size() ? + scale_strides[static_cast(gid)] : + 0); + if (stride == 0) { + continue; + } + const bool host_group = host_groups_.count(gid) > 0; + auto tensor = torch::empty({static_cast(config.block_num), static_cast(stride)}, + host_group ? host_options : cuda_options); + if (host_group) { + tensor = tensor.pin_memory(); + } + tensors_[key(layer, gid)] = std::move(tensor); + strides_[key(layer, gid)] = stride; + } + } + } + + void free(const FreeInfo&) override {} + void insertIntoCache(const InsertInfo&) override {} + + BlockAddrInfo convertIndexToAddr(int layer_id, int block_id) const override { + return convertIndexToAddr(layer_id, 0, block_id); + } + + BlockAddrInfo convertIndexToAddr(int layer_id, int group_id, int block_id) const override { + const auto buffers = convertIndexToBuffer(layer_id, group_id, block_id); + return buffers.empty() ? BlockAddrInfo{} : BlockAddrInfo{buffers[0].addr, nullptr}; + } + + std::vector convertIndexToBuffer(int layer_id, int block_id) const override { + return convertIndexToBuffer(layer_id, 0, block_id); + } + + std::vector convertIndexToBuffer(int layer_id, int block_id, int, int) const override { + return convertIndexToBuffer(layer_id, block_id); + } + + std::vector + convertIndexToBuffer(int layer_id, int group_id, int block_id) const override { + const auto k = key(layer_id, group_id); + const auto tensor_it = tensors_.find(k); + const auto stride_it = strides_.find(k); + if (tensor_it == tensors_.end() || stride_it == strides_.end() || block_id < 0 + || static_cast(block_id) >= config_.block_num) { + return {}; + } + const auto& tensor = tensor_it->second; + const auto stride = stride_it->second; + auto* addr = static_cast(tensor.data_ptr()) + static_cast(block_id) * stride; + const auto payload_size = payload_gap_bytes_ < stride ? stride - payload_gap_bytes_ : stride; + return {BlockInfo{ + /*is_cuda=*/tensor.is_cuda(), + /*device_index=*/tensor.is_cuda() ? static_cast(tensor.get_device()) : -1, + /*scalar_type=*/static_cast(tensor.scalar_type()), + /*addr=*/addr, + /*size_bytes=*/payload_size, + }}; + } + + std::vector + convertIndexToBuffer(int layer_id, int group_id, int block_id, int, int) const override { + return convertIndexToBuffer(layer_id, group_id, block_id); + } + + std::shared_ptr incrKVCacheRef(const KVCacheResource&, const CacheKeysType&, bool) override { + return nullptr; + } + + CacheLayerLayout allLayerCacheBase() const override { + return {}; + } + + bool + updateKVBlock(const BatchKVCacheResourcePtr&, const std::vector&, bool, std::vector&) override { + return false; + } + + int seqSizePerBlock() const override { + return static_cast(config_.seq_size_per_block); + } + + int singleBatchNeedBlocks(const BatchKVCacheResourcePtr&, int, int) const override { + return 0; + } + +private: + static std::pair key(int layer_id, int group_id) { + return {layer_id, group_id}; + } + + bool doInit() override { + return true; + } + + MallocResult incrMalloc(const MallocInfo&) override { + return {false, 0}; + } + + MallocResult initMallocForCommonLen(const MallocInfo&) override { + return {false, 0}; + } + + int getNeedBlocks(const MallocInfo&) const override { + return 0; + } + + void decrKVCacheRef(const KVCacheResource&, bool) override {} + + std::map, torch::Tensor> tensors_; + std::map, size_t> strides_; + std::set host_groups_; + size_t payload_gap_bytes_ = 0; +}; + +} // namespace + +void runDsv4TypedStagedCopyRoundTrip(const std::set& host_groups) { + const auto set_device_rc = cudaSetDevice(0); + ASSERT_EQ(set_device_rc, cudaSuccess) << cudaGetErrorString(set_device_rc); + + auto config = makeCompactDsv4TypedMemoryCopyConfig(/*use_flash=*/true); + + KVCacheConfig kv_config; + kv_config.memory_cache_size_mb = 64; + kv_config.memory_cache_sync_timeout_ms = 1000; + kv_config.enable_prefix_tree_memory_cache = false; + + auto allocator = std::make_shared(config, /*payload_gap_bytes=*/8, host_groups); + + std::vector server_addrs = {"127.0.0.1:1"}; + auto connector = std::make_shared(config, kv_config, allocator, server_addrs); + ASSERT_TRUE(connector->init()); + auto memory_pool = connector->isDualPool() ? connector->complete_pool_ : connector->block_pool_; + ASSERT_NE(memory_pool, nullptr); + + const auto slots = connector->layerTagSlots(); + ASSERT_TRUE(connector->hasTypedLayerTagSlots(slots)); + ASSERT_GT(slots.size(), config.layer_all_num); + + auto mem_blocks = memory_pool->malloc(2); + ASSERT_EQ(mem_blocks.size(), 2u); + const std::vector request_mem_blocks{static_cast(mem_blocks[1]), + static_cast(mem_blocks[0])}; + + MemoryOperationRequestPB req; + std::vector> gpu_block_sets(request_mem_blocks.size(), + std::vector(slots.size(), NULL_BLOCK_IDX)); + BlockIdxType next_gpu_block = 1; + for (auto& gpu_blocks : gpu_block_sets) { + for (auto& gpu_block : gpu_blocks) { + gpu_block = next_gpu_block++; + } + } + ASSERT_LT(next_gpu_block, static_cast(config.block_num)); + ASSERT_EQ(gpu_block_sets.size(), request_mem_blocks.size()); + for (size_t block_idx = 0; block_idx < request_mem_blocks.size(); ++block_idx) { + auto* item = req.add_copy_items(); + item->set_mem_block(request_mem_blocks[block_idx]); + item->set_is_complete(true); + ASSERT_EQ(gpu_block_sets[block_idx].size(), slots.size()); + for (const auto block : gpu_block_sets[block_idx]) { + item->add_gpu_blocks(block); + } + } + + for (size_t block_idx = 0; block_idx < request_mem_blocks.size(); ++block_idx) { + const auto mem_bufs = memory_pool->convertIndexToBuffer(0, request_mem_blocks[block_idx]); + ASSERT_EQ(mem_bufs.size(), 1u); + const auto& mem_buffer = mem_bufs[0]; + ASSERT_NE(mem_buffer.addr, nullptr); + setBlockBytes(mem_buffer, /*byte_offset=*/0, mem_buffer.size_bytes, '#'); + + size_t byte_off = 0; + for (size_t i = 0; i < slots.size(); ++i) { + const auto& slot = slots[i]; + const char tag = copyTag(block_idx * slots.size() + i); + const auto gpu_bufs = + allocator->convertIndexToBuffer(slot.layer_id, slot.group_id, gpu_block_sets[block_idx][i]); + ASSERT_GT(sumBlockInfosBytes(gpu_bufs), 0u); + ASSERT_LE(sumBlockInfosBytes(gpu_bufs), slot.stride_bytes); + setBlockInfosContent(gpu_bufs, tag); + setBlockBytes(mem_buffer, byte_off, sumBlockInfosBytes(gpu_bufs), 0); + byte_off += slot.stride_bytes; + } + } + + ASSERT_TRUE(connector->tryCopyCacheWithStagedMemoryCopy(req, KVCacheMemoryConnector::CopyDirection::D2H, slots)); + + for (size_t block_idx = 0; block_idx < request_mem_blocks.size(); ++block_idx) { + const auto mem_bufs = memory_pool->convertIndexToBuffer(0, request_mem_blocks[block_idx]); + ASSERT_EQ(mem_bufs.size(), 1u); + const auto& mem_buffer = mem_bufs[0]; + + size_t byte_off = 0; + for (size_t i = 0; i < slots.size(); ++i) { + const auto& slot = slots[i]; + const auto gpu_bufs = + allocator->convertIndexToBuffer(slot.layer_id, slot.group_id, gpu_block_sets[block_idx][i]); + verifyBlockBytesEq( + mem_buffer, byte_off, sumBlockInfosBytes(gpu_bufs), copyTag(block_idx * slots.size() + i)); + if (slot.stride_bytes > sumBlockInfosBytes(gpu_bufs)) { + verifyBlockBytesEq(mem_buffer, + byte_off + sumBlockInfosBytes(gpu_bufs), + slot.stride_bytes - sumBlockInfosBytes(gpu_bufs), + '#'); + } + byte_off += slot.stride_bytes; + } + } + + for (size_t block_idx = 0; block_idx < request_mem_blocks.size(); ++block_idx) { + const auto mem_bufs = memory_pool->convertIndexToBuffer(0, request_mem_blocks[block_idx]); + ASSERT_EQ(mem_bufs.size(), 1u); + const auto& mem_buffer = mem_bufs[0]; + + size_t byte_off = 0; + for (size_t i = 0; i < slots.size(); ++i) { + const auto& slot = slots[i]; + const char tag = copyTag(1000 + block_idx * slots.size() + i); + const auto gpu_bufs = + allocator->convertIndexToBuffer(slot.layer_id, slot.group_id, gpu_block_sets[block_idx][i]); + setBlockInfosContent(gpu_bufs, 0); + setBlockBytes(mem_buffer, byte_off, sumBlockInfosBytes(gpu_bufs), tag); + byte_off += slot.stride_bytes; + } + } + + ASSERT_TRUE(connector->tryCopyCacheWithStagedMemoryCopy(req, KVCacheMemoryConnector::CopyDirection::H2D, slots)); + + for (size_t block_idx = 0; block_idx < request_mem_blocks.size(); ++block_idx) { + for (size_t i = 0; i < slots.size(); ++i) { + const auto& slot = slots[i]; + const auto gpu_bufs = + allocator->convertIndexToBuffer(slot.layer_id, slot.group_id, gpu_block_sets[block_idx][i]); + verifyBlockInfosContent(gpu_bufs, copyTag(1000 + block_idx * slots.size() + i)); + } + } +} + +TEST(KVCacheBatchedMemoryCopyTest, Dsv4TypedLayoutUsesStagedCopyForD2HAndH2D) { + runDsv4TypedStagedCopyRoundTrip({}); +} + +} // namespace rtp_llm::test + +int main(int argc, char** argv) { + rtp_llm::initLogger(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/rtp_llm/cpp/cache/connector/memory/test/KVCacheMemoryConnectorTest.cc b/rtp_llm/cpp/cache/connector/memory/test/KVCacheMemoryConnectorTest.cc index ea2a74b2ed..d771373244 100644 --- a/rtp_llm/cpp/cache/connector/memory/test/KVCacheMemoryConnectorTest.cc +++ b/rtp_llm/cpp/cache/connector/memory/test/KVCacheMemoryConnectorTest.cc @@ -2,7 +2,13 @@ #include #include +#include +#include +#include #include +#include +#include +#include #include #include @@ -15,9 +21,11 @@ #include "rtp_llm/cpp/cache/connector/memory/MemoryAsyncContext.h" #include "rtp_llm/cpp/cache/connector/memory/MemoryBlockCache.h" #include "rtp_llm/cpp/cache/connector/memory/test/mock/TestRpcService.h" -#include "rtp_llm/cpp/cache/KVCacheAllocator.h" -#include "rtp_llm/cpp/cache/MLAKVCacheSpec.h" -#include "rtp_llm/cpp/cache/SingleTypeKVCacheAllocator.h" +#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h" +#include "rtp_llm/cpp/cache/spec/MLAKVCacheSpec.h" +#include "rtp_llm/cpp/cache/spec/OpaqueKVCacheSpec.h" +#include "rtp_llm/cpp/cache/allocator/SingleTypeKVCacheAllocator.h" +#include "rtp_llm/cpp/cache/allocator/HybridTypeKVCacheAllocator.h" #include "rtp_llm/models_py/bindings/cuda/cuda_host_utils.h" #include "rtp_llm/models_py/bindings/core/ExecOps.h" #include "rtp_llm/cpp/model_rpc/proto/model_rpc_service.pb.h" @@ -25,6 +33,7 @@ #include "rtp_llm/cpp/config/ConfigModules.h" #include "rtp_llm/cpp/config/EplbConfig.h" #include "rtp_llm/cpp/config/ModelConfig.h" +#include "rtp_llm/cpp/config/StaticConfig.h" namespace rtp_llm::test { @@ -58,6 +67,38 @@ struct CrashHandlerInstaller { static CrashHandlerInstaller g_crash_handler_installer; +void initResourceGroupsForConfig(KVCacheResource& resource, const CacheConfig& config) { + resource.initGroups(config.groupNums(), + static_cast(config.layer_all_num), + config.layerGroupIdsSnapshot(), + /*kernel_blocks_per_kv_block=*/1, + config.groupTypesSnapshot()); +} + +void setGroupStridesForConfig(CacheConfig& config, + const std::vector& kv_block_stride_bytes, + const std::vector& kv_scale_stride_bytes) { + std::vector block_nums = config.groupBlockNumsSnapshot(); + if (block_nums.empty()) { + block_nums.assign(static_cast(config.groupNums()), config.block_num); + } + config.setGroupBlockLayout(block_nums, kv_block_stride_bytes, kv_scale_stride_bytes); +} + +void makeConfigUseZeroStrideSpec(CacheConfig& config) { + auto spec = std::make_shared(); + spec->type = KVCacheSpecType::OpaqueState; + spec->dtype = config.dtype; + spec->store_dtype = config.dtype; + spec->state_dim = 0; + spec->entries_per_block = 1; + spec->seq_size_per_block = static_cast(config.seq_size_per_block); + std::vector layer_ids(static_cast(config.layer_all_num)); + std::iota(layer_ids.begin(), layer_ids.end(), 0); + config.fromGroupedSpecs({spec}, {layer_ids}, {CacheGroupType::FULL}, {"default"}); + setGroupStridesForConfig(config, {0}, {0}); +} + } // namespace // Test-local helper struct. Business code no longer exposes a LayerBlock type. @@ -99,6 +140,8 @@ class TestReadMeta: public rtp_llm::Meta { class KVCacheMemoryConnectorTest: public ::testing::Test { protected: void SetUp() override { + old_core_dump_on_exception_ = StaticConfig::user_ft_core_dump_on_exception; + StaticConfig::user_ft_core_dump_on_exception = false; createDevice(); cache_config_ = createMockCacheConfig(); @@ -113,7 +156,9 @@ class KVCacheMemoryConnectorTest: public ::testing::Test { ASSERT_TRUE(connector_->init()); } - void TearDown() override {} + void TearDown() override { + StaticConfig::user_ft_core_dump_on_exception = old_core_dump_on_exception_; + } CacheConfig cache_config_; KVCacheConfig kv_cache_config_; @@ -121,6 +166,7 @@ class KVCacheMemoryConnectorTest: public ::testing::Test { std::shared_ptr connector_; std::vector> servers_; std::vector server_addrs_; + bool old_core_dump_on_exception_{false}; private: void createDevice() const { @@ -145,14 +191,12 @@ class KVCacheMemoryConnectorTest: public ::testing::Test { kv_cache_config_.memory_cache_sync_timeout_ms = kTestMemoryCacheSyncTimeout; auto mha_spec = std::make_shared(); - mha_spec->layer_num = layer_num; // mha_spec->block_nums = block_num; mha_spec->local_head_num_kv = 8; mha_spec->size_per_head = 128; mha_spec->seq_size_per_block = seq_size_per_block; mha_spec->dtype = mha_dtype; mha_spec->type = KVCacheSpecType::MultiHeadAttention; - config.cache_specs.push_back(mha_spec); // Keep CacheConfig sizes consistent with current business definition (see CacheConfig.h): // - kv_block_stride_bytes / kv_scale_stride_bytes are "per-layer" strides for one logical block // - kv_block_size_bytes / kv_scale_size_bytes are "all layers" totals for one logical block @@ -172,10 +216,7 @@ class KVCacheMemoryConnectorTest: public ::testing::Test { for (int i = 0; i < layer_num; ++i) { layer_ids[i] = i; } - config.layer_ids.push_back(layer_ids); - // SingleTypeKVCacheAllocator::init() expects global_layer_ids[0] to exist. - // In these unit tests we only have one "model group", so keep it consistent with layer_ids. - config.global_layer_ids.push_back(layer_ids); + config.fromGroupedSpecs({mha_spec}, {layer_ids}, {CacheGroupType::FULL}, {"default"}); return config; } @@ -217,6 +258,7 @@ class KVCacheMemoryConnectorTest: public ::testing::Test { auto* addr = static_cast(b.addr) + byte_offset; if (b.is_cuda) { check_cuda_value(cudaMemset(addr, c, byte_len)); + check_cuda_value(cudaDeviceSynchronize()); } else { memset(addr, c, byte_len); } @@ -395,6 +437,8 @@ class KVCacheMemoryConnectorTest: public ::testing::Test { item->add_gpu_blocks(blocks[layer]); } item->set_mem_block(static_cast(mem_block_index)); + item->set_is_complete(true); + item->set_backing_type(MemoryOperationRequestPB::MEMORY); } LayerBlockIds makeLayerBlockIds(const std::vector>& per_layer_block_indices, size_t cache_keys_num) const { @@ -418,9 +462,18 @@ class KVCacheMemoryConnectorTest: public ::testing::Test { makeCacheResource(const CacheKeysType& cache_keys, const std::vector>& per_layer_block_indices, size_t reuse_len = 0) const { - auto res = std::make_shared(); - res->cache_keys = cache_keys; + auto res = std::make_shared(); + res->initGroups(1, + static_cast(cache_config_.layer_num), + std::vector>(cache_config_.layer_num, std::vector{0}), + /*kernel_blocks_per_kv_block=*/1, + {}); + res->cache_keys = cache_keys; res->layer_block_ids = makeLayerBlockIds(per_layer_block_indices, cache_keys.size()); + res->layer_group_block_ids.assign(cache_config_.layer_num, std::vector>(1, nullptr)); + for (size_t layer = 0; layer < cache_config_.layer_num; ++layer) { + res->layer_group_block_ids[layer][0] = res->layer_block_ids[layer]; + } // reuse_len in these tests means "GPU already-reused prefix length". // KVCacheResource::reuseBlockNum() is derived from (device + memory + remote), // so set device reuse here to make asyncMatch/asyncRead semantics consistent. @@ -556,10 +609,14 @@ TEST_F(KVCacheMemoryConnectorTest, init_ReturnFalse_WhenMemoryCacheSyncTimeoutMs } TEST_F(KVCacheMemoryConnectorTest, init_ReturnFalse_WhenBlockSizeBytesZero) { - // NOTE: business code no longer validates `block_size_bytes` for memory cache block size. - // `init()` validates `layer_to_block_stride_bytes` instead. auto cfg = cache_config_; cfg.layer_to_block_stride_bytes.clear(); + makeConfigUseZeroStrideSpec(cfg); + cfg.kv_block_stride_bytes = 0; + cfg.kv_scale_stride_bytes = 0; + cfg.kv_block_size_bytes = 0; + cfg.kv_scale_size_bytes = 0; + cfg.block_size_bytes = 0; auto kv_cfg = kv_cache_config_; kv_cfg.memory_cache_size_mb = 64; @@ -573,6 +630,9 @@ TEST_F(KVCacheMemoryConnectorTest, init_ReturnFalse_WhenPoolTooSmallForBlockSize auto cfg = cache_config_; // Make sure pool_size_mb * 1MB / total_stride_bytes == 0 -> createBlockPool() should fail with CHECK. cfg.layer_to_block_stride_bytes.assign(static_cast(cfg.layer_num), 1024 * 1024); // 1MB per layer + setGroupStridesForConfig(cfg, + std::vector(static_cast(cfg.groupNums()), 1024 * 1024), + std::vector(static_cast(cfg.groupNums()), 0)); auto kv_cfg = kv_cache_config_; kv_cfg.memory_cache_size_mb = 1; // 1MB @@ -602,10 +662,14 @@ TEST_F(KVCacheMemoryConnectorTest, initBlockPool_Throw_WhenMemoryCacheSizeMbZero } TEST_F(KVCacheMemoryConnectorTest, initBlockPool_Throw_WhenBlockSizeBytesZero) { - // NOTE: business code no longer validates `block_size_bytes` for memory cache block size. - // `initBlockPool()` validates `layer_to_block_stride_bytes` instead. auto cfg = cache_config_; cfg.layer_to_block_stride_bytes.clear(); + makeConfigUseZeroStrideSpec(cfg); + cfg.kv_block_stride_bytes = 0; + cfg.kv_scale_stride_bytes = 0; + cfg.kv_block_size_bytes = 0; + cfg.kv_scale_size_bytes = 0; + cfg.block_size_bytes = 0; auto kv_cfg = kv_cache_config_; kv_cfg.memory_cache_size_mb = 64; @@ -620,6 +684,9 @@ TEST_F(KVCacheMemoryConnectorTest, initBlockPool_Throw_WhenCreateBlockPoolFails) // Force createBlockPool() to compute block_num=0: // block_num = pool_size_mb * 1MB / total_stride_bytes. cfg.layer_to_block_stride_bytes.assign(static_cast(cfg.layer_num), 1024 * 1024); // 1MB per layer + setGroupStridesForConfig(cfg, + std::vector(static_cast(cfg.groupNums()), 1024 * 1024), + std::vector(static_cast(cfg.groupNums()), 0)); auto kv_cfg = kv_cache_config_; kv_cfg.memory_cache_size_mb = 1; // 1MB @@ -640,6 +707,54 @@ TEST_F(KVCacheMemoryConnectorTest, initBlockPool_ReturnTrue_AndRegistersPool) { ASSERT_NE(pool, nullptr); } +TEST_F(KVCacheMemoryConnectorTest, buildCopyPlanForWrite_UsesLayerAndGroupSlots) { + auto cfg = cache_config_; + cfg.layer_num = 1; + cfg.layer_all_num = 1; + auto spec = cfg.specForGroup(0); + cfg.fromGroupedSpecs({spec, spec}, {{0}, {0}}, {CacheGroupType::FULL, CacheGroupType::FULL}, {"csa_kv", "swa_kv"}); + setGroupStridesForConfig(cfg, {16, 32}, {0, 0}); + cfg.layer_to_block_stride_bytes = {999}; + + auto kv_cfg = kv_cache_config_; + kv_cfg.memory_cache_size_mb = 64; + kv_cfg.memory_cache_sync_timeout_ms = 1000; + auto conn = std::make_shared(cfg, kv_cfg, allocator_, server_addrs_); + conn->block_cache_ = std::make_shared(); + ASSERT_NO_THROW(conn->initBlockPool()); + + auto slots = conn->layerTagSlots(); + ASSERT_EQ(slots.size(), 2u); + EXPECT_EQ(slots[0].layer_id, 0); + EXPECT_EQ(slots[0].tag, "csa_kv"); + EXPECT_EQ(slots[0].group_id, 0); + EXPECT_EQ(slots[0].stride_bytes, 16u); + EXPECT_EQ(slots[1].layer_id, 0); + EXPECT_EQ(slots[1].tag, "swa_kv"); + EXPECT_EQ(slots[1].group_id, 1); + EXPECT_EQ(slots[1].stride_bytes, 32u); + + auto resource = std::make_shared(); + resource->cacheKeys() = {101, 102, 103}; + initResourceGroupsForConfig(*resource, cfg); + resource->mutableBlockIds(/*group_id=*/0).assign({11, 12, 13}); + resource->mutableBlockIds(/*group_id=*/1).assign({21, NULL_BLOCK_IDX, 23}); + + bool no_need_write = true; + auto plan = conn->buildCopyPlanForWrite( + resource->cacheKeys(), resource->layerGroupBlocks(), slots, /*start_index=*/0, /*write_num=*/3, no_need_write); + + ASSERT_NE(plan, nullptr); + EXPECT_FALSE(no_need_write); + ASSERT_EQ(plan->copy_infos.size(), 3u); + EXPECT_TRUE(plan->copy_infos[0].is_complete); + EXPECT_FALSE(plan->copy_infos[1].is_complete); + EXPECT_TRUE(plan->copy_infos[2].is_complete); + EXPECT_EQ(plan->copy_infos[0].gpu_blocks, (std::vector{11, 21})); + EXPECT_EQ(plan->copy_infos[1].gpu_blocks, (std::vector{12, NULL_BLOCK_IDX})); + EXPECT_EQ(plan->copy_infos[2].gpu_blocks, (std::vector{13, 23})); +} + TEST_F(KVCacheMemoryConnectorTest, asyncMatch_ReturnNull_WhenGpuReuseLenGEKeysSize) { const size_t N = 3; CacheKeysType cache_keys{70001, 70002, 70003}; @@ -855,6 +970,47 @@ TEST_F(KVCacheMemoryConnectorTest, asyncRead_ReturnNull_WhenPlanEmpty) { EXPECT_EQ(ctx, nullptr); } +TEST_F(KVCacheMemoryConnectorTest, asyncRead_Success_WhenCacheEntryRemovedAfterMatch) { + // asyncMatch should pin the matched memory blocks so asyncRead can still use them + // even if another request consumes and removes the cache entries before read starts. + CacheKeysType cache_keys{21001, 21002, 21003}; + + const size_t mem_size = memoryCacheBlockBytes(); + ASSERT_GT(mem_size, 0u); + auto pool = ensureBlockPool(mem_size); + ASSERT_NE(pool, nullptr); + + auto block_indices = putItemsToCache(cache_keys, mem_size); + ASSERT_EQ(block_indices.size(), cache_keys.size()); + + std::vector> lbs_vec{ + {1, 2, 3}, + {1, 2, 3}, + {1, 2, 3}, + {1, 2, 3}, + }; + auto res = makeCacheResource(cache_keys, lbs_vec); + auto meta = std::make_shared(/*enable_memory_cache=*/true, /*enable_remote_cache=*/false, ""); + + auto match_ctx = connector_->asyncMatch(res, meta); + ASSERT_NE(match_ctx, nullptr); + const int start_read_block_index = static_cast(res->reuseBlockNum()); + const int read_block_num = static_cast(match_ctx->matchedBlockCount()) - start_read_block_index; + ASSERT_GT(read_block_num, 0); + + for (int i = start_read_block_index; i < start_read_block_index + read_block_num; ++i) { + auto removed = connector_->block_cache_->remove(cache_keys[i]); + ASSERT_TRUE(removed.has_value()); + pool->blockCacheFree({removed->block_index}); + } + + auto ctx = connector_->asyncRead(res, meta, match_ctx, start_read_block_index, read_block_num); + ASSERT_NE(ctx, nullptr); + ASSERT_TRUE(waitUntilDone(ctx)); + EXPECT_TRUE(ctx->success()); + EXPECT_EQ(res->memoryReuseBlockNum(), static_cast(read_block_num)); +} + TEST_F(KVCacheMemoryConnectorTest, asyncRead_Success_IncrementsReuseLen_ByMatchedPrefix) { // 初始 reuse_len=1, 内存全部命中 => mem_match_len=3,最终 reuse_len=3 CacheKeysType cache_keys{40001, 40002, 40003}; @@ -866,10 +1022,10 @@ TEST_F(KVCacheMemoryConnectorTest, asyncRead_Success_IncrementsReuseLen_ByMatche ASSERT_EQ(block_indices.size(), cache_keys.size()); std::vector> lbs_vec{ - {101, 102, 103}, // layer0 - {201, 202, 203}, // layer1 - {301, 302, 303}, // layer2 - {401, 402, 403}, // layer3 + {1, 2, 3}, // layer0 + {1, 2, 3}, // layer1 + {1, 2, 3}, // layer2 + {1, 2, 3}, // layer3 }; auto res = makeCacheResource(cache_keys, lbs_vec, 1); @@ -901,10 +1057,10 @@ TEST_F(KVCacheMemoryConnectorTest, asyncRead_Success_RemovesLoadedBlocksFromMemo ASSERT_LT(pool->freeBlocksNum(), free_before); std::vector> lbs_vec{ - {111, 112, 113}, - {211, 212, 213}, - {311, 312, 313}, - {411, 412, 413}, + {1, 2, 3}, + {1, 2, 3}, + {1, 2, 3}, + {1, 2, 3}, }; auto res = makeCacheResource(cache_keys, lbs_vec, /*reuse_len=*/1); @@ -938,10 +1094,10 @@ TEST_F(KVCacheMemoryConnectorTest, asyncRead_Success_DoesNotRemoveUpgradedBlock) ASSERT_EQ(block_indices.size(), cache_keys.size()); std::vector> lbs_vec{ - {111, 112, 113}, - {211, 212, 213}, - {311, 312, 313}, - {411, 412, 413}, + {1, 2, 3}, + {1, 2, 3}, + {1, 2, 3}, + {1, 2, 3}, }; auto res = makeCacheResource(cache_keys, lbs_vec, /*reuse_len=*/1); @@ -952,7 +1108,7 @@ TEST_F(KVCacheMemoryConnectorTest, asyncRead_Success_DoesNotRemoveUpgradedBlock) const int read_num = static_cast(match_ctx->matchedBlockCount()) - reuse_num; ASSERT_GT(read_num, 0); - // Start async read — buildCopyPlanForRead captures old block indices in the copy plan. + // asyncMatch captured old block indices in the read copy plan; asyncRead consumes that plan. auto ctx = connector_->asyncRead(res, meta, match_ctx, reuse_num, read_num); ASSERT_NE(ctx, nullptr); @@ -1010,7 +1166,7 @@ TEST_F(KVCacheMemoryConnectorTest, asyncRead_FailureOnMemResponse_NoReuseLenIncr auto block_indices = putItemsToCache(cache_keys, mem_size); ASSERT_EQ(block_indices.size(), cache_keys.size()); - std::vector> lbs_vec{{11, 12}, {21, 22}, {31, 32}, {41, 42}}; + std::vector> lbs_vec{{1, 2}, {3, 4}, {5, 6}, {7, 8}}; auto res = makeCacheResource(cache_keys, lbs_vec); auto meta = std::make_shared(/*enable_memory_cache=*/true, /*enable_remote_cache=*/false, ""); @@ -1060,7 +1216,7 @@ TEST_F(KVCacheMemoryConnectorTest, asyncRead_FailureOnRpcStatus_NoReuseLenIncrem auto block_indices = putItemsToCache(cache_keys, mem_size); ASSERT_EQ(block_indices.size(), cache_keys.size()); - std::vector> lbs_vec{{31, 32}, {41, 42}, {51, 52}, {61, 62}}; + std::vector> lbs_vec{{1, 2}, {3, 4}, {5, 6}, {7, 8}}; auto res = makeCacheResource(cache_keys, lbs_vec); auto meta = std::make_shared(/*enable_memory_cache=*/true, /*enable_remote_cache=*/false, ""); @@ -1131,6 +1287,8 @@ TEST_F(KVCacheMemoryConnectorTest, asyncWrite_InvalidInputs_ReturnNullOrThrow) { // empty layer_block_ids auto res_empty_lbs = makeCacheResource(/*cache_keys=*/{1}, /*lbs=*/{{1}}); res_empty_lbs->layer_block_ids.clear(); + res_empty_lbs->layer_group_block_ids.clear(); + res_empty_lbs->group_block_ids.clear(); auto ctx2 = connector_->asyncWrite(res_empty_lbs, meta); EXPECT_EQ(ctx2, nullptr); } @@ -1602,6 +1760,15 @@ TEST_F(KVCacheMemoryConnectorTest, copyCache_ReturnFalse_CountMismatch) { EXPECT_THROW((void)connector_->copyCache(req, resp), rtp_llm::RTPException); } +TEST_F(KVCacheMemoryConnectorTest, copyCache_ReturnFalse_EmptyCopyItems) { + MemoryOperationRequestPB req; + req.set_copy_direction(MemoryOperationRequestPB::H2D); + + MemoryOperationResponsePB resp; + EXPECT_FALSE(connector_->copyCache(req, resp)); + EXPECT_FALSE(resp.success()); +} + TEST_F(KVCacheMemoryConnectorTest, copyCache_ReturnFalse_InvalidMemBlock) { const int layer_id = 0; const int gpu_block_idx = 1; @@ -1701,7 +1868,6 @@ TEST_F(KVCacheMemoryConnectorTest, copyCache_ReturnTrue_H2D_SplitKvScale_NoBlock auto mla_spec = std::make_shared(); mla_spec->type = rtp_llm::KVCacheSpecType::MultiHeadLatentAttention; - mla_spec->layer_num = static_cast(kLayerNum); mla_spec->local_head_num_kv = 1; mla_spec->seq_size_per_block = kSeqPerBlock; mla_spec->kv_lora_rank = 512; @@ -1715,7 +1881,6 @@ TEST_F(KVCacheMemoryConnectorTest, copyCache_ReturnTrue_H2D_SplitKvScale_NoBlock cache_config_.use_mla = true; cache_config_.is_sparse = false; cache_config_.dtype = mla_spec->dtype; - cache_config_.cache_specs = {mla_spec}; cache_config_.kv_block_stride_bytes = kKvBytesPerTok * kSeqPerBlock; cache_config_.kv_scale_stride_bytes = kScaleBytesPerTok * kSeqPerBlock; cache_config_.kv_block_size_bytes = static_cast(kLayerNum) * cache_config_.kv_block_stride_bytes; @@ -1728,10 +1893,8 @@ TEST_F(KVCacheMemoryConnectorTest, copyCache_ReturnTrue_H2D_SplitKvScale_NoBlock for (int i = 0; i < kLayerNum; ++i) { layer_ids[i] = i; } - cache_config_.layer_ids.clear(); - cache_config_.global_layer_ids.clear(); - cache_config_.layer_ids.push_back(layer_ids); - cache_config_.global_layer_ids.push_back(layer_ids); + cache_config_.fromGroupedSpecs({mla_spec}, {layer_ids}, {CacheGroupType::FULL}, {"default"}); + setGroupStridesForConfig(cache_config_, {cache_config_.kv_block_stride_bytes}, {cache_config_.kv_scale_stride_bytes}); ASSERT_EQ(mla_spec->block_size_bytes(), cache_config_.kv_block_stride_bytes); @@ -1853,6 +2016,8 @@ TEST_F(KVCacheMemoryConnectorTest, copyCache_ReturnTrue_D2H_SingleLayer) { item->add_gpu_blocks(l == layer_id ? gpu_block_idx : NULL_BLOCK_IDX); } item->set_mem_block(mem_block_index); + item->set_is_complete(true); + item->set_backing_type(MemoryOperationRequestPB::MEMORY); req.set_copy_direction(MemoryOperationRequestPB::D2H); MemoryOperationResponsePB resp; @@ -1968,6 +2133,412 @@ TEST_F(KVCacheMemoryConnectorTest, copyCache_D2H_MultiLayer_ValidatesByteOffsets } } +// ============================== Dual-pool tests ============================== + +class KVCacheMemoryConnectorDualPoolTest: public ::testing::Test { +protected: + void SetUp() override { + old_core_dump_on_exception_ = StaticConfig::user_ft_core_dump_on_exception; + StaticConfig::user_ft_core_dump_on_exception = false; + createDevice(); + startRpcServer(4); + } + + void TearDown() override { + StaticConfig::user_ft_core_dump_on_exception = old_core_dump_on_exception_; + } + + bool old_core_dump_on_exception_{false}; + + CacheConfig + createHybridCacheConfig(int layer_num = 4, int block_num = 10, int seq_size_per_block = 8, int linear_step = 4) { + constexpr int kTestMemoryCacheSizeMb = 64; + constexpr int kTestMemoryCacheSyncTimeout = 1000; + + CacheConfig config; + config.layer_num = layer_num; + config.layer_all_num = layer_num; + config.block_num = block_num; + config.seq_size_per_block = seq_size_per_block; + config.linear_step = linear_step; + config.group_layer_num = layer_num; + kv_cache_config_.memory_cache_size_mb = kTestMemoryCacheSizeMb; + kv_cache_config_.memory_cache_sync_timeout_ms = kTestMemoryCacheSyncTimeout; + + auto full_spec = std::make_shared(); + full_spec->local_head_num_kv = 4; + full_spec->size_per_head = 64; + full_spec->seq_size_per_block = seq_size_per_block; + full_spec->dtype = rtp_llm::DataType::TYPE_FP16; + + auto swa_spec = std::make_shared(); + swa_spec->local_head_num_kv = 4; + swa_spec->size_per_head = 64; + swa_spec->seq_size_per_block = seq_size_per_block; + swa_spec->dtype = rtp_llm::DataType::TYPE_FP16; + + const size_t full_stride = full_spec->block_size_bytes(); + const size_t swa_stride = swa_spec->block_size_bytes(); + + config.dtype = full_spec->dtype; + config.kv_block_stride_bytes = std::max(full_stride, swa_stride); + config.kv_scale_stride_bytes = 0; + config.kv_block_size_bytes = static_cast(layer_num) * full_stride; + config.kv_scale_size_bytes = 0; + config.block_size_bytes = config.kv_block_size_bytes; + + std::vector full_layer_ids(layer_num); + std::vector swa_layer_ids(layer_num); + for (int i = 0; i < layer_num; ++i) { + full_layer_ids[i] = i; + swa_layer_ids[i] = i; + } + config.fromGroupedSpecs({full_spec, swa_spec}, + {full_layer_ids, swa_layer_ids}, + {CacheGroupType::FULL, CacheGroupType::SWA}, + {"default", "swa_kv"}); + setGroupStridesForConfig(config, {full_stride, swa_stride}, {0, 0}); + config.layer_to_block_stride_bytes.assign(layer_num, static_cast(full_stride)); + + config.use_independent_block_pools = true; + + return config; + } + + std::shared_ptr createConnector(const CacheConfig& cfg) { + auto conn = std::make_shared(cfg, kv_cache_config_, allocator_, server_addrs_); + EXPECT_TRUE(conn->init()); + return conn; + } + + std::shared_ptr makeHybridResource(const CacheConfig& cfg, + const CacheKeysType& cache_keys, + const std::vector>& full_blocks, + const std::vector>& swa_blocks, + size_t reuse_len = 0) const { + auto res = std::make_shared(); + const size_t layer_num = static_cast(cfg.layer_all_num); + + initResourceGroupsForConfig(*res, cfg); + + res->resizeBlocks(static_cast(cache_keys.size()), NULL_BLOCK_IDX); + + for (size_t l = 0; l < layer_num; ++l) { + if (l < full_blocks.size()) { + BlockIndicesType padded(cache_keys.size(), NULL_BLOCK_IDX); + for (size_t k = 0; k < std::min(cache_keys.size(), full_blocks[l].size()); ++k) { + padded[k] = full_blocks[l][k]; + } + res->mutableBlockIds(static_cast(l), 0).assign(padded); + } + if (l < swa_blocks.size()) { + BlockIndicesType padded(cache_keys.size(), NULL_BLOCK_IDX); + for (size_t k = 0; k < std::min(cache_keys.size(), swa_blocks[l].size()); ++k) { + padded[k] = swa_blocks[l][k]; + } + res->mutableBlockIds(static_cast(l), 1).assign(padded); + } + } + + res->cacheKeys() = cache_keys; + res->setDeviceReuseBlockNum(reuse_len); + res->setLastBlockAligned(true); + return res; + } + + bool waitUntilDone(const std::shared_ptr& ctx, int timeout_ms = 3000) const { + if (!ctx) { + return false; + } + const auto deadline = std::chrono::steady_clock::now() + std::chrono::milliseconds(timeout_ms); + while (std::chrono::steady_clock::now() < deadline) { + if (ctx->done()) { + return true; + } + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + return ctx->done(); + } + + KVCacheConfig kv_cache_config_; + std::shared_ptr allocator_; + std::vector> servers_; + std::vector server_addrs_; + +private: + void createDevice() const { + initRuntime(/*device_id=*/0, + /*trace_memory=*/false, + /*enable_comm_overlap=*/false, + MlaOpsType::AUTO); + } + void startRpcServer(int server_num) { + for (int i = 0; i < server_num; ++i) { + auto service = std::make_unique(); + auto server = std::make_unique(std::move(service)); + ASSERT_TRUE(server->start()); + server_addrs_.push_back("127.0.0.1:" + std::to_string(server->listenPort())); + servers_.push_back(std::move(server)); + } + } +}; + +TEST_F(KVCacheMemoryConnectorDualPoolTest, Init_CreatesDualPools) { + auto cfg = createHybridCacheConfig(); + allocator_ = std::make_shared(cfg, AllocationType::DEVICE); + ASSERT_TRUE(allocator_->init()); + auto conn = createConnector(cfg); + + EXPECT_TRUE(conn->isDualPool()); + EXPECT_NE(conn->complete_pool_, nullptr); + EXPECT_NE(conn->incomplete_pool_, nullptr); + EXPECT_EQ(conn->block_pool_, nullptr); + EXPECT_NE(conn->block_cache_, nullptr); + EXPECT_GT(conn->complete_block_size_, 0u); + EXPECT_GT(conn->incomplete_block_size_, 0u); + EXPECT_GT(conn->complete_block_size_, conn->incomplete_block_size_); +} + +TEST_F(KVCacheMemoryConnectorDualPoolTest, Init_PureFullUsesSinglePool) { + // Pure FULL config: no typed slots, should use single pool + CacheConfig config; + config.layer_num = 4; + config.layer_all_num = 4; + config.block_num = 10; + config.seq_size_per_block = 8; + kv_cache_config_.memory_cache_size_mb = 64; + kv_cache_config_.memory_cache_sync_timeout_ms = 1000; + + auto spec = std::make_shared(); + spec->local_head_num_kv = 8; + spec->size_per_head = 128; + spec->seq_size_per_block = 8; + spec->dtype = rtp_llm::DataType::TYPE_FP16; + config.dtype = spec->dtype; + config.kv_block_stride_bytes = spec->block_size_bytes(); + config.kv_scale_stride_bytes = spec->scale_block_size_bytes(); + config.kv_block_size_bytes = 4UL * config.kv_block_stride_bytes; + config.kv_scale_size_bytes = 4UL * config.kv_scale_stride_bytes; + config.block_size_bytes = config.kv_block_size_bytes + config.kv_scale_size_bytes; + const size_t per_layer = config.kv_block_stride_bytes + config.kv_scale_stride_bytes; + config.layer_to_block_stride_bytes.assign(4, static_cast(per_layer)); + std::vector ids = {0, 1, 2, 3}; + config.fromGroupedSpecs({spec}, {ids}, {CacheGroupType::FULL}, {"default"}); + + allocator_ = std::make_shared(config, AllocationType::DEVICE); + ASSERT_TRUE(allocator_->init()); + auto conn = createConnector(config); + + EXPECT_FALSE(conn->isDualPool()); + EXPECT_NE(conn->block_pool_, nullptr); + EXPECT_NE(conn->block_cache_, nullptr); + EXPECT_EQ(conn->complete_pool_, nullptr); + EXPECT_EQ(conn->incomplete_pool_, nullptr); +} + +TEST_F(KVCacheMemoryConnectorDualPoolTest, AsyncMatch_AdvancesOnlyOnCompleteHit) { + auto cfg = createHybridCacheConfig(/*layer_num=*/2, /*block_num=*/10, /*seq_size_per_block=*/8, /*linear_step=*/4); + allocator_ = std::make_shared(cfg, AllocationType::DEVICE); + ASSERT_TRUE(allocator_->init()); + auto conn = createConnector(cfg); + ASSERT_TRUE(conn->isDualPool()); + + // 4 cache keys (3 real + 1 tail dummy that won't be matched) + CacheKeysType cache_keys{90001, 90002, 90003, 90999}; + // FULL blocks: all valid (non-null) for both layers + std::vector> full_blocks{{1, 1, 1, 1}, {2, 2, 2, 2}}; + // SWA blocks: key0 NULL (incomplete), key1 valid (complete), key2 NULL (incomplete) + std::vector> swa_blocks{{NULL_BLOCK_IDX, 1, NULL_BLOCK_IDX, 1}, + {NULL_BLOCK_IDX, 2, NULL_BLOCK_IDX, 2}}; + auto res = makeHybridResource(cfg, cache_keys, full_blocks, swa_blocks); + + // Populate caches directly + { + // key0: incomplete (SWA NULL) → incomplete cache + auto inc_blks = conn->incomplete_pool_->malloc(2); + ASSERT_EQ(inc_blks.size(), 2u); + MemoryDiskBlockCache::CacheItem item0; + item0.cache_key = cache_keys[0]; + item0.backing_type = CacheBackingType::MEMORY; + item0.block_index = static_cast(inc_blks[0]); + item0.is_complete = false; + conn->block_cache_->putCommitted(item0); + conn->incomplete_pool_->blockCacheReference({static_cast(inc_blks[0])}); + conn->incomplete_pool_->requestFree({inc_blks[0]}); + + // key2: incomplete → incomplete cache + MemoryDiskBlockCache::CacheItem item2; + item2.cache_key = cache_keys[2]; + item2.backing_type = CacheBackingType::MEMORY; + item2.block_index = static_cast(inc_blks[1]); + item2.is_complete = false; + conn->block_cache_->putCommitted(item2); + conn->incomplete_pool_->blockCacheReference({static_cast(inc_blks[1])}); + conn->incomplete_pool_->requestFree({inc_blks[1]}); + + // key1: complete → complete cache + auto comp_blks = conn->complete_pool_->malloc(1); + ASSERT_EQ(comp_blks.size(), 1u); + MemoryDiskBlockCache::CacheItem item1; + item1.cache_key = cache_keys[1]; + item1.backing_type = CacheBackingType::MEMORY; + item1.block_index = static_cast(comp_blks[0]); + item1.is_complete = true; + conn->block_cache_->putCommitted(item1); + conn->complete_pool_->blockCacheReference({static_cast(comp_blks[0])}); + conn->complete_pool_->requestFree({comp_blks[0]}); + } + + auto meta = std::make_shared(true); + auto match_ctx = conn->asyncMatch(res, meta); + ASSERT_NE(match_ctx, nullptr); + // key0: incomplete hit → scan continues, matched_num stays 0 + // key1: complete hit + all GPU valid → matched_num = 2 + // key2: incomplete hit → scan continues, matched_num stays 2 + // Result: matched_num = 2 + EXPECT_EQ(match_ctx->matchedBlockCount(), 2u); +} + +TEST_F(KVCacheMemoryConnectorDualPoolTest, AsyncMatch_StopsOnDoubleMiss) { + auto cfg = createHybridCacheConfig(/*layer_num=*/2, /*block_num=*/10, /*seq_size_per_block=*/8, /*linear_step=*/4); + allocator_ = std::make_shared(cfg, AllocationType::DEVICE); + ASSERT_TRUE(allocator_->init()); + auto conn = createConnector(cfg); + ASSERT_TRUE(conn->isDualPool()); + + CacheKeysType cache_keys{92001, 92002, 92003, 92999}; + std::vector> full_blocks{{1, 1, 1, 1}, {2, 2, 2, 2}}; + std::vector> swa_blocks{{1, 1, 1, 1}, {2, 2, 2, 2}}; + auto res = makeHybridResource(cfg, cache_keys, full_blocks, swa_blocks); + + // Put key0 as complete, skip key1 (gap), key2 as complete + auto blks = conn->complete_pool_->malloc(2); + ASSERT_EQ(blks.size(), 2u); + MemoryDiskBlockCache::CacheItem item0; + item0.cache_key = cache_keys[0]; + item0.backing_type = CacheBackingType::MEMORY; + item0.block_index = static_cast(blks[0]); + item0.is_complete = true; + conn->block_cache_->putCommitted(item0); + conn->complete_pool_->blockCacheReference({static_cast(blks[0])}); + conn->complete_pool_->requestFree({blks[0]}); + + MemoryDiskBlockCache::CacheItem item2; + item2.cache_key = cache_keys[2]; + item2.backing_type = CacheBackingType::MEMORY; + item2.block_index = static_cast(blks[1]); + item2.is_complete = true; + conn->block_cache_->putCommitted(item2); + conn->complete_pool_->blockCacheReference({static_cast(blks[1])}); + conn->complete_pool_->requestFree({blks[1]}); + + auto meta = std::make_shared(true); + auto match_ctx = conn->asyncMatch(res, meta); + ASSERT_NE(match_ctx, nullptr); + // key0 hit → matched=1, key1 miss in both caches → break + EXPECT_EQ(match_ctx->matchedBlockCount(), 1u); +} + +TEST_F(KVCacheMemoryConnectorDualPoolTest, PoolSizing_JointCalculation) { + auto cfg = createHybridCacheConfig(/*layer_num=*/2, /*block_num=*/10, /*seq_size_per_block=*/8, /*linear_step=*/4); + allocator_ = std::make_shared(cfg, AllocationType::DEVICE); + ASSERT_TRUE(allocator_->init()); + auto conn = createConnector(cfg); + ASSERT_TRUE(conn->isDualPool()); + + // With linear_step=4: incomplete_num = complete_num * 3. + // totalBlocksNum() reports allocatable blocks and excludes reserved block 0. + const auto complete_total = conn->complete_pool_->totalBlocksNum(); + const auto incomplete_total = conn->incomplete_pool_->totalBlocksNum(); + EXPECT_GT(complete_total, 0u); + EXPECT_GT(incomplete_total, 0u); + EXPECT_EQ(incomplete_total, (complete_total + 1) * 3 - 1); +} + +TEST_F(KVCacheMemoryConnectorDualPoolTest, BuildCopyPlanForWrite_SkipsIncompleteWhenIncompletePoolDisabled) { + auto cfg = createHybridCacheConfig(/*layer_num=*/2, /*block_num=*/10, /*seq_size_per_block=*/8, /*linear_step=*/1); + allocator_ = std::make_shared(cfg, AllocationType::DEVICE); + ASSERT_TRUE(allocator_->init()); + auto conn = createConnector(cfg); + ASSERT_TRUE(conn->isDualPool()); + ASSERT_NE(conn->complete_pool_, nullptr); + ASSERT_EQ(conn->incomplete_pool_, nullptr); + + CacheKeysType cache_keys{93001, 93002, 93003}; + std::vector> full_blocks{{1, 1, 1}, {2, 2, 2}}; + std::vector> swa_blocks{{NULL_BLOCK_IDX, 1, NULL_BLOCK_IDX}, + {NULL_BLOCK_IDX, 2, NULL_BLOCK_IDX}}; + auto res = makeHybridResource(cfg, cache_keys, full_blocks, swa_blocks); + auto slots = conn->layerTagSlots(); + + bool no_need_write = true; + auto plan = conn->buildCopyPlanForWrite( + res->cacheKeys(), res->layerGroupBlocks(), slots, /*start_index=*/0, /*write_num=*/3, no_need_write); + + ASSERT_NE(plan, nullptr); + EXPECT_FALSE(no_need_write); + ASSERT_EQ(plan->copy_infos.size(), 1u); + EXPECT_EQ(plan->copy_infos[0].cache_key, cache_keys[1]); + EXPECT_TRUE(plan->copy_infos[0].is_complete); +} + +TEST_F(KVCacheMemoryConnectorDualPoolTest, CacheKeys_MergesBothCaches) { + auto cfg = createHybridCacheConfig(/*layer_num=*/2); + allocator_ = std::make_shared(cfg, AllocationType::DEVICE); + ASSERT_TRUE(allocator_->init()); + auto conn = createConnector(cfg); + ASSERT_TRUE(conn->isDualPool()); + + // Put complete/incomplete items into the unified backing cache. + auto comp_blks = conn->complete_pool_->malloc(1); + ASSERT_EQ(comp_blks.size(), 1u); + MemoryDiskBlockCache::CacheItem item1; + item1.cache_key = 100; + item1.backing_type = CacheBackingType::MEMORY; + item1.block_index = static_cast(comp_blks[0]); + item1.is_complete = true; + conn->block_cache_->putCommitted(item1); + conn->complete_pool_->blockCacheReference({static_cast(comp_blks[0])}); + conn->complete_pool_->requestFree({comp_blks[0]}); + + auto inc_blks = conn->incomplete_pool_->malloc(1); + ASSERT_EQ(inc_blks.size(), 1u); + MemoryDiskBlockCache::CacheItem item2; + item2.cache_key = 200; + item2.backing_type = CacheBackingType::MEMORY; + item2.block_index = static_cast(inc_blks[0]); + item2.is_complete = false; + conn->block_cache_->putCommitted(item2); + conn->incomplete_pool_->blockCacheReference({static_cast(inc_blks[0])}); + conn->incomplete_pool_->requestFree({inc_blks[0]}); + + auto keys = conn->cacheKeys(); + EXPECT_EQ(keys.size(), 2u); + bool has_100 = std::find(keys.begin(), keys.end(), 100) != keys.end(); + bool has_200 = std::find(keys.begin(), keys.end(), 200) != keys.end(); + EXPECT_TRUE(has_100); + EXPECT_TRUE(has_200); +} + +TEST_F(KVCacheMemoryConnectorDualPoolTest, Init_IncompletePoolTracksCompletePoolByStep) { + const int linear_step = 4; + const int layer_num = 4; + const int block_num = 10; + const int spb = 8; + + auto cfg = createHybridCacheConfig(layer_num, block_num, spb, linear_step); + allocator_ = std::make_shared(cfg, AllocationType::DEVICE); + ASSERT_TRUE(allocator_->init()); + auto conn = createConnector(cfg); + ASSERT_TRUE(conn->isDualPool()); + + const size_t incomplete = conn->incomplete_pool_->totalBlocksNum(); + const size_t complete = conn->complete_pool_->totalBlocksNum(); + // BlockPool reserves block 0 in each pool, while initBlockPool sizes the + // incomplete pool from the complete pool's configured block_num. + EXPECT_EQ(incomplete, (complete + 1) * static_cast(linear_step - 1) - 1); +} + } // namespace rtp_llm::test int main(int argc, char** argv) { diff --git a/rtp_llm/cpp/cache/connector/memory/test/MemoryAsyncContextTest.cc b/rtp_llm/cpp/cache/connector/memory/test/MemoryAsyncContextTest.cc index 3dd7bdf630..fad3b27b7f 100644 --- a/rtp_llm/cpp/cache/connector/memory/test/MemoryAsyncContextTest.cc +++ b/rtp_llm/cpp/cache/connector/memory/test/MemoryAsyncContextTest.cc @@ -131,9 +131,10 @@ TEST_F(MemoryAsyncContextTest, waitDone_ReturnVoid_WhenBroadcastResultNullAndCal EXPECT_TRUE(ctx->done()); } -TEST_F(MemoryAsyncContextTest, waitDone_ReturnsImmediately_WhenBroadcastResultNotSet_ThenCallbackOnce) { +TEST_F(MemoryAsyncContextTest, waitDone_BlocksUntilBroadcastResultSet_ThenCallbackOnce) { std::atomic callback_cnt{0}; std::atomic last_ok{true}; + std::atomic wait_returned{false}; auto cb = [&](bool ok) { callback_cnt.fetch_add(1); last_ok.store(ok); @@ -142,17 +143,24 @@ TEST_F(MemoryAsyncContextTest, waitDone_ReturnsImmediately_WhenBroadcastResultNo auto ctx = std::make_shared(cb); EXPECT_FALSE(ctx->done()); - std::thread t([&]() { ctx->waitDone(); }); + std::thread t([&]() { + ctx->waitDone(); + wait_returned.store(true); + }); std::this_thread::sleep_for(std::chrono::milliseconds(10)); - // 如果 broadcast_result_ 还没设置,waitDone() 不会阻塞,而是按失败处理并回调一次。 - EXPECT_TRUE(ctx->done()); - EXPECT_EQ(callback_cnt.load(), 1); - EXPECT_FALSE(last_ok.load()); + // waitDone() must not finalize before startCopyAsync() publishes its BroadcastResult. + EXPECT_FALSE(wait_returned.load()); + EXPECT_FALSE(ctx->done()); + EXPECT_EQ(callback_cnt.load(), 0); - ctx->setBroadcastResult(nullptr); + // Empty worker contexts => BroadcastResult::waitDone() returns immediately and marks success. + auto result = std::make_shared(std::vector>{}); + ctx->setBroadcastResult(result); t.join(); EXPECT_TRUE(ctx->done()); + EXPECT_TRUE(wait_returned.load()); EXPECT_EQ(callback_cnt.load(), 1); + EXPECT_TRUE(last_ok.load()); } TEST_F(MemoryAsyncContextTest, waitDone_ReturnVoid_WhenBroadcastResultNonNullAndCallbackReceivesSuccess) { @@ -218,10 +226,40 @@ TEST_F(MemoryAsyncContextTest, waitDone_IsIdempotent_CallbackOnlyOnce) { EXPECT_TRUE(last_ok); } +TEST_F(MemoryAsyncContextTest, waitDone_ConcurrentCallersFinalizeOnce) { + std::atomic callback_cnt{0}; + std::atomic last_ok{false}; + auto cb = [&](bool ok) { + callback_cnt.fetch_add(1); + last_ok.store(ok); + }; + + auto result = std::make_shared(std::vector>{}); + auto ctx = std::make_shared(cb); + + std::vector waiters; + for (size_t i = 0; i < 8; ++i) { + waiters.emplace_back([&]() { ctx->waitDone(); }); + } + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + EXPECT_FALSE(ctx->done()); + EXPECT_EQ(callback_cnt.load(), 0); + + ctx->setBroadcastResult(result); + for (auto& waiter : waiters) { + waiter.join(); + } + + EXPECT_TRUE(ctx->done()); + EXPECT_TRUE(ctx->success()); + EXPECT_EQ(callback_cnt.load(), 1); + EXPECT_TRUE(last_ok.load()); +} + } // namespace rtp_llm::test int main(int argc, char** argv) { rtp_llm::initLogger(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); -} \ No newline at end of file +} diff --git a/rtp_llm/cpp/cache/connector/memory/test/MemoryDiskBlockCacheTest.cc b/rtp_llm/cpp/cache/connector/memory/test/MemoryDiskBlockCacheTest.cc new file mode 100644 index 0000000000..e95a59ed9c --- /dev/null +++ b/rtp_llm/cpp/cache/connector/memory/test/MemoryDiskBlockCacheTest.cc @@ -0,0 +1,179 @@ +#include "gtest/gtest.h" + +#include "rtp_llm/cpp/cache/connector/memory/MemoryDiskBlockCache.h" + +namespace rtp_llm::test { +namespace { + +MemoryDiskBlockCache::CacheItem memoryItem(CacheKeyType key, BlockIdxType block, bool complete = true) { + MemoryDiskBlockCache::CacheItem item; + item.cache_key = key; + item.backing_type = CacheBackingType::MEMORY; + item.block_index = block; + item.disk_slot = -1; + item.is_complete = complete; + return item; +} + +MemoryDiskBlockCache::CacheItem diskItem(CacheKeyType key, int32_t slot, bool complete = true) { + MemoryDiskBlockCache::CacheItem item; + item.cache_key = key; + item.backing_type = CacheBackingType::DISK; + item.block_index = NULL_BLOCK_IDX; + item.disk_slot = slot; + item.is_complete = complete; + return item; +} + +} // namespace + +TEST(MemoryDiskBlockCacheTest, ContainsAndMatchMemoryAndDisk) { + MemoryDiskBlockCache cache; + ASSERT_TRUE(cache.putCommitted(memoryItem(1, 10)).first); + ASSERT_TRUE(cache.putCommitted(diskItem(2, 20)).first); + + EXPECT_TRUE(cache.contains(1)); + EXPECT_TRUE(cache.contains(2)); + + auto mem = cache.match(1); + EXPECT_EQ(mem.backing_type, CacheBackingType::MEMORY); + EXPECT_EQ(mem.matched_index, 10); + + auto disk = cache.match(2); + EXPECT_EQ(disk.backing_type, CacheBackingType::DISK); + EXPECT_EQ(disk.disk_slot, 20); +} + +TEST(MemoryDiskBlockCacheTest, SharedAccessSeqEvictsOldestAcrossBackings) { + MemoryDiskBlockCache cache; + ASSERT_TRUE(cache.putCommitted(memoryItem(1, 10)).first); + ASSERT_TRUE(cache.putCommitted(diskItem(2, 20)).first); + + ASSERT_FALSE(isNullBlockIdx(cache.match(1).matched_index)); + + auto evicted = cache.popOldestEvictable(); + ASSERT_TRUE(evicted.has_value()); + EXPECT_EQ(evicted->cache_key, 2); + EXPECT_EQ(evicted->backing_type, CacheBackingType::DISK); +} + +TEST(MemoryDiskBlockCacheTest, KindAwareEvictionOnlyPopsRequestedKind) { + MemoryDiskBlockCache cache; + ASSERT_TRUE(cache.putCommitted(memoryItem(1, 10, false)).first); + ASSERT_TRUE(cache.putCommitted(diskItem(2, 20, false)).first); + ASSERT_TRUE(cache.putCommitted(memoryItem(3, 30, true)).first); + + auto evicted = cache.popOldestEvictable(CacheBlockKind::COMPLETE); + ASSERT_TRUE(evicted.has_value()); + EXPECT_EQ(evicted->cache_key, 3); + EXPECT_TRUE(evicted->is_complete); + + evicted = cache.popOldestEvictable(CacheBlockKind::INCOMPLETE); + ASSERT_TRUE(evicted.has_value()); + EXPECT_EQ(evicted->cache_key, 1); + EXPECT_FALSE(evicted->is_complete); +} + +TEST(MemoryDiskBlockCacheTest, KindAwareEvictionChoosesOldestAcrossMemoryAndDiskForSameKind) { + MemoryDiskBlockCache cache; + ASSERT_TRUE(cache.putCommitted(memoryItem(1, 10, true)).first); + ASSERT_TRUE(cache.putCommitted(diskItem(2, 20, true)).first); + ASSERT_FALSE(isNullBlockIdx(cache.match(1).matched_index)); + + auto evicted = cache.popOldestEvictable(CacheBlockKind::COMPLETE); + ASSERT_TRUE(evicted.has_value()); + EXPECT_EQ(evicted->cache_key, 2); + EXPECT_EQ(evicted->backing_type, CacheBackingType::DISK); +} + +TEST(MemoryDiskBlockCacheTest, ContainsDoesNotUpdateRecency) { + MemoryDiskBlockCache cache; + ASSERT_TRUE(cache.putCommitted(memoryItem(1, 10)).first); + ASSERT_TRUE(cache.putCommitted(diskItem(2, 20)).first); + ASSERT_TRUE(cache.contains(1)); + + auto evicted = cache.popOldestEvictable(); + ASSERT_TRUE(evicted.has_value()); + EXPECT_EQ(evicted->cache_key, 1); +} + +TEST(MemoryDiskBlockCacheTest, PartialToCompleteCanUpgradeAcrossBacking) { + MemoryDiskBlockCache cache; + ASSERT_TRUE(cache.putCommitted(memoryItem(1, 10, false)).first); + + auto [ok, popped] = cache.putCommitted(diskItem(1, 20, true)); + ASSERT_TRUE(ok); + ASSERT_TRUE(popped.has_value()); + EXPECT_EQ(popped->backing_type, CacheBackingType::MEMORY); + EXPECT_EQ(popped->block_index, 10); + + auto match = cache.match(1); + EXPECT_EQ(match.backing_type, CacheBackingType::DISK); + EXPECT_EQ(match.disk_slot, 20); + EXPECT_TRUE(match.is_complete); +} + +TEST(MemoryDiskBlockCacheTest, PartialToCompleteDoesNotReplaceInFlightItem) { + MemoryDiskBlockCache cache; + ASSERT_TRUE(cache.putCommitted(memoryItem(1, 10, false)).first); + + auto in_flight = cache.matchAndMarkInFlight(1); + EXPECT_EQ(in_flight.backing_type, CacheBackingType::MEMORY); + EXPECT_EQ(in_flight.matched_index, 10); + + auto [ok, popped] = cache.putCommitted(diskItem(1, 20, true)); + EXPECT_FALSE(ok); + EXPECT_FALSE(popped.has_value()); + + auto match = cache.match(1); + EXPECT_EQ(match.backing_type, CacheBackingType::MEMORY); + EXPECT_EQ(match.matched_index, 10); + EXPECT_FALSE(match.is_complete); +} + +TEST(MemoryDiskBlockCacheTest, InFlightEntryIsNotEvictable) { + MemoryDiskBlockCache cache; + ASSERT_TRUE(cache.putCommitted(memoryItem(1, 10)).first); + ASSERT_TRUE(cache.putCommitted(diskItem(2, 20)).first); + ASSERT_TRUE(cache.markInFlight(1, CacheBackingType::MEMORY, 10, -1)); + + auto evicted = cache.popOldestEvictable(); + ASSERT_TRUE(evicted.has_value()); + EXPECT_EQ(evicted->cache_key, 2); + + cache.releaseInFlight(1, CacheBackingType::MEMORY, 10, -1); + evicted = cache.popOldestEvictable(); + ASSERT_TRUE(evicted.has_value()); + EXPECT_EQ(evicted->cache_key, 1); +} + +TEST(MemoryDiskBlockCacheTest, MatchAndMarkInFlightPreventsEviction) { + MemoryDiskBlockCache cache; + ASSERT_TRUE(cache.putCommitted(memoryItem(1, 10)).first); + ASSERT_TRUE(cache.putCommitted(diskItem(2, 20)).first); + + auto match = cache.matchAndMarkInFlight(1); + EXPECT_EQ(match.backing_type, CacheBackingType::MEMORY); + EXPECT_EQ(match.matched_index, 10); + + auto evicted = cache.popOldestEvictable(); + ASSERT_TRUE(evicted.has_value()); + EXPECT_EQ(evicted->cache_key, 2); + + cache.releaseInFlight(1, CacheBackingType::MEMORY, 10, -1); + evicted = cache.popOldestEvictable(); + ASSERT_TRUE(evicted.has_value()); + EXPECT_EQ(evicted->cache_key, 1); +} + +TEST(MemoryDiskBlockCacheTest, RemoveIfMatchChecksBackingAndSlot) { + MemoryDiskBlockCache cache; + ASSERT_TRUE(cache.putCommitted(diskItem(2, 20)).first); + + EXPECT_FALSE(cache.removeIfMatch(2, CacheBackingType::DISK, NULL_BLOCK_IDX, 21).has_value()); + auto removed = cache.removeIfMatch(2, CacheBackingType::DISK, NULL_BLOCK_IDX, 20); + ASSERT_TRUE(removed.has_value()); + EXPECT_FALSE(cache.contains(2)); +} + +} // namespace rtp_llm::test diff --git a/rtp_llm/cpp/cache/connector/memory/test/PrefixTreeMemoryBlockCacheTest.cc b/rtp_llm/cpp/cache/connector/memory/test/PrefixTreeMemoryBlockCacheTest.cc new file mode 100644 index 0000000000..8ee579204a --- /dev/null +++ b/rtp_llm/cpp/cache/connector/memory/test/PrefixTreeMemoryBlockCacheTest.cc @@ -0,0 +1,691 @@ +#include "gtest/gtest.h" + +#include +#include + +#include "rtp_llm/cpp/cache/connector/memory/PrefixTreeMemoryBlockCache.h" + +namespace rtp_llm::test { +namespace { + +BlockDependency rootDep(uint32_t ordinal = 0) { + BlockDependency dep; + dep.ordinal = ordinal; + return dep; +} + +BlockDependency childDep(CacheKeyType parent, uint32_t ordinal) { + BlockDependency dep; + dep.has_parent = true; + dep.parent_key = parent; + dep.ordinal = ordinal; + return dep; +} + +PrefixTreeMemoryBlockCache::CacheItem item(CacheKeyType key, + CacheBlockKind kind, + BlockIdxType block, + std::vector slot_valid_mask = {}, + bool is_resident = false) { + PrefixTreeMemoryBlockCache::CacheItem item; + item.cache_key = key; + item.kind = kind; + item.backing_type = CacheBackingType::MEMORY; + item.block_index = block; + item.disk_slot = -1; + item.block_size = 1024; + item.is_resident = is_resident; + item.slot_valid_mask = std::move(slot_valid_mask); + return item; +} + +PrefixTreeMemoryBlockCache::CacheItem diskItem(CacheKeyType key, + CacheBlockKind kind, + int32_t disk_slot, + std::vector slot_valid_mask = {}) { + auto result = item(key, kind, NULL_BLOCK_IDX, std::move(slot_valid_mask)); + result.backing_type = CacheBackingType::DISK; + result.block_index = NULL_BLOCK_IDX; + result.disk_slot = disk_slot; + return result; +} + +} // namespace + +TEST(PrefixTreeMemoryBlockCacheTest, ContainsAndMatchAreKindAware) { + PrefixTreeMemoryBlockCache cache; + ASSERT_TRUE(cache.putCommitted(1, rootDep(), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first); + ASSERT_TRUE(cache.putCommitted(1, rootDep(), item(1, CacheBlockKind::STATE_SWA_KV, 12)).first); + + EXPECT_TRUE(cache.contains(1, CacheBlockKind::COMPRESSED_KV)); + EXPECT_TRUE(cache.contains(1, CacheBlockKind::STATE_SWA_KV)); + + auto compressed = cache.match(1, CacheBlockKind::COMPRESSED_KV); + ASSERT_TRUE(compressed.found); + EXPECT_EQ(compressed.block_index, 11); + + auto state = cache.match(1, CacheBlockKind::STATE_SWA_KV); + ASSERT_TRUE(state.found); + EXPECT_EQ(state.block_index, 12); +} + +TEST(PrefixTreeMemoryBlockCacheTest, DuplicateKindDoesNotBlockMissingOtherKind) { + PrefixTreeMemoryBlockCache cache; + ASSERT_TRUE(cache.putCommitted(1, rootDep(), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first); + + auto duplicate = cache.putCommitted(1, rootDep(), item(1, CacheBlockKind::COMPRESSED_KV, 13)); + EXPECT_FALSE(duplicate.first); + EXPECT_FALSE(duplicate.second.has_value()); + + auto missing_kind = cache.putCommitted(1, rootDep(), item(1, CacheBlockKind::STATE_SWA_KV, 12)); + EXPECT_TRUE(missing_kind.first); + EXPECT_FALSE(missing_kind.second.has_value()); + EXPECT_EQ(cache.match(1, CacheBlockKind::COMPRESSED_KV).block_index, 11); + EXPECT_EQ(cache.match(1, CacheBlockKind::STATE_SWA_KV).block_index, 12); +} + +TEST(PrefixTreeMemoryBlockCacheTest, SlotMaskMustCoverRequestedSlots) { + PrefixTreeMemoryBlockCache cache; + ASSERT_TRUE(cache.putCommitted(1, + rootDep(), + item(1, + CacheBlockKind::STATE_SWA_KV, + 11, + /*slot_valid_mask=*/std::vector{0, 1, 0})) + .first); + + EXPECT_TRUE(cache.contains(1, CacheBlockKind::STATE_SWA_KV, std::vector{0, 1, 0})); + EXPECT_TRUE(cache.match(1, CacheBlockKind::STATE_SWA_KV, std::vector{0, 1, 0}).found); + EXPECT_FALSE(cache.contains(1, CacheBlockKind::STATE_SWA_KV, std::vector{1, 1, 0})); + EXPECT_FALSE(cache.match(1, CacheBlockKind::STATE_SWA_KV, std::vector{1, 1, 0}).found); +} + +TEST(PrefixTreeMemoryBlockCacheTest, WiderSlotMaskReplacesNarrowerBacking) { + PrefixTreeMemoryBlockCache cache; + ASSERT_TRUE(cache.putCommitted(1, + rootDep(), + item(1, + CacheBlockKind::STATE_SWA_KV, + 11, + /*slot_valid_mask=*/std::vector{0, 1, 0})) + .first); + + auto replacement = cache.putCommitted(1, + rootDep(), + item(1, + CacheBlockKind::STATE_SWA_KV, + 12, + /*slot_valid_mask=*/std::vector{1, 1, 0})); + ASSERT_TRUE(replacement.first); + ASSERT_TRUE(replacement.second.has_value()); + EXPECT_EQ(replacement.second->block_index, 11); + + auto matched = cache.match(1, CacheBlockKind::STATE_SWA_KV, std::vector{1, 1, 0}); + ASSERT_TRUE(matched.found); + EXPECT_EQ(matched.block_index, 12); +} + +TEST(PrefixTreeMemoryBlockCacheTest, NonCoveringSlotMaskDoesNotReplaceBacking) { + PrefixTreeMemoryBlockCache cache; + ASSERT_TRUE(cache.putCommitted(1, + rootDep(), + item(1, + CacheBlockKind::STATE_SWA_KV, + 11, + /*slot_valid_mask=*/std::vector{0, 1, 0})) + .first); + + auto replacement = cache.putCommitted(1, + rootDep(), + item(1, + CacheBlockKind::STATE_SWA_KV, + 12, + /*slot_valid_mask=*/std::vector{1, 0, 0})); + EXPECT_FALSE(replacement.first); + EXPECT_FALSE(replacement.second.has_value()); + + EXPECT_TRUE(cache.contains(1, CacheBlockKind::STATE_SWA_KV, std::vector{0, 1, 0})); + EXPECT_FALSE(cache.contains(1, CacheBlockKind::STATE_SWA_KV, std::vector{1, 1, 0})); + EXPECT_EQ(cache.match(1, CacheBlockKind::STATE_SWA_KV, std::vector{0, 1, 0}).block_index, 11); +} + +TEST(PrefixTreeMemoryBlockCacheTest, SameSlotMaskDuplicateDoesNotReplaceBacking) { + PrefixTreeMemoryBlockCache cache; + ASSERT_TRUE(cache.putCommitted(1, + rootDep(), + item(1, + CacheBlockKind::STATE_SWA_KV, + 11, + /*slot_valid_mask=*/std::vector{0, 1, 0})) + .first); + + auto duplicate = cache.putCommitted(1, + rootDep(), + item(1, + CacheBlockKind::STATE_SWA_KV, + 12, + /*slot_valid_mask=*/std::vector{0, 1, 0})); + EXPECT_FALSE(duplicate.first); + EXPECT_FALSE(duplicate.second.has_value()); + EXPECT_EQ(cache.match(1, CacheBlockKind::STATE_SWA_KV, std::vector{0, 1, 0}).block_index, 11); +} + +TEST(PrefixTreeMemoryBlockCacheTest, MarkInFlightRejectsNonCoveringMask) { + PrefixTreeMemoryBlockCache cache; + ASSERT_TRUE(cache.putCommitted(1, + rootDep(), + item(1, + CacheBlockKind::STATE_SWA_KV, + 11, + /*slot_valid_mask=*/std::vector{0, 1, 0})) + .first); + + auto in_flight = cache.matchAndMarkInFlight(1, CacheBlockKind::STATE_SWA_KV, std::vector{1, 1, 0}); + EXPECT_FALSE(in_flight.found); + + auto evicted = cache.popOldestEvictable(CacheBlockKind::STATE_SWA_KV); + ASSERT_TRUE(evicted.has_value()); + EXPECT_EQ(evicted->block_index, 11); +} + +TEST(PrefixTreeMemoryBlockCacheTest, InFlightCanBeReplacedByCoveringBacking) { + PrefixTreeMemoryBlockCache cache; + ASSERT_TRUE(cache.putCommitted(1, + rootDep(), + item(1, + CacheBlockKind::STATE_SWA_KV, + 11, + /*slot_valid_mask=*/std::vector{0, 1, 0})) + .first); + + auto in_flight = cache.matchAndMarkInFlight(1, CacheBlockKind::STATE_SWA_KV, std::vector{0, 1, 0}); + ASSERT_TRUE(in_flight.found); + + auto replacement = cache.putCommitted(1, + rootDep(), + item(1, + CacheBlockKind::STATE_SWA_KV, + 12, + /*slot_valid_mask=*/std::vector{1, 1, 0})); + ASSERT_TRUE(replacement.first); + EXPECT_FALSE(replacement.second.has_value()); + + auto matched = cache.match(1, CacheBlockKind::STATE_SWA_KV, std::vector{1, 1, 0}); + ASSERT_TRUE(matched.found); + EXPECT_EQ(matched.block_index, 12); + + auto retired = cache.releaseInFlight(1, + CacheBlockKind::STATE_SWA_KV, + CacheBackingType::MEMORY, + in_flight.block_index, + in_flight.disk_slot, + in_flight.generation); + ASSERT_TRUE(retired.has_value()); + EXPECT_EQ(retired->block_index, 11); + matched = cache.match(1, CacheBlockKind::STATE_SWA_KV, std::vector{1, 1, 0}); + ASSERT_TRUE(matched.found); + EXPECT_EQ(matched.block_index, 12); +} + +TEST(PrefixTreeMemoryBlockCacheTest, RetiredItemRequiresAllInFlightReleases) { + PrefixTreeMemoryBlockCache cache; + ASSERT_TRUE(cache.putCommitted(1, + rootDep(), + item(1, + CacheBlockKind::STATE_SWA_KV, + 11, + /*slot_valid_mask=*/std::vector{0, 1, 0})) + .first); + + auto first = cache.matchAndMarkInFlight(1, CacheBlockKind::STATE_SWA_KV, std::vector{0, 1, 0}); + auto second = cache.matchAndMarkInFlight(1, CacheBlockKind::STATE_SWA_KV, std::vector{0, 1, 0}); + ASSERT_TRUE(first.found); + ASSERT_TRUE(second.found); + + auto replacement = cache.putCommitted(1, + rootDep(), + item(1, + CacheBlockKind::STATE_SWA_KV, + 12, + /*slot_valid_mask=*/std::vector{1, 1, 0})); + ASSERT_TRUE(replacement.first); + EXPECT_FALSE(replacement.second.has_value()); + + auto retired = cache.releaseInFlight(1, + CacheBlockKind::STATE_SWA_KV, + CacheBackingType::MEMORY, + first.block_index, + first.disk_slot, + first.generation); + EXPECT_FALSE(retired.has_value()); + + retired = cache.releaseInFlight(1, + CacheBlockKind::STATE_SWA_KV, + CacheBackingType::MEMORY, + second.block_index, + second.disk_slot, + second.generation); + ASSERT_TRUE(retired.has_value()); + EXPECT_EQ(retired->block_index, 11); + EXPECT_EQ(cache.match(1, CacheBlockKind::STATE_SWA_KV, std::vector{1, 1, 0}).block_index, 12); +} + +TEST(PrefixTreeMemoryBlockCacheTest, MultipleRetiredItemsReleaseOutOfOrder) { + PrefixTreeMemoryBlockCache cache; + ASSERT_TRUE(cache.putCommitted(1, + rootDep(), + item(1, + CacheBlockKind::STATE_SWA_KV, + 11, + /*slot_valid_mask=*/std::vector{1, 0, 0})) + .first); + auto old_in_flight = cache.matchAndMarkInFlight(1, CacheBlockKind::STATE_SWA_KV, std::vector{1, 0, 0}); + ASSERT_TRUE(old_in_flight.found); + + ASSERT_TRUE(cache.putCommitted(1, + rootDep(), + item(1, + CacheBlockKind::STATE_SWA_KV, + 12, + /*slot_valid_mask=*/std::vector{1, 1, 0})) + .first); + auto middle_in_flight = cache.matchAndMarkInFlight(1, CacheBlockKind::STATE_SWA_KV, std::vector{1, 1, 0}); + ASSERT_TRUE(middle_in_flight.found); + + ASSERT_TRUE(cache.putCommitted(1, + rootDep(), + item(1, + CacheBlockKind::STATE_SWA_KV, + 13, + /*slot_valid_mask=*/std::vector{1, 1, 1})) + .first); + + auto retired = cache.releaseInFlight(1, + CacheBlockKind::STATE_SWA_KV, + CacheBackingType::MEMORY, + middle_in_flight.block_index, + middle_in_flight.disk_slot, + middle_in_flight.generation); + ASSERT_TRUE(retired.has_value()); + EXPECT_EQ(retired->block_index, 12); + + retired = cache.releaseInFlight(1, + CacheBlockKind::STATE_SWA_KV, + CacheBackingType::MEMORY, + old_in_flight.block_index, + old_in_flight.disk_slot, + old_in_flight.generation); + ASSERT_TRUE(retired.has_value()); + EXPECT_EQ(retired->block_index, 11); + EXPECT_EQ(cache.match(1, CacheBlockKind::STATE_SWA_KV, std::vector{1, 1, 1}).block_index, 13); +} + +TEST(PrefixTreeMemoryBlockCacheTest, EvictionIsPerKindAndStopsAtBranchPoint) { + PrefixTreeMemoryBlockCache cache; + ASSERT_TRUE(cache.putCommitted(1, rootDep(0), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first); + ASSERT_TRUE(cache.putCommitted(2, childDep(1, 1), item(2, CacheBlockKind::COMPRESSED_KV, 12)).first); + ASSERT_TRUE(cache.putCommitted(3, childDep(1, 2), item(3, CacheBlockKind::COMPRESSED_KV, 13)).first); + + auto evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV); + ASSERT_TRUE(evicted.has_value()); + EXPECT_EQ(evicted->cache_key, 2); + EXPECT_FALSE(cache.contains(2, CacheBlockKind::COMPRESSED_KV)); + EXPECT_TRUE(cache.contains(1, CacheBlockKind::COMPRESSED_KV)); + EXPECT_TRUE(cache.contains(3, CacheBlockKind::COMPRESSED_KV)); +} + +TEST(PrefixTreeMemoryBlockCacheTest, PrefixTreeLinksChildInsertedBeforeParent) { + PrefixTreeMemoryBlockCache cache; + ASSERT_TRUE(cache.putCommitted(2, childDep(1, 1), item(2, CacheBlockKind::COMPRESSED_KV, 12)).first); + ASSERT_TRUE(cache.putCommitted(1, rootDep(0), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first); + + auto evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV); + ASSERT_TRUE(evicted.has_value()); + EXPECT_EQ(evicted->cache_key, 2); + EXPECT_FALSE(cache.contains(2, CacheBlockKind::COMPRESSED_KV)); + EXPECT_TRUE(cache.contains(1, CacheBlockKind::COMPRESSED_KV)); + + evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV); + ASSERT_TRUE(evicted.has_value()); + EXPECT_EQ(evicted->cache_key, 1); + EXPECT_FALSE(cache.contains(1, CacheBlockKind::COMPRESSED_KV)); +} + +TEST(PrefixTreeMemoryBlockCacheTest, ReparentMovesSubtreeRefFromOldToNewParent) { + PrefixTreeMemoryBlockCache cache; + ASSERT_TRUE(cache.putCommitted(1, rootDep(0), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first); + ASSERT_TRUE(cache.putCommitted(3, rootDep(0), item(3, CacheBlockKind::COMPRESSED_KV, 13)).first); + ASSERT_TRUE(cache.putCommitted(2, childDep(1, 1), item(2, CacheBlockKind::COMPRESSED_KV, 12)).first); + + auto reparent = cache.putCommitted(2, childDep(3, 1), item(2, CacheBlockKind::COMPRESSED_KV, 14)); + EXPECT_FALSE(reparent.first); + EXPECT_FALSE(reparent.second.has_value()); + + auto evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV); + ASSERT_TRUE(evicted.has_value()); + EXPECT_EQ(evicted->cache_key, 1); + + evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV); + ASSERT_TRUE(evicted.has_value()); + EXPECT_EQ(evicted->cache_key, 2); + + evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV); + ASSERT_TRUE(evicted.has_value()); + EXPECT_EQ(evicted->cache_key, 3); +} + +TEST(PrefixTreeMemoryBlockCacheTest, MultipleOrphanChildrenAttachOnParentInsert) { + PrefixTreeMemoryBlockCache cache; + ASSERT_TRUE(cache.putCommitted(2, childDep(1, 1), item(2, CacheBlockKind::COMPRESSED_KV, 12)).first); + ASSERT_TRUE(cache.putCommitted(3, childDep(1, 2), item(3, CacheBlockKind::COMPRESSED_KV, 13)).first); + ASSERT_TRUE(cache.putCommitted(1, rootDep(0), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first); + + auto evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV); + ASSERT_TRUE(evicted.has_value()); + EXPECT_EQ(evicted->cache_key, 2); + EXPECT_TRUE(cache.contains(1, CacheBlockKind::COMPRESSED_KV)); + EXPECT_TRUE(cache.contains(3, CacheBlockKind::COMPRESSED_KV)); + + evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV); + ASSERT_TRUE(evicted.has_value()); + EXPECT_EQ(evicted->cache_key, 3); + + evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV); + ASSERT_TRUE(evicted.has_value()); + EXPECT_EQ(evicted->cache_key, 1); +} + +TEST(PrefixTreeMemoryBlockCacheTest, ReparentPendingOrphanMovesPendingEntry) { + PrefixTreeMemoryBlockCache cache; + ASSERT_TRUE(cache.putCommitted(2, childDep(1, 1), item(2, CacheBlockKind::COMPRESSED_KV, 12)).first); + auto reparent = cache.putCommitted(2, childDep(3, 1), item(2, CacheBlockKind::COMPRESSED_KV, 14)); + EXPECT_FALSE(reparent.first); + ASSERT_TRUE(cache.putCommitted(3, rootDep(0), item(3, CacheBlockKind::COMPRESSED_KV, 13)).first); + ASSERT_TRUE(cache.putCommitted(1, rootDep(0), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first); + + auto evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV); + ASSERT_TRUE(evicted.has_value()); + EXPECT_EQ(evicted->cache_key, 2); + + evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV); + ASSERT_TRUE(evicted.has_value()); + EXPECT_EQ(evicted->cache_key, 3); + + evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV); + ASSERT_TRUE(evicted.has_value()); + EXPECT_EQ(evicted->cache_key, 1); +} + +TEST(PrefixTreeMemoryBlockCacheTest, BranchParentBecomesEvictableAfterAllChildrenGone) { + PrefixTreeMemoryBlockCache cache; + ASSERT_TRUE(cache.putCommitted(1, rootDep(0), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first); + ASSERT_TRUE(cache.putCommitted(2, childDep(1, 1), item(2, CacheBlockKind::COMPRESSED_KV, 12)).first); + ASSERT_TRUE(cache.putCommitted(3, childDep(1, 2), item(3, CacheBlockKind::COMPRESSED_KV, 13)).first); + + auto evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV); + ASSERT_TRUE(evicted.has_value()); + EXPECT_EQ(evicted->cache_key, 2); + + evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV); + ASSERT_TRUE(evicted.has_value()); + EXPECT_EQ(evicted->cache_key, 3); + + evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV); + ASSERT_TRUE(evicted.has_value()); + EXPECT_EQ(evicted->cache_key, 1); +} + +TEST(PrefixTreeMemoryBlockCacheTest, ResidentItemIsMatchableButNeverEvictable) { + PrefixTreeMemoryBlockCache cache; + ASSERT_TRUE(cache.putCommitted(1, + rootDep(0), + item(1, + CacheBlockKind::COMPRESSED_KV, + 11, + /*slot_valid_mask=*/{}, + /*is_resident=*/true)) + .first); + + EXPECT_TRUE(cache.match(1, CacheBlockKind::COMPRESSED_KV).found); + EXPECT_FALSE(cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV).has_value()); +} + +TEST(PrefixTreeMemoryBlockCacheTest, ParentDetachPreservesChildLeafAccounting) { + PrefixTreeMemoryBlockCache cache; + ASSERT_TRUE(cache.putCommitted(1, rootDep(0), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first); + ASSERT_TRUE(cache.putCommitted(2, childDep(1, 1), item(2, CacheBlockKind::COMPRESSED_KV, 12)).first); + + auto parent = cache.matchAndMarkInFlight(1, CacheBlockKind::COMPRESSED_KV); + ASSERT_TRUE(parent.found); + EXPECT_FALSE(cache.detachIfMatch(1, + CacheBlockKind::COMPRESSED_KV, + CacheBackingType::MEMORY, + parent.block_index, + parent.disk_slot, + parent.generation) + .has_value()); + auto retired_parent = cache.releaseInFlight(1, + CacheBlockKind::COMPRESSED_KV, + CacheBackingType::MEMORY, + parent.block_index, + parent.disk_slot, + parent.generation); + ASSERT_TRUE(retired_parent.has_value()); + + ASSERT_TRUE(cache.putCommitted(1, rootDep(0), item(1, CacheBlockKind::COMPRESSED_KV, 13)).first); + + auto evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV); + ASSERT_TRUE(evicted.has_value()); + EXPECT_EQ(evicted->cache_key, 2); + EXPECT_TRUE(cache.contains(1, CacheBlockKind::COMPRESSED_KV)); + EXPECT_FALSE(cache.contains(2, CacheBlockKind::COMPRESSED_KV)); +} + +TEST(PrefixTreeMemoryBlockCacheTest, KindLeafAccountingIsIndependent) { + PrefixTreeMemoryBlockCache cache; + ASSERT_TRUE(cache.putCommitted(1, rootDep(0), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first); + ASSERT_TRUE(cache.putCommitted(2, childDep(1, 1), item(2, CacheBlockKind::COMPRESSED_KV, 12)).first); + ASSERT_TRUE(cache.putCommitted(1, rootDep(0), item(1, CacheBlockKind::STATE_SWA_KV, 21)).first); + + auto evicted = cache.popOldestEvictable(CacheBlockKind::STATE_SWA_KV); + ASSERT_TRUE(evicted.has_value()); + EXPECT_EQ(evicted->cache_key, 1); + EXPECT_FALSE(cache.contains(1, CacheBlockKind::STATE_SWA_KV)); + EXPECT_TRUE(cache.contains(1, CacheBlockKind::COMPRESSED_KV)); + EXPECT_TRUE(cache.contains(2, CacheBlockKind::COMPRESSED_KV)); +} + +TEST(PrefixTreeMemoryBlockCacheTest, DetachThenReplaceDoesNotReturnDetachedBackingAgain) { + PrefixTreeMemoryBlockCache cache; + ASSERT_TRUE(cache.putCommitted(1, rootDep(), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first); + + auto in_flight = cache.matchAndMarkInFlight(1, CacheBlockKind::COMPRESSED_KV); + ASSERT_TRUE(in_flight.found); + ASSERT_EQ(in_flight.block_index, 11); + + auto detached = cache.detachIfMatch(1, + CacheBlockKind::COMPRESSED_KV, + CacheBackingType::MEMORY, + in_flight.block_index, + in_flight.disk_slot, + in_flight.generation); + EXPECT_FALSE(detached.has_value()); + + auto replacement = cache.putCommitted(1, rootDep(), item(1, CacheBlockKind::COMPRESSED_KV, 12)); + EXPECT_TRUE(replacement.first); + EXPECT_FALSE(replacement.second.has_value()); + + auto retired = cache.releaseInFlight(1, + CacheBlockKind::COMPRESSED_KV, + CacheBackingType::MEMORY, + in_flight.block_index, + in_flight.disk_slot, + in_flight.generation); + ASSERT_TRUE(retired.has_value()); + EXPECT_EQ(retired->block_index, 11); + auto matched = cache.match(1, CacheBlockKind::COMPRESSED_KV); + ASSERT_TRUE(matched.found); + EXPECT_EQ(matched.block_index, 12); +} + +TEST(PrefixTreeMemoryBlockCacheTest, DetachPrunesEmptyLeafButKeepsStructuralParent) { + PrefixTreeMemoryBlockCache cache; + ASSERT_TRUE(cache.putCommitted(1, rootDep(0), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first); + ASSERT_TRUE(cache.putCommitted(2, childDep(1, 1), item(2, CacheBlockKind::COMPRESSED_KV, 12)).first); + + auto child = cache.matchAndMarkInFlight(2, CacheBlockKind::COMPRESSED_KV); + ASSERT_TRUE(child.found); + auto detached_child = cache.detachIfMatch(2, + CacheBlockKind::COMPRESSED_KV, + CacheBackingType::MEMORY, + child.block_index, + child.disk_slot, + child.generation); + EXPECT_FALSE(detached_child.has_value()); + detached_child = cache.releaseInFlight(2, + CacheBlockKind::COMPRESSED_KV, + CacheBackingType::MEMORY, + child.block_index, + child.disk_slot, + child.generation); + ASSERT_TRUE(detached_child.has_value()); + + EXPECT_FALSE(cache.contains(2, CacheBlockKind::COMPRESSED_KV)); + EXPECT_TRUE(cache.contains(1, CacheBlockKind::COMPRESSED_KV)); + EXPECT_EQ(cache.cacheKeys(), (CacheKeysType{1})); + auto status_keys = cache.cacheKeysUnorderedForStatus(); + std::sort(status_keys.begin(), status_keys.end()); + EXPECT_EQ(status_keys, (CacheKeysType{1})); +} + +TEST(PrefixTreeMemoryBlockCacheTest, StatusCacheKeysAreUnorderedAndDeduplicated) { + PrefixTreeMemoryBlockCache cache; + ASSERT_TRUE(cache.putCommitted(1, rootDep(0), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first); + ASSERT_TRUE(cache.putCommitted(1, rootDep(0), item(1, CacheBlockKind::STATE_SWA_KV, 12)).first); + ASSERT_TRUE(cache.putCommitted(2, childDep(1, 1), item(2, CacheBlockKind::COMPRESSED_KV, 21)).first); + ASSERT_TRUE(cache.putCommitted(3, childDep(2, 2), item(3, CacheBlockKind::STATE_SWA_KV, 31)).first); + + auto status_keys = cache.cacheKeysUnorderedForStatus(); + std::sort(status_keys.begin(), status_keys.end()); + + EXPECT_EQ(status_keys, (CacheKeysType{1, 2, 3})); + EXPECT_EQ(status_keys.size(), cache.cacheKeys().size()); +} + +TEST(PrefixTreeMemoryBlockCacheTest, ParentBecomesEvictableAfterChildDetachEvenAfterTouchWhileNonLeaf) { + PrefixTreeMemoryBlockCache cache; + ASSERT_TRUE(cache.putCommitted(1, rootDep(0), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first); + ASSERT_TRUE(cache.putCommitted(2, childDep(1, 1), item(2, CacheBlockKind::COMPRESSED_KV, 12)).first); + + ASSERT_TRUE(cache.match(1, CacheBlockKind::COMPRESSED_KV).found); + + auto child = cache.matchAndMarkInFlight(2, CacheBlockKind::COMPRESSED_KV); + ASSERT_TRUE(child.found); + auto detached_child = cache.detachIfMatch(2, + CacheBlockKind::COMPRESSED_KV, + CacheBackingType::MEMORY, + child.block_index, + child.disk_slot, + child.generation); + EXPECT_FALSE(detached_child.has_value()); + detached_child = cache.releaseInFlight(2, + CacheBlockKind::COMPRESSED_KV, + CacheBackingType::MEMORY, + child.block_index, + child.disk_slot, + child.generation); + ASSERT_TRUE(detached_child.has_value()); + + auto evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV); + ASSERT_TRUE(evicted.has_value()); + EXPECT_EQ(evicted->cache_key, 1); +} + +TEST(PrefixTreeMemoryBlockCacheTest, InFlightReleaseRestoresEvictability) { + PrefixTreeMemoryBlockCache cache; + ASSERT_TRUE(cache.putCommitted(1, rootDep(), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first); + + auto in_flight = cache.matchAndMarkInFlight(1, CacheBlockKind::COMPRESSED_KV); + ASSERT_TRUE(in_flight.found); + EXPECT_FALSE(cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV).has_value()); + + cache.releaseInFlight(1, + CacheBlockKind::COMPRESSED_KV, + CacheBackingType::MEMORY, + in_flight.block_index, + in_flight.disk_slot, + in_flight.generation); + auto evicted = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV); + ASSERT_TRUE(evicted.has_value()); + EXPECT_EQ(evicted->cache_key, 1); +} + +TEST(PrefixTreeMemoryBlockCacheTest, DiskBackingMatchesAndEvictsByBacking) { + PrefixTreeMemoryBlockCache cache; + ASSERT_TRUE(cache.putCommitted(1, rootDep(), diskItem(1, CacheBlockKind::COMPRESSED_KV, 7)).first); + ASSERT_TRUE(cache.putCommitted(2, rootDep(), item(2, CacheBlockKind::COMPRESSED_KV, 22)).first); + + auto matched = cache.matchAndMarkInFlight(1, CacheBlockKind::COMPRESSED_KV); + ASSERT_TRUE(matched.found); + EXPECT_EQ(matched.backing_type, CacheBackingType::DISK); + EXPECT_EQ(matched.disk_slot, 7); + EXPECT_FALSE(cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV, CacheBackingType::DISK).has_value()); + + auto released = cache.releaseInFlight(1, + CacheBlockKind::COMPRESSED_KV, + CacheBackingType::DISK, + matched.block_index, + matched.disk_slot, + matched.generation); + EXPECT_FALSE(released.has_value()); + + auto evicted_disk = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV, CacheBackingType::DISK); + ASSERT_TRUE(evicted_disk.has_value()); + EXPECT_EQ(evicted_disk->cache_key, 1); + EXPECT_EQ(evicted_disk->disk_slot, 7); + + auto evicted_mem = cache.popOldestEvictable(CacheBlockKind::COMPRESSED_KV, CacheBackingType::MEMORY); + ASSERT_TRUE(evicted_mem.has_value()); + EXPECT_EQ(evicted_mem->cache_key, 2); + EXPECT_EQ(evicted_mem->block_index, 22); +} + +TEST(PrefixTreeMemoryBlockCacheTest, StateIndependentEvictionDropsDeepestNonTailState) { + PrefixTreeMemoryBlockCache cache; + ASSERT_TRUE(cache.putCommitted(1, rootDep(), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first); + ASSERT_TRUE(cache.putCommitted(1, rootDep(), item(1, CacheBlockKind::STATE_SWA_KV, 101)).first); + ASSERT_TRUE(cache.putCommitted(2, childDep(1, 1), item(2, CacheBlockKind::COMPRESSED_KV, 12)).first); + ASSERT_TRUE(cache.putCommitted(2, childDep(1, 1), item(2, CacheBlockKind::STATE_SWA_KV, 102)).first); + ASSERT_TRUE(cache.putCommitted(3, childDep(2, 2), item(3, CacheBlockKind::COMPRESSED_KV, 13)).first); + ASSERT_TRUE(cache.putCommitted(3, childDep(2, 2), item(3, CacheBlockKind::STATE_SWA_KV, 103)).first); + + auto evicted = cache.popOldestStateOrChainEvictable(CacheBackingType::MEMORY); + + ASSERT_EQ(evicted.size(), 1u); + EXPECT_EQ(evicted[0].cache_key, 2); + EXPECT_EQ(evicted[0].kind, CacheBlockKind::STATE_SWA_KV); + EXPECT_EQ(evicted[0].block_index, 102); + EXPECT_TRUE(cache.contains(2, CacheBlockKind::COMPRESSED_KV)); + EXPECT_FALSE(cache.contains(2, CacheBlockKind::STATE_SWA_KV)); + EXPECT_TRUE(cache.contains(3, CacheBlockKind::STATE_SWA_KV)); +} + +TEST(PrefixTreeMemoryBlockCacheTest, StateIndependentEvictionFallsBackToWholeChain) { + PrefixTreeMemoryBlockCache cache; + ASSERT_TRUE(cache.putCommitted(1, rootDep(), item(1, CacheBlockKind::COMPRESSED_KV, 11)).first); + ASSERT_TRUE(cache.putCommitted(2, childDep(1, 1), item(2, CacheBlockKind::COMPRESSED_KV, 12)).first); + ASSERT_TRUE(cache.putCommitted(2, childDep(1, 1), item(2, CacheBlockKind::STATE_SWA_KV, 102)).first); + + auto evicted = cache.popOldestStateOrChainEvictable(CacheBackingType::MEMORY); + + ASSERT_EQ(evicted.size(), 3u); + EXPECT_EQ(evicted[0].cache_key, 2); + EXPECT_EQ(evicted[0].kind, CacheBlockKind::COMPRESSED_KV); + EXPECT_EQ(evicted[1].cache_key, 2); + EXPECT_EQ(evicted[1].kind, CacheBlockKind::STATE_SWA_KV); + EXPECT_EQ(evicted[2].cache_key, 1); + EXPECT_EQ(evicted[2].kind, CacheBlockKind::COMPRESSED_KV); + EXPECT_EQ(cache.size(), 0u); +} + +} // namespace rtp_llm::test diff --git a/rtp_llm/cpp/cache/connector/memory/test/mock/MockKVCacheMemoryConnector.h b/rtp_llm/cpp/cache/connector/memory/test/mock/MockKVCacheMemoryConnector.h index a092c752e9..f33d587f67 100644 --- a/rtp_llm/cpp/cache/connector/memory/test/mock/MockKVCacheMemoryConnector.h +++ b/rtp_llm/cpp/cache/connector/memory/test/mock/MockKVCacheMemoryConnector.h @@ -8,6 +8,14 @@ namespace rtp_llm { class MockKVCacheMemoryConnector: public KVCacheMemoryConnector { public: + MockKVCacheMemoryConnector(const CacheConfig& cache_config, + const KVCacheConfig& kv_cache_config, + const ParallelismConfig& parallelism_config, + const std::shared_ptr& allocator, + const std::vector& worker_addrs, + const kmonitor::MetricsReporterPtr& metrics_reporter): + KVCacheMemoryConnector( + cache_config, kv_cache_config, parallelism_config, allocator, worker_addrs, metrics_reporter) {} MockKVCacheMemoryConnector(const CacheConfig& cache_config, const KVCacheConfig& kv_cache_config, const std::shared_ptr& allocator, diff --git a/rtp_llm/cpp/cache/connector/p2p/LayerBlockConverterImpl.h b/rtp_llm/cpp/cache/connector/p2p/LayerBlockConverterImpl.h index 360bf1c0bd..84b597a343 100644 --- a/rtp_llm/cpp/cache/connector/p2p/LayerBlockConverterImpl.h +++ b/rtp_llm/cpp/cache/connector/p2p/LayerBlockConverterImpl.h @@ -3,7 +3,7 @@ #include #include "rtp_llm/cpp/cache/connector/p2p/LayerBlockConverter.h" -#include "rtp_llm/cpp/cache/KVCacheAllocator.h" +#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h" #include "rtp_llm/cpp/cache/BlockInfo.h" namespace rtp_llm { diff --git a/rtp_llm/cpp/cache/connector/p2p/test/P2PBroadcastClientTest.cc b/rtp_llm/cpp/cache/connector/p2p/test/P2PBroadcastClientTest.cc index 9c6d6a3084..aa8f4cacce 100644 --- a/rtp_llm/cpp/cache/connector/p2p/test/P2PBroadcastClientTest.cc +++ b/rtp_llm/cpp/cache/connector/p2p/test/P2PBroadcastClientTest.cc @@ -100,14 +100,9 @@ TEST_F(P2PBroadcastClientTest, Broadcast_ReturnNotNull_AllRequestsSuccess) { } TEST_F(P2PBroadcastClientTest, Broadcast_ReturnNotNull_Timeout) { - // 设置服务器延迟响应 - for (auto& server : servers_) { - server->service()->setSleepMillis(200); - } - std::string unique_key = "test_broadcast_timeout"; int64_t request_id = 1002; - int64_t deadline_ms = currentTimeMs() + 10; // 很短的超时时间 + int64_t deadline_ms = currentTimeMs() - 1; std::vector> layer_cache_buffers; layer_cache_buffers.push_back(createLayerCacheBuffer(0, 2)); @@ -115,18 +110,13 @@ TEST_F(P2PBroadcastClientTest, Broadcast_ReturnNotNull_Timeout) { std::vector> decode_transfer_servers; decode_transfer_servers.push_back({"127.0.0.1", 12345}); - // 执行 broadcast auto result = client_->broadcast(request_id, layer_cache_buffers, decode_transfer_servers, unique_key, deadline_ms, P2PConnectorBroadcastType::READ); - - ASSERT_NE(result, nullptr); - - // broadcast gRPC 超时时 BroadcastManager::waitDone 抛 RTPException - EXPECT_THROW(waitDone(result, 500), RTPException); + EXPECT_EQ(result, nullptr); } TEST_F(P2PBroadcastClientTest, Broadcast_ReturnNotNull_PartialResponseFailed) { diff --git a/rtp_llm/cpp/cache/connector/p2p/test/P2PConnectorSchedulerTest.cc b/rtp_llm/cpp/cache/connector/p2p/test/P2PConnectorSchedulerTest.cc index 3a2ffe6842..c4c398632d 100644 --- a/rtp_llm/cpp/cache/connector/p2p/test/P2PConnectorSchedulerTest.cc +++ b/rtp_llm/cpp/cache/connector/p2p/test/P2PConnectorSchedulerTest.cc @@ -51,11 +51,11 @@ class P2PConnectorSchedulerTest: public ::testing::Test { // 创建有效的 KVCacheResource(使用 initGroups + groupBlocks/blocks/cacheKeys 公开 API) KVCacheResourcePtr createValidKVCacheResource(int num_layers = 2, int blocks_per_layer = 2) { auto resource = std::make_shared(); - std::vector layer_to_group(num_layers); + std::vector> layer_group_ids(num_layers); for (int i = 0; i < num_layers; ++i) { - layer_to_group[i] = i; + layer_group_ids[i] = {i}; } - resource->initGroups(num_layers, num_layers, layer_to_group); + resource->initGroups(num_layers, num_layers, layer_group_ids); for (int layer_id = 0; layer_id < num_layers; ++layer_id) { for (int i = 0; i < blocks_per_layer; ++i) { @@ -188,23 +188,18 @@ TEST_F(P2PConnectorSchedulerTest, HandleRead_ReturnError_BroadcastPartialFailed) } } -// 测试: broadcast worker 慢于 gRPC deadline,checkDone 路径抛 RTPException -TEST_F(P2PConnectorSchedulerTest, HandleRead_ThrowException_BroadcastTimeout) { - for (auto& server : tp_broadcast_servers_) { - server->service()->setSleepMillis(500); // 延迟 500ms - break; - } - +// 测试: broadcast 已超过 deadline,返回超时错误 +TEST_F(P2PConnectorSchedulerTest, HandleRead_ReturnError_BroadcastTimeout) { auto valid_resource = createValidKVCacheResource(2, 2); std::vector> decode_transfer_servers; decode_transfer_servers.push_back({"127.0.0.1", 12345}); - auto deadline_ms = currentTimeMs() + 50; + auto deadline_ms = currentTimeMs() - 1; - EXPECT_THROW( - scheduler_->sendKVCache(valid_resource, "test_broadcast_timeout", 1004, decode_transfer_servers, deadline_ms), - RTPException); + auto error_info = + scheduler_->sendKVCache(valid_resource, "test_broadcast_timeout", 1004, decode_transfer_servers, deadline_ms); + EXPECT_TRUE(error_info.hasError()); } // 测试: handleRead 被 client 取消, 返回失败 @@ -420,21 +415,14 @@ TEST_F(P2PConnectorSchedulerTest, AsyncRead_ReturnFalse_PrefillTimeout) { EXPECT_EQ(prefill_server_->service()->getStartLoadCallCount(), 1); } -// 测试: broadcast worker 慢于 gRPC deadline,checkDone 抛 RTPException -TEST_F(P2PConnectorSchedulerTest, AsyncRead_ThrowException_BroadcastTimeout) { - tp_broadcast_servers_[0]->service()->setSleepMillis(500); - - scheduler_->stopChecker(); - +// 测试: async read 已超过 deadline,返回超时错误 +TEST_F(P2PConnectorSchedulerTest, AsyncRead_ReturnError_BroadcastTimeout) { auto resource = createValidKVCacheResource(2, 2); - auto meta = createMockMeta(2008, "test_async_read_broadcast_timeout", currentTimeMs() + 50); + auto meta = createMockMeta(2008, "test_async_read_broadcast_timeout", currentTimeMs() - 1); auto result = scheduler_->asyncRead(resource, meta, {0, -1}); - ASSERT_TRUE(result.ok()); - auto async_context = result.context; - ASSERT_NE(async_context, nullptr); - - EXPECT_THROW(waitAsyncContextDone(async_context, 500, /*check_done=*/true), RTPException); + EXPECT_FALSE(result.ok()); + EXPECT_EQ(result.context, nullptr); } // 测试: asyncread prefill 失败, 取消broadcast @@ -509,22 +497,16 @@ TEST_F(P2PConnectorSchedulerTest, AsyncRead_CancelPrefill_WhenBroadcastFailed) { // 服务端可能已经开始处理请求,所以这里不验证取消是否成功 } -// Prefill:worker 极慢导致 gRPC DEADLINE_EXCEEDED 时抛 RTPException(与 BroadcastManager 行为一致) -TEST_F(P2PConnectorSchedulerTest, SendKVCache_ThrowException_WhenBroadcastExceedsDeadline) { - for (auto& server : tp_broadcast_servers_) { - server->service()->setSleepMillis(120000); - server->service()->setP2PResponseSuccess(true); - } - +// Prefill:已超过 deadline 时返回 broadcast 失败错误 +TEST_F(P2PConnectorSchedulerTest, SendKVCache_ReturnError_WhenBroadcastExceedsDeadline) { auto valid_resource = createValidKVCacheResource(2, 2); std::vector> decode_transfer_servers; decode_transfer_servers.push_back({"127.0.0.1", 12345}); - const int64_t deadline_ms = currentTimeMs() + 80; - EXPECT_THROW( - scheduler_->sendKVCache( - valid_resource, "test_prefill_broadcast_past_deadline", 4006, decode_transfer_servers, deadline_ms), - RTPException); + const int64_t deadline_ms = currentTimeMs() - 1; + auto error_info = scheduler_->sendKVCache( + valid_resource, "test_prefill_broadcast_past_deadline", 4006, decode_transfer_servers, deadline_ms); + EXPECT_TRUE(error_info.hasError()); } // StartLoad 返回 TRANSFER_NOT_DONE 且 hold_ms>0:checkDone 进入保留窗口,done 仍为 false 且 needCancel 为 false;hold diff --git a/rtp_llm/cpp/cache/connector/p2p/test/P2PConnectorTest.cc b/rtp_llm/cpp/cache/connector/p2p/test/P2PConnectorTest.cc index 252456f600..fb663efe76 100644 --- a/rtp_llm/cpp/cache/connector/p2p/test/P2PConnectorTest.cc +++ b/rtp_llm/cpp/cache/connector/p2p/test/P2PConnectorTest.cc @@ -79,11 +79,11 @@ class P2PConnectorTest: public ::testing::Test { // 创建有效的 KVCacheResource(使用 initGroups + groupBlocks/blocks/cacheKeys 公开 API) KVCacheResourcePtr createValidKVCacheResource(int num_layers = 2, int blocks_per_layer = 2) { auto resource = std::make_shared(); - std::vector layer_to_group(num_layers); + std::vector> layer_group_ids(num_layers); for (int i = 0; i < num_layers; ++i) { - layer_to_group[i] = i; + layer_group_ids[i] = {i}; } - resource->initGroups(num_layers, num_layers, layer_to_group); + resource->initGroups(num_layers, num_layers, layer_group_ids); for (int layer_id = 0; layer_id < num_layers; ++layer_id) { for (int i = 0; i < blocks_per_layer; ++i) { diff --git a/rtp_llm/cpp/cache/connector/p2p/test/P2PConnectorWorkerTest.cc b/rtp_llm/cpp/cache/connector/p2p/test/P2PConnectorWorkerTest.cc index e50d151bac..4e7849df85 100644 --- a/rtp_llm/cpp/cache/connector/p2p/test/P2PConnectorWorkerTest.cc +++ b/rtp_llm/cpp/cache/connector/p2p/test/P2PConnectorWorkerTest.cc @@ -17,7 +17,7 @@ #include "rtp_llm/cpp/cache/connector/p2p/ComputedLayerCacheBuffer.h" #include "rtp_llm/cpp/utils/ErrorCode.h" #include "rtp_llm/cpp/utils/TimeUtil.h" -#include "rtp_llm/cpp/cache/KVCacheAllocator.h" +#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h" #include "rtp_llm/cpp/cache/BatchKVCacheResource.h" namespace rtp_llm { @@ -279,11 +279,11 @@ class P2PConnectorWorkerTest: public ::testing::Test { KVCacheResourcePtr createKVCacheResource(int layer_id, int num_blocks = 2) { auto resource = std::make_shared(); int layer_num = static_cast(worker_config_.layer_all_num); - std::vector layer_to_group(layer_num); + std::vector> layer_group_ids(layer_num); for (int i = 0; i < layer_num; ++i) { - layer_to_group[i] = i; + layer_group_ids[i] = {i}; } - resource->initGroups(layer_num, layer_num, layer_to_group); + resource->initGroups(layer_num, layer_num, layer_group_ids); for (int i = 0; i < layer_num; ++i) { if (i == layer_id) { @@ -1006,11 +1006,11 @@ class LayerCacheBufferUtilTest: public ::testing::Test { protected: KVCacheResourcePtr createResource(int num_layers, int blocks_per_layer) { auto resource = std::make_shared(); - std::vector layer_to_group(num_layers); + std::vector> layer_group_ids(num_layers); for (int i = 0; i < num_layers; ++i) { - layer_to_group[i] = i; + layer_group_ids[i] = {i}; } - resource->initGroups(num_layers, num_layers, layer_to_group); + resource->initGroups(num_layers, num_layers, layer_group_ids); for (int layer = 0; layer < num_layers; ++layer) { for (int i = 0; i < blocks_per_layer; ++i) { resource->mutableBlockIds(layer).add({i}); diff --git a/rtp_llm/cpp/cache/connector/remote_connector/GroupPolicy.cc b/rtp_llm/cpp/cache/connector/remote_connector/GroupPolicy.cc index a500c2419f..f445f36f9f 100644 --- a/rtp_llm/cpp/cache/connector/remote_connector/GroupPolicy.cc +++ b/rtp_llm/cpp/cache/connector/remote_connector/GroupPolicy.cc @@ -5,7 +5,7 @@ #include "autil/EnvUtil.h" #include "rtp_llm/cpp/cache/connector/remote_connector/GroupPolicy.h" #include "rtp_llm/cpp/cache/Types.h" -#include "rtp_llm/cpp/cache/KVCacheAllocator.h" +#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h" #include "rtp_llm/cpp/utils/Logger.h" namespace rtp_llm { @@ -70,27 +70,32 @@ bool DefaultLayerGroupPolicy::init() { } const auto layer_layout = allocator_->allLayerCacheBase(); uint64_t group_name_bithash = 1; - const auto& layer_to_groups = layer_layout.layer_to_groups; - for (int layer = 0; layer < static_cast(layer_to_groups.size()); ++layer) { - const int group_idx = layer_to_groups.at(layer); - bool is_full_group = false; - if (full_group_ids_.find(group_idx) != full_group_ids_.end()) { - is_full_group = true; + const auto& layer_group_ids = layer_layout.layer_to_group_ids; + for (int layer = 0; layer < static_cast(layer_group_ids.size()); ++layer) { + if (layer_group_ids.at(layer).empty()) { + RTP_LLM_LOG_ERROR("layer [%d] has no cache group id", layer); + return false; } - if (!is_full_group) { - if (other_group_ids_.find(group_idx) == other_group_ids_.end()) { - RTP_LLM_LOG_ERROR("not find valid group id, [%d]", group_idx); - return false; + for (const int group_idx : layer_group_ids.at(layer)) { + bool is_full_group = false; + if (full_group_ids_.find(group_idx) != full_group_ids_.end()) { + is_full_group = true; } + if (!is_full_group) { + if (other_group_ids_.find(group_idx) == other_group_ids_.end()) { + RTP_LLM_LOG_ERROR("not find valid group id, [%d]", group_idx); + return false; + } + } + if (groups_.count(group_idx) == 0) { + std::string group_name = is_full_group ? ("F" + std::to_string(group_idx)) : + (GetOtherGroupPrefixName() + std::to_string(group_idx)); + groups_[group_idx] = Group{is_full_group, group_name_bithash, group_name}; + group_to_layer_ids_[group_idx] = {}; + group_name_bithash <<= 1; + } + group_to_layer_ids_.at(group_idx).push_back(layer); } - if (groups_.count(group_idx) == 0) { - std::string group_name = is_full_group ? ("F" + std::to_string(group_idx)) : - (GetOtherGroupPrefixName() + std::to_string(group_idx)); - groups_[group_idx] = Group{is_full_group, group_name_bithash, group_name}; - group_to_layer_ids_[group_idx] = {}; - group_name_bithash <<= 1; - } - group_to_layer_ids_.at(group_idx).push_back(layer); } if (groups_.size() > 64) { RTP_LLM_LOG_ERROR("not support bigger than 64 groups"); @@ -158,17 +163,20 @@ bool DefaultLayerGroupPolicy::genBlockBuffers(const std::vector& gr iovs.reserve(layer_ids.size() * 2); for (size_t j = 0; j < layer_ids.size(); ++j) { // if support scale, block_infos: {kv_info, scale_info} - const auto& block_infos = allocator_->convertIndexToBuffer(layer_ids[j], block_ids[i]); + const auto& block_infos = allocator_->convertIndexToBuffer(layer_ids[j], group_ids[i], block_ids[i]); if (block_infos.empty()) { - RTP_LLM_LOG_WARNING( - "convertIndexToBuffer returned empty for layer_id [%d] block_id[%d]", layer_ids[j], block_ids[i]); + RTP_LLM_LOG_WARNING("convertIndexToBuffer returned empty for layer_id [%d] group_id [%d] block_id[%d]", + layer_ids[j], + group_ids[i], + block_ids[i]); } for (size_t idx = 0; idx < block_infos.size(); ++idx) { CHECK_BLOCK_INFO_VALID( block_infos[idx], - "convertIndexToBuffer failed layer_id [%d] block_id[%d], block_info.addr or block_info.size_bytes is invalid", - j, - i); + "convertIndexToBuffer failed layer_id [%d] group_id [%d] block_id[%d], block_info.addr or block_info.size_bytes is invalid", + layer_ids[j], + group_ids[i], + block_ids[i]); push_iov(iovs, block_infos[idx]); } } diff --git a/rtp_llm/cpp/cache/connector/remote_connector/RemoteConnector.cc b/rtp_llm/cpp/cache/connector/remote_connector/RemoteConnector.cc index 254b3db08e..4aa724449f 100644 --- a/rtp_llm/cpp/cache/connector/remote_connector/RemoteConnector.cc +++ b/rtp_llm/cpp/cache/connector/remote_connector/RemoteConnector.cc @@ -7,7 +7,7 @@ #include "rtp_llm/cpp/utils/AssertUtils.h" #include "rtp_llm/cpp/utils/Logger.h" #include "rtp_llm/cpp/utils/TimeUtil.h" -#include "rtp_llm/cpp/cache/KVCacheAllocator.h" +#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h" #include "rtp_llm/models_py/bindings/cuda/cuda_host_utils.h" #include "rtp_llm/cpp/metrics/RtpLLMMetrics.h" #include "rtp_llm/cpp/cache/connector/Meta.h" @@ -181,18 +181,20 @@ RemoteConnector::RemoteConnector(const CacheConfig& cache register_buffer_size}; init_params_ = std::make_shared(std::move(init_params)); std::vector full_group_ids, linear_group_ids; - if (cache_config.linear_group_num == 0) { - full_group_ids.push_back(0); + for (int32_t group_id = 0; group_id < cache_config.groupNums(); group_id++) { + if (cache_config.typeForGroup(static_cast(group_id)) == CacheGroupType::FULL) { + full_group_ids.push_back(group_id); + } else { + linear_group_ids.push_back(group_id); + } + } + if (linear_group_ids.empty()) { + if (full_group_ids.empty()) { + full_group_ids.push_back(0); + } group_policy_ = std::make_unique(allocator, full_group_ids, linear_group_ids); } else { - for (int32_t group_id = 0; static_cast(group_id) < cache_config.group_types.size(); group_id++) { - if (cache_config.group_types[group_id] == CacheGroupType::FULL) { - full_group_ids.push_back(group_id); - } else { - linear_group_ids.push_back(group_id); - } - } group_policy_ = std::make_unique( allocator, full_group_ids, linear_group_ids, std::max(1, cache_config.linear_step)); } diff --git a/rtp_llm/cpp/cache/connector/remote_connector/test/BUILD b/rtp_llm/cpp/cache/connector/remote_connector/test/BUILD index 0318ea9af8..35db1b1fde 100644 --- a/rtp_llm/cpp/cache/connector/remote_connector/test/BUILD +++ b/rtp_llm/cpp/cache/connector/remote_connector/test/BUILD @@ -5,6 +5,18 @@ test_copts = [ "-fno-access-control", ] + copts() +cc_import( + name = "cuda13_torch_nvshmem", + shared_library = "@pip_gpu_cuda13_torch_torch//:site-packages/torch/lib/libtorch_nvshmem.so", +) + +cuda13_torch_link_deps = select({ + "@//:using_cuda13_x86": [ + ":cuda13_torch_nvshmem", + ], + "//conditions:default": [], +}) + test_deps = [ "//rtp_llm/models_py/bindings/cuda/ops:cuda_impl", "//rtp_llm/models_py/bindings/core:exec_ops_test_lib", @@ -13,7 +25,7 @@ test_deps = [ "@com_google_googletest//:gtest_main", "@local_config_cuda//cuda:cuda_headers", "@local_config_cuda//cuda:cudart", -] + torch_deps() +] + torch_deps() + cuda13_torch_link_deps test_copts = [ "-fno-access-control", diff --git a/rtp_llm/cpp/cache/connector/remote_connector/test/GroupPolicyTest.cc b/rtp_llm/cpp/cache/connector/remote_connector/test/GroupPolicyTest.cc index 64ba760189..f5e8aa54ea 100644 --- a/rtp_llm/cpp/cache/connector/remote_connector/test/GroupPolicyTest.cc +++ b/rtp_llm/cpp/cache/connector/remote_connector/test/GroupPolicyTest.cc @@ -2,7 +2,7 @@ #include #include "rtp_llm/cpp/utils/Logger.h" -#include "rtp_llm/cpp/cache/KVCacheAllocator.h" +#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h" #include "rtp_llm/cpp/cache/connector/remote_connector/GroupPolicy.h" using namespace rtp_llm; @@ -33,12 +33,12 @@ class FakeKVCacheAllocator: public KVCacheAllocator { KVCacheAllocator(config) { for (int32_t full_group_id : full_group_ids) { for (int i = 0; i < per_group_layer_num; i++) { - fake_layout_.layer_to_groups.push_back(full_group_id); + fake_layout_.layer_to_group_ids.push_back({full_group_id}); } } for (int32_t other_group_id : other_group_ids) { for (int i = 0; i < per_group_layer_num; i++) { - fake_layout_.layer_to_groups.push_back(other_group_id); + fake_layout_.layer_to_group_ids.push_back({other_group_id}); } } } diff --git a/rtp_llm/cpp/cache/connector/remote_connector/test/RemoteConnectorInternalTest.cc b/rtp_llm/cpp/cache/connector/remote_connector/test/RemoteConnectorInternalTest.cc index ac3565361e..bdebbeb8ef 100644 --- a/rtp_llm/cpp/cache/connector/remote_connector/test/RemoteConnectorInternalTest.cc +++ b/rtp_llm/cpp/cache/connector/remote_connector/test/RemoteConnectorInternalTest.cc @@ -1,8 +1,10 @@ #include #include +#include + #include "rtp_llm/cpp/cache/connector/remote_connector/RemoteConnector.h" -#include "rtp_llm/cpp/cache/KVCacheAllocator.h" +#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h" #include "rtp_llm/cpp/utils/Logger.h" #include "rtp_llm/models_py/bindings/core/ExecOps.h" #include "autil/EnvUtil.h" @@ -29,12 +31,12 @@ class FakeKVCacheAllocator: public KVCacheAllocator { KVCacheAllocator(config) { for (int32_t full_group_id : full_group_ids) { for (int i = 0; i < per_group_layer_num; i++) { - fake_layout_.layer_to_groups.push_back(full_group_id); + fake_layout_.layer_to_group_ids.push_back({full_group_id}); } } for (int32_t other_group_id : other_group_ids) { for (int i = 0; i < per_group_layer_num; i++) { - fake_layout_.layer_to_groups.push_back(other_group_id); + fake_layout_.layer_to_group_ids.push_back({other_group_id}); } } } @@ -128,22 +130,23 @@ class RemoteConnectorInternalTest: public ::testing::Test { void SetUp() override { rtp_llm::initLogger(); auto mha_spec = std::make_shared(); - mha_spec->layer_num = layer_num_; mha_spec->local_head_num_kv = 8; mha_spec->size_per_head = 128; mha_spec->seq_size_per_block = 8; mha_spec->dtype = rtp_llm::DataType::TYPE_FP16; mha_spec->type = KVCacheSpecType::MultiHeadAttention; cache_config_.block_num = 8; - cache_config_.cache_specs.push_back(mha_spec); - byte_size_per_block_ = static_cast(mha_spec->block_size_bytes() * mha_spec->layer_num); + byte_size_per_block_ = static_cast(mha_spec->block_size_bytes() * layer_num_); cache_config_.block_size_bytes = byte_size_per_block_; cache_config_.dtype = rtp_llm::DataType::TYPE_FP16; - cache_config_.group_types.push_back(CacheGroupType::FULL); - cache_config_.group_types.push_back(CacheGroupType::LINEAR); - cache_config_.group_types.push_back(CacheGroupType::LINEAR); - cache_config_.full_group_num = 1; - cache_config_.linear_group_num = 2; + cache_config_.layer_num = layer_num_; + cache_config_.layer_all_num = layer_num_; + std::vector layers(layer_num_); + std::iota(layers.begin(), layers.end(), 0); + cache_config_.fromGroupedSpecs({mha_spec, mha_spec, mha_spec}, + {layers, layers, layers}, + {CacheGroupType::FULL, CacheGroupType::LINEAR, CacheGroupType::LINEAR}, + {"F0", "L1", "L2"}); } void TearDown() override {} diff --git a/rtp_llm/cpp/cache/connector/remote_connector/test/RemoteConnectorMockFullLinearTest.cc b/rtp_llm/cpp/cache/connector/remote_connector/test/RemoteConnectorMockFullLinearTest.cc index 4f8d6d18cc..153c8d0a7f 100644 --- a/rtp_llm/cpp/cache/connector/remote_connector/test/RemoteConnectorMockFullLinearTest.cc +++ b/rtp_llm/cpp/cache/connector/remote_connector/test/RemoteConnectorMockFullLinearTest.cc @@ -1,7 +1,7 @@ -#include "rtp_llm/cpp/cache/KVCacheAllocator.h" +#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h" #include "rtp_llm/cpp/cache/connector/remote_connector/test/RemoteConnectorMockTestBase.h" #include "rtp_llm/cpp/cache/connector/Meta.h" -#include "rtp_llm/cpp/cache/HybridTypeKVCacheAllocator.h" +#include "rtp_llm/cpp/cache/allocator/HybridTypeKVCacheAllocator.h" #include "rtp_llm/cpp/utils/AssertUtils.h" #include "rtp_llm/cpp/config/StaticConfig.h" @@ -94,16 +94,13 @@ class RemoteConnectorMockFullLinearTest: public RemoteConnectorMockTestBase { } void initHybridLayerCacheConfig(int layer_num = 4, int block_num = 10, int seq_size_per_block = 8) { - cache_config_.linear_group_num = other_group_ids_.size(); - cache_config_.full_group_num = full_group_ids_.size(); - size_t all_group_num = cache_config_.linear_group_num + cache_config_.full_group_num; + size_t all_group_num = other_group_ids_.size() + full_group_ids_.size(); cache_config_.layer_num = all_group_num * layer_num; cache_config_.layer_all_num = all_group_num * layer_num; cache_config_.group_layer_num = layer_num; int unique_layer_id = 0; auto full_spec = std::make_shared(); - full_spec->layer_num = layer_num; full_spec->local_head_num_kv = 8; full_spec->size_per_head = 128; full_spec->seq_size_per_block = seq_size_per_block; @@ -113,7 +110,6 @@ class RemoteConnectorMockFullLinearTest: public RemoteConnectorMockTestBase { auto linear_spec = std::make_shared(); linear_spec->type = KVCacheSpecType::LinearAttention; linear_spec->dtype = rtp_llm::DataType::TYPE_FP16; - linear_spec->layer_num = layer_num; linear_spec->local_num_k_heads = 1; linear_spec->local_num_v_heads = 1; linear_spec->head_k_dim = 1; @@ -122,38 +118,37 @@ class RemoteConnectorMockFullLinearTest: public RemoteConnectorMockTestBase { linear_spec->local_head_num_kv = 1; linear_spec->seq_size_per_block = seq_size_per_block; - for (int i = 0; i < cache_config_.full_group_num; i++) { - cache_config_.global_layer_ids.push_back({}); - cache_config_.layer_ids.push_back({}); - cache_config_.group_types.push_back(CacheGroupType::FULL); - cache_config_.cache_specs.push_back(full_spec); - cache_config_.full_groups.push_back({}); + std::vector specs; + std::vector> layers_by_group; + std::vector types; + std::vector tags; + + for (size_t i = 0; i < full_group_ids_.size(); i++) { + specs.push_back(full_spec); + layers_by_group.emplace_back(); + types.push_back(CacheGroupType::FULL); + tags.push_back("full_" + std::to_string(i)); for (int j = 0; j < layer_num; j++) { - cache_config_.layer_to_group_id.push_back(full_group_ids_[i]); - cache_config_.global_layer_ids.back().push_back(unique_layer_id); - cache_config_.layer_ids.back().push_back(unique_layer_id); + layers_by_group.back().push_back(unique_layer_id); unique_layer_id++; } } - for (int i = 0; i < cache_config_.linear_group_num; i++) { - cache_config_.global_layer_ids.push_back({}); - cache_config_.layer_ids.push_back({}); - cache_config_.group_types.push_back(CacheGroupType::LINEAR); - cache_config_.cache_specs.push_back(linear_spec); - cache_config_.linear_groups.push_back({}); + for (size_t i = 0; i < other_group_ids_.size(); i++) { + specs.push_back(linear_spec); + layers_by_group.emplace_back(); + types.push_back(CacheGroupType::LINEAR); + tags.push_back("linear_" + std::to_string(i)); for (int j = 0; j < layer_num; j++) { - cache_config_.layer_to_group_id.push_back(other_group_ids_[i]); - cache_config_.global_layer_ids.back().push_back(unique_layer_id); - cache_config_.layer_ids.back().push_back(unique_layer_id); + layers_by_group.back().push_back(unique_layer_id); unique_layer_id++; } } - cache_config_.layer_ids = cache_config_.global_layer_ids; cache_config_.block_num = block_num; cache_config_.seq_size_per_block = seq_size_per_block; cache_config_.dtype = rtp_llm::DataType::TYPE_FP16; + cache_config_.fromGroupedSpecs(specs, layers_by_group, types, tags); const size_t full_kv_block_stride_bytes = full_spec->block_size_bytes(); const size_t linear_kv_block_stride_bytes = linear_spec->block_size_bytes(); @@ -1065,4 +1060,4 @@ TEST_F(RemoteConnectorMockFullLinearTest, test_threadpool_ec) { } } // namespace test -} // namespace rtp_llm \ No newline at end of file +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/connector/remote_connector/test/RemoteConnectorMockOnlyFullTest.cc b/rtp_llm/cpp/cache/connector/remote_connector/test/RemoteConnectorMockOnlyFullTest.cc index 742b364d03..bbd7f11907 100644 --- a/rtp_llm/cpp/cache/connector/remote_connector/test/RemoteConnectorMockOnlyFullTest.cc +++ b/rtp_llm/cpp/cache/connector/remote_connector/test/RemoteConnectorMockOnlyFullTest.cc @@ -1,4 +1,4 @@ -#include "rtp_llm/cpp/cache/SingleTypeKVCacheAllocator.h" +#include "rtp_llm/cpp/cache/allocator/SingleTypeKVCacheAllocator.h" #include "rtp_llm/cpp/cache/connector/Meta.h" #include "rtp_llm/cpp/cache/connector/remote_connector/test/RemoteConnectorMockTestBase.h" @@ -98,15 +98,12 @@ class RemoteConnectorMockOnlyFullTest: public RemoteConnectorMockTestBase { cache_config_.seq_size_per_block = seq_size_per_block; auto mha_spec = std::make_shared(); - mha_spec->layer_num = layer_num; mha_spec->local_head_num_kv = 8; mha_spec->size_per_head = 128; mha_spec->seq_size_per_block = seq_size_per_block; mha_spec->dtype = rtp_llm::DataType::TYPE_FP16; mha_spec->type = KVCacheSpecType::MultiHeadAttention; cache_config_.dtype = rtp_llm::DataType::TYPE_FP16; - cache_config_.cache_specs.push_back(mha_spec); - ; cache_config_.kv_block_stride_bytes = mha_spec->block_size_bytes(); // one-layer KV bytes for one logical block cache_config_.kv_scale_stride_bytes = 0; cache_config_.kv_block_size_bytes = static_cast(layer_num) * cache_config_.kv_block_stride_bytes; @@ -116,8 +113,7 @@ class RemoteConnectorMockOnlyFullTest: public RemoteConnectorMockTestBase { for (int i = 0; i < layer_num; ++i) { layer_ids[i] = i; } - cache_config_.layer_ids.push_back(layer_ids); - cache_config_.global_layer_ids.push_back(layer_ids); + cache_config_.fromGroupedSpecs({mha_spec}, {layer_ids}, {CacheGroupType::FULL}, {"default"}); } }; @@ -149,7 +145,7 @@ TEST_F(RemoteConnectorMockOnlyFullTest, test_async_match_and_async_read_with_gpu { // 没有其他connector UriStrVec expected_uris = genUris({1, 2, 3}); - BlockBuffersExpect block_buffers_expect = {3, kFakeLayerNum, cache_config_.cache_specs[0]->block_size_bytes()}; + BlockBuffersExpect block_buffers_expect = {3, kFakeLayerNum, cache_config_.specForGroup(0)->block_size_bytes()}; std::vector expect_block_ids({"1", "2", "3"}); EXPECT_CALL(*transfer_client_, LoadKvCaches(Eq(expected_uris), @@ -175,7 +171,7 @@ TEST_F(RemoteConnectorMockOnlyFullTest, test_async_match_and_async_read_with_gpu { // 其他connector也命中了部分 UriStrVec expected_uris = genUris({2, 3}); - BlockBuffersExpect block_buffers_expect = {2, kFakeLayerNum, cache_config_.cache_specs[0]->block_size_bytes()}; + BlockBuffersExpect block_buffers_expect = {2, kFakeLayerNum, cache_config_.specForGroup(0)->block_size_bytes()}; std::vector expect_block_ids({"2", "3"}); EXPECT_CALL(*transfer_client_, LoadKvCaches(Eq(expected_uris), @@ -240,7 +236,7 @@ TEST_F(RemoteConnectorMockOnlyFullTest, test_async_match_and_async_read_with_gpu { // 没有其他connector UriStrVec expected_uris = genUris({2, 3}); - BlockBuffersExpect block_buffers_expect = {2, kFakeLayerNum, cache_config_.cache_specs[0]->block_size_bytes()}; + BlockBuffersExpect block_buffers_expect = {2, kFakeLayerNum, cache_config_.specForGroup(0)->block_size_bytes()}; std::vector expect_block_ids({"2", "3"}); EXPECT_CALL(*transfer_client_, LoadKvCaches(Eq(expected_uris), @@ -263,7 +259,7 @@ TEST_F(RemoteConnectorMockOnlyFullTest, test_async_match_and_async_read_with_gpu { // 有其他connector UriStrVec expected_uris = genUris({3}); - BlockBuffersExpect block_buffers_expect = {1, kFakeLayerNum, cache_config_.cache_specs[0]->block_size_bytes()}; + BlockBuffersExpect block_buffers_expect = {1, kFakeLayerNum, cache_config_.specForGroup(0)->block_size_bytes()}; std::vector expect_block_ids({"3"}); EXPECT_CALL(*transfer_client_, LoadKvCaches(Eq(expected_uris), @@ -324,7 +320,7 @@ TEST_F(RemoteConnectorMockOnlyFullTest, test_write_success_broadcast_success_act UriStrVec expected_uris = genUris({1, 2, 3}); UriStrVec actual_uris = genUris({1, 2, 3}, {}, "actual_"); - BlockBuffersExpect block_buffers_expect = {3, kFakeLayerNum, cache_config_.cache_specs[0]->block_size_bytes()}; + BlockBuffersExpect block_buffers_expect = {3, kFakeLayerNum, cache_config_.specForGroup(0)->block_size_bytes()}; std::vector expect_block_ids({"1", "2", "3"}); EXPECT_CALL(*transfer_client_, SaveKvCaches(Eq(expected_uris), @@ -370,7 +366,7 @@ TEST_F(RemoteConnectorMockOnlyFullTest, UriStrVec expected_uris = genUris({2, 3}); UriStrVec actual_uris = genUris({2, 3}, {}, "actual_"); - BlockBuffersExpect block_buffers_expect = {2, kFakeLayerNum, cache_config_.cache_specs[0]->block_size_bytes()}; + BlockBuffersExpect block_buffers_expect = {2, kFakeLayerNum, cache_config_.specForGroup(0)->block_size_bytes()}; std::vector expect_block_ids({"2", "3"}); EXPECT_CALL(*transfer_client_, SaveKvCaches(Eq(expected_uris), @@ -417,7 +413,7 @@ TEST_F(RemoteConnectorMockOnlyFullTest, UriStrVec expected_uris = genUris({2, 4}); UriStrVec actual_uris = genUris({2, 4}, {}, "actual_"); - BlockBuffersExpect block_buffers_expect = {2, kFakeLayerNum, cache_config_.cache_specs[0]->block_size_bytes()}; + BlockBuffersExpect block_buffers_expect = {2, kFakeLayerNum, cache_config_.specForGroup(0)->block_size_bytes()}; std::vector expect_block_ids({"2", "4"}); EXPECT_CALL(*transfer_client_, SaveKvCaches(Eq(expected_uris), @@ -489,7 +485,7 @@ TEST_F(RemoteConnectorMockOnlyFullTest, test_write_success_broadcast_success_act UriStrVec expected_uris = genUris({1, 2, 3}); - BlockBuffersExpect block_buffers_expect = {3, kFakeLayerNum, cache_config_.cache_specs[0]->block_size_bytes()}; + BlockBuffersExpect block_buffers_expect = {3, kFakeLayerNum, cache_config_.specForGroup(0)->block_size_bytes()}; std::vector expect_block_ids({"1", "2", "3"}); EXPECT_CALL(*transfer_client_, SaveKvCaches(Eq(expected_uris), @@ -511,4 +507,4 @@ TEST_F(RemoteConnectorMockOnlyFullTest, test_write_success_broadcast_success_act } } // namespace test -} // namespace rtp_llm \ No newline at end of file +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/connector/test/BUILD b/rtp_llm/cpp/cache/connector/test/BUILD index b2aa5d8535..0bfdba38f9 100644 --- a/rtp_llm/cpp/cache/connector/test/BUILD +++ b/rtp_llm/cpp/cache/connector/test/BUILD @@ -7,6 +7,18 @@ test_copts = [ "-fno-access-control", ] + copts() +cc_import( + name = "cuda13_torch_nvshmem", + shared_library = "@pip_gpu_cuda13_torch_torch//:site-packages/torch/lib/libtorch_nvshmem.so", +) + +cuda13_torch_link_deps = select({ + "@//:using_cuda13_x86": [ + ":cuda13_torch_nvshmem", + ], + "//conditions:default": [], +}) + test_deps = [ "//rtp_llm/models_py/bindings/cuda/ops:cuda_impl", "//rtp_llm/cpp/config:config_modules", @@ -16,7 +28,7 @@ test_deps = [ "@com_google_googletest//:gtest_main", "@local_config_cuda//cuda:cuda_headers", "@local_config_cuda//cuda:cudart", -] + torch_deps() +] + torch_deps() + cuda13_torch_link_deps cc_test( name = "coordinator_test", diff --git a/rtp_llm/cpp/cache/connector/test/KVCacheConnectorCoordinatorTest.cc b/rtp_llm/cpp/cache/connector/test/KVCacheConnectorCoordinatorTest.cc index 60094c47e2..962c976d43 100644 --- a/rtp_llm/cpp/cache/connector/test/KVCacheConnectorCoordinatorTest.cc +++ b/rtp_llm/cpp/cache/connector/test/KVCacheConnectorCoordinatorTest.cc @@ -1,8 +1,13 @@ #include #include +#include +#include + #include "rtp_llm/cpp/cache/BlockPool.h" #include "rtp_llm/cpp/cache/BlockPoolConfigHelper.h" +#include "rtp_llm/cpp/cache/spec/MHAKVCacheSpec.h" +#include "rtp_llm/cpp/cache/spec/OpaqueKVCacheSpec.h" #include "rtp_llm/cpp/cache/connector/KVCacheConnectorCoordinator.h" #include "rtp_llm/cpp/cache/connector/memory/test/mock/MockKVCacheMemoryConnector.h" #include "rtp_llm/cpp/cache/connector/test/mock/MockAsyncContext.h" @@ -14,12 +19,37 @@ #include "rtp_llm/models_py/bindings/core/ExecOps.h" #include "rtp_llm/cpp/config/ModelConfig.h" #include "rtp_llm/cpp/config/EplbConfig.h" +#include "rtp_llm/cpp/config/StaticConfig.h" namespace rtp_llm { namespace test { namespace { +void initSingleGroupConfig(CacheConfig& config) { + auto spec = std::make_shared(); + spec->type = KVCacheSpecType::MultiHeadAttention; + spec->dtype = config.dtype; + spec->local_head_num_kv = 1; + spec->size_per_head = 1; + spec->seq_size_per_block = static_cast(std::max(1, config.seq_size_per_block)); + std::vector layers(static_cast(config.layer_num)); + std::iota(layers.begin(), layers.end(), 0); + config.fromGroupedSpecs({spec}, {layers}, {CacheGroupType::FULL}, {"default"}); +} + +void initTwoGroupCpConfig(CacheConfig& config) { + auto full_spec = std::make_shared(); + full_spec->type = KVCacheSpecType::MultiHeadAttention; + full_spec->dtype = config.dtype; + full_spec->local_head_num_kv = 1; + full_spec->size_per_head = 1; + full_spec->seq_size_per_block = static_cast(std::max(1, config.seq_size_per_block)); + + auto swa_spec = std::make_shared("swa", 1, 1, config.dtype, full_spec->seq_size_per_block); + config.fromGroupedSpecs({full_spec, swa_spec}, {{0}, {1}}, {CacheGroupType::FULL, CacheGroupType::SWA}, {"full", "swa"}); +} + class TestMeta final: public Meta { public: explicit TestMeta(bool enable_memory_cache, bool enable_remote_cache, std::string trace_id): @@ -55,6 +85,8 @@ class TestMeta final: public Meta { class KVCacheConnectorCoordinatorTest: public ::testing::Test { protected: void SetUp() override { + old_core_dump_on_exception_ = StaticConfig::user_ft_core_dump_on_exception; + StaticConfig::user_ft_core_dump_on_exception = false; rtp_llm::initLogger(); cache_config_.layer_num = 1; @@ -62,7 +94,7 @@ class KVCacheConnectorCoordinatorTest: public ::testing::Test { cache_config_.block_num = 10; cache_config_.block_size_bytes = 1024; cache_config_.dtype = rtp_llm::TYPE_FP16; - cache_config_.layer_to_group_id.assign(static_cast(cache_config_.layer_all_num), 0); + initSingleGroupConfig(cache_config_); kv_cache_config_.memory_cache_size_mb = 100; kv_cache_config_.memory_cache_sync_timeout_ms = 1000; @@ -73,7 +105,7 @@ class KVCacheConnectorCoordinatorTest: public ::testing::Test { // Those methods assume allocator_->block_pool_ is non-null. In UT we use a mock allocator, so set a // minimal BlockPool here to avoid crashes/hangs in tests that exercise coordinator paths. { - // NOTE: use the 4-arg overload to avoid requiring cache_config_.cache_specs in unit tests. + // NOTE: use the 4-arg overload because the mock allocator only needs physical block layout here. const size_t block_stride_bytes = cache_config_.block_size_bytes / static_cast(std::max(1u, cache_config_.layer_all_num)); auto pool_config = BlockPoolConfigHelper::createConfig( @@ -108,6 +140,7 @@ class KVCacheConnectorCoordinatorTest: public ::testing::Test { } void TearDown() override { + StaticConfig::user_ft_core_dump_on_exception = old_core_dump_on_exception_; if (coordinator_) { // Ensure all internal contexts/connectors are released before gmock leak checker runs at program exit. coordinator_->stop_.store(true); @@ -195,6 +228,7 @@ class KVCacheConnectorCoordinatorTest: public ::testing::Test { std::shared_ptr allocator_; std::shared_ptr coordinator_; + bool old_core_dump_on_exception_{false}; }; TEST_F(KVCacheConnectorCoordinatorTest, Init_ReturnFalse_WhenMemoryConfigInvalid) { @@ -205,7 +239,7 @@ TEST_F(KVCacheConnectorCoordinatorTest, Init_ReturnFalse_WhenMemoryConfigInvalid cache_config.layer_all_num = 1; cache_config.block_num = 1; cache_config.block_size_bytes = 1; - cache_config.layer_to_group_id.assign(static_cast(cache_config.layer_all_num), 0); + initSingleGroupConfig(cache_config); kv_cache_config.enable_memory_cache = true; kv_cache_config.reuse_cache = true; // coordinator init only enables memory connector when reuse_cache is true @@ -227,7 +261,7 @@ TEST_F(KVCacheConnectorCoordinatorTest, Init_ReturnTrue_WhenMemorySkipped_AndSto cache_config.layer_all_num = 1; cache_config.block_num = 1; cache_config.block_size_bytes = 1; - cache_config.layer_to_group_id.assign(static_cast(cache_config.layer_all_num), 0); + initSingleGroupConfig(cache_config); kv_cache_config.enable_memory_cache = false; // skip memory connector in init @@ -248,7 +282,7 @@ TEST_F(KVCacheConnectorCoordinatorTest, Init_ReturnFalse_WhenMemoryEnabledButSiz cache_config.layer_all_num = 1; cache_config.block_num = 1; cache_config.block_size_bytes = 1; - cache_config.layer_to_group_id.assign(static_cast(cache_config.layer_all_num), 0); + initSingleGroupConfig(cache_config); kv_cache_config.enable_memory_cache = true; kv_cache_config.reuse_cache = true; @@ -274,7 +308,7 @@ TEST_F(KVCacheConnectorCoordinatorTest, Init_ReturnTrue_WhenMemoryEnabled_HappyP // Keep block size reasonably large so block_num doesn't explode in createBlockPool(). cache_config.block_size_bytes = 1024; cache_config.dtype = rtp_llm::TYPE_FP16; - cache_config.layer_to_group_id.assign(static_cast(cache_config.layer_all_num), 0); + initSingleGroupConfig(cache_config); // Memory connector requires per-layer block stride bytes. cache_config.layer_to_block_stride_bytes.assign(static_cast(cache_config.layer_num), cache_config.block_size_bytes); @@ -314,7 +348,7 @@ TEST_F(KVCacheConnectorCoordinatorTest, AsyncRead_ReturnNull_WhenStop) { cache_config.layer_all_num = 1; cache_config.block_num = 1; cache_config.block_size_bytes = 1; - cache_config.layer_to_group_id.assign(static_cast(cache_config.layer_all_num), 0); + initSingleGroupConfig(cache_config); auto allocator = std::make_shared>(cache_config); auto coordinator = std::make_shared(cache_config, @@ -345,7 +379,7 @@ TEST_F(KVCacheConnectorCoordinatorTest, AsyncRead_ReturnNull_WhenCacheKeysEmpty) coordinator_->allocator_ = allocator_; KVCacheResource resource; - resource.initGroups(1, cache_config_.layer_all_num, cache_config_.layer_to_group_id); + resource.initGroups(1, cache_config_.layer_all_num, cache_config_.layerGroupIdsSnapshot()); // leave cacheKeys empty to hit the early return auto rw_ctx = std::make_shared>(); std::shared_ptr meta = @@ -380,7 +414,7 @@ TEST_F(KVCacheConnectorCoordinatorTest, AsyncRead_ReturnNull_WhenIncrKVCacheRefR } KVCacheResource resource; - resource.initGroups(1, cache_config_.layer_all_num, cache_config_.layer_to_group_id); + resource.initGroups(1, cache_config_.layer_all_num, cache_config_.layerGroupIdsSnapshot()); resource.cacheKeys() = CacheKeysType{1, 2, 3}; auto rw_ctx = std::make_shared>(); @@ -410,7 +444,7 @@ TEST_F(KVCacheConnectorCoordinatorTest, AsyncRead_ReturnNull_WhenNoMatchContexts // and will be processed/cleaned up by the coordinator update loop if enabled. // Use a plain shared_ptr here to avoid custom-deleter side effects in this no-connector path. auto resource = std::make_shared(); - resource->initGroups(1, cache_config_.layer_all_num, cache_config_.layer_to_group_id); + resource->initGroups(1, cache_config_.layer_all_num, cache_config_.layerGroupIdsSnapshot()); // Don't let gmock keep a ref to `resource` until program exit. // gmock actions are stored as const; use a shared holder to release the ref after first call. auto resource_holder = std::make_shared>(resource); @@ -491,7 +525,7 @@ TEST_F(KVCacheConnectorCoordinatorTest, AsyncWrite_ReturnNull_WhenStop) { cache_config.layer_all_num = 1; cache_config.block_num = 1; cache_config.block_size_bytes = 1; - cache_config.layer_to_group_id.assign(static_cast(cache_config.layer_all_num), 0); + initSingleGroupConfig(cache_config); auto allocator = std::make_shared>(cache_config); auto coordinator = std::make_shared(cache_config, @@ -522,7 +556,7 @@ TEST_F(KVCacheConnectorCoordinatorTest, AsyncWrite_ReturnNull_WhenCacheKeysEmpty coordinator_->allocator_ = allocator_; KVCacheResource resource; - resource.initGroups(1, cache_config_.layer_all_num, cache_config_.layer_to_group_id); + resource.initGroups(1, cache_config_.layer_all_num, cache_config_.layerGroupIdsSnapshot()); // leave cacheKeys empty auto rw_ctx = std::make_shared>(); std::shared_ptr meta = @@ -537,6 +571,319 @@ TEST_F(KVCacheConnectorCoordinatorTest, AsyncWrite_ReturnNull_WhenCacheKeysEmpty EXPECT_EQ(ctx, nullptr); } +TEST_F(KVCacheConnectorCoordinatorTest, AsyncWrite_CPShardedKeepsNonFullGroupsInLogicalCoordinates) { + CacheConfig cp_cache_config = cache_config_; + cp_cache_config.layer_num = 2; + cp_cache_config.layer_all_num = 2; + initTwoGroupCpConfig(cp_cache_config); + + ParallelismConfig parallelism_config; + parallelism_config.tp_size = 2; + parallelism_config.prefill_cp_config.kv_cache_sharded = true; + + auto coordinator = std::make_shared(cp_cache_config, + kv_cache_config_, + runtime_config_, + parallelism_config, + SpeculativeExecutionConfig{}, + allocator_); + coordinator->connectors_.clear(); + + KVCacheResource resource; + resource.initGroups(/*group_num=*/2, + /*layer_num=*/static_cast(cp_cache_config.layer_all_num), + cp_cache_config.layerGroupIdsSnapshot(), + cp_cache_config.kernelBlocksPerKvBlock(), + cp_cache_config.groupTypesSnapshot()); + resource.cacheKeys() = CacheKeysType{10, 11, 12, 13}; + resource.setLastBlockAligned(false); + resource.mutableBlockIds(/*gid=*/0).assign(BlockIndicesType{100, 101}); // FULL: compact local blocks + resource.mutableBlockIds(/*gid=*/1).assign(BlockIndicesType{200, 201, 202, 203}); // SWA: full logical slots + + EXPECT_CALL(*allocator_, incrKVCacheRef(testing::_, testing::_, testing::Eq(true))) + .WillOnce( + testing::Invoke([](const KVCacheResource& ref_resource, const CacheKeysType& ref_keys, bool is_connector) { + (void)is_connector; + EXPECT_THAT(ref_keys, testing::ElementsAre(11, 13)); + EXPECT_THAT(ref_resource.cacheKeys(), testing::ElementsAre(11, 13)); + EXPECT_FALSE(ref_resource.lastBlockAligned()); + EXPECT_THAT(ref_resource.blocks(/*gid=*/0), testing::ElementsAre(100, 101)); + EXPECT_THAT(ref_resource.blocks(/*gid=*/1), testing::ElementsAre(201, 203)); + return std::make_shared(); + })); + + auto rw_ctx = std::make_shared>(); + ON_CALL(*rw_ctx, kvCacheResource()).WillByDefault(testing::ReturnRef(resource)); + std::shared_ptr meta = + std::make_shared(/*enable_memory_cache=*/true, /*enable_remote_cache=*/false, ""); + ON_CALL(*rw_ctx, meta()).WillByDefault(testing::ReturnRef(meta)); + + auto async_ctx = coordinator->asyncWrite(rw_ctx); + ASSERT_NE(async_ctx, nullptr); + + { + std::lock_guard lock(coordinator->update_mutex_); + coordinator->fused_async_write_context_list_.clear(); + } + async_ctx.reset(); + coordinator.reset(); +} + +TEST_F(KVCacheConnectorCoordinatorTest, AsyncWrite_CPShardedSkipsRemapForCanonicalEvictedResource) { + CacheConfig cp_cache_config = cache_config_; + cp_cache_config.layer_num = 2; + cp_cache_config.layer_all_num = 2; + initTwoGroupCpConfig(cp_cache_config); + + ParallelismConfig parallelism_config; + parallelism_config.tp_size = 2; + parallelism_config.prefill_cp_config.kv_cache_sharded = true; + + auto coordinator = std::make_shared(cp_cache_config, + kv_cache_config_, + runtime_config_, + parallelism_config, + SpeculativeExecutionConfig{}, + allocator_); + coordinator->connectors_.clear(); + + KVCacheResource resource; + resource.initGroups(/*group_num=*/2, + /*layer_num=*/static_cast(cp_cache_config.layer_all_num), + cp_cache_config.layerGroupIdsSnapshot(), + cp_cache_config.kernelBlocksPerKvBlock(), + cp_cache_config.groupTypesSnapshot()); + resource.setCacheKeys(CacheKeysType{11, 13}); + resource.setCacheKeysAreCpCanonical(true); + BlockDependency root_dep; + root_dep.ordinal = 0; + BlockDependency child_dep; + child_dep.has_parent = true; + child_dep.parent_key = 11; + child_dep.ordinal = 1; + resource.setBlockDependencies(BlockDependenciesType{root_dep, child_dep}); + resource.setLastBlockAligned(true); + resource.mutableBlockIds(/*gid=*/0).assign(BlockIndicesType{100, 101}); + resource.mutableBlockIds(/*gid=*/1).assign(BlockIndicesType{201, 203}); + + EXPECT_CALL(*allocator_, incrKVCacheRef(testing::_, testing::_, testing::Eq(true))) + .WillOnce( + testing::Invoke([](const KVCacheResource& ref_resource, const CacheKeysType& ref_keys, bool is_connector) { + (void)is_connector; + EXPECT_THAT(ref_keys, testing::ElementsAre(11, 13)); + EXPECT_THAT(ref_resource.cacheKeys(), testing::ElementsAre(11, 13)); + EXPECT_TRUE(ref_resource.cacheKeysAreCpCanonical()); + EXPECT_EQ(ref_resource.blockDependencies().size(), 2u); + if (ref_resource.blockDependencies().size() == 2u) { + EXPECT_FALSE(ref_resource.blockDependencies()[0].has_parent); + EXPECT_EQ(ref_resource.blockDependencies()[0].ordinal, 0u); + EXPECT_TRUE(ref_resource.blockDependencies()[1].has_parent); + EXPECT_EQ(ref_resource.blockDependencies()[1].parent_key, 11); + EXPECT_EQ(ref_resource.blockDependencies()[1].ordinal, 1u); + } + EXPECT_THAT(ref_resource.blocks(/*gid=*/0), testing::ElementsAre(100, 101)); + EXPECT_THAT(ref_resource.blocks(/*gid=*/1), testing::ElementsAre(201, 203)); + return std::make_shared(); + })); + + auto rw_ctx = std::make_shared>(); + ON_CALL(*rw_ctx, kvCacheResource()).WillByDefault(testing::ReturnRef(resource)); + std::shared_ptr meta = + std::make_shared(/*enable_memory_cache=*/true, /*enable_remote_cache=*/false, ""); + ON_CALL(*rw_ctx, meta()).WillByDefault(testing::ReturnRef(meta)); + + auto async_ctx = coordinator->asyncWrite(rw_ctx); + ASSERT_NE(async_ctx, nullptr); + + { + std::lock_guard lock(coordinator->update_mutex_); + coordinator->fused_async_write_context_list_.clear(); + } + async_ctx.reset(); + coordinator.reset(); +} + +TEST_F(KVCacheConnectorCoordinatorTest, AsyncWrite_CPShardedKeepsCompactFixedGroupsInCanonicalCoordinates) { + CacheConfig cp_cache_config = cache_config_; + cp_cache_config.layer_num = 2; + cp_cache_config.layer_all_num = 2; + cp_cache_config.seq_size_per_block = 128; + initTwoGroupCpConfig(cp_cache_config); + cp_cache_config.group_seq_size_per_block = {128, 256}; + + ParallelismConfig parallelism_config; + parallelism_config.tp_size = 2; + parallelism_config.prefill_cp_config.kv_cache_sharded = true; + + auto coordinator = std::make_shared(cp_cache_config, + kv_cache_config_, + runtime_config_, + parallelism_config, + SpeculativeExecutionConfig{}, + allocator_); + coordinator->connectors_.clear(); + + KVCacheResource resource; + resource.initGroups(/*group_num=*/2, + /*layer_num=*/static_cast(cp_cache_config.layer_all_num), + cp_cache_config.layerGroupIdsSnapshot(), + cp_cache_config.kernelBlocksPerKvBlock(), + cp_cache_config.groupTypesSnapshot()); + resource.cacheKeys() = CacheKeysType{10, 11, 12, 13}; + resource.setLastBlockAligned(false); + resource.mutableBlockIds(/*gid=*/0).assign(BlockIndicesType{100, 101}); + resource.mutableBlockIds(/*gid=*/1).assign(BlockIndicesType{200, 201}); + + EXPECT_CALL(*allocator_, incrKVCacheRef(testing::_, testing::_, testing::Eq(true))) + .WillOnce( + testing::Invoke([](const KVCacheResource& ref_resource, const CacheKeysType& ref_keys, bool is_connector) { + (void)is_connector; + EXPECT_THAT(ref_keys, testing::ElementsAre(11, 13)); + EXPECT_THAT(ref_resource.cacheKeys(), testing::ElementsAre(11, 13)); + EXPECT_FALSE(ref_resource.lastBlockAligned()); + EXPECT_THAT(ref_resource.blocks(/*gid=*/0), testing::ElementsAre(100, 101)); + EXPECT_THAT(ref_resource.blocks(/*gid=*/1), testing::ElementsAre(200, 201)); + return std::make_shared(); + })); + + auto rw_ctx = std::make_shared>(); + ON_CALL(*rw_ctx, kvCacheResource()).WillByDefault(testing::ReturnRef(resource)); + std::shared_ptr meta = + std::make_shared(/*enable_memory_cache=*/true, /*enable_remote_cache=*/false, ""); + ON_CALL(*rw_ctx, meta()).WillByDefault(testing::ReturnRef(meta)); + + auto async_ctx = coordinator->asyncWrite(rw_ctx); + ASSERT_NE(async_ctx, nullptr); + + { + std::lock_guard lock(coordinator->update_mutex_); + coordinator->fused_async_write_context_list_.clear(); + } + async_ctx.reset(); + coordinator.reset(); +} + +TEST_F(KVCacheConnectorCoordinatorTest, AsyncWrite_DecodePrefillCpRemapsFullAndCompactFixedGroups) { + CacheConfig cp_cache_config = cache_config_; + cp_cache_config.layer_num = 2; + cp_cache_config.layer_all_num = 2; + cp_cache_config.seq_size_per_block = 128; + initTwoGroupCpConfig(cp_cache_config); + cp_cache_config.group_seq_size_per_block = {128, 256}; + + ParallelismConfig parallelism_config; + parallelism_config.role_type = RoleType::DECODE; + parallelism_config.tp_size = 1; + parallelism_config.prefill_cp_config.method = CPRotateMethod::PREFILL_CP; + parallelism_config.prefill_cp_config.kv_cache_sharded = true; + parallelism_config.prefill_cp_config.prefill_cp_size = 2; + + auto coordinator = std::make_shared(cp_cache_config, + kv_cache_config_, + runtime_config_, + parallelism_config, + SpeculativeExecutionConfig{}, + allocator_); + coordinator->connectors_.clear(); + + KVCacheResource resource; + resource.initGroups(/*group_num=*/2, + /*layer_num=*/static_cast(cp_cache_config.layer_all_num), + cp_cache_config.layerGroupIdsSnapshot(), + cp_cache_config.kernelBlocksPerKvBlock(), + cp_cache_config.groupTypesSnapshot()); + resource.cacheKeys() = CacheKeysType{10, 11, 12, 13, 14}; + resource.setLastBlockAligned(false); + resource.mutableBlockIds(/*gid=*/0).assign(BlockIndicesType{100, 101, 102, 103, 104}); + resource.mutableBlockIds(/*gid=*/1).assign(BlockIndicesType{200, 201, 202}); + + EXPECT_CALL(*allocator_, incrKVCacheRef(testing::_, testing::_, testing::Eq(true))) + .WillOnce( + testing::Invoke([](const KVCacheResource& ref_resource, const CacheKeysType& ref_keys, bool is_connector) { + (void)is_connector; + EXPECT_THAT(ref_keys, testing::ElementsAre(11, 13, 14)); + EXPECT_THAT(ref_resource.cacheKeys(), testing::ElementsAre(11, 13, 14)); + EXPECT_FALSE(ref_resource.lastBlockAligned()); + EXPECT_THAT(ref_resource.blocks(/*gid=*/0), testing::ElementsAre(101, 103)); + EXPECT_THAT(ref_resource.blocks(/*gid=*/1), testing::ElementsAre(200, 201)); + return std::make_shared(); + })); + + auto rw_ctx = std::make_shared>(); + ON_CALL(*rw_ctx, kvCacheResource()).WillByDefault(testing::ReturnRef(resource)); + std::shared_ptr meta = + std::make_shared(/*enable_memory_cache=*/true, /*enable_remote_cache=*/false, ""); + ON_CALL(*rw_ctx, meta()).WillByDefault(testing::ReturnRef(meta)); + + auto async_ctx = coordinator->asyncWrite(rw_ctx); + ASSERT_NE(async_ctx, nullptr); + + { + std::lock_guard lock(coordinator->update_mutex_); + coordinator->fused_async_write_context_list_.clear(); + } + async_ctx.reset(); + coordinator.reset(); +} + +TEST_F(KVCacheConnectorCoordinatorTest, AsyncWrite_CPShardedAppendsDummyTailWhenPartialIsNotLastRank) { + CacheConfig cp_cache_config = cache_config_; + cp_cache_config.layer_num = 2; + cp_cache_config.layer_all_num = 2; + initTwoGroupCpConfig(cp_cache_config); + + ParallelismConfig parallelism_config; + parallelism_config.tp_size = 2; + parallelism_config.prefill_cp_config.kv_cache_sharded = true; + + auto coordinator = std::make_shared(cp_cache_config, + kv_cache_config_, + runtime_config_, + parallelism_config, + SpeculativeExecutionConfig{}, + allocator_); + coordinator->connectors_.clear(); + + KVCacheResource resource; + resource.initGroups(/*group_num=*/2, + /*layer_num=*/static_cast(cp_cache_config.layer_all_num), + cp_cache_config.layerGroupIdsSnapshot(), + cp_cache_config.kernelBlocksPerKvBlock(), + cp_cache_config.groupTypesSnapshot()); + resource.cacheKeys() = CacheKeysType{10, 11, 12, 13, 14}; + resource.setLastBlockAligned(false); + resource.mutableBlockIds(/*gid=*/0).assign(BlockIndicesType{100, 101, 102}); + resource.mutableBlockIds(/*gid=*/1).assign(BlockIndicesType{200, 201, 202, 203, 204}); + + EXPECT_CALL(*allocator_, incrKVCacheRef(testing::_, testing::_, testing::Eq(true))) + .WillOnce( + testing::Invoke([](const KVCacheResource& ref_resource, const CacheKeysType& ref_keys, bool is_connector) { + (void)is_connector; + EXPECT_THAT(ref_keys, testing::ElementsAre(11, 13, 14)); + EXPECT_THAT(ref_resource.cacheKeys(), testing::ElementsAre(11, 13, 14)); + EXPECT_FALSE(ref_resource.lastBlockAligned()); + EXPECT_THAT(ref_resource.blocks(/*gid=*/0), testing::ElementsAre(100, 101)); + EXPECT_THAT(ref_resource.blocks(/*gid=*/1), testing::ElementsAre(201, 203)); + return std::make_shared(); + })); + + auto rw_ctx = std::make_shared>(); + ON_CALL(*rw_ctx, kvCacheResource()).WillByDefault(testing::ReturnRef(resource)); + std::shared_ptr meta = + std::make_shared(/*enable_memory_cache=*/true, /*enable_remote_cache=*/false, ""); + ON_CALL(*rw_ctx, meta()).WillByDefault(testing::ReturnRef(meta)); + + auto async_ctx = coordinator->asyncWrite(rw_ctx); + ASSERT_NE(async_ctx, nullptr); + + { + std::lock_guard lock(coordinator->update_mutex_); + coordinator->fused_async_write_context_list_.clear(); + } + async_ctx.reset(); + coordinator.reset(); +} + TEST_F(KVCacheConnectorCoordinatorTest, AsyncWrite_ReturnNull_WhenIncrKVCacheRefReturnsNull) { auto mock_connector = std::make_shared(); coordinator_->connectors_ = {mock_connector}; @@ -544,7 +891,7 @@ TEST_F(KVCacheConnectorCoordinatorTest, AsyncWrite_ReturnNull_WhenIncrKVCacheRef // Build a connector context with non-empty cache keys. auto ctx_resource = std::make_shared(); - ctx_resource->initGroups(1, cache_config_.layer_all_num, cache_config_.layer_to_group_id); + ctx_resource->initGroups(1, cache_config_.layer_all_num, cache_config_.layerGroupIdsSnapshot()); ctx_resource->cacheKeys() = CacheKeysType{1, 2, 3}; auto rw_ctx = std::make_shared>(); std::shared_ptr meta = @@ -569,7 +916,7 @@ TEST_F(KVCacheConnectorCoordinatorTest, AsyncWrite_ReturnFusedContext_WhenMemory coordinator_->allocator_ = allocator_; KVCacheResource resource; - resource.initGroups(1, cache_config_.layer_all_num, cache_config_.layer_to_group_id); + resource.initGroups(1, cache_config_.layer_all_num, cache_config_.layerGroupIdsSnapshot()); resource.cacheKeys() = CacheKeysType{1, 2, 3}; auto selected_resource = makeResourceWithAutoDecr(); @@ -608,7 +955,7 @@ TEST_F(KVCacheConnectorCoordinatorTest, AsyncWrite_ReturnFusedContext_WhenConnec coordinator_->allocator_ = allocator_; KVCacheResource resource; - resource.initGroups(1, cache_config_.layer_all_num, cache_config_.layer_to_group_id); + resource.initGroups(1, cache_config_.layer_all_num, cache_config_.layerGroupIdsSnapshot()); resource.cacheKeys() = CacheKeysType{1, 2, 3}; auto selected_resource = makeResourceWithAutoDecr(); @@ -648,7 +995,7 @@ TEST_F(KVCacheConnectorCoordinatorTest, AsyncWrite_ReturnFusedContext_WhenNoConn coordinator_->allocator_ = allocator_; KVCacheResource resource; - resource.initGroups(1, cache_config_.layer_all_num, cache_config_.layer_to_group_id); + resource.initGroups(1, cache_config_.layer_all_num, cache_config_.layerGroupIdsSnapshot()); resource.cacheKeys() = CacheKeysType{1, 2, 3}; auto selected_resource = makeResourceWithAutoDecr(); @@ -876,4 +1223,4 @@ TEST_F(KVCacheConnectorCoordinatorTest, AsyncReadAfterMatch_Throws_WhenSizeMisma } } // namespace test -} // namespace rtp_llm \ No newline at end of file +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/FullKVCacheGroup.cc b/rtp_llm/cpp/cache/group/FullKVCacheGroup.cc similarity index 54% rename from rtp_llm/cpp/cache/FullKVCacheGroup.cc rename to rtp_llm/cpp/cache/group/FullKVCacheGroup.cc index 6e8791ac69..31ae7532a5 100644 --- a/rtp_llm/cpp/cache/FullKVCacheGroup.cc +++ b/rtp_llm/cpp/cache/group/FullKVCacheGroup.cc @@ -1,4 +1,4 @@ -#include "rtp_llm/cpp/cache/FullKVCacheGroup.h" +#include "rtp_llm/cpp/cache/group/FullKVCacheGroup.h" #include "rtp_llm/cpp/utils/Logger.h" namespace rtp_llm { @@ -9,13 +9,12 @@ int FullKVCacheGroup::needBlocksNum(int seq_len, int current_blocks, int reserve NeedBlocksInfo FullKVCacheGroup::getNeedBlocks( int common_seq_len, int seq_len, int reserve_step, int reuse_blocks_len, bool reuse_enabled) const { - (void)reuse_blocks_len; - (void)reuse_enabled; NeedBlocksInfo info; - const int common_slots = needBlocksNum(common_seq_len, /*current_blocks=*/0); - const int total_slots = needBlocksNum(seq_len, /*current_blocks=*/0, reserve_step); - info.common_blocks = std::max(common_slots, 0); - info.extra_blocks = std::max(total_slots - common_slots, 0); + const int common_slots = needBlocksNum(common_seq_len, /*current_blocks=*/0); + const int total_slots = needBlocksNum(seq_len, /*current_blocks=*/0, reserve_step); + const int reused_common_slots = reuse_enabled ? std::min(std::max(reuse_blocks_len, 0), common_slots) : 0; + info.common_blocks = std::max(common_slots - reused_common_slots, 0); + info.extra_blocks = std::max(total_slots - common_slots, 0); return info; } @@ -42,20 +41,24 @@ bool FullKVCacheGroup::malloc(BlockIds& block_ids, int seq_len, bool enable_reus return true; } -MatchResult FullKVCacheGroup::match(const CacheKeysType& cache_keys) { +MatchResult FullKVCacheGroup::matchPrefix(const CacheKeysType& cache_keys) const { MatchResult final_result; - for (const auto& cache_key : cache_keys) { - auto result = block_cache_->match(cache_key, group_id_); - if (isNullBlockIdx(result.matched_index)) { + if (!shared_cache_) { + return final_result; + } + + for (size_t i = 0; i < cache_keys.size(); ++i) { + const auto cache_key = cache_keys[i]; + auto block_idx = shared_cache_->matchGroup(cache_key, group_id_); + if (isNullBlockIdx(block_idx)) { break; } final_result.reuse_blocks++; - final_result.block_indices.push_back(result.matched_index); + final_result.block_indices.push_back(block_idx); } final_result.reuse_length = final_result.reuse_blocks * seqSizePerBlock(); - return final_result; } @@ -73,34 +76,6 @@ void FullKVCacheGroup::reference(BlockIds& block_ids, const BlockIndicesType& ne block_pool_->requestReference(new_block_indices); } -void FullKVCacheGroup::insertIntoCache(const CacheKeysType& cache_keys, - const BlockIndicesType& block_indices, - bool is_resident) { - if (cache_keys.empty()) { - return; - } - - if (cache_keys.size() != block_indices.size()) { - RTP_LLM_LOG_ERROR( - "Cache keys size (%zu) doesn't match block indices size (%zu)", cache_keys.size(), block_indices.size()); - return; - } - - const int last_index = cache_keys.size() - 1; - for (int i = last_index; i >= 0; --i) { - BlockCache::CacheItem item; - item.cache_key = cache_keys[i]; - item.group_id = group_id_; - item.block_index = block_indices[i]; - item.is_resident = is_resident; - if (block_cache_->put(item)) { - block_pool_->blockCacheReference(block_indices[i]); - } - } - - RTP_LLM_LOG_DEBUG("Inserted %zu blocks into cache", block_indices.size()); -} - void FullKVCacheGroup::removeSkippedBlocks(BlockIds& /*block_ids*/, bool /*enable_reuse_cache*/, int /*reserve_step*/) { } diff --git a/rtp_llm/cpp/cache/FullKVCacheGroup.h b/rtp_llm/cpp/cache/group/FullKVCacheGroup.h similarity index 52% rename from rtp_llm/cpp/cache/FullKVCacheGroup.h rename to rtp_llm/cpp/cache/group/FullKVCacheGroup.h index 13bb862766..f7331159bf 100644 --- a/rtp_llm/cpp/cache/FullKVCacheGroup.h +++ b/rtp_llm/cpp/cache/group/FullKVCacheGroup.h @@ -2,23 +2,24 @@ #include -#include "rtp_llm/cpp/cache/KVCacheGroup.h" +#include "rtp_llm/cpp/cache/group/KVCacheGroup.h" namespace rtp_llm { class FullKVCacheGroup: public KVCacheGroup { public: FullKVCacheGroup(const LayerIdsType& layer_ids, - std::shared_ptr kvcache_spec, - BlockPoolPtr block_pool, - int group_id): - KVCacheGroup(layer_ids, kvcache_spec, block_pool, group_id) {} + std::shared_ptr kvcache_spec, + BlockPoolPtr block_pool, + int group_id, + SharedBlockCache* shared_cache = nullptr, + const kmonitor::MetricsReporterPtr& metrics_reporter = nullptr, + CacheGroupPolicy policy = CacheGroupPolicy{}): + KVCacheGroup(layer_ids, kvcache_spec, block_pool, group_id, policy, shared_cache, metrics_reporter) {} - bool malloc(BlockIds& block_indices, int seq_len, bool enable_reuse_cache = false, int reserve_step = 0) override; - MatchResult match(const CacheKeysType& cache_keys) override; + bool malloc(BlockIds& block_ids, int seq_len, bool enable_reuse_cache = false, int reserve_step = 0) override; + MatchResult matchPrefix(const CacheKeysType& cache_keys) const override; void free(const BlockIndicesType& block_indices) override; - void - insertIntoCache(const CacheKeysType& cache_keys, const BlockIndicesType& block_indices, bool is_resident) override; void removeSkippedBlocks(BlockIds& block_ids, bool enable_reuse_cache = false, int reserve_step = 0) override; int needBlocksNum(int seq_len, int current_blocks = 0, int reserve_step = 0) const override; NeedBlocksInfo getNeedBlocks(int common_seq_len, diff --git a/rtp_llm/cpp/cache/KVCacheGroup.cc b/rtp_llm/cpp/cache/group/KVCacheGroup.cc similarity index 56% rename from rtp_llm/cpp/cache/KVCacheGroup.cc rename to rtp_llm/cpp/cache/group/KVCacheGroup.cc index 5b9343890f..db18411f00 100644 --- a/rtp_llm/cpp/cache/KVCacheGroup.cc +++ b/rtp_llm/cpp/cache/group/KVCacheGroup.cc @@ -1,4 +1,5 @@ -#include "rtp_llm/cpp/cache/KVCacheGroup.h" +#include "rtp_llm/cpp/cache/group/KVCacheGroup.h" +#include "rtp_llm/cpp/metrics/RtpLLMMetrics.h" #include "rtp_llm/cpp/utils/Logger.h" namespace rtp_llm { @@ -41,28 +42,58 @@ bool KVCacheGroup::ensureFreeBlocks(int required_blocks) { return true; } - // blocks popped by block cache might be occupied by request - // it's necessary to checkout whether free blocks are enough while (true) { const auto free_blocks = block_pool_->freeBlocksNum(); if (free_blocks >= static_cast(required_blocks)) { break; } - const int need_evict = required_blocks - static_cast(free_blocks); - auto evicted_blocks = block_cache_->pop(need_evict); - if (evicted_blocks.empty()) { - RTP_LLM_LOG_WARNING("ensure free blocks failed, free blocks : %d, need evict blocks : %d", + if (!shared_cache_) { + RTP_LLM_LOG_WARNING( + "ensure free blocks failed, no shared cache, free blocks: %zu, need: %d", free_blocks, required_blocks); + return false; + } + + const size_t need_evict = static_cast(required_blocks) - free_blocks; + SharedBlockCache::EvictResult evict_result; + size_t freed = shared_cache_->evictAndFreeForGroup(group_id_, need_evict, &evict_result); + if (metrics_reporter_) { + for (const auto& [cache_key, lifetime_ms] : evict_result.evicted_lifetime_ms) { + RtpLLMCacheEvictionMetricsCollector collector; + collector.lifetime_ms = lifetime_ms; + kmonitor::MetricsTags tags("scope", "gpu"); + tags.AddTag("evict_policy", + evict_result.evicted_independent_group.count(cache_key) ? "independent" : "chain"); + tags.AddTag("backing", "device"); + metrics_reporter_->report(&tags, + &collector); + } + } + if (freed == 0) { + RTP_LLM_LOG_WARNING("ensure free blocks failed, free blocks: %zu, need evict blocks: %zu", block_pool_->freeBlocksNum(), need_evict); return false; } - block_pool_->blockCacheFree(evicted_blocks); } return true; } +MatchResult KVCacheGroup::match(const CacheKeysType& cache_keys) { + return matchPrefix(cache_keys); +} + +MatchResult KVCacheGroup::matchPrefix(const CacheKeysType& /*cache_keys*/) const { + RTP_LLM_FAIL("KVCacheGroup gid=%d does not support prefix matching", group_id_); + return {}; +} + +MatchResult KVCacheGroup::matchSingleKey(CacheKeyType /*cache_key*/) const { + RTP_LLM_FAIL("KVCacheGroup gid=%d does not support single-key matching", group_id_); + return {}; +} + size_t KVCacheGroup::freeBlocksNum() const { return block_pool_->freeBlocksNum(); } @@ -75,6 +106,26 @@ int KVCacheGroup::group_id() const { return group_id_; } +const CacheGroupPolicy& KVCacheGroup::policy() const { + return policy_; +} + +CacheReusePolicy KVCacheGroup::reusePolicy() const { + return policy_.reuse_policy; +} + +CacheEvictPolicy KVCacheGroup::evictPolicy() const { + return policy_.evict_policy; +} + +uint32_t KVCacheGroup::explicitBlockNum() const { + return policy_.explicit_block_num; +} + +size_t KVCacheGroup::activeTailBlocks() const { + return policy_.active_tail_blocks > 0 ? static_cast(policy_.active_tail_blocks) : 0; +} + std::unordered_map KVCacheGroup::allLayerCacheBase() const { return global_layer_to_kv_tensors; } @@ -109,4 +160,36 @@ void KVCacheGroup::reference(const BlockIndicesType& new_block_indices) { block_pool_->requestReference(new_block_indices); } +bool KVCacheGroup::isCpShardable() const { + return policy_.is_cp_shardable; +} + +bool KVCacheGroup::prefixReusable() const { + return policy_.prefix_reusable && policy_.reuse_policy == CacheReusePolicy::REUSABLE; +} + +bool KVCacheGroup::hasSparseSlots() const { + return policy_.has_sparse_slots; +} + +bool KVCacheGroup::hasKernelBlockSubdiv() const { + return policy_.has_kernel_block_subdiv; +} + +bool KVCacheGroup::transferTailBlocks() const { + return activeTailBlocks() > 0; +} + +bool KVCacheGroup::cpCompactTailBlocks() const { + return policy_.cp_compact_tail_blocks; +} + +bool KVCacheGroup::isReservable() const { + return policy_.is_reservable; +} + +bool KVCacheGroup::usesPinnedCpuBacking() const { + return policy_.uses_pinned_cpu_backing; +} + } // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/KVCacheGroup.h b/rtp_llm/cpp/cache/group/KVCacheGroup.h similarity index 61% rename from rtp_llm/cpp/cache/KVCacheGroup.h rename to rtp_llm/cpp/cache/group/KVCacheGroup.h index 3274a40dc9..7004a523a3 100644 --- a/rtp_llm/cpp/cache/KVCacheGroup.h +++ b/rtp_llm/cpp/cache/group/KVCacheGroup.h @@ -7,12 +7,13 @@ #include +#include "kmonitor/client/MetricsReporter.h" #include "rtp_llm/cpp/cache/KVCacheResource.h" #include "rtp_llm/cpp/cache/Types.h" #include "rtp_llm/cpp/cache/BufferTypes.h" #include "rtp_llm/cpp/cache/CacheConfig.h" #include "rtp_llm/cpp/cache/BlockPool.h" -#include "rtp_llm/cpp/cache/BlockCache.h" +#include "rtp_llm/cpp/cache/SharedBlockCache.h" namespace rtp_llm { @@ -23,11 +24,19 @@ struct NeedBlocksInfo { class KVCacheGroup { public: - KVCacheGroup(const LayerIdsType& layer_ids, KVCacheSpecPtr kvcache_spec, BlockPoolPtr block_pool, int group_id): + KVCacheGroup(const LayerIdsType& layer_ids, + KVCacheSpecPtr kvcache_spec, + BlockPoolPtr block_pool, + int group_id, + CacheGroupPolicy policy = CacheGroupPolicy{}, + SharedBlockCache* shared_cache = nullptr, + const kmonitor::MetricsReporterPtr& metrics_reporter = nullptr): layer_ids_(layer_ids), kvcache_spec_(std::move(kvcache_spec)), block_pool_(block_pool), - block_cache_(block_pool_->blockCache()), + policy_(policy), + shared_cache_(shared_cache), + metrics_reporter_(metrics_reporter), group_id_(group_id), seq_size_per_block_(kvcache_spec_->seq_size_per_block) {} @@ -37,10 +46,10 @@ class KVCacheGroup { // Allocate blocks for `seq_len` tokens; appends new IDs to `block_ids` via BlockIds::add(). virtual bool malloc(BlockIds& block_ids, int seq_len, bool enable_reuse_cache = false, int reserve_step = 0) = 0; // TODO, match的时候热度不增加,最终匹配成功的时候再去增加热度。 - virtual MatchResult match(const CacheKeysType& cache_keys) = 0; - virtual void free(const BlockIndicesType& block_indices) = 0; - virtual void - insertIntoCache(const CacheKeysType& cache_keys, const BlockIndicesType& block_indices, bool is_resident) = 0; + virtual MatchResult match(const CacheKeysType& cache_keys); + virtual MatchResult matchPrefix(const CacheKeysType& cache_keys) const; + virtual MatchResult matchSingleKey(CacheKeyType cache_key) const; + virtual void free(const BlockIndicesType& block_indices) = 0; virtual void removeSkippedBlocks(BlockIds& block_ids, bool enable_reuse_cache = false, int reserve_step = 0) = 0; virtual int needBlocksNum(int seq_len, int current_blocks, int reserve_step = 0) const = 0; virtual NeedBlocksInfo getNeedBlocks( @@ -59,13 +68,29 @@ class KVCacheGroup { bool ensureFreeBlocks(int need_blocks); int seqSizePerBlock() const; int group_id() const; + const CacheGroupPolicy& policy() const; + CacheReusePolicy reusePolicy() const; + CacheEvictPolicy evictPolicy() const; + uint32_t explicitBlockNum() const; + size_t activeTailBlocks() const; + + virtual bool isCpShardable() const; + virtual bool prefixReusable() const; + virtual bool hasSparseSlots() const; + virtual bool hasKernelBlockSubdiv() const; + virtual bool transferTailBlocks() const; + virtual bool cpCompactTailBlocks() const; + virtual bool isReservable() const; + virtual bool usesPinnedCpuBacking() const; protected: - LayerIdsType layer_ids_; - KVCacheSpecPtr kvcache_spec_; - BlockPoolPtr block_pool_; - BlockCachePtr block_cache_; - int group_id_ = 0; + LayerIdsType layer_ids_; + KVCacheSpecPtr kvcache_spec_; + BlockPoolPtr block_pool_; + CacheGroupPolicy policy_; + SharedBlockCache* shared_cache_ = nullptr; + kmonitor::MetricsReporterPtr metrics_reporter_ = nullptr; + int group_id_ = 0; int seq_size_per_block_; std::unordered_map global_layer_to_kv_tensors; diff --git a/rtp_llm/cpp/cache/group/LinearKVCacheGroup.cc b/rtp_llm/cpp/cache/group/LinearKVCacheGroup.cc new file mode 100644 index 0000000000..68a8d33c34 --- /dev/null +++ b/rtp_llm/cpp/cache/group/LinearKVCacheGroup.cc @@ -0,0 +1,219 @@ +#include "rtp_llm/cpp/cache/group/LinearKVCacheGroup.h" + +#include +#include + +#include "rtp_llm/cpp/utils/Logger.h" + +namespace rtp_llm { + +void LinearKVCacheGroup::filterValidBlocks(const BlockIndicesType& in, BlockIndicesType& out) const { + out.clear(); + out.reserve(in.size()); + for (auto b : in) { + if (!isNullBlockIdx(b)) { + out.push_back(b); + } + } +} + +int LinearKVCacheGroup::needBlocksNum(int seq_len, int current_blocks, int reserve_step) const { + int extra_blocks = reserve_step ? reserve_step - 1 : 0; + return std::max((seq_len + seq_size_per_block_ - 1) / seq_size_per_block_ + extra_blocks - current_blocks, 0); +} + +bool LinearKVCacheGroup::shouldMaterializeBlock(int pos, int seq_len, int reserve_step, bool enable_reuse_cache) const { + if (pos < 0) { + return false; + } + + const int step = std::max(1, linear_step_); + const int seq_slots = needBlocksNum(seq_len, 0, 0); + const int total_slots = needBlocksNum(seq_len, 0, reserve_step); + const bool is_seq_tail = (seq_slots > 0) && (pos >= std::max(0, seq_slots - 2)) && (pos < seq_slots); + const bool is_reserve = (reserve_step > 0) && (pos >= seq_slots) && (pos < total_slots); + const bool step_hit = (((pos + 1) % step) == 0); + return is_reserve || (enable_reuse_cache ? (step_hit || is_seq_tail) : is_seq_tail); +} + +NeedBlocksInfo LinearKVCacheGroup::getNeedBlocks( + int common_seq_len, int seq_len, int reserve_step, int reuse_blocks_len, bool reuse_enabled) const { + NeedBlocksInfo info; + + // common_slots: blocks for common_seq_len (no reserve) + const int common_slots = needBlocksNum(common_seq_len, 0); + // total_slots includes reserve_step - 1 extra linear slots when reserve_step is non-zero. + const int total_slots = needBlocksNum(seq_len, 0, reserve_step); + + auto common_required = [&](int pos) { return shouldMaterializeBlock(pos, common_seq_len, 0, reuse_enabled); }; + auto final_required = [&](int pos) { return shouldMaterializeBlock(pos, seq_len, reserve_step, reuse_enabled); }; + + for (int pos = 0; pos < common_slots; ++pos) { + if (common_required(pos)) { + info.common_blocks++; + } + } + for (int pos = 0; pos < total_slots; ++pos) { + if (final_required(pos) && !(pos < common_slots && common_required(pos))) { + info.extra_blocks++; + } + } + + // Linear reuse materializes only one prefix block: the matched tail at + // reuse_blocks_len - 1. Do not count that block as newly allocated. + const int reused_tail_pos = (reuse_enabled && reuse_blocks_len > 0) ? reuse_blocks_len - 1 : -1; + if (reused_tail_pos >= 0) { + if (reused_tail_pos < common_slots && common_required(reused_tail_pos)) { + info.common_blocks--; + } else if (reused_tail_pos < total_slots && final_required(reused_tail_pos)) { + info.extra_blocks--; + } + } + + info.common_blocks = std::max(info.common_blocks, 0); + info.extra_blocks = std::max(info.extra_blocks, 0); + return info; +} + +MatchResult LinearKVCacheGroup::matchSingleKey(CacheKeyType cache_key) const { + MatchResult result; + if (!shared_cache_) { + return result; + } + auto block_idx = shared_cache_->matchGroup(cache_key, group_id_); + if (!isNullBlockIdx(block_idx)) { + result.block_indices = {block_idx}; + } + return result; +} + +bool LinearKVCacheGroup::malloc(BlockIds& block_ids, int seq_len, bool enable_reuse_cache, int reserve_step) { + const int step = std::max(1, linear_step_); + const int current_blocks_len = static_cast(block_ids.blocksNum()); + const int seq_slots = needBlocksNum(seq_len, 0, 0); + const int total_slots = needBlocksNum(seq_len, 0, reserve_step); + const int new_blocks_len = std::max(total_slots - current_blocks_len, 0); + + auto should_materialize = [&](int pos) { + // Materialize tail and tail-1: causal_conv1d_update may read + // (seq_len - 2) / SBP when seq_len crosses a block boundary. + // Leaving tail-1 NULL can hit IMA on long prompts. + const bool is_seq_tail = (seq_slots > 0) && (pos >= std::max(0, seq_slots - 2)) && (pos < seq_slots); + const bool is_reserve = (reserve_step > 0) && (pos >= seq_slots) && (pos < total_slots); + const bool step_hit = (((pos + 1) % step) == 0); + return is_reserve || (enable_reuse_cache ? (step_hit || is_seq_tail) : is_seq_tail); + }; + + std::vector positions_to_backfill; + const auto& existing_blocks = block_ids.blocks(); + const int existing_scan = std::min(current_blocks_len, total_slots); + for (int i = 0; i < existing_scan; ++i) { + if (should_materialize(i) && isNullBlockIdx(existing_blocks[static_cast(i)])) { + positions_to_backfill.push_back(static_cast(i)); + } + } + + int need_alloc_blocks = 0; + need_alloc_blocks += static_cast(positions_to_backfill.size()); + for (int i = current_blocks_len; i < total_slots; i++) { + if (should_materialize(i)) { + need_alloc_blocks++; + } + } + + if (need_alloc_blocks > 0) { + const auto free_blocks_num = freeBlocksNum(); + if (free_blocks_num < static_cast(need_alloc_blocks)) { + if (!ensureFreeBlocks(need_alloc_blocks)) { + RTP_LLM_LOG_WARNING("Insufficient free blocks for LinearKVCacheGroup: need %d, have %zu", + need_alloc_blocks, + free_blocks_num); + return false; + } + } + } + + BlockIndicesType allocated_blocks; + if (need_alloc_blocks > 0) { + allocated_blocks = block_pool_->malloc(need_alloc_blocks); + if (allocated_blocks.size() != static_cast(need_alloc_blocks)) { + if (!allocated_blocks.empty()) { + block_pool_->requestFree(allocated_blocks); + } + return false; + } + } + + size_t allocated_idx = 0; + for (size_t pos : positions_to_backfill) { + block_ids.setAt(pos, allocated_blocks[allocated_idx++]); + } + + BlockIndicesType new_ids; + new_ids.reserve(static_cast(new_blocks_len)); + for (int i = current_blocks_len; i < total_slots; i++) { + if (should_materialize(i)) { + new_ids.push_back(allocated_blocks[allocated_idx++]); + } else { + new_ids.push_back(NULL_BLOCK_IDX); + } + } + if (!new_ids.empty()) { + block_ids.add(new_ids); + } + RTP_LLM_CHECK_WITH_INFO(allocated_idx == allocated_blocks.size(), + "linear kv allocation accounting mismatch, used=%zu allocated=%zu", + allocated_idx, + allocated_blocks.size()); + return true; +} + +void LinearKVCacheGroup::removeSkippedBlocks(BlockIds& block_ids, bool enable_reuse_cache, int reserve_step) { + const auto& block_indices = block_ids.blocks(); // const view for reading current state + if (block_indices.empty()) { + return; + } + const int step = std::max(1, linear_step_); + const int block_size = static_cast(block_indices.size()); + + BlockIndicesType blocks_to_free; + std::vector pos_to_remove; + // keep last 2 and every reserve_step + for (int i = block_size - 3 - reserve_step; i >= 0; i--) { + if (isNullBlockIdx(block_indices[i])) { + continue; + } + if (enable_reuse_cache && ((i + 1) % step) == 0) { + continue; + } + blocks_to_free.push_back(block_indices[i]); + pos_to_remove.push_back(static_cast(i)); + } + if (!blocks_to_free.empty()) { + block_pool_->requestFree(blocks_to_free); + block_ids.remove(pos_to_remove); // null-out by position, updates kernel slots incrementally + } +} + +void LinearKVCacheGroup::free(const BlockIndicesType& block_indices) { + if (block_indices.empty()) { + return; + } + BlockIndicesType valid; + filterValidBlocks(block_indices, valid); + if (valid.empty()) { + return; + } + block_pool_->requestFree(valid); +} + +void LinearKVCacheGroup::reference(BlockIds& block_ids, const BlockIndicesType& new_block_indices) { + block_ids.add(new_block_indices); + BlockIndicesType valid; + filterValidBlocks(new_block_indices, valid); + if (!valid.empty()) { + block_pool_->requestReference(valid); + } +} + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/LinearKVCacheGroup.h b/rtp_llm/cpp/cache/group/LinearKVCacheGroup.h similarity index 75% rename from rtp_llm/cpp/cache/LinearKVCacheGroup.h rename to rtp_llm/cpp/cache/group/LinearKVCacheGroup.h index f3daa1f197..0d8db388a1 100644 --- a/rtp_llm/cpp/cache/LinearKVCacheGroup.h +++ b/rtp_llm/cpp/cache/group/LinearKVCacheGroup.h @@ -4,7 +4,7 @@ #include #include -#include "rtp_llm/cpp/cache/KVCacheGroup.h" +#include "rtp_llm/cpp/cache/group/KVCacheGroup.h" namespace rtp_llm { @@ -14,15 +14,16 @@ class LinearKVCacheGroup: public KVCacheGroup { std::shared_ptr kvcache_spec, BlockPoolPtr block_pool, int group_id, - int linear_step = 0): - KVCacheGroup(layer_ids, kvcache_spec, block_pool, group_id), linear_step_(linear_step) {} + int linear_step = 0, + SharedBlockCache* shared_cache = nullptr, + const kmonitor::MetricsReporterPtr& metrics_reporter = nullptr, + CacheGroupPolicy policy = defaultCacheGroupPolicy(CacheGroupType::LINEAR)): + KVCacheGroup(layer_ids, kvcache_spec, block_pool, group_id, policy, shared_cache, metrics_reporter), + linear_step_(linear_step) {} - MatchResult match(const CacheKeysType& cache_keys) override; // Match a single cache key (used by Hybrid allocator to do right-to-left joint matching). - MatchResult matchSingleKey(CacheKeyType cache_key) const; + MatchResult matchSingleKey(CacheKeyType cache_key) const override; bool malloc(BlockIds& block_ids, int seq_len, bool enable_reuse_cache = false, int reserve_step = 0) override; - void - insertIntoCache(const CacheKeysType& cache_keys, const BlockIndicesType& block_indices, bool is_resident) override; void removeSkippedBlocks(BlockIds& block_ids, bool enable_reuse_cache = false, int reserve_step = 0) override; void free(const BlockIndicesType& block_indices) override; @@ -33,6 +34,7 @@ class LinearKVCacheGroup: public KVCacheGroup { int reserve_step, int reuse_blocks_len, bool reuse_enabled = false) const override; + bool shouldMaterializeBlock(int pos, int seq_len, int reserve_step, bool enable_reuse_cache) const; private: void filterValidBlocks(const BlockIndicesType& in, BlockIndicesType& out) const; diff --git a/rtp_llm/cpp/cache/group/SWAKVCacheGroup.cc b/rtp_llm/cpp/cache/group/SWAKVCacheGroup.cc new file mode 100644 index 0000000000..7be84d60e9 --- /dev/null +++ b/rtp_llm/cpp/cache/group/SWAKVCacheGroup.cc @@ -0,0 +1,241 @@ +#include "rtp_llm/cpp/cache/group/SWAKVCacheGroup.h" + +#include +#include +#include + +#include "rtp_llm/cpp/utils/Logger.h" + +namespace rtp_llm { + +namespace { + +bool isActiveTailBlock(int block_idx, int seq_slots, int active_tail_blocks) { + if (seq_slots <= 0 || block_idx >= seq_slots) { + return false; + } + return block_idx >= std::max(seq_slots - active_tail_blocks, 0); +} + +bool shouldAllocateBlock( + int block_idx, int seq_slots, int reserve_step, int step, bool enable_reuse_cache, int active_tail_blocks) { + const bool is_reserve = reserve_step > 0 && block_idx >= seq_slots; + const bool step_hit = ((block_idx + 1) % step) == 0; + return is_reserve || isActiveTailBlock(block_idx, seq_slots, active_tail_blocks) + || (enable_reuse_cache && step_hit); +} + +bool dsv4TrapInvalidKVAccessEnabled() { + const char* value = std::getenv("DSV4_TRAP_INVALID_KV_ACCESS"); + if (value == nullptr) { + return false; + } + const std::string flag(value); + return !flag.empty() && flag != "0" && flag != "false" && flag != "FALSE" && flag != "off" && flag != "OFF"; +} + +} // namespace + +bool SWAKVCacheGroup::shouldCheckSWATailBlockIds() const { + if (!dsv4TrapInvalidKVAccessEnabled()) { + return false; + } + return policy_.validate_tail_blocks; +} + +bool SWAKVCacheGroup::effectiveReuseCacheForAllocation(bool enable_reuse_cache) const { + return enable_reuse_cache && policy_.reuse_policy == CacheReusePolicy::REUSABLE; +} + +int SWAKVCacheGroup::activeTailBlockCount() const { + return std::max(1, policy_.active_tail_blocks); +} + +void SWAKVCacheGroup::checkSWATailBlockIds(const BlockIds& block_ids, const char* caller) const { + if (!shouldCheckSWATailBlockIds()) { + return; + } + + const auto& blocks = block_ids.blocks(); + if (blocks.empty()) { + return; + } + + const size_t block_num = blocks.size(); + RTP_LLM_CHECK_WITH_INFO(!isNullBlockIdx(blocks[block_num - 1]), + "%s invalid SWA block ids: tail block is NULL, block_num=%zu", + caller, + block_num); + if (activeTailBlockCount() >= 2 && block_num >= 2) { + RTP_LLM_CHECK_WITH_INFO(!isNullBlockIdx(blocks[block_num - 2]), + "%s invalid SWA block ids: tail-1 block is NULL, block_num=%zu", + caller, + block_num); + } +} + +void SWAKVCacheGroup::filterValidBlocks(const BlockIndicesType& in, BlockIndicesType& out) const { + out.clear(); + out.reserve(in.size()); + for (auto b : in) { + if (!isNullBlockIdx(b)) { + out.push_back(b); + } + } +} + +int SWAKVCacheGroup::needBlocksNum(int seq_len, int current_blocks, int reserve_step) const { + return std::max((seq_len + reserve_step + seq_size_per_block_ - 1) / seq_size_per_block_ - current_blocks, 0); +} + +NeedBlocksInfo SWAKVCacheGroup::getNeedBlocks( + int common_seq_len, int seq_len, int reserve_step, int reuse_blocks_len, bool reuse_enabled) const { + (void)common_seq_len; + const int step = std::max(1, linear_step_); + const bool effective_reuse_enabled = effectiveReuseCacheForAllocation(reuse_enabled); + const int active_tail_blocks = activeTailBlockCount(); + + NeedBlocksInfo info; + + const int seq_slots = needBlocksNum(seq_len, 0); + const int total_slots = needBlocksNum(seq_len, 0, reserve_step); + + info.common_blocks = 0; + for (int i = reuse_blocks_len; i < seq_slots; ++i) { + if (shouldAllocateBlock(i, seq_slots, /*reserve_step=*/0, step, effective_reuse_enabled, active_tail_blocks)) { + ++info.extra_blocks; + } + } + info.extra_blocks += std::max(total_slots - std::max(seq_slots, reuse_blocks_len), 0); + + info.extra_blocks = std::max(info.extra_blocks, 0); + return info; +} + +MatchResult SWAKVCacheGroup::matchSingleKey(CacheKeyType cache_key) const { + MatchResult result; + if (!shared_cache_) { + return result; + } + auto block_idx = shared_cache_->matchGroup(cache_key, group_id_); + if (!isNullBlockIdx(block_idx)) { + result.block_indices = {block_idx}; + } + return result; +} + +bool SWAKVCacheGroup::malloc(BlockIds& block_ids, int seq_len, bool enable_reuse_cache, int reserve_step) { + const int step = std::max(1, linear_step_); + const bool effective_reuse_enabled = effectiveReuseCacheForAllocation(enable_reuse_cache); + const int active_tail_blocks = activeTailBlockCount(); + const int current_blocks_len = static_cast(block_ids.blocksNum()); + const int seq_slots = needBlocksNum(seq_len, 0, 0); + const int new_blocks_len = needBlocksNum(seq_len, current_blocks_len, reserve_step); + + if (new_blocks_len == 0) { + checkSWATailBlockIds(block_ids, "SWAKVCacheGroup::malloc"); + return true; + } + + int need_alloc_blocks = 0; + for (int i = current_blocks_len; i < current_blocks_len + new_blocks_len; i++) { + if (shouldAllocateBlock(i, seq_slots, reserve_step, step, effective_reuse_enabled, active_tail_blocks)) { + need_alloc_blocks++; + } + } + + if (need_alloc_blocks > 0) { + const auto free_blocks_num = freeBlocksNum(); + if (free_blocks_num < static_cast(need_alloc_blocks)) { + if (!ensureFreeBlocks(need_alloc_blocks)) { + RTP_LLM_LOG_WARNING("Insufficient free blocks for SWAKVCacheGroup: need %d, have %zu", + need_alloc_blocks, + free_blocks_num); + return false; + } + } + } + + BlockIndicesType allocated_blocks; + if (need_alloc_blocks > 0) { + allocated_blocks = block_pool_->malloc(need_alloc_blocks); + if (allocated_blocks.size() != static_cast(need_alloc_blocks)) { + if (!allocated_blocks.empty()) { + block_pool_->requestFree(allocated_blocks); + } + return false; + } + } + + BlockIndicesType new_ids; + new_ids.reserve(static_cast(new_blocks_len)); + size_t allocated_idx = 0; + for (int i = current_blocks_len; i < current_blocks_len + new_blocks_len; i++) { + const bool should_alloc = + shouldAllocateBlock(i, seq_slots, reserve_step, step, effective_reuse_enabled, active_tail_blocks); + if (should_alloc) { + new_ids.push_back(allocated_blocks[allocated_idx++]); + } else { + new_ids.push_back(NULL_BLOCK_IDX); + } + } + RTP_LLM_CHECK_WITH_INFO(allocated_idx == allocated_blocks.size(), + "swa kv allocation accounting mismatch, used=%zu allocated=%zu", + allocated_idx, + allocated_blocks.size()); + block_ids.add(new_ids); + checkSWATailBlockIds(block_ids, "SWAKVCacheGroup::malloc"); + return true; +} + +void SWAKVCacheGroup::removeSkippedBlocks(BlockIds& block_ids, bool enable_reuse_cache, int reserve_step) { + const auto& block_indices = block_ids.blocks(); + if (block_indices.empty()) { + checkSWATailBlockIds(block_ids, "SWAKVCacheGroup::removeSkippedBlocks"); + return; + } + const int step = std::max(1, linear_step_); + const bool effective_reuse_enabled = effectiveReuseCacheForAllocation(enable_reuse_cache); + const int active_tail_blocks = activeTailBlockCount(); + const int block_size = static_cast(block_indices.size()); + + BlockIndicesType blocks_to_free; + std::vector pos_to_remove; + for (int i = block_size - active_tail_blocks - 1 - reserve_step; i >= 0; i--) { + if (isNullBlockIdx(block_indices[i])) { + break; + } + if (effective_reuse_enabled && ((i + 1) % step) == 0) { + continue; + } + blocks_to_free.push_back(block_indices[i]); + pos_to_remove.push_back(static_cast(i)); + } + if (!blocks_to_free.empty()) { + block_pool_->requestFree(blocks_to_free); + block_ids.remove(pos_to_remove); + } + checkSWATailBlockIds(block_ids, "SWAKVCacheGroup::removeSkippedBlocks"); +} + +void SWAKVCacheGroup::free(const BlockIndicesType& block_indices) { + if (block_indices.empty()) { + return; + } + BlockIndicesType valid; + filterValidBlocks(block_indices, valid); + if (!valid.empty()) { + block_pool_->requestFree(valid); + } +} + +void SWAKVCacheGroup::reference(BlockIds& block_ids, const BlockIndicesType& new_block_indices) { + block_ids.add(new_block_indices); + BlockIndicesType valid; + filterValidBlocks(new_block_indices, valid); + if (!valid.empty()) { + block_pool_->requestReference(valid); + } +} + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/group/SWAKVCacheGroup.h b/rtp_llm/cpp/cache/group/SWAKVCacheGroup.h new file mode 100644 index 0000000000..202a85766b --- /dev/null +++ b/rtp_llm/cpp/cache/group/SWAKVCacheGroup.h @@ -0,0 +1,46 @@ +#pragma once + +#include + +#include "rtp_llm/cpp/cache/group/KVCacheGroup.h" + +namespace rtp_llm { + +class SWAKVCacheGroup: public KVCacheGroup { +public: + SWAKVCacheGroup(const LayerIdsType& layer_ids, + std::shared_ptr kvcache_spec, + BlockPoolPtr block_pool, + int group_id, + int linear_step = 0, + SharedBlockCache* shared_cache = nullptr, + const kmonitor::MetricsReporterPtr& metrics_reporter = nullptr, + CacheGroupPolicy policy = defaultCacheGroupPolicy(CacheGroupType::SWA)): + KVCacheGroup(layer_ids, kvcache_spec, block_pool, group_id, policy, shared_cache, metrics_reporter), + linear_step_(linear_step) {} + + MatchResult matchSingleKey(CacheKeyType cache_key) const override; + bool malloc(BlockIds& block_ids, int seq_len, bool enable_reuse_cache = false, int reserve_step = 0) override; + void removeSkippedBlocks(BlockIds& block_ids, bool enable_reuse_cache = false, int reserve_step = 0) override; + void free(const BlockIndicesType& block_indices) override; + void reference(BlockIds& block_ids, const BlockIndicesType& new_block_indices) override; + int needBlocksNum(int seq_len, int current_blocks, int reserve_step = 0) const override; + NeedBlocksInfo getNeedBlocks(int common_seq_len, + int seq_len, + int reserve_step, + int reuse_blocks_len, + bool reuse_enabled = false) const override; + +private: + void filterValidBlocks(const BlockIndicesType& in, BlockIndicesType& out) const; + int activeTailBlockCount() const; + bool effectiveReuseCacheForAllocation(bool enable_reuse_cache) const; + bool shouldCheckSWATailBlockIds() const; + void checkSWATailBlockIds(const BlockIds& block_ids, const char* caller) const; + + int linear_step_ = 0; +}; + +using SWAKVCacheGroupPtr = std::shared_ptr; + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/spec/CacheGroupType.h b/rtp_llm/cpp/cache/spec/CacheGroupType.h new file mode 100644 index 0000000000..9bbe3ccdff --- /dev/null +++ b/rtp_llm/cpp/cache/spec/CacheGroupType.h @@ -0,0 +1,81 @@ +#pragma once + +#include + +namespace rtp_llm { + +// Cache group type for hybrid KV-cache: +// - LINEAR: linear attention group (PD cache-store transfer keeps the last block) +// - FULL: full attention group (all blocks are needed for cache-store transfer) +// - SWA: sliding-window attention group (PD cache-store transfer keeps the last two blocks) +enum class CacheGroupType : int8_t { + LINEAR = 0, + FULL = 1, + SWA = 2, +}; + +enum class CacheReusePolicy : int8_t { + REUSABLE = 0, + NON_REUSABLE = 1, +}; + +enum class CacheEvictPolicy : int8_t { + CHAIN = 0, + INDEPENDENT = 1, + NONE = 2, +}; + +struct CacheGroupPolicy { + CacheReusePolicy reuse_policy = CacheReusePolicy::REUSABLE; + CacheEvictPolicy evict_policy = CacheEvictPolicy::CHAIN; + int active_tail_blocks = 2; + bool validate_tail_blocks = true; + uint32_t explicit_block_num = 0; + bool reserve_from_paged_budget = false; + bool prefix_reusable = true; + bool uses_pinned_cpu_backing = false; + bool is_cp_shardable = true; + bool has_sparse_slots = false; + bool has_kernel_block_subdiv = true; + bool cp_compact_tail_blocks = false; + bool is_reservable = true; + CacheGroupType group_type = CacheGroupType::FULL; +}; + +inline const char* cacheGroupTypeName(CacheGroupType group_type) { + switch (group_type) { + case CacheGroupType::LINEAR: + return "LINEAR"; + case CacheGroupType::FULL: + return "FULL"; + case CacheGroupType::SWA: + return "SWA"; + } + return "UNKNOWN"; +} + +inline const char* cacheEvictPolicyName(CacheEvictPolicy evict_policy) { + switch (evict_policy) { + case CacheEvictPolicy::CHAIN: + return "chain"; + case CacheEvictPolicy::INDEPENDENT: + return "independent"; + case CacheEvictPolicy::NONE: + return "none"; + } + return "unknown"; +} + +inline CacheGroupPolicy defaultCacheGroupPolicy(CacheGroupType group_type) { + CacheGroupPolicy policy; + policy.group_type = group_type; + policy.active_tail_blocks = group_type == CacheGroupType::LINEAR ? 1 : (group_type == CacheGroupType::SWA ? 2 : 0); + policy.prefix_reusable = group_type == CacheGroupType::FULL; + policy.is_cp_shardable = group_type == CacheGroupType::FULL; + policy.has_sparse_slots = group_type != CacheGroupType::FULL; + policy.has_kernel_block_subdiv = group_type == CacheGroupType::FULL; + policy.cp_compact_tail_blocks = group_type == CacheGroupType::SWA; + return policy; +} + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/KVCacheSpec.h b/rtp_llm/cpp/cache/spec/KVCacheSpec.h similarity index 53% rename from rtp_llm/cpp/cache/KVCacheSpec.h rename to rtp_llm/cpp/cache/spec/KVCacheSpec.h index c8dec3a39a..e7df5ec9b1 100644 --- a/rtp_llm/cpp/cache/KVCacheSpec.h +++ b/rtp_llm/cpp/cache/spec/KVCacheSpec.h @@ -3,10 +3,11 @@ // This header includes all KVCacheSpec related classes // Split into separate files for better modularity -#include "rtp_llm/cpp/cache/KVCacheSpecBase.h" -#include "rtp_llm/cpp/cache/MHAKVCacheSpec.h" -#include "rtp_llm/cpp/cache/MLAKVCacheSpec.h" -#include "rtp_llm/cpp/cache/LinearKVCacheSpec.h" +#include "rtp_llm/cpp/cache/spec/KVCacheSpecBase.h" +#include "rtp_llm/cpp/cache/spec/MHAKVCacheSpec.h" +#include "rtp_llm/cpp/cache/spec/MLAKVCacheSpec.h" +#include "rtp_llm/cpp/cache/spec/LinearKVCacheSpec.h" +#include "rtp_llm/cpp/cache/spec/OpaqueKVCacheSpec.h" namespace rtp_llm { // All KVCacheSpec classes are now available through individual headers diff --git a/rtp_llm/cpp/cache/spec/KVCacheSpecBase.h b/rtp_llm/cpp/cache/spec/KVCacheSpecBase.h new file mode 100644 index 0000000000..a547ee8dac --- /dev/null +++ b/rtp_llm/cpp/cache/spec/KVCacheSpecBase.h @@ -0,0 +1,188 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "rtp_llm/cpp/cache/BlockInfo.h" +#include "rtp_llm/cpp/cache/spec/CacheGroupType.h" +#include "rtp_llm/cpp/cache/Types.h" +#include "rtp_llm/cpp/config/ConfigModules.h" +#include "rtp_llm/cpp/utils/AssertUtils.h" +#include "rtp_llm/models_py/bindings/core/Types.h" +#include "rtp_llm/cpp/model_utils/AttentionConfig.h" + +namespace rtp_llm { + +// Physical signature used to determine whether two KVCacheSpec instances can +// share the same KVCacheGroup and BlockPool. Two specs with identical tags AND +// identical SpecPhysicalSignature are merged into a single group; different tags +// always produce different groups regardless of physical equality. +struct SpecPhysicalSignature { + size_t block_size_bytes = 0; + size_t scale_block_size_bytes = 0; + CacheGroupType lifecycle_type = CacheGroupType::FULL; + rtp_llm::DataType dtype = rtp_llm::DataType::TYPE_INVALID; + + bool operator==(const SpecPhysicalSignature& other) const { + return block_size_bytes == other.block_size_bytes + && scale_block_size_bytes == other.scale_block_size_bytes + && lifecycle_type == other.lifecycle_type + && dtype == other.dtype; + } + bool operator!=(const SpecPhysicalSignature& other) const { + return !(*this == other); + } +}; + +enum KVCacheSpecType { + MultiHeadAttention, // MHAKVCacheSpec: standard multi-head attention KV cache + MultiHeadLatentAttention, // MLAKVCacheSpec: MLA compressed latent KV cache + LinearAttention, // LinearKVCacheSpec: linear / SSM attention state cache + OpaqueKV, // Byte-addressed opaque paged KV pool + OpaqueState, // Fixed-allocation opaque state / SWA-like pool +}; + +enum class CPTransferPolicy { + NONE, + INTRA_BLOCK_SLICE, +}; + +inline const char* KVCacheSpecTypeToString(KVCacheSpecType t) { + switch (t) { + case KVCacheSpecType::MultiHeadAttention: + return "MultiHeadAttention"; + case KVCacheSpecType::MultiHeadLatentAttention: + return "MultiHeadLatentAttention"; + case KVCacheSpecType::LinearAttention: + return "LinearAttention"; + case KVCacheSpecType::OpaqueKV: + return "OpaqueKV"; + case KVCacheSpecType::OpaqueState: + return "OpaqueState"; + default: + return "Unknown"; + } +} + +struct KVCacheSpec; +using KVCacheSpecPtr = std::shared_ptr; +using LayerKVCacheSpecs = std::vector>; + +struct KVCacheSpec { + std::string tag; + std::vector layers; + uint32_t local_head_num_kv = 1; + uint32_t seq_size_per_block = 1; + bool is_state_cache = false; + bool skip_prefix_reuse = false; + + // Lifecycle governs the allocation strategy for this cache group. + // Each concrete spec subclass sets this in its constructor; do NOT set it + // manually from outside the spec class hierarchy. + // FULL - standard paged allocation (MHA, MLA, OpaqueKV) + // LINEAR - fixed-capacity ring buffer (LinearAttention / SSM state) + // SWA - fixed-size tail-allocation pool (DSV4 state / SWA_KV) + CacheGroupType lifecycle = CacheGroupType::FULL; + + KVCacheSpecType type = KVCacheSpecType::MultiHeadAttention; + rtp_llm::DataType dtype = rtp_llm::DataType::TYPE_INVALID; + + // Derived from lifecycle; true when this spec uses SWA-style fixed allocation. + bool isFixedCache() const { return lifecycle == CacheGroupType::SWA; } + + virtual size_t block_size() const = 0; + virtual size_t k_block_size() const = 0; + virtual size_t v_block_size() const = 0; + + virtual size_t block_size_bytes() const = 0; + virtual size_t k_block_size_bytes() const = 0; + virtual size_t v_block_size_bytes() const = 0; + + virtual size_t scale_block_size_bytes() const { + return 0; + } + virtual size_t k_scale_block_size_bytes() const { + return 0; + } + virtual size_t v_scale_block_size_bytes() const { + return 0; + } + + virtual KVCacheSpecPtr clone() const = 0; + + virtual CPTransferPolicy cpTransferPolicy() const { + return CPTransferPolicy::NONE; + } + + bool supportsCpSlice() const { + return cpTransferPolicy() == CPTransferPolicy::INTRA_BLOCK_SLICE; + } + + virtual std::vector cpSliceDestination(std::vector parts, + size_t cp_size, + size_t peer_idx) const { + (void)cp_size; + (void)peer_idx; + return parts; + } + + virtual std::vector sliceBlockForPeer(std::vector parts, + size_t cp_size, + size_t peer_idx) const { + return cpSliceDestination(std::move(parts), cp_size, peer_idx); + } + + std::string fingerprint() const { + std::ostringstream os; + os << "tag=" << tag << ";type=" << static_cast(type) << ";dtype=" << static_cast(dtype) + << ";local_head_num_kv=" << local_head_num_kv << ";seq_size_per_block=" << seq_size_per_block; + os << fingerprintExtra(); + return os.str(); + } + + virtual std::string debugString(size_t indent = 0) const = 0; + + // Returns the physical signature used for spec grouping. + // Two specs with the same (tag, physicalSignature()) are merged into one + // KVCacheGroup. lifecycle is a direct field — no switch needed. + // LinearKVCacheSpec overrides to encode its dual-dtype block layout. + virtual SpecPhysicalSignature physicalSignature() const { + return {block_size_bytes(), scale_block_size_bytes(), lifecycle, dtype}; + } + +protected: + virtual std::string fingerprintExtra() const { + return ""; + } + + // Helper method to generate common parts of debug string + std::string commonDebugString(size_t indent = 0) const { + const std::string indent_str = std::string(indent, ' '); + const std::string indent1 = indent_str + " "; + + std::ostringstream os; + os << indent1 << "tag=" << tag << "\n"; + os << indent1 << "type=" << KVCacheSpecTypeToString(type) << "(" << static_cast(type) << ")\n"; + os << indent1 << "dtype=" << static_cast(dtype) << "\n"; + os << indent1 << "layers.size=" << layers.size() << "\n"; + os << indent1 << "local_head_num_kv=" << local_head_num_kv << "\n"; + os << indent1 << "seq_size_per_block=" << seq_size_per_block << "\n"; + os << indent1 << "is_state_cache=" << (is_state_cache ? "true" : "false") << "\n"; + os << indent1 << "is_fixed_cache=" << (isFixedCache() ? "true" : "false") << "\n"; + os << indent1 << "skip_prefix_reuse=" << (skip_prefix_reuse ? "true" : "false") << "\n"; + os << indent1 << "block_size=" << block_size() << "\n"; + os << indent1 << "k_block_size=" << k_block_size() << "\n"; + os << indent1 << "v_block_size=" << v_block_size() << "\n"; + os << indent1 << "block_size_bytes=" << block_size_bytes() << "\n"; + os << indent1 << "k_block_size_bytes=" << k_block_size_bytes() << "\n"; + os << indent1 << "v_block_size_bytes=" << v_block_size_bytes() << "\n"; + return os.str(); + } +}; + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/spec/KVCacheSpecDesc.h b/rtp_llm/cpp/cache/spec/KVCacheSpecDesc.h new file mode 100644 index 0000000000..b2a957300e --- /dev/null +++ b/rtp_llm/cpp/cache/spec/KVCacheSpecDesc.h @@ -0,0 +1,211 @@ +#pragma once + +#include +#include +#include + +#include "rtp_llm/cpp/cache/spec/KVCacheSpecDescTypes.h" +#include "rtp_llm/cpp/cache/spec/KVCacheSpec.h" +#include "rtp_llm/cpp/config/ConfigModules.h" +#include "rtp_llm/cpp/utils/AssertUtils.h" + +namespace rtp_llm { + +struct SpecBuildContext { + DataType dtype = DataType::TYPE_INVALID; + uint32_t seq_size_per_block = 0; + uint32_t attn_tp_size = 1; // TP size for computing local head counts from global desc fields + uint32_t kernel_tokens_per_block = 0; + uint32_t gen_num_per_cycle = 0; + uint32_t cp_size = 1; + bool cp_prefill_sliced = false; +}; + +class SpecBuilder { +public: + static KVCacheSpecPtr build(const KVCacheSpecDesc& desc, const SpecBuildContext& ctx) { + RTP_LLM_CHECK_WITH_INFO(!desc.tag.empty(), "KVCacheSpecDesc tag must not be empty"); + auto spec = buildTyped(desc, ctx); + spec->tag = desc.tag; + spec->seq_size_per_block = effectiveSeqSizePerBlock(desc, ctx); + spec->dtype = dataTypeOr(desc.dtype, dataTypeOr(ctx.dtype, desc.store_dtype)); + return spec; + } + + static CacheGroupType groupType(const KVCacheSpecDesc& desc) { + switch (desc.cache_type) { + case CacheType::LINEAR: + return CacheGroupType::LINEAR; + case CacheType::FIXED_STATE: + return CacheGroupType::SWA; + case CacheType::MHA: + case CacheType::MLA: + case CacheType::COMPRESSED_KV: + return CacheGroupType::FULL; + } + return CacheGroupType::FULL; + } + +private: + static uint32_t valueOr(uint32_t value, uint32_t fallback) { + return value == 0 ? fallback : value; + } + + static DataType dataTypeOr(DataType value, DataType fallback) { + return value == DataType::TYPE_INVALID ? fallback : value; + } + + static uint32_t alignUpToMultiple(uint32_t value, uint32_t multiple) { + RTP_LLM_CHECK_WITH_INFO(multiple > 0, "align multiple must be > 0"); + return ((value + multiple - 1) / multiple) * multiple; + } + + static uint32_t effectiveSeqSizePerBlock(const KVCacheSpecDesc& desc, const SpecBuildContext& ctx) { + const auto ctx_seq_size = valueOr(ctx.seq_size_per_block, 1); + if (desc.extra.use_fixed_region_cp_tokens && ctx.cp_size > 1) { + return ctx_seq_size * ctx.cp_size; + } + return valueOr(desc.seq_size_per_block, ctx_seq_size); + } + + static uint32_t computeStateRingEntries(const KVCacheSpecDesc& desc, const SpecBuildContext& ctx) { + RTP_LLM_CHECK_WITH_INFO(desc.extra.state_ring_compression_ratio > 0, + "state ring desc tag=%s requires positive state_ring_compression_ratio", + desc.tag.c_str()); + const uint32_t window = + (1 + desc.extra.state_ring_overlap) * desc.extra.state_ring_compression_ratio; + const uint32_t raw = + window + (desc.extra.state_ring_add_gen_num_per_cycle ? ctx.gen_num_per_cycle : 0); + return (raw + 1) & ~1U; + } + + static uint32_t effectiveEntriesPerBlock(const KVCacheSpecDesc& desc, const SpecBuildContext& ctx) { + if (desc.extra.derive_entries_from_kernel_block) { + RTP_LLM_CHECK_WITH_INFO(desc.compression_ratio > 0, + "desc tag=%s derives entries from kernel block but has invalid compression_ratio=%u", + desc.tag.c_str(), + desc.compression_ratio); + RTP_LLM_CHECK_WITH_INFO(ctx.kernel_tokens_per_block > 0, + "desc tag=%s derives entries from kernel block but kernel_tokens_per_block is 0", + desc.tag.c_str()); + RTP_LLM_CHECK_WITH_INFO(ctx.kernel_tokens_per_block % desc.compression_ratio == 0, + "desc tag=%s compression_ratio=%u must divide kernel block %u", + desc.tag.c_str(), + desc.compression_ratio, + ctx.kernel_tokens_per_block); + return ctx.kernel_tokens_per_block / desc.compression_ratio; + } + + if (desc.extra.state_ring_compression_ratio > 0) { + uint32_t entries = computeStateRingEntries(desc, ctx); + if (ctx.cp_size > 1 && (desc.extra.cp_align_entries || desc.extra.cp_slice_entries)) { + entries = alignUpToMultiple(entries, ctx.cp_size); + if (desc.extra.cp_slice_entries && ctx.cp_prefill_sliced) { + entries /= ctx.cp_size; + } + } + return entries; + } + + return desc.entries_per_block; + } + + static size_t effectiveFixedStateBlockOverride(const KVCacheSpecDesc& desc, + uint32_t entries_per_block, + const SpecBuildContext& ctx) { + if (ctx.cp_size <= 1 || !ctx.cp_prefill_sliced || !desc.extra.cp_prefill_slice_block_bytes) { + return desc.block_size_bytes_override; + } + const size_t natural_bytes = static_cast(entries_per_block) * desc.entry_elems * getTypeSize(desc.store_dtype); + const size_t align = + desc.block_size_bytes_alignment > 0 ? + std::lcm(desc.block_size_bytes_alignment, static_cast(ctx.cp_size)) : + static_cast(ctx.cp_size); + const size_t full_stride_bytes = ((natural_bytes + align - 1) / align) * align; + RTP_LLM_CHECK_WITH_INFO(full_stride_bytes % ctx.cp_size == 0, + "CP prefill byte slicing tag=%s full stride %zu must be divisible by cp_size %u", + desc.tag.c_str(), + full_stride_bytes, + ctx.cp_size); + return full_stride_bytes / ctx.cp_size; + } + + // Dispatch to per-type factory methods. + // Each factory method owns all type-specific field assignments, + // including local_head_num_kv derived from global desc fields and runtime TP size. + static KVCacheSpecPtr buildTyped(const KVCacheSpecDesc& desc, const SpecBuildContext& ctx) { + switch (desc.cache_type) { + case CacheType::MHA: return buildMHA(desc, ctx); + case CacheType::MLA: return buildMLA(desc); + case CacheType::LINEAR: return buildLinear(desc, ctx); + case CacheType::COMPRESSED_KV: return buildCompressedKV(desc, ctx); + case CacheType::FIXED_STATE: return buildFixedState(desc, ctx); + } + RTP_LLM_CHECK_WITH_INFO(false, "unknown CacheType=%d", static_cast(desc.cache_type)); + return nullptr; + } + + // MHA/GQA: local_head_num_kv = global_kv_heads / TP, with gcd fallback for non-divisible TP. + static KVCacheSpecPtr buildMHA(const KVCacheSpecDesc& desc, const SpecBuildContext& ctx) { + const uint32_t tp = std::max(1u, ctx.attn_tp_size); + auto spec = std::make_shared(); + spec->size_per_head = desc.size_per_head; + const uint32_t kv = valueOr(desc.num_kv_heads, 1); + spec->local_head_num_kv = (kv % tp == 0) ? kv / tp : kv / std::gcd(kv, tp); + return spec; + } + + // MLA: local_head_num_kv is always 1 — heads are not split across TP. + static KVCacheSpecPtr buildMLA(const KVCacheSpecDesc& desc) { + auto spec = std::make_shared(); + spec->kv_lora_rank = desc.kv_lora_rank; + spec->rope_head_dim = desc.rope_head_dim; + spec->local_head_num_kv = 1; + return spec; + } + + // Linear Attention: all three local head fields derived from global counts / TP. + static KVCacheSpecPtr buildLinear(const KVCacheSpecDesc& desc, const SpecBuildContext& ctx) { + const uint32_t tp = std::max(1u, ctx.attn_tp_size); + auto spec = std::make_shared(); + spec->local_num_k_heads = desc.num_k_heads / tp; + spec->local_num_v_heads = desc.num_v_heads / tp; + spec->head_k_dim = desc.head_k_dim; + spec->head_v_dim = desc.head_v_dim; + spec->conv_kernel_dim = desc.conv_kernel_dim; + spec->ssm_state_dtype = dataTypeOr(desc.ssm_state_dtype, DataType::TYPE_BF16); + spec->conv_state_dtype = dataTypeOr(desc.conv_state_dtype, DataType::TYPE_BF16); + const uint32_t v_heads = valueOr(desc.num_v_heads, 1); + spec->local_head_num_kv = std::max(1u, (v_heads > 1u) ? v_heads / tp : v_heads); + return spec; + } + + // COMPRESSED_KV / FIXED_STATE: no per-head TP split, local_head_num_kv = global value. + static KVCacheSpecPtr buildCompressedKV(const KVCacheSpecDesc& desc, const SpecBuildContext& ctx) { + auto spec = std::make_shared(); + spec->entry_elems = desc.entry_elems; + spec->entries_per_block = effectiveEntriesPerBlock(desc, ctx); + spec->compression_ratio = valueOr(desc.compression_ratio, 1); + spec->store_dtype = desc.store_dtype; + spec->block_size_bytes_alignment = desc.block_size_bytes_alignment; + spec->local_head_num_kv = valueOr(desc.num_kv_heads, 1); + return spec; + } + + static KVCacheSpecPtr buildFixedState(const KVCacheSpecDesc& desc, const SpecBuildContext& ctx) { + const auto entries_per_block = effectiveEntriesPerBlock(desc, ctx); + auto spec = std::make_shared(); + spec->state_dim = desc.entry_elems; + spec->entries_per_block = entries_per_block; + spec->store_dtype = desc.store_dtype; + spec->block_size_bytes_override = effectiveFixedStateBlockOverride(desc, entries_per_block, ctx); + spec->block_size_bytes_alignment = desc.block_size_bytes_alignment; + spec->block_size_alignment_min_entries = desc.block_size_alignment_min_entries; + spec->is_state_cache = desc.is_state_cache; + spec->skip_prefix_reuse = desc.skip_prefix_reuse; + spec->local_head_num_kv = valueOr(desc.num_kv_heads, 1); + return spec; + } +}; + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/spec/KVCacheSpecDescTypes.h b/rtp_llm/cpp/cache/spec/KVCacheSpecDescTypes.h new file mode 100644 index 0000000000..4bd554e3aa --- /dev/null +++ b/rtp_llm/cpp/cache/spec/KVCacheSpecDescTypes.h @@ -0,0 +1,88 @@ +#pragma once + +#include +#include +#include + +#include "rtp_llm/cpp/cache/spec/CacheGroupType.h" +#include "rtp_llm/models_py/bindings/core/Types.h" + +namespace rtp_llm { + +enum class CacheType : int8_t { + MHA = 0, + MLA = 1, + LINEAR = 2, + COMPRESSED_KV = 3, + FIXED_STATE = 4, +}; + +struct KVCacheSpecDescExtra { + uint32_t explicit_block_num = 0; + bool reserve_from_paged_budget = false; + + bool derive_entries_from_kernel_block = false; + uint32_t state_ring_compression_ratio = 0; + uint32_t state_ring_overlap = 0; + bool state_ring_add_gen_num_per_cycle = false; + bool cp_align_entries = false; + bool cp_slice_entries = false; + bool cp_prefill_slice_block_bytes = false; + bool use_fixed_region_cp_tokens = false; +}; + +struct KVCacheSpecDesc { + std::string tag; + CacheType cache_type = CacheType::MHA; + uint32_t num_kv_heads = 0; // global model KV head count (MHA: kv_head_num; MLA: 1) + uint32_t seq_size_per_block = 0; + DataType dtype = DataType::TYPE_INVALID; + + uint32_t size_per_head = 0; + uint32_t kv_lora_rank = 0; + uint32_t rope_head_dim = 0; + + uint32_t num_k_heads = 0; // Linear Attention: global key head count + uint32_t num_v_heads = 0; // Linear Attention: global value head count + uint32_t head_k_dim = 0; + uint32_t head_v_dim = 0; + uint32_t conv_kernel_dim = 0; + DataType ssm_state_dtype = DataType::TYPE_INVALID; + DataType conv_state_dtype = DataType::TYPE_INVALID; + + uint32_t entry_elems = 0; + uint32_t entries_per_block = 0; + DataType store_dtype = DataType::TYPE_INVALID; + uint32_t compression_ratio = 1; + size_t block_size_bytes_override = 0; + size_t block_size_bytes_alignment = 0; + uint32_t block_size_alignment_min_entries = 0; + bool is_state_cache = true; + bool skip_prefix_reuse = false; + + bool has_reuse_policy = false; + CacheReusePolicy reuse_policy = CacheReusePolicy::REUSABLE; + bool has_evict_policy = false; + CacheEvictPolicy evict_policy = CacheEvictPolicy::CHAIN; + bool has_active_tail_blocks = false; + int active_tail_blocks = 0; + bool has_validate_tail_blocks = false; + bool validate_tail_blocks = true; + KVCacheSpecDescExtra extra; + bool has_prefix_reusable = false; + bool prefix_reusable = true; + bool uses_pinned_cpu_backing = false; + bool has_is_cp_shardable = false; + bool is_cp_shardable = true; + bool has_sparse_slots = false; + bool sparse_slots = false; + bool has_kernel_block_subdiv = false; + bool kernel_block_subdiv = true; + bool has_cp_compact_tail_blocks = false; + bool cp_compact_tail_blocks = false; + bool has_is_reservable = false; + bool is_reservable = true; + +}; + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/LinearKVCacheSpec.h b/rtp_llm/cpp/cache/spec/LinearKVCacheSpec.h similarity index 81% rename from rtp_llm/cpp/cache/LinearKVCacheSpec.h rename to rtp_llm/cpp/cache/spec/LinearKVCacheSpec.h index 39a80bdd80..d62bfced44 100644 --- a/rtp_llm/cpp/cache/LinearKVCacheSpec.h +++ b/rtp_llm/cpp/cache/spec/LinearKVCacheSpec.h @@ -4,7 +4,7 @@ #include #include -#include "rtp_llm/cpp/cache/KVCacheSpecBase.h" +#include "rtp_llm/cpp/cache/spec/KVCacheSpecBase.h" #include "rtp_llm/cpp/config/ConfigModules.h" #include "rtp_llm/models_py/bindings/core/Types.h" #include "rtp_llm/cpp/model_utils/AttentionConfig.h" @@ -24,7 +24,10 @@ struct LinearKVCacheSpec: public KVCacheSpec { DataType ssm_state_dtype = DataType::TYPE_BF16; DataType conv_state_dtype = DataType::TYPE_BF16; - LinearKVCacheSpec() = default; + LinearKVCacheSpec() { + type = KVCacheSpecType::LinearAttention; + lifecycle = CacheGroupType::LINEAR; + } LinearKVCacheSpec(const AttentionConfigs& attn_config, const ParallelismConfig& parallelism_config, @@ -52,7 +55,7 @@ struct LinearKVCacheSpec: public KVCacheSpec { linear_config.linear_value_head_dim); type = KVCacheSpecType::LinearAttention; - layer_num = 1; // Will be set by caller + lifecycle = CacheGroupType::LINEAR; local_head_num_kv = static_cast(std::max( 1, (linear_config.linear_num_value_heads > 1) ? @@ -134,6 +137,35 @@ struct LinearKVCacheSpec: public KVCacheSpec { return {0, k_block_bytes, k_block_bytes, v_block_bytes}; } + KVCacheSpecPtr clone() const override { + return std::make_shared(*this); + } + +protected: + std::string fingerprintExtra() const override { + std::ostringstream os; + os << ";linear.local_num_k_heads=" << local_num_k_heads + << ";linear.local_num_v_heads=" << local_num_v_heads << ";linear.head_k_dim=" << head_k_dim + << ";linear.head_v_dim=" << head_v_dim << ";linear.conv_kernel_dim=" << conv_kernel_dim + << ";linear.ssm_state_dtype=" << static_cast(ssm_state_dtype) + << ";linear.conv_state_dtype=" << static_cast(conv_state_dtype); + return os.str(); + } + +public: + // Override physicalSignature() to capture the dual-dtype layout. + // LinearKVCacheSpec uses ssm_state_dtype for the K (SSM) segment and + // conv_state_dtype for the V (conv) segment. Since block_size_bytes() already + // encodes their combined element count, we also expose k_block_size_bytes() + // (= SSM segment bytes) via scale_block_size_bytes to distinguish specs that + // share total block bytes but have a different K/V dtype split. + SpecPhysicalSignature physicalSignature() const override { + return {block_size_bytes(), + k_block_size_bytes(), // K segment bytes as secondary discriminator + lifecycle, // always LINEAR; use field for consistency with base class + ssm_state_dtype}; // primary dtype for the K (SSM) segment + } + std::string debugString(size_t indent = 0) const override { const std::string indent_str = std::string(indent, ' '); const std::string indent1 = indent_str + " "; @@ -154,4 +186,4 @@ struct LinearKVCacheSpec: public KVCacheSpec { } }; -} // namespace rtp_llm \ No newline at end of file +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/MHAKVCacheSpec.h b/rtp_llm/cpp/cache/spec/MHAKVCacheSpec.h similarity index 92% rename from rtp_llm/cpp/cache/MHAKVCacheSpec.h rename to rtp_llm/cpp/cache/spec/MHAKVCacheSpec.h index 38dcc4fc8c..cf03f55b94 100644 --- a/rtp_llm/cpp/cache/MHAKVCacheSpec.h +++ b/rtp_llm/cpp/cache/spec/MHAKVCacheSpec.h @@ -1,10 +1,11 @@ #pragma once #include +#include #include #include -#include "rtp_llm/cpp/cache/KVCacheSpecBase.h" +#include "rtp_llm/cpp/cache/spec/KVCacheSpecBase.h" #include "rtp_llm/cpp/config/ConfigModules.h" #include "rtp_llm/models_py/bindings/core/Types.h" #include "rtp_llm/cpp/model_utils/AttentionConfig.h" @@ -14,12 +15,13 @@ namespace rtp_llm { struct MHAKVCacheSpec: public KVCacheSpec { uint32_t size_per_head; - MHAKVCacheSpec() = default; - - MHAKVCacheSpec(const AttentionConfigs& attn_config, const ParallelismConfig& parallelism_config) { - type = KVCacheSpecType::MultiHeadAttention; - layer_num = 1; // Will be set by caller + MHAKVCacheSpec() { + type = KVCacheSpecType::MultiHeadAttention; + lifecycle = CacheGroupType::FULL; + } + MHAKVCacheSpec(const AttentionConfigs& attn_config, const ParallelismConfig& parallelism_config) + : MHAKVCacheSpec() { // TODO(xinfei.sxf): 这里的head_num_kv分配逻辑需要和ModelConfig::getAttentionConfigs里保持一致,目前这里还是单独计算的 local_head_num_kv = static_cast( (attn_config.kv_head_num % parallelism_config.get_attn_tp_size() == 0) ? @@ -126,6 +128,18 @@ struct MHAKVCacheSpec: public KVCacheSpec { return {k_partition_off, k_partition_sz, v_partition_off, v_partition_sz}; } + KVCacheSpecPtr clone() const override { + return std::make_shared(*this); + } + +protected: + std::string fingerprintExtra() const override { + std::ostringstream os; + os << ";mha.size_per_head=" << size_per_head; + return os.str(); + } + +public: std::string debugString(size_t indent = 0) const override { const std::string indent_str = std::string(indent, ' '); const std::string indent1 = indent_str + " "; @@ -142,4 +156,4 @@ struct MHAKVCacheSpec: public KVCacheSpec { } }; -} // namespace rtp_llm \ No newline at end of file +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/MLAKVCacheSpec.h b/rtp_llm/cpp/cache/spec/MLAKVCacheSpec.h similarity index 87% rename from rtp_llm/cpp/cache/MLAKVCacheSpec.h rename to rtp_llm/cpp/cache/spec/MLAKVCacheSpec.h index d89fa8309f..edba354858 100644 --- a/rtp_llm/cpp/cache/MLAKVCacheSpec.h +++ b/rtp_llm/cpp/cache/spec/MLAKVCacheSpec.h @@ -4,7 +4,7 @@ #include #include -#include "rtp_llm/cpp/cache/KVCacheSpecBase.h" +#include "rtp_llm/cpp/cache/spec/KVCacheSpecBase.h" #include "rtp_llm/cpp/config/ConfigModules.h" #include "rtp_llm/models_py/bindings/core/Types.h" #include "rtp_llm/cpp/model_utils/AttentionConfig.h" @@ -15,11 +15,13 @@ struct MLAKVCacheSpec: public KVCacheSpec { uint32_t kv_lora_rank; uint32_t rope_head_dim; - MLAKVCacheSpec() = default; + MLAKVCacheSpec() { + type = KVCacheSpecType::MultiHeadLatentAttention; + lifecycle = CacheGroupType::FULL; + } - MLAKVCacheSpec(const AttentionConfigs& attn_config, const ParallelismConfig& parallelism_config) { - type = KVCacheSpecType::MultiHeadLatentAttention; - layer_num = 1; // Will be set by caller + MLAKVCacheSpec(const AttentionConfigs& attn_config, const ParallelismConfig& parallelism_config) + : MLAKVCacheSpec() { local_head_num_kv = 1; // mla set local_head_num_kv to 1 seq_size_per_block = static_cast(attn_config.tokens_per_block); kv_lora_rank = static_cast(attn_config.kv_lora_rank); @@ -81,6 +83,18 @@ struct MLAKVCacheSpec: public KVCacheSpec { return {0, k_block_bytes, k_block_bytes, v_block_bytes}; } + KVCacheSpecPtr clone() const override { + return std::make_shared(*this); + } + +protected: + std::string fingerprintExtra() const override { + std::ostringstream os; + os << ";mla.kv_lora_rank=" << kv_lora_rank << ";mla.rope_head_dim=" << rope_head_dim; + return os.str(); + } + +public: std::string debugString(size_t indent = 0) const override { const std::string indent_str = std::string(indent, ' '); const std::string indent1 = indent_str + " "; @@ -93,4 +107,4 @@ struct MLAKVCacheSpec: public KVCacheSpec { } }; -} // namespace rtp_llm \ No newline at end of file +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/spec/OpaqueKVCacheSpec.h b/rtp_llm/cpp/cache/spec/OpaqueKVCacheSpec.h new file mode 100644 index 0000000000..0a2c2f6a27 --- /dev/null +++ b/rtp_llm/cpp/cache/spec/OpaqueKVCacheSpec.h @@ -0,0 +1,286 @@ +#pragma once + +#include + +#include "rtp_llm/cpp/cache/spec/KVCacheSpecBase.h" +#include "rtp_llm/cpp/utils/AssertUtils.h" + +namespace rtp_llm { + +struct OpaqueKVCacheSpec: public KVCacheSpec { + uint32_t entry_elems = 0; + uint32_t entries_per_block = 0; + DataType store_dtype = DataType::TYPE_INVALID; + size_t block_size_bytes_override = 0; + size_t block_size_bytes_alignment = 0; + uint32_t block_size_alignment_min_entries = 0; + + OpaqueKVCacheSpec() = default; + + OpaqueKVCacheSpec(KVCacheSpecType spec_type, + CacheGroupType lifecycle_type, + uint32_t entry_elements, + uint32_t block_entries, + DataType storage_dtype, + uint32_t seq_size_per_blk, + size_t block_size_bytes_override_value = 0, + size_t block_size_alignment = 0, + uint32_t block_alignment_min_entries = 0) { + type = spec_type; + lifecycle = lifecycle_type; + entry_elems = entry_elements; + entries_per_block = block_entries; + store_dtype = storage_dtype; + block_size_bytes_override = block_size_bytes_override_value; + block_size_bytes_alignment = block_size_alignment; + block_size_alignment_min_entries = block_alignment_min_entries; + + local_head_num_kv = 1; + seq_size_per_block = seq_size_per_blk; + dtype = store_dtype; + } + + size_t block_size() const override { + return static_cast(entries_per_block) * entry_elems; + } + + size_t k_block_size() const override { + return block_size() / 2; + } + + size_t v_block_size() const override { + return block_size() / 2; + } + + size_t natural_block_size_bytes() const { + return static_cast(entries_per_block) * entry_elems * getTypeSize(store_dtype); + } + + size_t block_size_bytes() const override { + if (block_size_bytes_override > 0) { + return block_size_bytes_override; + } + const size_t natural = natural_block_size_bytes(); + if (block_size_bytes_alignment > 0 && entries_per_block >= block_size_alignment_min_entries) { + return ((natural + block_size_bytes_alignment - 1) / block_size_bytes_alignment) + * block_size_bytes_alignment; + } + return natural; + } + + size_t k_block_size_bytes() const override { + return block_size_bytes() / 2; + } + + size_t v_block_size_bytes() const override { + return block_size_bytes() / 2; + } + + KVCacheSpecPtr clone() const override { + return std::make_shared(*this); + } + +protected: + std::string opaqueFingerprintExtra(const std::string& prefix) const { + std::ostringstream os; + os << ";" << prefix << ".entry_elems=" << entry_elems + << ";" << prefix << ".entries_per_block=" << entries_per_block + << ";" << prefix << ".store_dtype=" << static_cast(store_dtype) + << ";" << prefix << ".block_size_bytes_override=" << block_size_bytes_override + << ";" << prefix << ".block_size_bytes_alignment=" << block_size_bytes_alignment + << ";" << prefix << ".block_size_alignment_min_entries=" << block_size_alignment_min_entries; + return os.str(); + } + + std::string fingerprintExtra() const override { + return opaqueFingerprintExtra("opaque"); + } + +public: + std::string debugString(size_t indent = 0) const override { + std::ostringstream os; + os << std::string(indent, ' ') << "OpaqueKVCacheSpec{\n"; + os << commonDebugString(indent); + os << std::string(indent + 2, ' ') << "entry_elems=" << entry_elems << "\n"; + os << std::string(indent + 2, ' ') << "entries_per_block=" << entries_per_block << "\n"; + os << std::string(indent + 2, ' ') << "block_size_bytes_override=" << block_size_bytes_override << "\n"; + os << std::string(indent + 2, ' ') << "block_size_bytes_alignment=" << block_size_bytes_alignment << "\n"; + os << std::string(indent + 2, ' ') + << "block_size_alignment_min_entries=" << block_size_alignment_min_entries << "\n"; + os << std::string(indent, ' ') << "}\n"; + return os.str(); + } +}; + +struct CompressedKVCacheSpec: public OpaqueKVCacheSpec { + uint32_t compression_ratio = 1; + + CompressedKVCacheSpec() { + type = KVCacheSpecType::OpaqueKV; + lifecycle = CacheGroupType::FULL; + } + + CompressedKVCacheSpec(std::string cache_tag, + uint32_t entry_elements, + uint32_t block_entries, + DataType storage_dtype, + uint32_t seq_size_per_blk, + uint32_t cache_compression_ratio = 1, + size_t block_size_alignment = 0) + : CompressedKVCacheSpec() { + tag = std::move(cache_tag); + entry_elems = entry_elements; + entries_per_block = block_entries; + compression_ratio = cache_compression_ratio; + store_dtype = storage_dtype; + block_size_bytes_alignment = block_size_alignment; + + local_head_num_kv = 1; + seq_size_per_block = seq_size_per_blk; + dtype = store_dtype; + } + + KVCacheSpecPtr clone() const override { + return std::make_shared(*this); + } + +protected: + std::string fingerprintExtra() const override { + std::ostringstream os; + os << ";compressed_kv.compression_ratio=" << compression_ratio + << opaqueFingerprintExtra("compressed_kv"); + return os.str(); + } + +public: + std::string debugString(size_t indent = 0) const override { + std::ostringstream os; + os << std::string(indent, ' ') << "CompressedKVCacheSpec{\n"; + os << commonDebugString(indent); + os << std::string(indent + 2, ' ') << "entry_elems=" << entry_elems << "\n"; + os << std::string(indent + 2, ' ') << "entries_per_block=" << entries_per_block << "\n"; + os << std::string(indent + 2, ' ') << "compression_ratio=" << compression_ratio << "\n"; + os << std::string(indent + 2, ' ') << "block_size_bytes_alignment=" << block_size_bytes_alignment << "\n"; + os << std::string(indent, ' ') << "}\n"; + return os.str(); + } +}; + +struct FixedStateCacheSpec: public OpaqueKVCacheSpec { + uint32_t& state_dim; + + FixedStateCacheSpec(): state_dim(entry_elems) { + type = KVCacheSpecType::OpaqueState; + lifecycle = CacheGroupType::SWA; + } + + FixedStateCacheSpec(const FixedStateCacheSpec& other): OpaqueKVCacheSpec(other), state_dim(entry_elems) {} + + FixedStateCacheSpec& operator=(const FixedStateCacheSpec& other) { + if (this != &other) { + OpaqueKVCacheSpec::operator=(other); + } + return *this; + } + + FixedStateCacheSpec(std::string cache_tag, + uint32_t state_elements, + uint32_t block_entries, + DataType storage_dtype, + uint32_t seq_size_per_blk, + size_t block_size_bytes_override_value = 0, + size_t block_size_alignment = 0, + uint32_t block_alignment_min_entries = 0, + bool state_cache = true, + bool skip_reuse = false) + : FixedStateCacheSpec() { + tag = std::move(cache_tag); + state_dim = state_elements; + entries_per_block = block_entries; + store_dtype = storage_dtype; + block_size_bytes_override = block_size_bytes_override_value; + block_size_bytes_alignment = block_size_alignment; + block_size_alignment_min_entries = block_alignment_min_entries; + + local_head_num_kv = 1; + seq_size_per_block = seq_size_per_blk; + dtype = store_dtype; + is_state_cache = state_cache; + skip_prefix_reuse = skip_reuse; + } + + std::vector sliceBlockForPeer(std::vector parts, + size_t cp_size, + size_t peer_idx) const override { + return cpSliceDestination(std::move(parts), cp_size, peer_idx); + } + + CPTransferPolicy cpTransferPolicy() const override { + return CPTransferPolicy::INTRA_BLOCK_SLICE; + } + + std::vector cpSliceDestination(std::vector parts, + size_t cp_size, + size_t peer_idx) const override { + if (cp_size <= 1) { + return parts; + } + RTP_LLM_CHECK_WITH_INFO(parts.size() == 1, + "FixedStateCacheSpec CP byte slicing expects one block part, got %zu", + parts.size()); + auto& block = parts[0]; + RTP_LLM_CHECK_WITH_INFO(block.addr != nullptr, "FixedStateCacheSpec CP byte slicing got null block addr"); + + size_t slice_bytes = 0; + if (block_size_bytes_override > 0 || block.size_bytes == block_size_bytes()) { + RTP_LLM_CHECK_WITH_INFO(block.size_bytes % cp_size == 0, + "FixedStateCacheSpec block bytes %zu not divisible by cp_size %zu", + block.size_bytes, + cp_size); + slice_bytes = block.size_bytes / cp_size; + } else { + RTP_LLM_CHECK_WITH_INFO(entries_per_block % cp_size == 0, + "FixedStateCacheSpec entries %u not divisible by cp_size %zu", + entries_per_block, + cp_size); + const size_t local_entries = entries_per_block / cp_size; + slice_bytes = local_entries * static_cast(state_dim) * getTypeSize(store_dtype); + } + + const size_t slice_offset = slice_bytes * peer_idx; + RTP_LLM_CHECK_WITH_INFO(slice_offset + slice_bytes <= block.size_bytes, + "FixedStateCacheSpec CP slice [%zu, %zu) exceeds block bytes %zu", + slice_offset, + slice_offset + slice_bytes, + block.size_bytes); + block.addr = static_cast(static_cast(block.addr) + slice_offset); + block.size_bytes = slice_bytes; + return parts; + } + + KVCacheSpecPtr clone() const override { + return std::make_shared(*this); + } + +protected: + std::string fingerprintExtra() const override { + return opaqueFingerprintExtra("fixed_state"); + } + +public: + std::string debugString(size_t indent = 0) const override { + std::ostringstream os; + os << std::string(indent, ' ') << "FixedStateCacheSpec{\n"; + os << commonDebugString(indent); + os << std::string(indent + 2, ' ') << "state_dim=" << state_dim << "\n"; + os << std::string(indent + 2, ' ') << "entries_per_block=" << entries_per_block << "\n"; + os << std::string(indent + 2, ' ') << "block_size_bytes_override=" << block_size_bytes_override << "\n"; + os << std::string(indent + 2, ' ') << "block_size_bytes_alignment=" << block_size_bytes_alignment << "\n"; + os << std::string(indent + 2, ' ') + << "block_size_alignment_min_entries=" << block_size_alignment_min_entries << "\n"; + os << std::string(indent, ' ') << "}\n"; + return os.str(); + } +}; + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/test/BUILD b/rtp_llm/cpp/cache/test/BUILD index 5c4c954fb6..dcc71eb08a 100644 --- a/rtp_llm/cpp/cache/test/BUILD +++ b/rtp_llm/cpp/cache/test/BUILD @@ -7,6 +7,48 @@ test_copts = [ "-fno-access-control", ] + copts() +cc_import( + name = "cuda12_cudart", + shared_library = "@pip_gpu_cuda12_9_torch_nvidia_cuda_runtime_cu12//:site-packages/nvidia/cuda_runtime/lib/libcudart.so.12", +) + +cc_import( + name = "cuda12_cublas", + shared_library = "@pip_gpu_cuda12_9_torch_nvidia_cublas_cu12//:site-packages/nvidia/cublas/lib/libcublas.so.12", +) + +cc_import( + name = "cuda12_cublas_lt", + shared_library = "@pip_gpu_cuda12_9_torch_nvidia_cublas_cu12//:site-packages/nvidia/cublas/lib/libcublasLt.so.12", +) + +cc_import( + name = "cuda12_cufft", + shared_library = "@pip_gpu_cuda12_9_torch_nvidia_cufft_cu12//:site-packages/nvidia/cufft/lib/libcufft.so.11", +) + +cc_import( + name = "cuda12_cupti", + shared_library = "@pip_gpu_cuda12_9_torch_nvidia_cuda_cupti_cu12//:site-packages/nvidia/cuda_cupti/lib/libcupti.so.12", +) + +cc_import( + name = "cuda13_torch_nvshmem", + shared_library = "@pip_gpu_cuda13_torch_torch//:site-packages/torch/lib/libtorch_nvshmem.so", +) + +cuda12_torch_link_deps = select({ + "@//:using_cuda13_x86": [ + ":cuda12_cublas", + ":cuda12_cublas_lt", + ":cuda12_cudart", + ":cuda12_cufft", + ":cuda12_cupti", + ":cuda13_torch_nvshmem", + ], + "//conditions:default": [], +}) + test_deps = [ "//rtp_llm/models_py/bindings/cuda/ops:cuda_impl", "//rtp_llm/models_py/bindings/core:exec_ops_test_lib", @@ -19,7 +61,7 @@ test_deps = [ "@com_google_googletest//:gtest_main", "@local_config_cuda//cuda:cuda_headers", "@local_config_cuda//cuda:cudart", -] + torch_deps() +] + torch_deps() + cuda12_torch_link_deps block_cache_test_deps = [ "//rtp_llm/cpp/testing:device_test_utils", @@ -31,7 +73,7 @@ block_cache_test_deps = [ "@com_google_googletest//:gtest_main", "@local_config_cuda//cuda:cuda_headers", "@local_config_cuda//cuda:cudart", -] + torch_deps() +] + torch_deps() + cuda12_torch_link_deps cc_library( @@ -42,10 +84,41 @@ cc_library( copts = test_copts, deps = [ "//rtp_llm/cpp/cache:cache_types", + "//rtp_llm/cpp/config:model_config", ], visibility = ["//visibility:public"], ) +cc_test( + name = "cp_slot_mapper_test", + srcs = [ + "CPSlotMapperTest.cc", + ], + data = [], + copts = test_copts, + deps = [ + "//rtp_llm/cpp/cache:cp_slot_mapper", + "@com_google_googletest//:gtest", + "@com_google_googletest//:gtest_main", + ], + env = {}, +) + +cc_test( + name = "kv_cache_resource_local_cache_keys_test", + srcs = [ + "KVCacheResourceLocalCacheKeysTest.cc", + ], + data = [], + copts = test_copts, + deps = [ + "//rtp_llm/cpp/cache:batch_kv_cache_resource", + "@com_google_googletest//:gtest", + "@com_google_googletest//:gtest_main", + ], + env = {}, +) + cc_library( name = "block_pool_test_helper", hdrs = [ @@ -71,6 +144,18 @@ cc_test( exec_properties = {'gpu':'H20'}, ) +cc_test( + name = "shared_block_cache_test", + srcs = [ + "SharedBlockCacheTest.cc", + ], + data = [], + copts = test_copts, + deps = block_cache_test_deps, + env = {}, + exec_properties = {'gpu':'H20'}, +) + cc_test( name = "block_pool_test", srcs = [ @@ -142,6 +227,22 @@ cc_test( exec_properties = {'gpu':'H20'}, ) +cc_test( + name = "kv_cache_manager_cp_test", + srcs = [ + "KVCacheManagerCPSlotMapperTest.cc", + ], + data = [], + copts = test_copts, + deps = test_deps + [ + ":block_pool_test_helper", + "//rtp_llm/cpp/cache/connector/test/mock:connector_mock_files_lib", + "//rtp_llm/cpp/cache/connector/memory:memory_connector", + ], + env = {}, + exec_properties = {'gpu':'H20'}, +) + cc_test( name = "linear_kv_cache_group_test", srcs = [ @@ -170,6 +271,80 @@ cc_test( exec_properties = {'gpu':'H20'}, ) +cc_test( + name = "hybrid_kv_cache_allocator_cp_shard_test", + srcs = [ + "HybridKVCacheAllocatorCPShardTest.cc", + ], + data = [], + copts = test_copts, + deps = test_deps + [ + ":block_pool_test_helper", + ], + env = {}, + exec_properties = {'gpu':'H20'}, +) + +cc_test( + name = "hybrid_pool_kv_cache_allocator_test", + srcs = [ + "HybridPoolKVCacheAllocatorTest.cc", + ], + data = [], + copts = test_copts, + deps = test_deps + [ + ":block_pool_test_helper", + ], + env = {}, + exec_properties = {'gpu':'H20'}, +) + +cc_test( + name = "swa_kv_cache_group_test", + srcs = [ + "SWAKVCacheGroupTest.cc", + ], + data = [], + copts = test_copts, + deps = test_deps + [ + ":block_pool_test_helper", + ], + env = {}, + exec_properties = {'gpu':'H20'}, +) + +cc_test( + name = "swa_kv_cache_group_malloc_range_test", + srcs = [ + "SWAKVCacheGroupMallocRangeTest.cc", + ], + data = [], + copts = test_copts, + deps = [ + "//rtp_llm/cpp/cache:kv_cache_group", + "@com_google_googletest//:gtest", + "@com_google_googletest//:gtest_main", + ] + torch_deps() + cuda12_torch_link_deps, + env = { + "RTP_LLM_PIN_HOST_BLOCK_POOL": "0", + }, +) + +cc_test( + name = "dsv4_cache_test", + srcs = [ + "DSV4CacheTest.cc", + ], + data = [], + copts = test_copts, + deps = test_deps + [ + ":block_pool_test_helper", + "//rtp_llm/cpp/cache:kv_cache_transfer_planner", + ], + env = {}, + exec_properties = {'gpu':'H20'}, +) + cc_test( name = "kv_cache_resource_test", srcs = [ @@ -180,4 +355,4 @@ cc_test( deps = test_deps, env = {}, exec_properties = {'gpu':'H20'}, -) \ No newline at end of file +) diff --git a/rtp_llm/cpp/cache/test/BlockPoolTest.cc b/rtp_llm/cpp/cache/test/BlockPoolTest.cc index 46132b09af..0e03f69c77 100644 --- a/rtp_llm/cpp/cache/test/BlockPoolTest.cc +++ b/rtp_llm/cpp/cache/test/BlockPoolTest.cc @@ -8,10 +8,12 @@ #include "rtp_llm/cpp/utils/Logger.h" #include "rtp_llm/cpp/cache/BlockPool.h" #include "rtp_llm/cpp/cache/CacheConfig.h" -#include "rtp_llm/cpp/cache/CacheConfigCreator.h" +#include "rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h" #include "rtp_llm/cpp/cache/BlockPoolConfigHelper.h" +#include "rtp_llm/cpp/config/StaticConfig.h" #include "rtp_llm/models_py/bindings/core/ExecOps.h" #include "rtp_llm/cpp/cache/test/BlockPoolTestHelper.h" +#include "rtp_llm/cpp/cache/test/CacheConfigTestUtils.h" namespace rtp_llm { namespace test { @@ -19,14 +21,18 @@ namespace test { class BlockPoolTest: public ::testing::Test { protected: void SetUp() override { + old_core_dump_on_exception_ = StaticConfig::user_ft_core_dump_on_exception; + StaticConfig::user_ft_core_dump_on_exception = false; createDevice(); } void TearDown() override { + StaticConfig::user_ft_core_dump_on_exception = old_core_dump_on_exception_; block_pool_.reset(); } std::shared_ptr block_pool_; + bool old_core_dump_on_exception_{false}; }; namespace { @@ -46,7 +52,7 @@ static rtp_llm::ModelConfig makeTestModelConfig(uint32_t num_layers) { m.attn_config.kv_lora_rank = 0; m.attn_config.rope_head_dim = 0; m.attn_config.head_num = 2; - // keep other fields default + setDefaultKvCacheSpec(m); return m; } @@ -95,27 +101,25 @@ TEST_F(BlockPoolTest, ConstructorAndInit) { } TEST_F(BlockPoolTest, MTPConvertIndexGlobalIdMapping) { - // Use createSpConfig logic so that global_layer_ids is filled for main + sub-model layers. + // Use createSpConfig logic so that group layer ids are filled for main + sub-model layers. // main(2 layers) + mtp1(1 layer) + mtp2(1 layer) auto cache_cfg = makeMtpCacheConfigByCreateSpConfig(/*main_layers=*/2, /*mtp_module_num=*/2, /*block_num=*/4); - ASSERT_FALSE(cache_cfg.global_layer_ids.empty()); - ASSERT_EQ(cache_cfg.global_layer_ids[0].size(), static_cast(cache_cfg.layer_all_num)); + ASSERT_GT(cache_cfg.groupNums(), 0); + ASSERT_EQ(cache_cfg.layerIdsForGroup(0).size(), static_cast(cache_cfg.layer_all_num)); ASSERT_EQ(cache_cfg.mtp_sub_configs.size(), 2u); ASSERT_NE(cache_cfg.mtp_sub_configs[0], nullptr); ASSERT_NE(cache_cfg.mtp_sub_configs[1], nullptr); ASSERT_EQ(cache_cfg.mtp_sub_configs[0]->groupNums(), 1); ASSERT_EQ(cache_cfg.mtp_sub_configs[1]->groupNums(), 1); - EXPECT_EQ(cache_cfg.mtp_sub_configs[0]->cache_specs[0]->block_size_bytes(), - cache_cfg.mtp_sub_configs[1]->cache_specs[0]->block_size_bytes()); - - ASSERT_FALSE(cache_cfg.mtp_sub_configs[0]->global_layer_ids.empty()); - ASSERT_FALSE(cache_cfg.mtp_sub_configs[1]->global_layer_ids.empty()); - ASSERT_EQ(cache_cfg.mtp_sub_configs[0]->global_layer_ids[0].size(), 1u); - ASSERT_EQ(cache_cfg.mtp_sub_configs[1]->global_layer_ids[0].size(), 1u); - EXPECT_EQ(cache_cfg.mtp_sub_configs[0]->global_layer_ids[0][0], 2); - EXPECT_EQ(cache_cfg.mtp_sub_configs[1]->global_layer_ids[0][0], 3); + EXPECT_EQ(cache_cfg.mtp_sub_configs[0]->specForGroup(0)->block_size_bytes(), + cache_cfg.mtp_sub_configs[1]->specForGroup(0)->block_size_bytes()); + + ASSERT_EQ(cache_cfg.mtp_sub_configs[0]->layerIdsForGroup(0).size(), 1u); + ASSERT_EQ(cache_cfg.mtp_sub_configs[1]->layerIdsForGroup(0).size(), 1u); + EXPECT_EQ(cache_cfg.mtp_sub_configs[0]->layerIdsForGroup(0)[0], 2); + EXPECT_EQ(cache_cfg.mtp_sub_configs[1]->layerIdsForGroup(0)[0], 3); auto pool_cfg = rtp_llm::BlockPoolConfigHelper::createConfig(cache_cfg); ASSERT_EQ(pool_cfg.memory_layouts.size(), 3u); diff --git a/rtp_llm/cpp/cache/test/BlockPoolTestHelper.h b/rtp_llm/cpp/cache/test/BlockPoolTestHelper.h index bd3f68e000..ff6438fb94 100644 --- a/rtp_llm/cpp/cache/test/BlockPoolTestHelper.h +++ b/rtp_llm/cpp/cache/test/BlockPoolTestHelper.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include "rtp_llm/cpp/cache/CacheConfig.h" #include "rtp_llm/cpp/cache/BlockPoolConfigHelper.h" #include "rtp_llm/cpp/utils/AssertUtils.h" @@ -40,7 +41,6 @@ inline KVCacheSpecPtr createTestKvCacheSpec(uint32_t layer_num, auto spec = std::make_shared(); spec->type = KVCacheSpecType::MultiHeadAttention; spec->dtype = dtype; - spec->layer_num = layer_num; spec->local_head_num_kv = local_head_num_kv; spec->seq_size_per_block = seq_size_per_block; spec->size_per_head = static_cast(k_elems / denom); @@ -50,7 +50,6 @@ inline KVCacheSpecPtr createTestKvCacheSpec(uint32_t layer_num, auto spec = std::make_shared(); spec->type = KVCacheSpecType::MultiHeadLatentAttention; spec->dtype = dtype; - spec->layer_num = layer_num; spec->local_head_num_kv = local_head_num_kv; spec->seq_size_per_block = seq_size_per_block; spec->kv_lora_rank = static_cast(k_elems / denom); @@ -72,16 +71,19 @@ inline BlockPoolConfig createTestConfig(size_t k_block_stride_bytes = auto spec = createTestKvCacheSpec( kLayerNum, dtype, local_head_num_kv, seq_size_per_block, k_block_stride_bytes, v_block_stride_bytes); - // Create CacheConfig with the spec rtp_llm::CacheConfig cache_config; - cache_config.cache_specs = {spec}; cache_config.layer_num = kLayerNum; + cache_config.layer_all_num = kLayerNum; cache_config.block_num = kBlockNum; cache_config.dtype = dtype; cache_config.seq_size_per_block = seq_size_per_block; cache_config.kv_block_stride_bytes = k_block_stride_bytes + v_block_stride_bytes; cache_config.kv_scale_stride_bytes = k_scale_stride_bytes + v_scale_stride_bytes; + std::vector layer_ids(kLayerNum); + std::iota(layer_ids.begin(), layer_ids.end(), 0); + cache_config.fromGroupedSpecs({spec}, {layer_ids}, {CacheGroupType::FULL}, {"default"}); + return BlockPoolConfigHelper::createConfig(cache_config); } diff --git a/rtp_llm/cpp/cache/test/CPSlotMapperTest.cc b/rtp_llm/cpp/cache/test/CPSlotMapperTest.cc new file mode 100644 index 0000000000..d8d052a7ae --- /dev/null +++ b/rtp_llm/cpp/cache/test/CPSlotMapperTest.cc @@ -0,0 +1,124 @@ +#include +#include +#include "rtp_llm/cpp/cache/CPSlotMapper.h" + +namespace rtp_llm { +namespace test { + +class CPSlotMapperTest: public ::testing::Test {}; + +TEST_F(CPSlotMapperTest, DefaultConstructorIsNotSharded) { + CPSlotMapper mapper; + EXPECT_FALSE(mapper.isSharded()); // cp_size=1 → not sharded + EXPECT_EQ(mapper.cpRank(), 0); + EXPECT_EQ(mapper.cpSize(), 1); + EXPECT_EQ(mapper.blockSize(), 1); + EXPECT_EQ(mapper.virtualBlockSize(), 1); +} + +TEST_F(CPSlotMapperTest, SingleRankIsNotSharded) { + CPSlotMapper mapper(0, 1, 32); + EXPECT_FALSE(mapper.isSharded()); // cp_size=1 → not sharded +} + +TEST_F(CPSlotMapperTest, MultiRankIsSharded) { + CPSlotMapper mapper(0, 2, 32); + EXPECT_TRUE(mapper.isSharded()); // cp_size=2 → sharded + EXPECT_EQ(mapper.virtualBlockSize(), 64); // block_size * cp_size +} + +TEST_F(CPSlotMapperTest, RejectsInvalidGeometry) { + EXPECT_THROW(CPSlotMapper(0, 0, 32), std::invalid_argument); + EXPECT_THROW(CPSlotMapper(0, 2, 0), std::invalid_argument); + EXPECT_THROW(CPSlotMapper(-1, 2, 32), std::invalid_argument); + EXPECT_THROW(CPSlotMapper(2, 2, 32), std::invalid_argument); +} + +TEST_F(CPSlotMapperTest, LocalBlockCount) { + const int block_size = 4; + + // cp_size=2: localBlockCount = ceil(total_blocks / cp_size), same for all ranks + CPSlotMapper rank0(0, 2, block_size); + CPSlotMapper rank1(1, 2, block_size); + + // seq_len=0: 0 total blocks -> 0 + EXPECT_EQ(rank0.localBlockCount(0), 0); + EXPECT_EQ(rank1.localBlockCount(0), 0); + + // seq_len=4: 1 total block -> ceil(1/2)=1 + EXPECT_EQ(rank0.localBlockCount(4), 1); + EXPECT_EQ(rank1.localBlockCount(4), 1); + + // seq_len=8: 2 total blocks -> ceil(2/2)=1 + EXPECT_EQ(rank0.localBlockCount(8), 1); + EXPECT_EQ(rank1.localBlockCount(8), 1); + + // seq_len=12: 3 total blocks -> ceil(3/2)=2 + EXPECT_EQ(rank0.localBlockCount(12), 2); + EXPECT_EQ(rank1.localBlockCount(12), 2); + + // seq_len=16: 4 total blocks -> ceil(4/2)=2 + EXPECT_EQ(rank0.localBlockCount(16), 2); + EXPECT_EQ(rank1.localBlockCount(16), 2); + + // seq_len=5: 2 total blocks -> ceil(2/2)=1 + EXPECT_EQ(rank0.localBlockCount(5), 1); + EXPECT_EQ(rank1.localBlockCount(5), 1); +} + +TEST_F(CPSlotMapperTest, LocalBlockCountFourRanks) { + // seq_len=55, block_size=8, cp_size=4 + // total_blocks = ceil(55/8) = 7, localBlockCount = ceil(7/4) = 2 + // All ranks get 2 — rank3 has 1 unused trailing block + const int block_size = 8; + const int cp_size = 4; + + for (int r = 0; r < cp_size; ++r) { + CPSlotMapper mapper(r, cp_size, block_size); + EXPECT_EQ(mapper.localBlockCount(55), 2) << "rank=" << r; + } +} + +TEST_F(CPSlotMapperTest, EffectiveSeqLenForAllocIsRankIndependent) { + const int block_size = 4; + CPSlotMapper rank0(0, 2, block_size); + CPSlotMapper rank1(1, 2, block_size); + + // effectiveSeqLenForAlloc = ceil(total_blocks / cp_size) * block_size + // This is rank-independent — always allocates max across all ranks. + EXPECT_EQ(rank0.effectiveSeqLenForAlloc(0), 0); + EXPECT_EQ(rank0.effectiveSeqLenForAlloc(4), 4); // ceil(1/2)=1 block * 4 + EXPECT_EQ(rank0.effectiveSeqLenForAlloc(8), 4); // ceil(2/2)=1 block * 4 + EXPECT_EQ(rank0.effectiveSeqLenForAlloc(12), 8); // ceil(3/2)=2 blocks * 4 + EXPECT_EQ(rank0.effectiveSeqLenForAlloc(16), 8); // ceil(4/2)=2 blocks * 4 + + // Same results for rank1 — rank-independent + EXPECT_EQ(rank1.effectiveSeqLenForAlloc(0), 0); + EXPECT_EQ(rank1.effectiveSeqLenForAlloc(4), 4); + EXPECT_EQ(rank1.effectiveSeqLenForAlloc(8), 4); + EXPECT_EQ(rank1.effectiveSeqLenForAlloc(12), 8); + EXPECT_EQ(rank1.effectiveSeqLenForAlloc(16), 8); +} + +TEST_F(CPSlotMapperTest, EffectiveSeqLenFourRanks) { + // seq_len=55, block_size=8, cp_size=4 + // total_blocks=7, ceil(7/4)=2, effective=16 + // All ranks get the same value + const int block_size = 8; + const int cp_size = 4; + + for (int r = 0; r < cp_size; ++r) { + CPSlotMapper mapper(r, cp_size, block_size); + EXPECT_EQ(mapper.effectiveSeqLenForAlloc(55), 16) << "rank=" << r; + } +} + +TEST_F(CPSlotMapperTest, NonShardedPassthrough) { + CPSlotMapper mapper; // cp_size=1, block_size=1 + + EXPECT_EQ(mapper.localBlockCount(10), 10); + EXPECT_EQ(mapper.effectiveSeqLenForAlloc(10), 10); +} + +} // namespace test +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/test/CacheConfigTestUtils.h b/rtp_llm/cpp/cache/test/CacheConfigTestUtils.h index 9ecd30ca0c..5565b40769 100644 --- a/rtp_llm/cpp/cache/test/CacheConfigTestUtils.h +++ b/rtp_llm/cpp/cache/test/CacheConfigTestUtils.h @@ -4,12 +4,257 @@ #include #include #include +#include +#include #include #include "rtp_llm/cpp/cache/CacheConfig.h" +#include "rtp_llm/cpp/cache/spec/LinearKVCacheSpec.h" +#include "rtp_llm/cpp/cache/spec/MHAKVCacheSpec.h" +#include "rtp_llm/cpp/cache/spec/MLAKVCacheSpec.h" +#include "rtp_llm/cpp/cache/spec/OpaqueKVCacheSpec.h" +#include "rtp_llm/cpp/config/ModelConfig.h" +#include "rtp_llm/cpp/utils/AssertUtils.h" namespace rtp_llm::test { +inline constexpr uint32_t DSV4_FP8_KV_ENTRY_BYTES = 584; +inline constexpr uint32_t DSV4_FP8_INDEXER_ENTRY_BYTES = 132; +inline constexpr size_t DSV4_FP8_MLA_BLOCK_ALIGNMENT_BYTES = 576; +inline constexpr uint32_t DSV4_SWA_WINDOW_ENTRIES = 128; + +inline size_t alignDsv4Fp8KvBlockBytes(size_t natural, size_t extra_multiple = 1) { + const size_t align = std::lcm(DSV4_FP8_MLA_BLOCK_ALIGNMENT_BYTES, std::max(extra_multiple, 1)); + return ((natural + align - 1) / align) * align; +} + +inline KVCacheSpecPtr makeDsv4Spec(const std::string& tag, + const std::string& kind, + uint32_t entry_elems, + DataType dtype, + uint32_t compression_ratio = 1) { + KVCacheSpecPtr spec; + if (kind == "compressed_kv") { + auto kv_spec = std::make_shared(); + kv_spec->entry_elems = entry_elems; + kv_spec->compression_ratio = compression_ratio; + kv_spec->store_dtype = dtype; + spec = kv_spec; + } else { + auto state_spec = std::make_shared(); + state_spec->state_dim = entry_elems; + state_spec->store_dtype = dtype; + spec = state_spec; + } + spec->tag = tag; + spec->dtype = dtype; + return spec; +} + +inline KVCacheSpecDesc dsv4DescForSpec(const KVCacheSpecPtr& spec) { + RTP_LLM_CHECK_WITH_INFO(spec != nullptr, "dsv4DescForSpec got null spec"); + KVCacheSpecDesc desc; + desc.tag = spec->tag; + desc.dtype = spec->dtype; + if (auto* compressed = dynamic_cast(spec.get())) { + desc.cache_type = CacheType::COMPRESSED_KV; + desc.is_state_cache = false; + desc.entry_elems = compressed->entry_elems; + desc.compression_ratio = compressed->compression_ratio; + desc.store_dtype = compressed->store_dtype; + desc.block_size_bytes_alignment = compressed->block_size_bytes_alignment; + desc.extra.derive_entries_from_kernel_block = true; + if (desc.block_size_bytes_alignment == 0 && desc.entry_elems == DSV4_FP8_KV_ENTRY_BYTES) { + desc.block_size_bytes_alignment = DSV4_FP8_MLA_BLOCK_ALIGNMENT_BYTES; + } + return desc; + } + + auto* fixed = dynamic_cast(spec.get()); + RTP_LLM_CHECK_WITH_INFO(fixed != nullptr, "DSV4 test spec tag=%s must be opaque", spec->tag.c_str()); + desc.cache_type = CacheType::FIXED_STATE; + desc.entry_elems = fixed->entry_elems; + desc.store_dtype = fixed->store_dtype; + desc.block_size_bytes_override = fixed->block_size_bytes_override; + desc.block_size_bytes_alignment = fixed->block_size_bytes_alignment; + desc.block_size_alignment_min_entries = fixed->block_size_alignment_min_entries; + if (desc.tag == "indexer_state" || desc.tag == "csa_state") { + desc.extra.state_ring_compression_ratio = 4; + desc.extra.state_ring_overlap = 1; + desc.extra.cp_align_entries = true; + desc.extra.cp_slice_entries = true; + } else if (desc.tag == "hca_state") { + desc.extra.state_ring_compression_ratio = 128; + desc.extra.cp_align_entries = true; + desc.extra.cp_slice_entries = true; + desc.extra.explicit_block_num = 256; + desc.skip_prefix_reuse = true; + desc.has_reuse_policy = true; + desc.reuse_policy = CacheReusePolicy::NON_REUSABLE; + desc.has_active_tail_blocks = true; + desc.active_tail_blocks = 1; + desc.has_validate_tail_blocks = true; + desc.validate_tail_blocks = false; + } else if (desc.tag == "swa_kv") { + desc.extra.state_ring_compression_ratio = DSV4_SWA_WINDOW_ENTRIES; + desc.extra.cp_align_entries = true; + desc.extra.cp_prefill_slice_block_bytes = true; + if (desc.block_size_bytes_alignment == 0 && desc.entry_elems == DSV4_FP8_KV_ENTRY_BYTES) { + desc.block_size_bytes_alignment = DSV4_FP8_MLA_BLOCK_ALIGNMENT_BYTES; + } + } + desc.extra.state_ring_add_gen_num_per_cycle = true; + desc.extra.use_fixed_region_cp_tokens = true; + desc.block_size_alignment_min_entries = + desc.block_size_alignment_min_entries == 0 ? DSV4_SWA_WINDOW_ENTRIES : desc.block_size_alignment_min_entries; + desc.is_state_cache = true; + desc.has_evict_policy = true; + desc.evict_policy = CacheEvictPolicy::INDEPENDENT; + return desc; +} + +inline void setDefaultKvCacheSpec(ModelConfig& model_config) { + KVCacheSpecDesc desc; + desc.tag = "default"; + desc.seq_size_per_block = static_cast(model_config.attn_config.tokens_per_block); + if (model_config.attn_config.use_mla && model_config.mla_ops_type != rtp_llm::MlaOpsType::MHA) { + desc.cache_type = CacheType::MLA; + desc.kv_lora_rank = static_cast(model_config.attn_config.kv_lora_rank); + desc.rope_head_dim = static_cast(model_config.attn_config.rope_head_dim); + desc.num_kv_heads = 1; + } else { + desc.cache_type = CacheType::MHA; + desc.size_per_head = static_cast(model_config.attn_config.size_per_head); + desc.num_kv_heads = static_cast(model_config.attn_config.kv_head_num); + } + model_config.kv_cache_spec_descs.assign(static_cast(model_config.num_layers), {desc}); +} + +inline void setHybridAttentionKvCacheSpecs(ModelConfig& model_config) { + std::vector full_layers; + std::vector swa_layers; + std::vector linear_layers; + const auto& types = model_config.hybrid_attention_config.hybrid_attention_types; + RTP_LLM_CHECK_WITH_INFO(types.size() == static_cast(model_config.num_layers), + "hybrid_attention_types size %zu != num_layers %ld", + types.size(), + model_config.num_layers); + for (int i = 0; i < static_cast(model_config.num_layers); ++i) { + switch (types[static_cast(i)]) { + case HybridAttentionType::LINEAR: + linear_layers.push_back(i); + break; + case HybridAttentionType::SLIDING_WINDOW: + swa_layers.push_back(i); + break; + case HybridAttentionType::NONE: + default: + full_layers.push_back(i); + break; + } + } + + KVCacheSpecDesc full_desc; + full_desc.tag = "full"; + full_desc.cache_type = CacheType::MHA; + full_desc.seq_size_per_block = static_cast(model_config.attn_config.tokens_per_block); + full_desc.size_per_head = static_cast(model_config.attn_config.size_per_head); + full_desc.num_kv_heads = static_cast(model_config.attn_config.kv_head_num); + + KVCacheSpecDesc swa_desc = full_desc; + swa_desc.tag = "swa"; + swa_desc.cache_type = CacheType::FIXED_STATE; + swa_desc.entry_elems = static_cast(model_config.attn_config.size_per_head) + * static_cast(model_config.attn_config.kv_head_num) * 2; + swa_desc.entries_per_block = static_cast(model_config.attn_config.sliding_window > 0 ? + model_config.attn_config.sliding_window : + model_config.attn_config.tokens_per_block); + swa_desc.store_dtype = DataType::TYPE_FP16; + + const auto& linear_config = model_config.linear_attention_config; + KVCacheSpecDesc linear_desc; + linear_desc.tag = "linear"; + linear_desc.cache_type = CacheType::LINEAR; + linear_desc.seq_size_per_block = static_cast(model_config.attn_config.tokens_per_block); + linear_desc.num_k_heads = static_cast(linear_config.linear_num_key_heads); + linear_desc.num_v_heads = static_cast(linear_config.linear_num_value_heads); + linear_desc.head_k_dim = static_cast(linear_config.linear_key_head_dim); + linear_desc.head_v_dim = static_cast(linear_config.linear_value_head_dim); + linear_desc.conv_kernel_dim = static_cast(linear_config.linear_conv_kernel_dim); + linear_desc.ssm_state_dtype = linear_config.ssm_state_dtype; + linear_desc.conv_state_dtype = linear_config.conv_state_dtype; + + model_config.kv_cache_spec_descs.assign(static_cast(model_config.num_layers), {}); + for (int layer_id : full_layers) { + model_config.kv_cache_spec_descs[static_cast(layer_id)] = {full_desc}; + } + for (int layer_id : swa_layers) { + model_config.kv_cache_spec_descs[static_cast(layer_id)] = {swa_desc}; + } + for (int layer_id : linear_layers) { + model_config.kv_cache_spec_descs[static_cast(layer_id)] = {linear_desc}; + } +} + +inline void setDsv4KvCacheSpecs(ModelConfig& model_config) { + const int layer_num = static_cast(model_config.num_layers); + + const bool fp8_kv = model_config.attn_config.kv_cache_dtype == KvCacheDataType::FP8; + const uint32_t kv_entry_elems = fp8_kv ? 584 : static_cast(model_config.attn_config.size_per_head) * 2; + const uint32_t indexer_entry_elems = fp8_kv ? 132 : static_cast(model_config.attn_config.indexer_head_dim) * 2; + const uint32_t head_dim = static_cast(model_config.attn_config.size_per_head); + const uint32_t indexer_head_dim = static_cast(model_config.attn_config.indexer_head_dim); + + auto csa_kv = makeDsv4Spec("csa_kv", "compressed_kv", kv_entry_elems, DataType::TYPE_UINT8, 4); + auto hca_kv = makeDsv4Spec("hca_kv", "compressed_kv", kv_entry_elems, DataType::TYPE_UINT8, 128); + auto indexer_kv = makeDsv4Spec("indexer_kv", "compressed_kv", indexer_entry_elems, DataType::TYPE_UINT8, 4); + auto indexer_state = makeDsv4Spec("indexer_state", "fixed_state", 4 * indexer_head_dim, DataType::TYPE_FP32); + auto csa_state = makeDsv4Spec("csa_state", "fixed_state", 4 * head_dim, DataType::TYPE_FP32); + auto hca_state = makeDsv4Spec("hca_state", "fixed_state", 2 * head_dim, DataType::TYPE_FP32); + auto swa_kv = makeDsv4Spec("swa_kv", "sliding_window_kv", kv_entry_elems, DataType::TYPE_UINT8); + + model_config.kv_cache_spec_descs.clear(); + model_config.kv_cache_spec_descs.resize(layer_num); + for (int i = 0; i < layer_num; ++i) { + const int ratio = i < static_cast(model_config.attn_config.layer_compress_ratios.size()) ? + model_config.attn_config.layer_compress_ratios[static_cast(i)] : + 0; + std::vector specs; + if (ratio == 4) { + specs = {csa_kv, indexer_kv, indexer_state, csa_state, swa_kv}; + } else if (ratio == 128) { + specs = {hca_kv, hca_state, swa_kv}; + } else { + specs = {swa_kv}; + } + auto& descs = model_config.kv_cache_spec_descs[static_cast(i)]; + descs.reserve(specs.size()); + for (const auto& spec : specs) { + descs.push_back(dsv4DescForSpec(spec)); + } + } +} + +inline void refreshDsv4KvCacheSpecDescs(ModelConfig& model_config, + const ParallelismConfig& parallelism_config, + const KVCacheConfig& kv_cache_config, + int gen_num_per_cycle = 0) { + (void)parallelism_config; + (void)kv_cache_config; + (void)gen_num_per_cycle; + setDsv4KvCacheSpecs(model_config); +} + +inline void setDsv4ExplicitPoolBlocks(ModelConfig& model_config, const std::string& tag, uint32_t block_num) { + for (auto& descs : model_config.kv_cache_spec_descs) { + for (auto& desc : descs) { + if (desc.tag == tag) { + desc.extra.explicit_block_num = block_num; + } + } + } +} + // A tiny helper for unit tests to construct a minimal MultiHeadAttention KV cache config. // // NOTE: @@ -33,22 +278,16 @@ inline CacheConfig makeSimpleMhaCacheConfig(int layer_num, spec->type = KVCacheSpecType::MultiHeadAttention; spec->dtype = dtype; spec->seq_size_per_block = static_cast(tokens_per_block); - spec->layer_num = static_cast(layer_num); spec->local_head_num_kv = local_head_num_kv; spec->size_per_head = size_per_head; - config.cache_specs.push_back(spec); - std::vector layer_ids(layer_num); for (int i = 0; i < layer_num; ++i) { layer_ids[i] = i; } - config.layer_ids.push_back(layer_ids); - config.global_layer_ids.push_back(layer_ids); - config.layer_to_group_id.assign(layer_num, 0); - config.layer_attn_types.assign(layer_num, CacheGroupType::FULL); + config.fromGroupedSpecs({spec}, {layer_ids}, {CacheGroupType::FULL}, {"default"}); config.kv_block_stride_bytes = spec->block_size_bytes(); - config.kv_block_size_bytes = static_cast(spec->block_size_bytes() * spec->layer_num); + config.kv_block_size_bytes = static_cast(layer_num) * spec->block_size_bytes(); if (dtype == rtp_llm::TYPE_INT8 || dtype == rtp_llm::TYPE_FP8_E4M3) { const size_t kv_scale_kv_stride = static_cast(spec->local_head_num_kv) * tokens_per_block; @@ -100,14 +339,11 @@ inline CacheConfig makeSimpleHybridMhaCacheConfig(int layer_num, } const int group_cnt = layer_num / config.group_layer_num; - const int linear_groups = 1; - const int full_groups = group_cnt - 1; // Specs. auto linear_spec = std::make_shared(); linear_spec->type = KVCacheSpecType::LinearAttention; linear_spec->dtype = dtype; - linear_spec->layer_num = static_cast(config.group_layer_num); linear_spec->local_num_k_heads = 1; linear_spec->local_num_v_heads = 1; linear_spec->head_k_dim = 1; @@ -120,19 +356,17 @@ inline CacheConfig makeSimpleHybridMhaCacheConfig(int layer_num, full_spec->type = KVCacheSpecType::MultiHeadAttention; full_spec->dtype = dtype; full_spec->seq_size_per_block = static_cast(tokens_per_block); - full_spec->layer_num = static_cast(config.group_layer_num); full_spec->local_head_num_kv = local_head_num_kv; full_spec->size_per_head = size_per_head; - config.layer_ids.clear(); - config.global_layer_ids.clear(); - config.linear_groups.clear(); - config.full_groups.clear(); - config.cache_specs.clear(); - config.group_types.clear(); - - config.layer_to_group_id.assign(static_cast(layer_num), 0); - config.layer_attn_types.assign(static_cast(layer_num), CacheGroupType::FULL); + std::vector specs; + std::vector> layers_by_group; + std::vector types; + std::vector tags; + specs.reserve(static_cast(group_cnt)); + layers_by_group.reserve(static_cast(group_cnt)); + types.reserve(static_cast(group_cnt)); + tags.reserve(static_cast(group_cnt)); // Build groups: gid=0 linear, gid>=1 full. for (int gid = 0; gid < group_cnt; ++gid) { @@ -141,26 +375,19 @@ inline CacheConfig makeSimpleHybridMhaCacheConfig(int layer_num, for (int local = 0; local < config.group_layer_num; ++local) { const int layer_id = gid * config.group_layer_num + local; group_layers.push_back(layer_id); - config.layer_to_group_id[static_cast(layer_id)] = gid; - config.layer_attn_types[static_cast(layer_id)] = - (gid == 0) ? CacheGroupType::LINEAR : CacheGroupType::FULL; } - config.layer_ids.push_back(group_layers); - config.global_layer_ids.push_back(group_layers); + layers_by_group.push_back(group_layers); if (gid == 0) { - config.cache_specs.push_back(linear_spec); - config.group_types.push_back(CacheGroupType::LINEAR); - config.linear_groups.push_back(group_layers); + specs.push_back(linear_spec); + types.push_back(CacheGroupType::LINEAR); } else { - config.cache_specs.push_back(full_spec); - config.group_types.push_back(CacheGroupType::FULL); - config.full_groups.push_back(group_layers); + specs.push_back(full_spec); + types.push_back(CacheGroupType::FULL); } + tags.push_back("default"); } - - config.linear_group_num = linear_groups; - config.full_group_num = full_groups; + config.fromGroupedSpecs(specs, layers_by_group, types, tags); // Physical sizes for hybrid memory layout: one group (group_layer_num) worth of layers. config.kv_block_stride_bytes = std::max(full_spec->block_size_bytes(), linear_spec->block_size_bytes()); diff --git a/rtp_llm/cpp/cache/test/DSV4CacheTest.cc b/rtp_llm/cpp/cache/test/DSV4CacheTest.cc new file mode 100644 index 0000000000..e5ee99997a --- /dev/null +++ b/rtp_llm/cpp/cache/test/DSV4CacheTest.cc @@ -0,0 +1,2596 @@ +#include +#include +#include +#include +#include + +#include "rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h" +#include "rtp_llm/cpp/cache/BlockPoolConfigHelper.h" +#include "rtp_llm/cpp/cache/CPSlotMapper.h" +#include "rtp_llm/cpp/cache/config_creator/HybridPoolConfigCreator.h" +#include "rtp_llm/cpp/cache/allocator/HybridPoolKVCacheAllocator.h" +#include "rtp_llm/cpp/cache/allocator/HybridTypeKVCacheAllocator.h" +#include "rtp_llm/cpp/cache/KVCacheTransferPlanner.h" +#include "rtp_llm/cpp/cache/spec/OpaqueKVCacheSpec.h" +#include "rtp_llm/cpp/cache/BatchKVCacheResource.h" +#include "rtp_llm/cpp/cache/SharedBlockCache.h" +#include "rtp_llm/cpp/cache/test/BlockPoolTestHelper.h" +#include "rtp_llm/cpp/cache/test/CacheConfigTestUtils.h" +#include "rtp_llm/cpp/engine_base/stream/CompleteTokenIds.h" +#include "rtp_llm/cpp/config/ConfigModules.h" +#include "rtp_llm/cpp/config/ModelConfig.h" +#include "rtp_llm/cpp/config/StaticConfig.h" +#include "rtp_llm/cpp/utils/Logger.h" + +namespace rtp_llm { +namespace test { + +namespace { + +constexpr int kDsv4PoolNum = 7; +constexpr uint32_t kDsv4TokensPerBlock = 128; +constexpr uint32_t kDsv4KvEntryBytes = 1024; +constexpr uint32_t kDsv4IndexerEntryBytes = 256; +constexpr uint32_t kDsv4Fp8KvEntryBytes = 584; +const std::vector kDsv4FlashFirstSeenTags = { + "swa_kv", "csa_kv", "indexer_kv", "indexer_state", "csa_state", "hca_kv", "hca_state"}; +const std::vector kDsv4ProFirstSeenTags = { + "hca_kv", "hca_state", "swa_kv", "csa_kv", "indexer_kv", "indexer_state", "csa_state"}; + +static size_t gidForTag(const CacheConfig& config, const std::string& tag) { + return static_cast(config.groupIdForTag(tag)); +} + +class DSV4CacheTestEnvironment: public ::testing::Environment { +public: + void SetUp() override { + old_core_dump_on_exception_ = StaticConfig::user_ft_core_dump_on_exception; + StaticConfig::user_ft_core_dump_on_exception = false; + } + + void TearDown() override { + StaticConfig::user_ft_core_dump_on_exception = old_core_dump_on_exception_; + } + +private: + bool old_core_dump_on_exception_{false}; +}; + +[[maybe_unused]] auto* const dsv4_cache_test_env = + ::testing::AddGlobalTestEnvironment(new DSV4CacheTestEnvironment()); + +} // namespace + +static KVCacheConfig makeDsv4KvCacheConfig() { + KVCacheConfig config; + config.seq_size_per_block = 128; + return config; +} + +static void setGroupBlockNumsForTest(CacheConfig& config, const std::vector& block_nums) { + std::vector kv_strides; + std::vector scale_strides; + kv_strides.reserve(static_cast(config.groupNums())); + scale_strides.reserve(static_cast(config.groupNums())); + for (size_t gid = 0; gid < static_cast(config.groupNums()); ++gid) { + kv_strides.push_back(config.kvBlockStrideBytesForGroup(gid)); + scale_strides.push_back(config.kvScaleStrideBytesForGroup(gid)); + } + config.setGroupBlockLayout(block_nums, kv_strides, scale_strides); +} + +static void initDsv4BatchGroups(BatchKVCacheResource& batch_res, const CacheConfig& config) { + batch_res.initGroups(config.groupNums(), + static_cast(config.layer_all_num), + config.layerGroupIdsSnapshot(), + config.kernelBlocksPerKvBlock(), + config.groupTypesSnapshot()); +} + +static ModelConfig makeProModelConfig() { + ModelConfig mc; + mc.num_layers = 61; + mc.hidden_size = 7168; + mc.attn_config.head_num = 128; + mc.attn_config.kv_head_num = 1; + mc.attn_config.size_per_head = 512; + mc.attn_config.rope_head_dim = 64; + mc.attn_config.sliding_window = 128; + mc.attn_config.indexer_head_dim = 128; + mc.attn_config.indexer_head_num = 64; + mc.attn_config.indexer_topk = 1024; + mc.attn_config.o_groups = 16; + mc.attn_config.o_lora_rank = 1024; + std::vector ratios; + ratios.push_back(128); + ratios.push_back(128); + for (int i = 2; i < 61; i++) { + ratios.push_back((i % 2 == 0) ? 4 : 128); + } + mc.attn_config.layer_compress_ratios = ratios; + mc.hybrid_attention_config.enable_hybrid_attention = true; + mc.hybrid_attention_config.enable_independent_kv_cache_pools = true; + setDsv4KvCacheSpecs(mc); + return mc; +} + +static ModelConfig makeFlashModelConfig() { + ModelConfig mc; + mc.num_layers = 43; + mc.hidden_size = 4096; + mc.attn_config.head_num = 64; + mc.attn_config.kv_head_num = 1; + mc.attn_config.size_per_head = 512; + mc.attn_config.rope_head_dim = 64; + mc.attn_config.sliding_window = 128; + mc.attn_config.indexer_head_dim = 128; + mc.attn_config.indexer_head_num = 64; + mc.attn_config.indexer_topk = 512; + mc.attn_config.o_groups = 8; + mc.attn_config.o_lora_rank = 1024; + std::vector ratios = {0, 0}; + for (int i = 2; i < 43; i++) { + ratios.push_back((i % 2 == 0) ? 4 : 128); + } + mc.attn_config.layer_compress_ratios = ratios; + mc.hybrid_attention_config.enable_hybrid_attention = true; + mc.hybrid_attention_config.enable_independent_kv_cache_pools = true; + setDsv4KvCacheSpecs(mc); + return mc; +} + +static ModelConfig makeFlashMtpModelConfig() { + ModelConfig mc = makeFlashModelConfig(); + mc.num_layers = 1; + mc.attn_config.layer_compress_ratios = {0}; + setDsv4KvCacheSpecs(mc); + return mc; +} + +static ModelConfig makeHybridAttentionModelConfig(bool independent_pool) { + ModelConfig mc; + mc.num_layers = 4; + mc.hidden_size = 128; + mc.attn_config.head_num = 4; + mc.attn_config.kv_head_num = 2; + mc.attn_config.size_per_head = independent_pool ? 16 : 32; + mc.attn_config.tokens_per_block = 8; + mc.hybrid_attention_config.enable_hybrid_attention = true; + mc.hybrid_attention_config.enable_independent_kv_cache_pools = independent_pool; + mc.hybrid_attention_config.hybrid_attention_types = { + HybridAttentionType::LINEAR, HybridAttentionType::NONE, HybridAttentionType::LINEAR, HybridAttentionType::NONE}; + mc.linear_attention_config.linear_conv_kernel_dim = 4; + mc.linear_attention_config.linear_key_head_dim = 16; + mc.linear_attention_config.linear_value_head_dim = 16; + mc.linear_attention_config.linear_num_key_heads = 2; + mc.linear_attention_config.linear_num_value_heads = 2; + setHybridAttentionKvCacheSpecs(mc); + return mc; +} + +// ============================================================ +// Layer classification +// ============================================================ + +TEST(HybridPoolConfigCreatorTest, ProLayerClassification) { + ParallelismConfig pc; + auto config = CacheConfigCreator::createBasicConfig(makeProModelConfig(), pc, makeDsv4KvCacheConfig(), false, 0); + EXPECT_EQ(config.layer_num, 61u); + EXPECT_EQ(config.groupTagsSnapshot(), kDsv4ProFirstSeenTags); + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "csa_kv")).size(), 30u); + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "hca_kv")).size(), 31u); + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "swa_kv")).size(), 61u); +} + +TEST(HybridPoolConfigCreatorTest, FlashLayerClassification) { + ParallelismConfig pc; + auto config = CacheConfigCreator::createBasicConfig(makeFlashModelConfig(), pc, makeDsv4KvCacheConfig(), false, 0); + EXPECT_EQ(config.layer_num, 43u); + EXPECT_EQ(config.groupTagsSnapshot(), kDsv4FlashFirstSeenTags); + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "csa_kv")).size(), 21u); + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "hca_kv")).size(), 20u); + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "swa_kv")).size(), 43u); +} + +TEST(HybridPoolConfigCreatorTest, MtpSwaOnlyLayerIsNotStripped) { + ParallelismConfig pc; + auto config = + CacheConfigCreator::createBasicConfig(makeFlashMtpModelConfig(), pc, makeDsv4KvCacheConfig(), true, 0); + + EXPECT_EQ(config.layer_num, 1u); + EXPECT_EQ(config.block_size_bytes, 1u); + ASSERT_EQ(static_cast(config.groupNums()), 1u); + ASSERT_EQ(config.layerIdsForGroup(gidForTag(config, "swa_kv")), std::vector({0})); + ASSERT_EQ(config.layerGroupIdsSnapshot().size(), 1u); + EXPECT_EQ(config.layerGroupIdsSnapshot()[0], std::vector({0})); + EXPECT_EQ(config.tagForGroup(0), "swa_kv"); + EXPECT_EQ(config.groupIdForLayerTag(0, "swa_kv"), 0); +} + +TEST(HybridPoolConfigCreatorTest, Dsv4SpecOrderControlsFirstSeenGroupOrder) { + auto mc = makeFlashModelConfig(); + for (auto& layer_descs : mc.kv_cache_spec_descs) { + std::reverse(layer_descs.begin(), layer_descs.end()); + } + + ParallelismConfig pc; + auto config = CacheConfigCreator::createBasicConfig(mc, pc, makeDsv4KvCacheConfig(), false, 0); + + const std::vector expected_tags = { + "swa_kv", "csa_state", "indexer_state", "indexer_kv", "csa_kv", "hca_state", "hca_kv"}; + EXPECT_EQ(config.groupTagsSnapshot(), expected_tags); + + ASSERT_EQ(static_cast(config.groupNums()), expected_tags.size()); + ASSERT_EQ(static_cast(config.groupNums()), expected_tags.size()); + for (size_t gid = 0; gid < expected_tags.size(); ++gid) { + ASSERT_NE(config.specForGroup(gid), nullptr); + EXPECT_EQ(config.specForGroup(gid)->tag, expected_tags[gid]) << "gid=" << gid; + EXPECT_EQ(config.specForGroup(gid)->layers, config.layerIdsForGroup(gid)) << "gid=" << gid; + } + + EXPECT_EQ(config.groupIdForLayerTag(2, "csa_kv"), config.groupIdForTag("csa_kv")); + EXPECT_EQ(config.groupIdForLayerTag(3, "hca_kv"), config.groupIdForTag("hca_kv")); + EXPECT_EQ(config.groupIdForLayerTag(0, "swa_kv"), config.groupIdForTag("swa_kv")); +} + +static GroupBase makeTestGroup(const KVCacheSpecPtr& spec, CacheGroupType type, std::vector layer_ids) { + GroupBase group; + group.spec = spec; + group.policy = defaultCacheGroupPolicy(type); + group.layer_ids = std::move(layer_ids); + return group; +} + +TEST(CacheConfigTest, SetTopologyInstallsTagAndGroupTopology) { + CacheConfig config; + config.layer_num = 3; + config.layer_all_num = 3; + + auto swa_spec = std::make_shared(); + swa_spec->tag = "swa"; + swa_spec->state_dim = 1; + swa_spec->entries_per_block = 1; + swa_spec->store_dtype = DataType::TYPE_UINT8; + swa_spec->dtype = DataType::TYPE_UINT8; + auto csa_spec = std::make_shared(); + csa_spec->tag = "csa"; + csa_spec->entry_elems = 1; + csa_spec->entries_per_block = 1; + csa_spec->compression_ratio = 1; + csa_spec->store_dtype = DataType::TYPE_UINT8; + csa_spec->dtype = DataType::TYPE_UINT8; + + std::vector layers(3); + layers[0].group_ids = {0}; + layers[0].tag_to_gid["swa"] = 0; + layers[1].group_ids = {0, 1}; + layers[1].tag_to_gid["swa"] = 0; + layers[1].tag_to_gid["csa"] = 1; + layers[2].group_ids = {0}; + layers[2].tag_to_gid["swa"] = 0; + + config.setTopology({makeTestGroup(swa_spec, CacheGroupType::SWA, {0, 1, 2}), + makeTestGroup(csa_spec, CacheGroupType::FULL, {1})}, + std::move(layers)); + + EXPECT_EQ(config.groupTagsSnapshot(), std::vector({"swa", "csa"})); + EXPECT_EQ(config.groupIdForLayerTag(1, "swa"), 0); + EXPECT_EQ(config.groupIdForLayerTag(1, "csa"), 1); + EXPECT_THROW((void)config.groupIdFor(1), std::exception); + EXPECT_EQ(config.layerGroupIdsSnapshot()[1], std::vector({0, 1})); +} + +TEST(CacheConfigTest, SetTopologyRejectsMissingLayer) { + CacheConfig config; + config.layer_num = 2; + config.layer_all_num = 2; + + auto spec = std::make_shared(); + spec->tag = "default"; + std::vector layers(2); + layers[0].group_ids = {0}; + layers[0].tag_to_gid["default"] = 0; + EXPECT_THROW(config.setTopology({makeTestGroup(spec, CacheGroupType::FULL, {0})}, std::move(layers)), + std::exception); +} + +TEST(CacheConfigTest, SetTopologyRejectsEmptyTag) { + CacheConfig config; + config.layer_num = 1; + config.layer_all_num = 1; + + auto spec = std::make_shared(); + std::vector layers(1); + layers[0].group_ids = {0}; + EXPECT_THROW(config.setTopology({makeTestGroup(spec, CacheGroupType::FULL, {0})}, std::move(layers)), + std::exception); +} + +TEST(CacheConfigTest, SetTopologyAllowsDifferentLayerTags) { + CacheConfig config; + config.layer_num = 1; + config.layer_all_num = 1; + + auto spec0 = std::make_shared(); + spec0->tag = "full"; + auto spec1 = std::make_shared(); + spec1->tag = "linear"; + + std::vector layers(1); + layers[0].group_ids = {0, 1}; + layers[0].tag_to_gid["full"] = 0; + layers[0].tag_to_gid["linear"] = 1; + EXPECT_NO_THROW(config.setTopology({makeTestGroup(spec0, CacheGroupType::FULL, {0}), + makeTestGroup(spec1, CacheGroupType::LINEAR, {0})}, + std::move(layers))); + EXPECT_EQ(config.layerGroupIdsSnapshot()[0].size(), 2u); +} + +TEST(HybridPoolConfigCreatorTest, Dsv4ModelProvidedAlignmentPropagatesToCacheSpecs) { + auto mc = makeFlashModelConfig(); + for (auto& layer_descs : mc.kv_cache_spec_descs) { + for (auto& desc : layer_descs) { + if (desc.tag == "csa_kv") { + desc.block_size_bytes_alignment = 1024; + } else if (desc.tag == "swa_kv") { + desc.block_size_bytes_alignment = 2048; + desc.block_size_alignment_min_entries = 256; + } + } + } + + ParallelismConfig pc; + auto config = CacheConfigCreator::createBasicConfig(mc, pc, makeDsv4KvCacheConfig(), false, 0); + + auto* csa_kv = dynamic_cast(config.specForGroup(gidForTag(config, "csa_kv")).get()); + auto* swa_kv = dynamic_cast(config.specForGroup(gidForTag(config, "swa_kv")).get()); + ASSERT_NE(csa_kv, nullptr); + ASSERT_NE(swa_kv, nullptr); + EXPECT_EQ(csa_kv->block_size_bytes_alignment, 1024u); + EXPECT_EQ(swa_kv->block_size_bytes_alignment, 2048u); + EXPECT_EQ(swa_kv->block_size_alignment_min_entries, 256u); +} + +TEST(HybridPoolConfigCreatorTest, Dsv4TagRoutesAreConsistent) { + ParallelismConfig pc; + auto config = + CacheConfigCreator::createBasicConfig(makeFlashModelConfig(), pc, makeDsv4KvCacheConfig(), false, 0); + + auto expect_route = [&](int layer_id, const std::string& tag, int expected_gid) { + EXPECT_EQ(config.groupIdForLayerTag(layer_id, tag), expected_gid) << "layer=" << layer_id << " tag=" << tag; + }; + + // Flash DSV4 test config uses layers 2,4,... as CSA and 3,5,... as HCA; 0/1 are SWA-only. + expect_route(2, "csa_kv", config.groupIdForTag("csa_kv")); + expect_route(2, "indexer_kv", config.groupIdForTag("indexer_kv")); + expect_route(2, "indexer_state", config.groupIdForTag("indexer_state")); + expect_route(2, "csa_state", config.groupIdForTag("csa_state")); + expect_route(2, "swa_kv", config.groupIdForTag("swa_kv")); + + expect_route(3, "hca_kv", config.groupIdForTag("hca_kv")); + expect_route(3, "hca_state", config.groupIdForTag("hca_state")); + expect_route(3, "swa_kv", config.groupIdForTag("swa_kv")); + + expect_route(0, "swa_kv", config.groupIdForTag("swa_kv")); + EXPECT_THROW(config.groupIdForLayerTag(0, "csa_kv"), std::exception); + EXPECT_THROW(config.groupIdForLayerTag(0, "hca_kv"), std::exception); + + auto mtp_config = + CacheConfigCreator::createBasicConfig(makeFlashMtpModelConfig(), pc, makeDsv4KvCacheConfig(), true, 0); + ASSERT_EQ(mtp_config.groupIdForLayerTag(0, "swa_kv"), 0); +} + +TEST(HybridPoolConfigCreatorTest, Dsv4GroupPoliciesMatchLegacyBehavior) { + ParallelismConfig pc; + auto config = + CacheConfigCreator::createBasicConfig(makeFlashModelConfig(), pc, makeDsv4KvCacheConfig(), false, 0); + + ASSERT_EQ(config.groupPoliciesSnapshot().size(), static_cast(config.groupNums())); + auto expect_policy = [&](const std::string& tag, + CacheReusePolicy reuse_policy, + CacheEvictPolicy evict_policy, + int active_tail_blocks) { + const auto group_tags = config.groupTagsSnapshot(); + auto it = std::find(group_tags.begin(), group_tags.end(), tag); + ASSERT_NE(it, group_tags.end()) << tag; + const auto gid = static_cast(std::distance(group_tags.begin(), it)); + EXPECT_EQ(config.policyForGroup(gid).reuse_policy, reuse_policy) << tag; + EXPECT_EQ(config.policyForGroup(gid).evict_policy, evict_policy) << tag; + EXPECT_EQ(config.policyForGroup(gid).active_tail_blocks, active_tail_blocks) << tag; + }; + + expect_policy("hca_state", CacheReusePolicy::NON_REUSABLE, CacheEvictPolicy::INDEPENDENT, 1); + expect_policy("swa_kv", CacheReusePolicy::REUSABLE, CacheEvictPolicy::INDEPENDENT, 2); + expect_policy("csa_state", CacheReusePolicy::REUSABLE, CacheEvictPolicy::INDEPENDENT, 2); + expect_policy("csa_kv", CacheReusePolicy::REUSABLE, CacheEvictPolicy::CHAIN, 0); + expect_policy("hca_kv", CacheReusePolicy::REUSABLE, CacheEvictPolicy::CHAIN, 0); + expect_policy("indexer_kv", CacheReusePolicy::REUSABLE, CacheEvictPolicy::CHAIN, 0); +} + +TEST(HybridPoolConfigCreatorTest, Dsv4SpecsMissingFailsFastWithoutRatioFallback) { + auto mc = makeFlashModelConfig(); + mc.kv_cache_spec_descs.clear(); + + ParallelismConfig pc; + EXPECT_THROW((void)CacheConfigCreator::createBasicConfig(mc, pc, makeDsv4KvCacheConfig(), false, 0), + std::exception); +} + +// ============================================================ +// Pool specs +// ============================================================ + +TEST(HybridPoolConfigCreatorTest, ProPoolSpecs) { + ParallelismConfig pc; + auto config = CacheConfigCreator::createBasicConfig(makeProModelConfig(), pc, makeDsv4KvCacheConfig(), false, 0); + + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "csa_kv")).size(), 30u); + EXPECT_EQ(config.specForGroup(gidForTag(config, "csa_kv"))->block_size_bytes(), 32u * kDsv4KvEntryBytes); + EXPECT_EQ(config.typeForGroup(gidForTag(config, "csa_kv")), CacheGroupType::FULL); + + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "hca_kv")).size(), 31u); + EXPECT_EQ(config.specForGroup(gidForTag(config, "hca_kv"))->block_size_bytes(), 1u * kDsv4KvEntryBytes); + + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "indexer_kv")).size(), 30u); + EXPECT_EQ(config.specForGroup(gidForTag(config, "indexer_kv"))->block_size_bytes(), 32u * kDsv4IndexerEntryBytes); + + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "indexer_state")).size(), 30u); + EXPECT_EQ(config.specForGroup(gidForTag(config, "indexer_state"))->block_size_bytes(), 8u * 512u * 4u); + + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "csa_state")).size(), 30u); + EXPECT_EQ(config.specForGroup(gidForTag(config, "csa_state"))->block_size_bytes(), 8u * 2048u * 4u); + + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "hca_state")).size(), 31u); + EXPECT_EQ(config.specForGroup(gidForTag(config, "hca_state"))->block_size_bytes(), 128u * 1024u * 4u); + + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "swa_kv")).size(), 61u); + EXPECT_EQ(config.specForGroup(gidForTag(config, "swa_kv"))->block_size_bytes(), kDsv4TokensPerBlock * kDsv4KvEntryBytes); +} + +TEST(HybridPoolConfigCreatorTest, FlashPoolSpecs) { + ParallelismConfig pc; + auto config = CacheConfigCreator::createBasicConfig(makeFlashModelConfig(), pc, makeDsv4KvCacheConfig(), false, 0); + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "csa_kv")).size(), 21u); + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "hca_kv")).size(), 20u); + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "swa_kv")).size(), 43u); +} + +// ============================================================ +// Block size bytes +// ============================================================ + +TEST(HybridPoolConfigCreatorTest, BlockSizeBytes) { + ParallelismConfig pc; + auto config = CacheConfigCreator::createBasicConfig(makeProModelConfig(), pc, makeDsv4KvCacheConfig(), false, 0); + EXPECT_EQ(config.specForGroup(gidForTag(config, "csa_kv"))->block_size_bytes(), 32u * kDsv4KvEntryBytes); + EXPECT_EQ(config.specForGroup(gidForTag(config, "hca_kv"))->block_size_bytes(), 1u * kDsv4KvEntryBytes); + EXPECT_EQ(config.specForGroup(gidForTag(config, "indexer_kv"))->block_size_bytes(), 32u * kDsv4IndexerEntryBytes); + EXPECT_EQ(config.specForGroup(gidForTag(config, "indexer_state"))->block_size_bytes(), 8u * 512u * 4u); + EXPECT_EQ(config.specForGroup(gidForTag(config, "csa_state"))->block_size_bytes(), 8u * 2048u * 4u); + EXPECT_EQ(config.specForGroup(gidForTag(config, "hca_state"))->block_size_bytes(), 128u * 1024u * 4u); + EXPECT_EQ(config.specForGroup(gidForTag(config, "swa_kv"))->block_size_bytes(), kDsv4TokensPerBlock * kDsv4KvEntryBytes); +} + +TEST(HybridPoolConfigCreatorTest, Fp8BlockSizeBytesUsePaddedPhysicalStride) { + ParallelismConfig pc; + auto mc = makeProModelConfig(); + mc.attn_config.kv_cache_dtype = KvCacheDataType::FP8; + setDsv4KvCacheSpecs(mc); + auto config = CacheConfigCreator::createBasicConfig(mc, pc, makeDsv4KvCacheConfig(), false, 0); + + ASSERT_EQ(static_cast(config.groupNums()), 7u); + ASSERT_EQ(config.groupKvBlockStrideBytesSnapshot().size(), 7u); + + EXPECT_EQ(config.specForGroup(gidForTag(config, "csa_kv"))->block_size_bytes(), 19008u); + EXPECT_EQ(config.specForGroup(gidForTag(config, "hca_kv"))->block_size_bytes(), 1152u); + EXPECT_EQ(config.specForGroup(gidForTag(config, "indexer_kv"))->block_size_bytes(), 32u * 132u); + EXPECT_EQ(config.specForGroup(gidForTag(config, "swa_kv"))->block_size_bytes(), 74880u); + + EXPECT_EQ(config.kvBlockStrideBytesForGroup(gidForTag(config, "csa_kv")), + config.specForGroup(gidForTag(config, "csa_kv"))->block_size_bytes()); + EXPECT_EQ(config.kvBlockStrideBytesForGroup(gidForTag(config, "hca_kv")), + config.specForGroup(gidForTag(config, "hca_kv"))->block_size_bytes()); + EXPECT_EQ(config.kvBlockStrideBytesForGroup(gidForTag(config, "swa_kv")), + config.specForGroup(gidForTag(config, "swa_kv"))->block_size_bytes()); +} + +TEST(HybridPoolConfigCreatorTest, DecoupledPhysicalAndKernelBlockSizeUsesPerGroupBpk) { + ParallelismConfig pc; + auto mc = makeProModelConfig(); + KVCacheConfig kv_cache_config; + kv_cache_config.seq_size_per_block = 16384; + kv_cache_config.kernel_seq_size_per_block = 128; + auto config = CacheConfigCreator::createBasicConfig(mc, pc, kv_cache_config, false, 0); + + ASSERT_EQ(static_cast(config.groupNums()), 7u); + ASSERT_EQ(config.group_seq_size_per_block.size(), 7u); + + EXPECT_EQ(config.seq_size_per_block, 16384u); + EXPECT_EQ(config.kernel_seq_size_per_block, 128u); + EXPECT_EQ(config.kernelBlocksPerKvBlock(), 128u); + for (size_t gid = 0; gid < config.group_seq_size_per_block.size(); ++gid) { + EXPECT_EQ(config.group_seq_size_per_block[gid], 16384u); + } + + auto* csa_kv = dynamic_cast(config.specForGroup(gidForTag(config, "csa_kv")).get()); + auto* hca_kv = dynamic_cast(config.specForGroup(gidForTag(config, "hca_kv")).get()); + auto* idx_kv = dynamic_cast(config.specForGroup(gidForTag(config, "indexer_kv")).get()); + auto* swa_kv = dynamic_cast(config.specForGroup(gidForTag(config, "swa_kv")).get()); + ASSERT_NE(csa_kv, nullptr); + ASSERT_NE(hca_kv, nullptr); + ASSERT_NE(idx_kv, nullptr); + ASSERT_NE(swa_kv, nullptr); + EXPECT_EQ(csa_kv->compression_ratio, 4u); + EXPECT_EQ(hca_kv->compression_ratio, 128u); + EXPECT_EQ(idx_kv->compression_ratio, 4u); + EXPECT_EQ(csa_kv->entries_per_block, 32u); + EXPECT_EQ(hca_kv->entries_per_block, 1u); + EXPECT_EQ(idx_kv->entries_per_block, 32u); + EXPECT_EQ(swa_kv->entries_per_block, 128u); + + EXPECT_EQ(config.kvBlockStrideBytesForGroup(gidForTag(config, "csa_kv")), + config.specForGroup(gidForTag(config, "csa_kv"))->block_size_bytes() * 128u); + EXPECT_EQ(config.kvBlockStrideBytesForGroup(gidForTag(config, "hca_kv")), + config.specForGroup(gidForTag(config, "hca_kv"))->block_size_bytes() * 128u); + EXPECT_EQ(config.kvBlockStrideBytesForGroup(gidForTag(config, "indexer_kv")), + config.specForGroup(gidForTag(config, "indexer_kv"))->block_size_bytes() * 128u); + EXPECT_EQ(config.kvBlockStrideBytesForGroup(gidForTag(config, "swa_kv")), + config.specForGroup(gidForTag(config, "swa_kv"))->block_size_bytes()); + + auto full_pool = BlockPoolConfigHelper::createConfigForGroup(config, gidForTag(config, "csa_kv")); + auto swa_pool = BlockPoolConfigHelper::createConfigForGroup(config, gidForTag(config, "swa_kv")); + ASSERT_EQ(full_pool.memory_layouts.size(), 1u); + ASSERT_EQ(swa_pool.memory_layouts.size(), 1u); + EXPECT_EQ(full_pool.memory_layouts[0].kernel_blocks_per_kv_block, 128u); + EXPECT_EQ(swa_pool.memory_layouts[0].kernel_blocks_per_kv_block, 1u); +} + +TEST(HybridPoolConfigCreatorTest, PrefillCpShardedSlicesFixedAndSwaPhysicalBlocks) { + ParallelismConfig pc; + pc.role_type = RoleType::PREFILL; + pc.tp_size = 4; + pc.prefill_cp_config.kv_cache_sharded = true; + + auto mc = makeProModelConfig(); + mc.attn_config.kv_cache_dtype = KvCacheDataType::FP8; + setDsv4KvCacheSpecs(mc); + auto config = CacheConfigCreator::createBasicConfig(mc, pc, makeDsv4KvCacheConfig(), false, 0); + + ASSERT_EQ(static_cast(config.groupNums()), 7u); + ASSERT_EQ(config.groupKvBlockStrideBytesSnapshot().size(), 7u); + + EXPECT_EQ(config.specForGroup(gidForTag(config, "csa_kv"))->block_size_bytes(), 19008u); + EXPECT_EQ(config.specForGroup(gidForTag(config, "hca_kv"))->block_size_bytes(), 1152u); + EXPECT_EQ(config.specForGroup(gidForTag(config, "indexer_kv"))->block_size_bytes(), 32u * 132u); + EXPECT_EQ(config.specForGroup(gidForTag(config, "indexer_state"))->block_size_bytes(), 2u * 512u * 4u); + EXPECT_EQ(config.specForGroup(gidForTag(config, "csa_state"))->block_size_bytes(), 2u * 2048u * 4u); + EXPECT_EQ(config.specForGroup(gidForTag(config, "hca_state"))->block_size_bytes(), 32u * 1024u * 4u); + + // SWA_KV keeps full logical ring entries for byte-sliced CP layout, but + // each prefill rank stores only one aligned byte slice of the full block. + EXPECT_EQ(config.specForGroup(gidForTag(config, "swa_kv"))->block_size_bytes(), 18720u); + for (const auto& tag : {"indexer_state", "csa_state", "hca_state", "swa_kv"}) { + const auto gid = gidForTag(config, tag); + EXPECT_EQ(config.kvBlockStrideBytesForGroup(gid), config.specForGroup(gid)->block_size_bytes()); + EXPECT_EQ(config.group_seq_size_per_block[gid], kDsv4TokensPerBlock * 4u) << "tag=" << tag; + } + + pc.role_type = RoleType::DECODE; + auto decode_config = CacheConfigCreator::createBasicConfig(mc, pc, makeDsv4KvCacheConfig(), false, 0); + EXPECT_EQ(decode_config.specForGroup(gidForTag(decode_config, "indexer_state"))->block_size_bytes(), 8u * 512u * 4u); + EXPECT_EQ(decode_config.specForGroup(gidForTag(decode_config, "csa_state"))->block_size_bytes(), 8u * 2048u * 4u); + EXPECT_EQ(decode_config.specForGroup(gidForTag(decode_config, "hca_state"))->block_size_bytes(), 128u * 1024u * 4u); + EXPECT_EQ(decode_config.specForGroup(gidForTag(decode_config, "swa_kv"))->block_size_bytes(), 74880u); +} + +TEST(KVCacheTransferPlannerTest, CpCompactSwaUsesCanonicalTailRows) { + auto plan = buildCacheStoreBlockPlan(/*total_logical_blocks=*/8, + /*reuse_block_size=*/0, + /*use_hybrid=*/true, + CacheGroupType::SWA, + /*cp_rank=*/0, + /*cp_size=*/4); + ASSERT_EQ(plan.size(), 2u); + EXPECT_EQ(plan[0].key_index, 3); + EXPECT_EQ(plan[0].offset_index, 0); + EXPECT_EQ(plan[1].key_index, 7); + EXPECT_EQ(plan[1].offset_index, 1); +} + +TEST(KVCacheTransferPlannerTest, CpCompactSwaKeepsPartialTailRows) { + { + auto plan = buildCacheStoreBlockPlan(/*total_logical_blocks=*/1, + /*reuse_block_size=*/0, + /*use_hybrid=*/true, + CacheGroupType::SWA, + /*cp_rank=*/0, + /*cp_size=*/2); + ASSERT_EQ(plan.size(), 1u); + EXPECT_EQ(plan[0].key_index, 0); + EXPECT_EQ(plan[0].offset_index, 0); + } + { + auto plan = buildCacheStoreBlockPlan(/*total_logical_blocks=*/11, + /*reuse_block_size=*/0, + /*use_hybrid=*/true, + CacheGroupType::SWA, + /*cp_rank=*/0, + /*cp_size=*/2); + ASSERT_EQ(plan.size(), 2u); + EXPECT_EQ(plan[0].key_index, 9); + EXPECT_EQ(plan[0].offset_index, 4); + EXPECT_EQ(plan[1].key_index, 10); + EXPECT_EQ(plan[1].offset_index, 5); + } +} + +// ============================================================ +// CacheConfig output +// ============================================================ + +TEST(HybridPoolConfigCreatorTest, CreateCacheConfig) { + auto mc = makeProModelConfig(); + ParallelismConfig pc; + auto config = CacheConfigCreator::createBasicConfig(mc, pc, makeDsv4KvCacheConfig(), false, 0); + + // 7 groups -> groupNums() > 1 -> HybridTypeKVCacheAllocator path + EXPECT_EQ(config.groupNums(), 7); + EXPECT_EQ(static_cast(config.groupNums()), 7u); + EXPECT_EQ(static_cast(config.groupNums()), 7u); + EXPECT_EQ(static_cast(config.groupNums()), 7u); + EXPECT_EQ(config.layer_num, 61u); + EXPECT_TRUE(config.is_sparse); + EXPECT_FALSE(config.use_mla); +} + +TEST(HybridPoolConfigCreatorTest, FlashCacheConfig) { + auto mc = makeFlashModelConfig(); + ParallelismConfig pc; + auto config = CacheConfigCreator::createBasicConfig(mc, pc, makeDsv4KvCacheConfig(), false, 0); + + EXPECT_EQ(config.groupNums(), 7); + EXPECT_EQ(config.layer_num, 43u); + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "swa_kv")).size(), 43u); + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "csa_kv")).size(), 21u); +} + +TEST(HybridPoolConfigCreatorTest, HybridAttentionIndependentPoolUsesHybridPoolConfig) { + ParallelismConfig pc; + auto config = + CacheConfigCreator::createBasicConfig(makeHybridAttentionModelConfig(true), pc, KVCacheConfig{}, false, 0); + + EXPECT_TRUE(config.use_independent_block_pools); + ASSERT_EQ(config.groupNums(), 2); + const auto group_types = config.groupTypesSnapshot(); + EXPECT_EQ(std::count(group_types.begin(), group_types.end(), CacheGroupType::FULL), 1); + EXPECT_EQ(std::count(group_types.begin(), group_types.end(), CacheGroupType::LINEAR), 1); + ASSERT_EQ(static_cast(config.groupNums()), 2u); + EXPECT_LT(config.specForGroup(gidForTag(config, "full"))->block_size_bytes(), + config.specForGroup(gidForTag(config, "linear"))->block_size_bytes()); + EXPECT_EQ(config.groupBlockNumsSnapshot().size(), 2u); + EXPECT_EQ(config.groupTagsSnapshot(), std::vector({"linear", "full"})); +} + +TEST(HybridPoolConfigCreatorTest, HybridAttentionIndependentPoolSplitsFullAndSwaSpecs) { + auto mc = makeHybridAttentionModelConfig(true); + mc.hybrid_attention_config.hybrid_attention_types = {HybridAttentionType::NONE, + HybridAttentionType::SLIDING_WINDOW, + HybridAttentionType::LINEAR, + HybridAttentionType::SLIDING_WINDOW}; + setHybridAttentionKvCacheSpecs(mc); + + ParallelismConfig pc; + auto config = CacheConfigCreator::createBasicConfig(mc, pc, KVCacheConfig{}, false, 0); + + ASSERT_EQ(config.groupNums(), 3); + EXPECT_EQ(config.groupTypesSnapshot(), + std::vector({CacheGroupType::FULL, CacheGroupType::SWA, CacheGroupType::LINEAR})); + EXPECT_EQ(config.groupTagsSnapshot(), std::vector({"full", "swa", "linear"})); + ASSERT_EQ(static_cast(config.groupNums()), 3u); + EXPECT_NE(config.specForGroup(0).get(), config.specForGroup(1).get()); + EXPECT_EQ(config.layerIdsForGroup(0), std::vector({0})); + EXPECT_EQ(config.layerIdsForGroup(1), std::vector({1, 3})); + EXPECT_EQ(config.layerIdsForGroup(2), std::vector({2})); + EXPECT_EQ(config.layerIdsForGroup(0).size(), 1u); + EXPECT_EQ(config.layerIdsForGroup(1).size(), 2u); + EXPECT_EQ(config.layerIdsForGroup(2).size(), 1u); + EXPECT_EQ(config.groupIdForLayerTag(1, "swa"), 1); + EXPECT_EQ(config.groupIdForLayerTag(2, "linear"), 2); +} + +TEST(HybridPoolConfigCreatorTest, HybridAttentionWithoutIndependentPoolKeepsSharedHybridConfig) { + ParallelismConfig pc; + auto config = + CacheConfigCreator::createBasicConfig(makeHybridAttentionModelConfig(false), pc, KVCacheConfig{}, false, 0); + + EXPECT_FALSE(config.use_independent_block_pools); + ASSERT_EQ(config.groupNums(), 2); + EXPECT_TRUE(config.groupBlockNumsSnapshot().empty()); +} + +TEST(HybridConfigCreatorTest, HybridAttentionTypesMustCoverAllLayers) { + auto mc = makeHybridAttentionModelConfig(false); + mc.hybrid_attention_config.hybrid_attention_types.pop_back(); + + ParallelismConfig pc; + EXPECT_THROW((void)CacheConfigCreator::createBasicConfig(mc, pc, KVCacheConfig{}, false, 0), + std::exception); +} + +// ============================================================ +// Generic opaque cache specs +// ============================================================ + +TEST(GenericOpaqueCacheSpecTest, KVSpecFromPoolSpec) { + CompressedKVCacheSpec spec("csa_kv", + kDsv4Fp8KvEntryBytes, + 64, + DataType::TYPE_UINT8, + kDsv4TokensPerBlock, + 1, + DSV4_FP8_MLA_BLOCK_ALIGNMENT_BYTES); + + EXPECT_EQ(spec.block_size(), 64u * kDsv4Fp8KvEntryBytes); + EXPECT_EQ(spec.natural_block_size_bytes(), 64u * kDsv4Fp8KvEntryBytes * 1u); // uint8 = 1 byte + EXPECT_EQ(spec.block_size_bytes(), 37440u); + EXPECT_EQ(spec.tag, "csa_kv"); + EXPECT_EQ(spec.entry_elems, kDsv4Fp8KvEntryBytes); + EXPECT_EQ(spec.entries_per_block, 64u); + + CompressedKVCacheSpec hca_spec("hca_kv", + kDsv4Fp8KvEntryBytes, + 2, + DataType::TYPE_UINT8, + kDsv4TokensPerBlock, + 1, + DSV4_FP8_MLA_BLOCK_ALIGNMENT_BYTES); + EXPECT_EQ(hca_spec.natural_block_size_bytes(), 2u * kDsv4Fp8KvEntryBytes); + EXPECT_EQ(hca_spec.block_size_bytes(), 1728u); +} + +TEST(GenericOpaqueCacheSpecTest, CompressedKVSpecReportsGenericKindsAndLayout) { + CompressedKVCacheSpec spec("compressed", + kDsv4Fp8KvEntryBytes, + 64, + DataType::TYPE_UINT8, + kDsv4TokensPerBlock, + 4, + DSV4_FP8_MLA_BLOCK_ALIGNMENT_BYTES); + + EXPECT_EQ(spec.type, KVCacheSpecType::OpaqueKV); + EXPECT_EQ(spec.lifecycle, CacheGroupType::FULL); + EXPECT_EQ(spec.block_size(), 64u * kDsv4Fp8KvEntryBytes); + EXPECT_EQ(spec.natural_block_size_bytes(), 64u * kDsv4Fp8KvEntryBytes); + EXPECT_EQ(spec.block_size_bytes(), 37440u); + EXPECT_EQ(spec.compression_ratio, 4u); + EXPECT_EQ(spec.cpTransferPolicy(), CPTransferPolicy::NONE); + EXPECT_FALSE(spec.supportsCpSlice()); +} + +TEST(GenericOpaqueCacheSpecTest, FixedStateSpecReportsGenericKindsAndSlicesByEntries) { + FixedStateCacheSpec spec("tail_state", 32, 8, DataType::TYPE_FP32, kDsv4TokensPerBlock); + char storage[8 * 32 * 4] = {}; + BlockInfo block; + block.addr = storage; + block.size_bytes = sizeof(storage); + + auto sliced = spec.sliceBlockForPeer({block}, 4, 2); + ASSERT_EQ(sliced.size(), 1u); + EXPECT_EQ(spec.type, KVCacheSpecType::OpaqueState); + EXPECT_EQ(spec.lifecycle, CacheGroupType::SWA); + EXPECT_EQ(spec.cpTransferPolicy(), CPTransferPolicy::INTRA_BLOCK_SLICE); + EXPECT_TRUE(spec.supportsCpSlice()); + EXPECT_EQ(sliced[0].addr, storage + 2 * 2 * 32 * 4); + EXPECT_EQ(sliced[0].size_bytes, 2u * 32u * 4u); +} + +TEST(GenericOpaqueCacheSpecTest, FixedStateSpecSlicesOverrideByBytes) { + FixedStateCacheSpec spec("tail_bytes", + kDsv4Fp8KvEntryBytes, + kDsv4TokensPerBlock, + DataType::TYPE_UINT8, + kDsv4TokensPerBlock, + 74880); + char storage[74880] = {}; + BlockInfo block; + block.addr = storage; + block.size_bytes = sizeof(storage); + + auto sliced = spec.sliceBlockForPeer({block}, 4, 3); + ASSERT_EQ(sliced.size(), 1u); + EXPECT_EQ(sliced[0].addr, storage + 3 * (sizeof(storage) / 4)); + EXPECT_EQ(sliced[0].size_bytes, sizeof(storage) / 4); + + auto cp_sliced = spec.cpSliceDestination({block}, 4, 3); + ASSERT_EQ(cp_sliced.size(), 1u); + EXPECT_EQ(cp_sliced[0].addr, sliced[0].addr); + EXPECT_EQ(cp_sliced[0].size_bytes, sliced[0].size_bytes); +} + +TEST(GenericOpaqueCacheSpecTest, FixedStateSpecSlicesAlignedBlockByPhysicalBytes) { + FixedStateCacheSpec spec("aligned_tail", + kDsv4Fp8KvEntryBytes, + 132, + DataType::TYPE_UINT8, + kDsv4TokensPerBlock, + 0, + DSV4_FP8_MLA_BLOCK_ALIGNMENT_BYTES, + DSV4_SWA_WINDOW_ENTRIES); + ASSERT_EQ(spec.natural_block_size_bytes(), 77088u); + ASSERT_EQ(spec.block_size_bytes(), 77184u); + char storage[77184] = {}; + BlockInfo block; + block.addr = storage; + block.size_bytes = sizeof(storage); + + auto sliced = spec.sliceBlockForPeer({block}, 2, 1); + ASSERT_EQ(sliced.size(), 1u); + EXPECT_EQ(sliced[0].addr, storage + 38592); + EXPECT_EQ(sliced[0].size_bytes, 38592u); +} + +TEST(GenericOpaqueCacheSpecTest, SWAFp8StateSpecUsesPaddedPhysicalBlockSize) { + FixedStateCacheSpec spec("swa_kv", + kDsv4Fp8KvEntryBytes, + kDsv4TokensPerBlock, + DataType::TYPE_UINT8, + kDsv4TokensPerBlock, + 0, + DSV4_FP8_MLA_BLOCK_ALIGNMENT_BYTES, + DSV4_SWA_WINDOW_ENTRIES); + + EXPECT_EQ(spec.block_size(), kDsv4TokensPerBlock * kDsv4Fp8KvEntryBytes); + EXPECT_EQ(spec.natural_block_size_bytes(), kDsv4TokensPerBlock * kDsv4Fp8KvEntryBytes); + EXPECT_EQ(spec.block_size_bytes(), 74880u); + EXPECT_EQ(spec.tag, "swa_kv"); +} + +TEST(GenericOpaqueCacheSpecTest, StateSpecFloat32) { + FixedStateCacheSpec spec("csa_state", 2048, 8, DataType::TYPE_FP32, kDsv4TokensPerBlock); + + EXPECT_EQ(spec.block_size(), 8u * 2048u); + EXPECT_EQ(spec.block_size_bytes(), 8u * 2048u * 4u); // float32 = 4 bytes + EXPECT_EQ(spec.tag, "csa_state"); + EXPECT_EQ(spec.state_dim, 2048u); +} + +TEST(GenericOpaqueCacheSpecTest, IndexerKVSpec) { + CompressedKVCacheSpec spec("indexer_kv", 132, 64, DataType::TYPE_UINT8, kDsv4TokensPerBlock); + + EXPECT_EQ(spec.block_size(), 64u * 132u); + EXPECT_EQ(spec.block_size_bytes(), 64u * 132u); + EXPECT_EQ(spec.tag, "indexer_kv"); +} + +TEST(GenericOpaqueCacheSpecTest, HCAStateSpec) { + FixedStateCacheSpec spec("hca_state", 1024, 128, DataType::TYPE_FP32, kDsv4TokensPerBlock); + + EXPECT_EQ(spec.block_size_bytes(), 128u * 1024u * 4u); + EXPECT_EQ(spec.tag, "hca_state"); +} + +// ============================================================ +// Pool 0/1/2 shared properties: same tokens_per_block, same num_blocks +// ============================================================ + +TEST(HybridPoolConfigCreatorTest, PagedPoolsShareTokensPerBlock) { + // Pro config + { + ParallelismConfig pc; + auto config = + CacheConfigCreator::createBasicConfig(makeProModelConfig(), pc, makeDsv4KvCacheConfig(), false, 0); + for (const auto& tag : {"csa_kv", "hca_kv", "indexer_kv", "swa_kv"}) { + EXPECT_EQ(config.group_seq_size_per_block[gidForTag(config, tag)], kDsv4TokensPerBlock) << tag; + } + } + // Flash config + { + ParallelismConfig pc; + auto config = + CacheConfigCreator::createBasicConfig(makeFlashModelConfig(), pc, makeDsv4KvCacheConfig(), false, 0); + for (const auto& tag : {"csa_kv", "hca_kv", "indexer_kv"}) { + EXPECT_EQ(config.group_seq_size_per_block[gidForTag(config, tag)], kDsv4TokensPerBlock) << tag; + } + } +} + +TEST(HybridPoolConfigCreatorTest, AllPagedPoolsShareBlockNum) { + auto mc = makeProModelConfig(); + ParallelismConfig pc; + auto config = CacheConfigCreator::createBasicConfig(mc, pc, makeDsv4KvCacheConfig(), false, 0); + config.block_num = 100; + + // Paged groups derive their block count from the global block_num; explicit + // independent groups may override it with per-group fixed block counts. + EXPECT_EQ(config.groupNums(), 7); + for (int i = 0; i < 7; i++) { + EXPECT_GT(config.specForGroup(i)->block_size_bytes(), 0u) << "pool " << i; + } +} + +TEST(HybridPoolConfigCreatorTest, DSV4StateSwaPoolsFollowGlobalBlocks) { + auto mc = makeProModelConfig(); + ParallelismConfig pc; + RuntimeConfig runtime_config; + KVCacheConfig kv_cache_config; + kv_cache_config.seq_size_per_block = 128; + kv_cache_config.test_block_num = 100; + setDsv4ExplicitPoolBlocks(mc, "hca_state", 0); + runtime_config.max_generate_batch_size = 5; + runtime_config.fifo_scheduler_config.max_context_batch_size = 3; + + auto config = CacheConfigCreator::createConfig(mc, pc, runtime_config, kv_cache_config); + + ASSERT_EQ(config.groupBlockNumsSnapshot().size(), static_cast(kDsv4PoolNum)); + for (int gid = 0; gid < kDsv4PoolNum; ++gid) { + EXPECT_EQ(config.blockNumForGroup(gid), 100u) << "gid=" << gid; + } + EXPECT_EQ(config.explicitly_sized_pool_reserve_bytes, 0u); +} + +TEST(HybridPoolConfigCreatorTest, DSV4HcaStatePoolBlocksOverridesOnlyHcaState) { + auto mc = makeProModelConfig(); + ParallelismConfig pc; + RuntimeConfig runtime_config; + KVCacheConfig kv_cache_config; + kv_cache_config.seq_size_per_block = 128; + kv_cache_config.test_block_num = 100; + setDsv4ExplicitPoolBlocks(mc, "hca_state", 350); + runtime_config.max_generate_batch_size = 5; + runtime_config.fifo_scheduler_config.max_context_batch_size = 3; + + auto config = CacheConfigCreator::createConfig(mc, pc, runtime_config, kv_cache_config); + + ASSERT_EQ(config.groupBlockNumsSnapshot().size(), static_cast(kDsv4PoolNum)); + const auto hca_state_gid = gidForTag(config, "hca_state"); + for (size_t gid = 0; gid < static_cast(config.groupNums()); ++gid) { + const uint32_t expected = gid == hca_state_gid ? 350u : 100u; + EXPECT_EQ(config.blockNumForGroup(gid), expected) << "gid=" << gid; + } + + const size_t expected_reserve = 350u * config.blockSizeBytesForGroup(hca_state_gid); + EXPECT_EQ(config.explicitly_sized_pool_reserve_bytes, expected_reserve); + ASSERT_EQ(config.groupPoliciesSnapshot().size(), static_cast(kDsv4PoolNum)); + EXPECT_EQ(config.policyForGroup(hca_state_gid).explicit_block_num, 350u); + for (size_t gid = 0; gid < config.groupPoliciesSnapshot().size(); ++gid) { + if (gid != hca_state_gid) { + EXPECT_EQ(config.policyForGroup(gid).explicit_block_num, 0u) << "gid=" << gid; + } + } +} + +TEST(CacheConfigTest, DSV4KernelSeqSizeAllowsDecoupledPhysicalBlocks) { + auto mc = makeProModelConfig(); + ParallelismConfig pc; + RuntimeConfig runtime_config; + runtime_config.max_generate_batch_size = 2; + runtime_config.fifo_scheduler_config.max_context_batch_size = 1; + + auto create_config = [&](int seq_size_per_block, int kernel_seq_size_per_block) { + KVCacheConfig kv_cache_config; + kv_cache_config.seq_size_per_block = seq_size_per_block; + kv_cache_config.kernel_seq_size_per_block = kernel_seq_size_per_block; + kv_cache_config.test_block_num = 100; + return CacheConfigCreator::createConfig(mc, pc, runtime_config, kv_cache_config); + }; + + auto old_valid = create_config(128, 128); + EXPECT_EQ(old_valid.seq_size_per_block, 128u); + EXPECT_EQ(old_valid.kernel_seq_size_per_block, 128u); + EXPECT_EQ(old_valid.kernelBlocksPerKvBlock(), 1u); + + auto decoupled = create_config(16384, 128); + EXPECT_EQ(decoupled.seq_size_per_block, 16384u); + EXPECT_EQ(decoupled.kernel_seq_size_per_block, 128u); + EXPECT_EQ(decoupled.kernelBlocksPerKvBlock(), 128u); +} + +TEST(CacheConfigTest, DSV4KernelSeqSizeRejectsInvalidPhysicalKernelShape) { + auto mc = makeProModelConfig(); + ParallelismConfig pc; + RuntimeConfig runtime_config; + runtime_config.max_generate_batch_size = 2; + runtime_config.fifo_scheduler_config.max_context_batch_size = 1; + + auto create_config = [&](int seq_size_per_block, int kernel_seq_size_per_block) { + KVCacheConfig kv_cache_config; + kv_cache_config.seq_size_per_block = seq_size_per_block; + kv_cache_config.kernel_seq_size_per_block = kernel_seq_size_per_block; + kv_cache_config.test_block_num = 100; + return CacheConfigCreator::createConfig(mc, pc, runtime_config, kv_cache_config); + }; + + EXPECT_THROW((void)create_config(16384, 64), std::exception); + EXPECT_THROW((void)create_config(16384, 384), std::exception); +} + +TEST(HybridPoolConfigCreatorTest, DSV4HcaStatePoolBlocksIndependentOfMaxConcurrency) { + for (uint32_t max_concurrency : {1u, 2u, 8u}) { + auto mc = makeProModelConfig(); + ParallelismConfig pc; + RuntimeConfig runtime_config; + KVCacheConfig kv_cache_config; + kv_cache_config.seq_size_per_block = 128; + kv_cache_config.test_block_num = 100; + setDsv4ExplicitPoolBlocks(mc, "hca_state", 256); + runtime_config.max_generate_batch_size = max_concurrency; + runtime_config.fifo_scheduler_config.max_context_batch_size = 1; + + auto config = CacheConfigCreator::createConfig(mc, pc, runtime_config, kv_cache_config); + + ASSERT_EQ(config.groupBlockNumsSnapshot().size(), static_cast(kDsv4PoolNum)); + const auto hca_state_gid = gidForTag(config, "hca_state"); + for (int gid = 0; gid < kDsv4PoolNum; ++gid) { + const uint32_t expected = static_cast(gid) == hca_state_gid ? 256u : 100u; + EXPECT_EQ(config.blockNumForGroup(gid), expected) + << "gid=" << gid << " max_concurrency=" << max_concurrency; + } + } +} + +TEST(HybridPoolConfigCreatorTest, DSV4HcaStatePoolBlocksCanBeOverriddenByConfig) { + auto mc = makeProModelConfig(); + ParallelismConfig pc; + RuntimeConfig runtime_config; + KVCacheConfig kv_cache_config; + kv_cache_config.seq_size_per_block = 128; + kv_cache_config.test_block_num = 100; + setDsv4ExplicitPoolBlocks(mc, "hca_state", 6); + runtime_config.max_generate_batch_size = 2; + runtime_config.fifo_scheduler_config.max_context_batch_size = 1; + + auto config = CacheConfigCreator::createConfig(mc, pc, runtime_config, kv_cache_config); + + ASSERT_EQ(config.groupBlockNumsSnapshot().size(), static_cast(kDsv4PoolNum)); + const auto hca_state_gid = gidForTag(config, "hca_state"); + for (int gid = 0; gid < kDsv4PoolNum; ++gid) { + const uint32_t expected = static_cast(gid) == hca_state_gid ? 6u : 100u; + EXPECT_EQ(config.blockNumForGroup(gid), expected) << "gid=" << gid; + } +} + +TEST(CacheConfigTest, ModelSpecCloneKeepsExistingConfigStable) { + ModelConfig model_config; + model_config.num_layers = 2; + model_config.attn_config.kv_head_num = 4; + model_config.attn_config.size_per_head = 16; + model_config.attn_config.tokens_per_block = 8; + setDefaultKvCacheSpec(model_config); + + ParallelismConfig pc_tp1; + pc_tp1.tp_size = 1; + auto config_tp1 = CacheConfigCreator::createBasicConfig(model_config, pc_tp1, KVCacheConfig{}, false, 0); + ASSERT_EQ(static_cast(config_tp1.groupNums()), 1u); + EXPECT_EQ(config_tp1.specForGroup(0)->local_head_num_kv, 4u); + + ParallelismConfig pc_tp2; + pc_tp2.tp_size = 2; + auto config_tp2 = CacheConfigCreator::createBasicConfig(model_config, pc_tp2, KVCacheConfig{}, false, 0); + ASSERT_EQ(static_cast(config_tp2.groupNums()), 1u); + EXPECT_EQ(config_tp2.specForGroup(0)->local_head_num_kv, 2u); + + EXPECT_EQ(config_tp1.specForGroup(0)->local_head_num_kv, 4u); + EXPECT_NE(config_tp1.specForGroup(0).get(), config_tp2.specForGroup(0).get()); +} + +TEST(CacheConfigTest, SpecBuilderDerivesHybridPoolRuntimeFieldsFromContext) { + SpecBuildContext ctx; + ctx.dtype = DataType::TYPE_BF16; + ctx.seq_size_per_block = 128; + ctx.attn_tp_size = 1; + ctx.kernel_tokens_per_block = 128; + ctx.gen_num_per_cycle = 3; + ctx.cp_size = 2; + ctx.cp_prefill_sliced = true; + + KVCacheSpecDesc compressed_desc; + compressed_desc.tag = "compressed"; + compressed_desc.cache_type = CacheType::COMPRESSED_KV; + compressed_desc.entry_elems = 16; + compressed_desc.compression_ratio = 4; + compressed_desc.store_dtype = DataType::TYPE_UINT8; + compressed_desc.extra.derive_entries_from_kernel_block = true; + compressed_desc.extra.use_fixed_region_cp_tokens = true; + + auto compressed = std::dynamic_pointer_cast(SpecBuilder::build(compressed_desc, ctx)); + ASSERT_NE(compressed, nullptr); + EXPECT_EQ(compressed->entries_per_block, 32u); + EXPECT_EQ(compressed->seq_size_per_block, 256u); + EXPECT_EQ(compressed->dtype, DataType::TYPE_BF16); + + KVCacheSpecDesc state_desc; + state_desc.tag = "state"; + state_desc.cache_type = CacheType::FIXED_STATE; + state_desc.entry_elems = 32; + state_desc.store_dtype = DataType::TYPE_FP32; + state_desc.block_size_bytes_alignment = 64; + state_desc.extra.state_ring_compression_ratio = 4; + state_desc.extra.state_ring_overlap = 1; + state_desc.extra.state_ring_add_gen_num_per_cycle = true; + state_desc.extra.cp_align_entries = true; + state_desc.extra.cp_slice_entries = true; + state_desc.extra.cp_prefill_slice_block_bytes = true; + state_desc.extra.use_fixed_region_cp_tokens = true; + + auto prefill_state = std::dynamic_pointer_cast(SpecBuilder::build(state_desc, ctx)); + ASSERT_NE(prefill_state, nullptr); + EXPECT_EQ(prefill_state->entries_per_block, 6u); + EXPECT_EQ(prefill_state->block_size_bytes_override, 384u); + EXPECT_EQ(prefill_state->seq_size_per_block, 256u); + + ctx.cp_prefill_sliced = false; + auto decode_state = std::dynamic_pointer_cast(SpecBuilder::build(state_desc, ctx)); + ASSERT_NE(decode_state, nullptr); + EXPECT_EQ(decode_state->entries_per_block, 12u); + EXPECT_EQ(decode_state->block_size_bytes_override, 0u); + EXPECT_EQ(decode_state->seq_size_per_block, 256u); +} + +TEST(CacheConfigTest, FinalizeBlockNumsIsNoopForSingleAndSharedHybridConfig) { + RuntimeConfig runtime_config; + runtime_config.max_generate_batch_size = 8; + runtime_config.fifo_scheduler_config.max_context_batch_size = 4; + + ParallelismConfig pc; + ModelConfig single_model_config; + single_model_config.num_layers = 1; + single_model_config.attn_config.kv_head_num = 1; + single_model_config.attn_config.size_per_head = 1; + single_model_config.attn_config.tokens_per_block = 1; + setDefaultKvCacheSpec(single_model_config); + auto single_config = CacheConfigCreator::createBasicConfig(single_model_config, pc, KVCacheConfig{}, false, 0); + single_config.finalizeBlockNums(123, runtime_config); + EXPECT_TRUE(single_config.groupBlockNumsSnapshot().empty()); + EXPECT_EQ(single_config.explicitly_sized_pool_reserve_bytes, 0u); + + auto hybrid_config = + CacheConfigCreator::createBasicConfig(makeHybridAttentionModelConfig(false), pc, KVCacheConfig{}, false, 0); + hybrid_config.finalizeBlockNums(123, runtime_config); + EXPECT_FALSE(hybrid_config.use_independent_block_pools); + EXPECT_TRUE(hybrid_config.groupBlockNumsSnapshot().empty()); + EXPECT_EQ(hybrid_config.explicitly_sized_pool_reserve_bytes, 0u); +} + +TEST(CacheConfigTest, FinalizeBlockNumsAppliesToIndependentPools) { + RuntimeConfig runtime_config; + runtime_config.max_generate_batch_size = 5; + runtime_config.fifo_scheduler_config.max_context_batch_size = 3; + + ParallelismConfig pc; + auto config = CacheConfigCreator::createBasicConfig(makeProModelConfig(), pc, makeDsv4KvCacheConfig(), false, 0); + config.finalizeBlockNums(100, runtime_config); + + ASSERT_EQ(config.groupBlockNumsSnapshot().size(), static_cast(kDsv4PoolNum)); + const auto hca_state_gid = gidForTag(config, "hca_state"); + for (int gid = 0; gid < kDsv4PoolNum; ++gid) { + const uint32_t expected = static_cast(gid) == hca_state_gid ? 256u : 100u; + EXPECT_EQ(config.blockNumForGroup(gid), expected) << "gid=" << gid; + } + EXPECT_EQ(config.explicitly_sized_pool_reserve_bytes, 256u * config.blockSizeBytesForGroup(hca_state_gid)); +} + +TEST(CacheConfigTest, HcaStateReserveDeductedFromPagedBudget) { + auto mc = makeProModelConfig(); + ParallelismConfig pc; + RuntimeConfig runtime_config; + runtime_config.max_generate_batch_size = 4; + runtime_config.fifo_scheduler_config.max_context_batch_size = 2; + + const uint32_t small_hca_state_pool = 32; + const uint32_t large_hca_state_pool = 256; + + KVCacheConfig kv_cache_config_with; + kv_cache_config_with.seq_size_per_block = 128; + kv_cache_config_with.kv_cache_mem_mb = 65536; + setDsv4ExplicitPoolBlocks(mc, "hca_state", small_hca_state_pool); + auto config_with = CacheConfigCreator::createConfig(mc, pc, runtime_config, kv_cache_config_with); + + KVCacheConfig kv_cache_config_without; + kv_cache_config_without.seq_size_per_block = 128; + kv_cache_config_without.kv_cache_mem_mb = 65536; + setDsv4ExplicitPoolBlocks(mc, "hca_state", large_hca_state_pool); + auto config_without = CacheConfigCreator::createConfig(mc, pc, runtime_config, kv_cache_config_without); + + // More HCA_STATE blocks reserve more HBM and leave fewer blocks for the global pools. + EXPECT_GT(config_with.block_num, config_without.block_num); + EXPECT_EQ(config_with.blockNumForGroup(gidForTag(config_with, "hca_kv")), + static_cast(config_with.block_num)); + EXPECT_EQ(config_without.blockNumForGroup(gidForTag(config_without, "hca_kv")), + static_cast(config_without.block_num)); + EXPECT_EQ(config_with.blockNumForGroup(gidForTag(config_with, "hca_state")), small_hca_state_pool); + EXPECT_EQ(config_without.blockNumForGroup(gidForTag(config_without, "hca_state")), large_hca_state_pool); + const size_t expected_reserve = + static_cast(small_hca_state_pool) * config_with.blockSizeBytesForGroup(gidForTag(config_with, "hca_state")); + EXPECT_EQ(config_with.explicitly_sized_pool_reserve_bytes, expected_reserve); +} + +TEST(CacheConfigTest, DSV4ExplicitHcaStatePoolBlocksIgnoreLinearStep) { + RuntimeConfig runtime_config; + runtime_config.max_generate_batch_size = 4; + runtime_config.fifo_scheduler_config.max_context_batch_size = 2; + + ParallelismConfig pc; + KVCacheConfig kv_cache_config = makeDsv4KvCacheConfig(); + auto config = CacheConfigCreator::createBasicConfig(makeProModelConfig(), pc, kv_cache_config, false, 0); + config.linear_step = 4; + config.finalizeBlockNums(100, runtime_config); + + // FULL groups: unaffected by step, get global_block_num + const auto hca_state_gid = gidForTag(config, "hca_state"); + for (size_t gid = 0; gid < static_cast(config.groupNums()); ++gid) { + const uint32_t expected = gid == hca_state_gid ? 256u : 100u; + EXPECT_EQ(config.blockNumForGroup(gid), expected) << "gid=" << gid; + } + const size_t expected_reserve = 256u * config.blockSizeBytesForGroup(hca_state_gid); + EXPECT_EQ(config.explicitly_sized_pool_reserve_bytes, expected_reserve); +} + +TEST(CacheConfigTest, DSV4StateSwaPoolsWithoutExplicitBlocksUseGlobalBlocks) { + RuntimeConfig runtime_config; + runtime_config.max_generate_batch_size = 4; + runtime_config.fifo_scheduler_config.max_context_batch_size = 2; + + ParallelismConfig pc; + KVCacheConfig kv_cache_config; + kv_cache_config.seq_size_per_block = 128; + kv_cache_config.test_block_num = 100; + kv_cache_config.linear_step = 4; + auto mc = makeProModelConfig(); + setDsv4ExplicitPoolBlocks(mc, "hca_state", 0); + + auto config = CacheConfigCreator::createConfig(mc, pc, runtime_config, kv_cache_config); + + ASSERT_EQ(config.groupBlockNumsSnapshot().size(), static_cast(kDsv4PoolNum)); + for (int gid = 0; gid < kDsv4PoolNum; ++gid) { + EXPECT_EQ(config.blockNumForGroup(gid), 100u) << "gid=" << gid; + } + EXPECT_EQ(config.explicitly_sized_pool_reserve_bytes, 0u); +} + +TEST(CacheConfigTest, DSV4MtpKeepsProposeLayerInSwaPool) { + auto score_model_config = makeFlashModelConfig(); + auto propose_model_config = makeFlashMtpModelConfig(); + score_model_config.attn_config.kv_cache_dtype = KvCacheDataType::FP8; + propose_model_config.attn_config.kv_cache_dtype = KvCacheDataType::FP8; + + ParallelismConfig parallelism_config; + RuntimeConfig runtime_config; + runtime_config.max_generate_batch_size = 2; + runtime_config.fifo_scheduler_config.max_context_batch_size = 1; + + KVCacheConfig kv_cache_config; + kv_cache_config.seq_size_per_block = 16384; + kv_cache_config.kernel_seq_size_per_block = 128; + kv_cache_config.test_block_num = 100; + + SpeculativeExecutionConfig sp_config; + sp_config.type = SP_TYPE_MTP; + sp_config.gen_num_per_cycle = 2; + + auto config = CacheConfigCreator::createSpConfig(score_model_config, + propose_model_config, + parallelism_config, + runtime_config, + kv_cache_config, + sp_config, + std::nullopt, + true, + false); + + ASSERT_EQ(config.layer_num, 43u); + ASSERT_EQ(config.layer_all_num, 45u); + ASSERT_EQ(config.mtp_sub_configs.size(), 2u); + ASSERT_NE(config.mtp_sub_configs[0], nullptr); + ASSERT_NE(config.mtp_sub_configs[1], nullptr); + + const auto swa_gid = gidForTag(config, "swa_kv"); + EXPECT_EQ(config.layerGroupIdsSnapshot()[43], std::vector({static_cast(swa_gid)})); + EXPECT_EQ(config.layerGroupIdsSnapshot()[44], std::vector({static_cast(swa_gid)})); + EXPECT_EQ(config.groupIdForLayerTag(43, "swa_kv"), static_cast(swa_gid)); + EXPECT_EQ(config.groupIdForLayerTag(44, "swa_kv"), static_cast(swa_gid)); + + EXPECT_EQ(config.layerIdsForGroup(swa_gid).size(), 45u); + + // MTP sub-configs preserve the target/global group namespace. Current + // MTP execution passes block tables by gid without a draft-local remap, so + // unused target groups stay as empty placeholders and the real SWA layer + // keeps the same gid as the target config. + EXPECT_EQ(config.mtp_sub_configs[0]->groupTagsSnapshot(), config.groupTagsSnapshot()); + EXPECT_EQ(config.mtp_sub_configs[1]->groupTagsSnapshot(), config.groupTagsSnapshot()); + EXPECT_EQ(config.mtp_sub_configs[0]->groupIdForLayerTag(0, "swa_kv"), static_cast(swa_gid)); + EXPECT_EQ(config.mtp_sub_configs[1]->groupIdForLayerTag(0, "swa_kv"), static_cast(swa_gid)); + EXPECT_EQ(config.mtp_sub_configs[0]->layerIdsForGroup(swa_gid), std::vector({43})); + EXPECT_EQ(config.mtp_sub_configs[1]->layerIdsForGroup(swa_gid), std::vector({44})); + for (size_t gid = 0; gid < static_cast(config.groupNums()); ++gid) { + if (gid == swa_gid) { + continue; + } + EXPECT_TRUE(config.mtp_sub_configs[0]->layerIdsForGroup(gid).empty()) << config.tagForGroup(gid); + EXPECT_TRUE(config.mtp_sub_configs[1]->layerIdsForGroup(gid).empty()) << config.tagForGroup(gid); + } + EXPECT_EQ(config.seq_size_per_block, 16384u); + EXPECT_EQ(config.kernel_seq_size_per_block, 128u); + EXPECT_EQ(config.kernelBlocksPerKvBlock(), 128u); + EXPECT_EQ(config.mtp_sub_configs[0]->seq_size_per_block, 16384u); + EXPECT_EQ(config.mtp_sub_configs[0]->kernel_seq_size_per_block, 128u); + + EXPECT_EQ(config.explicitly_sized_pool_reserve_bytes, + 256u * config.blockSizeBytesForGroup(gidForTag(config, "hca_state"))); +} + +TEST(HybridPoolConfigCreatorTest, MtpGenNum2RingEntriesMatch) { + // gen_num_per_cycle=2 -> CSA/INDEXER R=10, HCA R=130, SWA R=130. + // Formula: R = ceil_even((1 + overlap) * ratio + gen_num_per_cycle). + // SWA_KV is sized like the HCA state ring (window 128, overlap 0). + auto mc = makeFlashModelConfig(); + ParallelismConfig pc; + auto config = + CacheConfigCreator::createBasicConfig(mc, pc, makeDsv4KvCacheConfig(), false, /*gen_num_per_cycle=*/2); + + ASSERT_EQ(static_cast(config.groupNums()), 7u); + // Pool 3: INDEXER_STATE (ratio=4, overlap=1) → R=10 + auto* indexer_state = dynamic_cast(config.specForGroup(gidForTag(config, "indexer_state")).get()); + ASSERT_NE(indexer_state, nullptr); + EXPECT_EQ(indexer_state->entries_per_block, 10u); + // Pool 4: CSA_STATE (ratio=4, overlap=1) → R=10 + auto* csa_state = dynamic_cast(config.specForGroup(gidForTag(config, "csa_state")).get()); + ASSERT_NE(csa_state, nullptr); + EXPECT_EQ(csa_state->entries_per_block, 10u); + // Pool 5: HCA_STATE (ratio=128, overlap=0) → R=130 + auto* hca_state = dynamic_cast(config.specForGroup(gidForTag(config, "hca_state")).get()); + ASSERT_NE(hca_state, nullptr); + EXPECT_EQ(hca_state->entries_per_block, 130u); + // Pool 6: SWA_KV (window=128, overlap=0) → R=130, same as HCA_STATE + auto* swa_kv = dynamic_cast(config.specForGroup(gidForTag(config, "swa_kv")).get()); + ASSERT_NE(swa_kv, nullptr); + EXPECT_EQ(swa_kv->tag, "swa_kv"); + EXPECT_EQ(swa_kv->entries_per_block, 130u); +} + +TEST(HybridPoolConfigCreatorTest, PrefillCp8MtpGenNum2PadsStateRingBeforeSlicing) { + auto mc = makeFlashModelConfig(); + ParallelismConfig pc; + pc.role_type = RoleType::PREFILL; + pc.tp_size = 8; + pc.prefill_cp_config.kv_cache_sharded = true; + + auto config = CacheConfigCreator::createBasicConfig(mc, pc, makeDsv4KvCacheConfig(), false, 2); + + ASSERT_EQ(static_cast(config.groupNums()), 7u); + auto* indexer_state = dynamic_cast(config.specForGroup(gidForTag(config, "indexer_state")).get()); + auto* csa_state = dynamic_cast(config.specForGroup(gidForTag(config, "csa_state")).get()); + auto* hca_state = dynamic_cast(config.specForGroup(gidForTag(config, "hca_state")).get()); + auto* swa_kv = dynamic_cast(config.specForGroup(gidForTag(config, "swa_kv")).get()); + ASSERT_NE(indexer_state, nullptr); + ASSERT_NE(csa_state, nullptr); + ASSERT_NE(hca_state, nullptr); + ASSERT_NE(swa_kv, nullptr); + + // gen_num_per_cycle=2 gives raw INDEXER/CSA R=10, HCA/SWA R=130. + // Fixed state pools are CP-sliced by entries; SWA_KV keeps full logical + // entries and slices its packed bytes instead. + EXPECT_EQ(indexer_state->entries_per_block, 2u); + EXPECT_EQ(csa_state->entries_per_block, 2u); + EXPECT_EQ(hca_state->entries_per_block, 17u); + EXPECT_EQ(swa_kv->entries_per_block, 136u); +} + +TEST(HybridPoolConfigCreatorTest, DecodePrefillCp8MtpGenNum2ExpandsFixedAndSwaSlices) { + constexpr uint32_t cp_size = 8; + auto mc = makeFlashModelConfig(); + + ParallelismConfig prefill_pc; + prefill_pc.role_type = RoleType::PREFILL; + prefill_pc.tp_size = cp_size; + prefill_pc.prefill_cp_config.kv_cache_sharded = true; + + ParallelismConfig decode_pc; + decode_pc.role_type = RoleType::DECODE; + decode_pc.tp_size = 1; + decode_pc.dp_size = cp_size; + decode_pc.world_size = cp_size; + decode_pc.prefill_cp_config.method = CPRotateMethod::PREFILL_CP; + decode_pc.prefill_cp_config.kv_cache_sharded = true; + decode_pc.prefill_cp_config.prefill_cp_size = cp_size; + + auto prefill_config = CacheConfigCreator::createBasicConfig(mc, prefill_pc, makeDsv4KvCacheConfig(), false, 2); + auto decode_config = CacheConfigCreator::createBasicConfig(mc, decode_pc, makeDsv4KvCacheConfig(), false, 2); + + ASSERT_EQ(static_cast(prefill_config.groupNums()), 7u); + ASSERT_EQ(static_cast(decode_config.groupNums()), 7u); + + for (const auto& tag : {"indexer_state", "csa_state", "hca_state"}) { + const auto prefill_gid = gidForTag(prefill_config, tag); + const auto decode_gid = gidForTag(decode_config, tag); + auto* prefill_spec = dynamic_cast(prefill_config.specForGroup(prefill_gid).get()); + auto* decode_spec = dynamic_cast(decode_config.specForGroup(decode_gid).get()); + ASSERT_NE(prefill_spec, nullptr) << tag; + ASSERT_NE(decode_spec, nullptr) << tag; + EXPECT_EQ(decode_spec->tag, prefill_spec->tag) << tag; + const auto expected_entries = prefill_spec->entries_per_block * cp_size; + EXPECT_EQ(decode_spec->entries_per_block, expected_entries) << tag; + } + auto* prefill_swa = dynamic_cast( + prefill_config.specForGroup(gidForTag(prefill_config, "swa_kv")).get()); + auto* decode_swa = dynamic_cast( + decode_config.specForGroup(gidForTag(decode_config, "swa_kv")).get()); + ASSERT_NE(prefill_swa, nullptr); + ASSERT_NE(decode_swa, nullptr); + EXPECT_EQ(prefill_swa->entries_per_block, 136u); + EXPECT_EQ(decode_swa->entries_per_block, prefill_swa->entries_per_block); + + auto* indexer_state = dynamic_cast(decode_config.specForGroup(gidForTag(decode_config, "indexer_state")).get()); + auto* csa_state = dynamic_cast(decode_config.specForGroup(gidForTag(decode_config, "csa_state")).get()); + auto* hca_state = dynamic_cast(decode_config.specForGroup(gidForTag(decode_config, "hca_state")).get()); + auto* swa_kv = dynamic_cast(decode_config.specForGroup(gidForTag(decode_config, "swa_kv")).get()); + ASSERT_NE(indexer_state, nullptr); + ASSERT_NE(csa_state, nullptr); + ASSERT_NE(hca_state, nullptr); + ASSERT_NE(swa_kv, nullptr); + + EXPECT_EQ(indexer_state->entries_per_block, 16u); + EXPECT_EQ(csa_state->entries_per_block, 16u); + EXPECT_EQ(hca_state->entries_per_block, 136u); + EXPECT_EQ(swa_kv->entries_per_block, 136u); + for (const auto& tag : {"indexer_state", "csa_state", "hca_state", "swa_kv"}) { + const auto prefill_gid = gidForTag(prefill_config, tag); + const auto decode_gid = gidForTag(decode_config, tag); + EXPECT_EQ(prefill_config.group_seq_size_per_block[prefill_gid], kDsv4TokensPerBlock * cp_size) << tag; + EXPECT_EQ(decode_config.group_seq_size_per_block[decode_gid], kDsv4TokensPerBlock * cp_size) << tag; + } +} + +TEST(HybridPoolConfigCreatorTest, DecodeExplicitPrefillCpSizeHandlesDp16) { + constexpr uint32_t cp_size = 8; + auto mc = makeFlashModelConfig(); + + ParallelismConfig prefill_pc; + prefill_pc.role_type = RoleType::PREFILL; + prefill_pc.tp_size = cp_size; + prefill_pc.prefill_cp_config.kv_cache_sharded = true; + + ParallelismConfig decode_pc; + decode_pc.role_type = RoleType::DECODE; + decode_pc.tp_size = 1; + decode_pc.dp_size = 16; + decode_pc.world_size = 16; + decode_pc.prefill_cp_config.method = CPRotateMethod::PREFILL_CP; + decode_pc.prefill_cp_config.kv_cache_sharded = true; + decode_pc.prefill_cp_config.prefill_cp_size = cp_size; + + auto prefill_config = CacheConfigCreator::createBasicConfig(mc, prefill_pc, makeDsv4KvCacheConfig(), false, 2); + auto decode_config = CacheConfigCreator::createBasicConfig(mc, decode_pc, makeDsv4KvCacheConfig(), false, 2); + + for (const auto& tag : {"indexer_state", "csa_state", "hca_state"}) { + const auto prefill_gid = gidForTag(prefill_config, tag); + const auto decode_gid = gidForTag(decode_config, tag); + auto* prefill_spec = dynamic_cast(prefill_config.specForGroup(prefill_gid).get()); + auto* decode_spec = dynamic_cast(decode_config.specForGroup(decode_gid).get()); + ASSERT_NE(prefill_spec, nullptr) << tag; + ASSERT_NE(decode_spec, nullptr) << tag; + const auto expected_entries = prefill_spec->entries_per_block * cp_size; + EXPECT_EQ(decode_spec->entries_per_block, expected_entries) << tag; + EXPECT_EQ(prefill_config.group_seq_size_per_block[prefill_gid], kDsv4TokensPerBlock * cp_size) << tag; + EXPECT_EQ(decode_config.group_seq_size_per_block[decode_gid], kDsv4TokensPerBlock * cp_size) << tag; + } + auto* prefill_swa = dynamic_cast( + prefill_config.specForGroup(gidForTag(prefill_config, "swa_kv")).get()); + auto* decode_swa = dynamic_cast( + decode_config.specForGroup(gidForTag(decode_config, "swa_kv")).get()); + ASSERT_NE(prefill_swa, nullptr); + ASSERT_NE(decode_swa, nullptr); + EXPECT_EQ(prefill_swa->entries_per_block, 136u); + EXPECT_EQ(decode_swa->entries_per_block, prefill_swa->entries_per_block); + EXPECT_EQ(prefill_config.group_seq_size_per_block[gidForTag(prefill_config, "swa_kv")], + kDsv4TokensPerBlock * cp_size); + EXPECT_EQ(decode_config.group_seq_size_per_block[gidForTag(decode_config, "swa_kv")], + kDsv4TokensPerBlock * cp_size); +} + +TEST(CacheConfigTest, DSV4NonMtpSpConfigDoesNotInflateRing) { + // SP_TYPE_NONE with default gen_num_per_cycle=1 must NOT inflate state ring. + // Non-MTP DSV4 ring: R = ceil_even((1+overlap)*ratio + 0) = 8 for CSA. + auto mc = makeFlashModelConfig(); + ParallelismConfig pc; + RuntimeConfig rc; + rc.max_generate_batch_size = 2; + rc.fifo_scheduler_config.max_context_batch_size = 1; + KVCacheConfig kvc; + kvc.seq_size_per_block = 128; + kvc.kernel_seq_size_per_block = 128; + kvc.test_block_num = 50; + SpeculativeExecutionConfig sp_none; // type=SP_TYPE_NONE, gen_num_per_cycle=1 + auto config = CacheConfigCreator::createConfig(mc, pc, rc, kvc, std::nullopt, std::make_optional(sp_none)); + ASSERT_EQ(static_cast(config.groupNums()), 7u); + // CSA_STATE (pool 4): ratio=4, overlap=1, gen_num=0 → R=8 + auto* csa = dynamic_cast(config.specForGroup(gidForTag(config, "csa_state")).get()); + ASSERT_NE(csa, nullptr); + EXPECT_EQ(csa->entries_per_block, 8u) << "SP_TYPE_NONE should not inflate ring"; +} + +TEST(HybridPoolConfigCreatorTest, BlockIdConsistencyAcrossGroups) { + // DSV4 has multiple semantic cache tags per logical layer. The config must expose + // every tag's group id for the layer so model/runtime code can request the + // correct group by tag. + auto mc = makeProModelConfig(); + ParallelismConfig pc; + auto config = CacheConfigCreator::createBasicConfig(mc, pc, makeDsv4KvCacheConfig(), false, 0); + + // Verify every layer exposes its complete group ids directly. + const auto layer_group_ids = config.layerGroupIdsSnapshot(); + EXPECT_EQ(layer_group_ids.size(), 61u); + for (size_t i = 0; i < layer_group_ids.size(); i++) { + EXPECT_FALSE(layer_group_ids[i].empty()) << "layer " << i; + } + + // Verify group layer ids: each group has the correct layer list. + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "csa_kv")), + config.layerIdsForGroup(gidForTag(config, "indexer_kv"))); + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "csa_kv")), + config.layerIdsForGroup(gidForTag(config, "indexer_state"))); + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "csa_kv")), + config.layerIdsForGroup(gidForTag(config, "csa_state"))); + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "hca_kv")), + config.layerIdsForGroup(gidForTag(config, "hca_state"))); +} + +// ============================================================ +// Helper: build a DSV4 CacheConfig with block_num set for allocator tests +// ============================================================ + +static CacheConfig makeDSV4AllocatorConfig(bool use_flash = false) { + auto mc = use_flash ? makeFlashModelConfig() : makeProModelConfig(); + ParallelismConfig pc; + auto config = CacheConfigCreator::createBasicConfig(mc, pc, makeDsv4KvCacheConfig(), false, 0); + // Set enough blocks for tests (7 groups × N blocks each) + config.block_num = 200; + return config; +} + +static CacheConfig makeDSV4CpAllocatorConfig(uint32_t cp_size) { + auto mc = makeProModelConfig(); + ParallelismConfig pc; + pc.role_type = RoleType::PREFILL; + pc.tp_size = cp_size; + pc.prefill_cp_config.kv_cache_sharded = true; + auto config = CacheConfigCreator::createBasicConfig(mc, pc, makeDsv4KvCacheConfig(), false, 0); + config.block_num = 200; + setGroupBlockNumsForTest(config, std::vector(static_cast(config.groupNums()), config.block_num)); + return config; +} + +// ============================================================ +// HybridTypeKVCacheAllocator integration tests with DSV4 7-group config +// ============================================================ + +class DSV4AllocatorTest: public ::testing::Test { +protected: + void SetUp() override { + rtp_llm::initLogger(); + createDevice(); + } +}; + +TEST_F(DSV4AllocatorTest, InitAndBasicProperties) { + auto config = makeDSV4AllocatorConfig(); + auto allocator = std::make_shared(config, AllocationType::DEVICE); + ASSERT_TRUE(allocator->init()); + + // 7 groups → HybridTypeKVCacheAllocator path + EXPECT_EQ(config.groupNums(), 7); + EXPECT_EQ(allocator->seqSizePerBlock(), static_cast(config.seq_size_per_block)); + EXPECT_EQ(allocator->totalBlocksNum(), config.block_num - 1); + EXPECT_EQ(allocator->freeBlocksNum(), config.block_num - 1); +} + +TEST_F(DSV4AllocatorTest, CpPageRrFixedAndSwaAllocateOneBlockPerVirtualBlock) { + constexpr uint32_t cp_size = 4; + auto config = makeDSV4CpAllocatorConfig(cp_size); + auto allocator = std::make_shared(config, AllocationType::DEVICE); + ASSERT_TRUE(allocator->init()); + + const int spb = allocator->seqSizePerBlock(); + const int seq_len = static_cast(cp_size) * spb; + allocator->setCPSlotMapper(std::make_shared(0, static_cast(cp_size), spb)); + + auto batch_res = std::make_shared(); + batch_res->resetBatchSize(1); + initDsv4BatchGroups(*batch_res, config); + batch_res->setBatchCacheKeys(0, CacheKeysType{100, 101, 102, 103}); + + auto cti = std::make_shared(1, 1, seq_len + spb, spb); + auto gi = std::make_shared(); + gi->input_ids = torch::arange(seq_len, torch::kInt32); + gi->generate_config = std::make_shared(); + cti->init(gi); + + MallocInfo info{batch_res, cti}; + info.enable_device_cache = false; + info.reuse_cache = false; + + auto result = allocator->malloc(info); + ASSERT_TRUE(result.success); + for (int gid = 0; gid < 7; ++gid) { + EXPECT_EQ(batch_res->blocksNum(0, gid), 1u) << "gid=" << gid; + } + + FreeInfo free_info{batch_res}; + allocator->free(free_info); +} + +TEST_F(DSV4AllocatorTest, FlashInitAndBasicProperties) { + auto config = makeDSV4AllocatorConfig(/*use_flash=*/true); + auto allocator = std::make_shared(config, AllocationType::DEVICE); + ASSERT_TRUE(allocator->init()); + + EXPECT_EQ(config.groupNums(), 7); + EXPECT_EQ(config.layer_num, 43u); + EXPECT_EQ(allocator->totalBlocksNum(), config.block_num - 1); +} + +TEST_F(DSV4AllocatorTest, AddressLookupAllGroups) { + auto config = makeDSV4AllocatorConfig(); + auto allocator = std::make_shared(config, AllocationType::DEVICE); + ASSERT_TRUE(allocator->init()); + + // Verify address lookup works for a layer in each group + // Group 0 (CSA KV): csa_layer_ids[0] + // Group 1 (HCA KV): hca_layer_ids[0] + // Group 6 (SWA KV): all_layer_ids[0] + for (int gid = 0; gid < 7; gid++) { + ASSERT_FALSE(config.layerIdsForGroup(gid).empty()) << "group " << gid << " has no layers"; + int layer_id = config.layerIdsForGroup(gid)[0]; + auto addr = allocator->convertIndexToAddr(layer_id, gid, /*block_id=*/1); + EXPECT_NE(addr.kv_addr, nullptr) << "null kv_addr for group " << gid << " layer " << layer_id; + } +} + +TEST_F(DSV4AllocatorTest, BlockPoolCreatedWithCorrectTensors) { + auto config = makeDSV4AllocatorConfig(); + auto allocator = std::make_shared(config, AllocationType::DEVICE); + ASSERT_TRUE(allocator->init()); + + auto block_pool = allocator->getBlockPool(); + ASSERT_NE(block_pool, nullptr); + + // allLayerCacheBase should return tensors for all 61 layers + auto layout = allocator->allLayerCacheBase(); + EXPECT_EQ(layout.layers_to_kv_buffer_ptrs.size(), static_cast(config.layer_num)); + for (size_t i = 0; i < layout.layers_to_kv_buffer_ptrs.size(); ++i) { + EXPECT_TRUE(layout.layers_to_kv_buffer_ptrs[i].defined()) << "undefined kv buffer for layer " << i; + } +} + +TEST_F(DSV4AllocatorTest, ConvertIndexToBufferAllGroups) { + auto config = makeDSV4AllocatorConfig(); + auto allocator = std::make_shared(config, AllocationType::DEVICE); + ASSERT_TRUE(allocator->init()); + + // convertIndexToBuffer should work for layers in each of the 7 groups + for (int gid = 0; gid < 7; gid++) { + int layer_id = config.layerIdsForGroup(gid)[0]; + auto buf = allocator->convertIndexToBuffer(layer_id, gid, /*block_id=*/1); + ASSERT_FALSE(buf.empty()) << "empty buffer for group " << gid; + EXPECT_NE(buf[0].addr, nullptr) << "null addr for group " << gid; + } +} + +TEST_F(DSV4AllocatorTest, MallocAndFreeBlocks) { + auto config = makeDSV4AllocatorConfig(); + auto allocator = std::make_shared(config, AllocationType::DEVICE); + ASSERT_TRUE(allocator->init()); + + auto block_pool = allocator->getBlockPool(); + ASSERT_NE(block_pool, nullptr); + + size_t free_before = allocator->freeBlocksNum(); + ASSERT_GT(free_before, 3u); + + // Direct block pool malloc/free + auto blocks = block_pool->malloc(3); + ASSERT_EQ(blocks.size(), 3u); + EXPECT_EQ(allocator->freeBlocksNum(), free_before - 3); + + block_pool->requestFree(blocks); + EXPECT_EQ(allocator->freeBlocksNum(), free_before); +} + +TEST_F(DSV4AllocatorTest, SevenGroupLayerMapping) { + auto config = makeDSV4AllocatorConfig(); + + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "csa_kv")).size(), 30u); + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "hca_kv")).size(), 31u); + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "indexer_kv")).size(), 30u); + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "indexer_state")).size(), 30u); + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "csa_state")).size(), 30u); + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "hca_state")).size(), 31u); + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "swa_kv")).size(), 61u); + + EXPECT_EQ(config.typeForGroup(gidForTag(config, "csa_kv")), CacheGroupType::FULL); + EXPECT_EQ(config.typeForGroup(gidForTag(config, "hca_kv")), CacheGroupType::FULL); + EXPECT_EQ(config.typeForGroup(gidForTag(config, "indexer_kv")), CacheGroupType::FULL); + EXPECT_EQ(config.typeForGroup(gidForTag(config, "indexer_state")), CacheGroupType::SWA); + EXPECT_EQ(config.typeForGroup(gidForTag(config, "csa_state")), CacheGroupType::SWA); + EXPECT_EQ(config.typeForGroup(gidForTag(config, "hca_state")), CacheGroupType::SWA); + EXPECT_EQ(config.typeForGroup(gidForTag(config, "swa_kv")), CacheGroupType::SWA); +} + +TEST_F(DSV4AllocatorTest, SpecBlockSizesMatchPoolSpecs) { + auto config = makeDSV4AllocatorConfig(); + + ASSERT_EQ(static_cast(config.groupNums()), 7u); + EXPECT_EQ(config.specForGroup(gidForTag(config, "csa_kv"))->block_size_bytes(), 32u * kDsv4KvEntryBytes); + EXPECT_EQ(config.specForGroup(gidForTag(config, "hca_kv"))->block_size_bytes(), 1u * kDsv4KvEntryBytes); + EXPECT_EQ(config.specForGroup(gidForTag(config, "indexer_kv"))->block_size_bytes(), 32u * kDsv4IndexerEntryBytes); + EXPECT_EQ(config.specForGroup(gidForTag(config, "indexer_state"))->block_size_bytes(), 8u * 512u * 4u); + EXPECT_EQ(config.specForGroup(gidForTag(config, "csa_state"))->block_size_bytes(), 8u * 2048u * 4u); + EXPECT_EQ(config.specForGroup(gidForTag(config, "hca_state"))->block_size_bytes(), 128u * 1024u * 4u); + EXPECT_EQ(config.specForGroup(gidForTag(config, "swa_kv"))->block_size_bytes(), kDsv4TokensPerBlock * kDsv4KvEntryBytes); +} + +TEST_F(DSV4AllocatorTest, KVBlockStrideIsMaxAcrossGroups) { + auto config = makeDSV4AllocatorConfig(); + + // kv_block_stride_bytes should be the max block_size_bytes across all 7 pools + size_t expected_max = 0; + for (int i = 0; i < kDsv4PoolNum; i++) { + expected_max = std::max(expected_max, config.specForGroup(i)->block_size_bytes()); + } + EXPECT_EQ(config.kv_block_stride_bytes, expected_max); + // HCA_STATE has the largest per-block bytes (128 entries * 1024 * 4) + EXPECT_EQ(expected_max, config.specForGroup(gidForTag(config, "hca_state"))->block_size_bytes()); +} + +TEST_F(DSV4AllocatorTest, HCAStateIsExcludedFromReuseCachePolicy) { + auto config = makeDSV4AllocatorConfig(); + ASSERT_EQ(static_cast(config.groupNums()), 7u); + ASSERT_EQ(config.groupPoliciesSnapshot().size(), static_cast(config.groupNums())); + + for (size_t gid = 0; gid < static_cast(config.groupNums()); ++gid) { + if (config.tagForGroup(gid) == "hca_state") { + EXPECT_EQ(config.policyForGroup(gid).reuse_policy, CacheReusePolicy::NON_REUSABLE) + << "HCA_STATE should skip reuse cache"; + } else { + EXPECT_EQ(config.policyForGroup(gid).reuse_policy, CacheReusePolicy::REUSABLE) << "group " << gid; + } + } +} + +// ============================================================ +// Flash config: allocator integration +// ============================================================ + +TEST_F(DSV4AllocatorTest, FlashGroupTypes) { + auto config = makeDSV4AllocatorConfig(/*use_flash=*/true); + + // Flash: 21 CSA + 20 HCA + 2 SWA-only = 43 layers + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "csa_kv")).size(), 21u); + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "hca_kv")).size(), 20u); + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "swa_kv")).size(), 43u); + + EXPECT_EQ(config.typeForGroup(gidForTag(config, "csa_kv")), CacheGroupType::FULL); + EXPECT_EQ(config.typeForGroup(gidForTag(config, "hca_kv")), CacheGroupType::FULL); + EXPECT_EQ(config.typeForGroup(gidForTag(config, "indexer_kv")), CacheGroupType::FULL); + EXPECT_EQ(config.typeForGroup(gidForTag(config, "indexer_state")), CacheGroupType::SWA); + EXPECT_EQ(config.typeForGroup(gidForTag(config, "csa_state")), CacheGroupType::SWA); + EXPECT_EQ(config.typeForGroup(gidForTag(config, "hca_state")), CacheGroupType::SWA); + EXPECT_EQ(config.typeForGroup(gidForTag(config, "swa_kv")), CacheGroupType::SWA); +} + +TEST_F(DSV4AllocatorTest, FlashAddressLookupAllGroups) { + auto config = makeDSV4AllocatorConfig(/*use_flash=*/true); + auto allocator = std::make_shared(config, AllocationType::DEVICE); + ASSERT_TRUE(allocator->init()); + + for (int gid = 0; gid < 7; gid++) { + ASSERT_FALSE(config.layerIdsForGroup(gid).empty()) << "Flash group " << gid << " has no layers"; + int layer_id = config.layerIdsForGroup(gid)[0]; + auto addr = allocator->convertIndexToAddr(layer_id, gid, /*block_id=*/1); + EXPECT_NE(addr.kv_addr, nullptr) << "Flash null kv_addr for group " << gid; + } +} + +TEST_F(DSV4AllocatorTest, FlashBlockPoolTensors) { + auto config = makeDSV4AllocatorConfig(/*use_flash=*/true); + auto allocator = std::make_shared(config, AllocationType::DEVICE); + ASSERT_TRUE(allocator->init()); + + auto layout = allocator->allLayerCacheBase(); + EXPECT_EQ(layout.layers_to_kv_buffer_ptrs.size(), 43u); + for (size_t i = 0; i < layout.layers_to_kv_buffer_ptrs.size(); ++i) { + EXPECT_TRUE(layout.layers_to_kv_buffer_ptrs[i].defined()) << "Flash undefined kv buffer for layer " << i; + } +} + +TEST_F(DSV4AllocatorTest, FlashLayerMapping) { + auto config = makeDSV4AllocatorConfig(/*use_flash=*/true); + + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "csa_kv")).size(), 21u); + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "hca_kv")).size(), 20u); + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "indexer_kv")).size(), 21u); + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "indexer_state")).size(), 21u); + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "csa_state")).size(), 21u); + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "hca_state")).size(), 20u); + EXPECT_EQ(config.layerIdsForGroup(gidForTag(config, "swa_kv")).size(), 43u); +} + +TEST_F(DSV4AllocatorTest, FlashSpecBlockSizes) { + auto config = makeDSV4AllocatorConfig(/*use_flash=*/true); + + ASSERT_EQ(static_cast(config.groupNums()), 7u); + EXPECT_EQ(config.specForGroup(gidForTag(config, "csa_kv"))->block_size_bytes(), 32u * kDsv4KvEntryBytes); + EXPECT_EQ(config.specForGroup(gidForTag(config, "hca_kv"))->block_size_bytes(), 1u * kDsv4KvEntryBytes); + EXPECT_EQ(config.specForGroup(gidForTag(config, "indexer_kv"))->block_size_bytes(), 32u * kDsv4IndexerEntryBytes); + EXPECT_EQ(config.specForGroup(gidForTag(config, "swa_kv"))->block_size_bytes(), kDsv4TokensPerBlock * kDsv4KvEntryBytes); +} + +TEST_F(DSV4AllocatorTest, FlashMallocAndFree) { + auto config = makeDSV4AllocatorConfig(/*use_flash=*/true); + auto allocator = std::make_shared(config, AllocationType::DEVICE); + ASSERT_TRUE(allocator->init()); + + auto block_pool = allocator->getBlockPool(); + size_t free_before = allocator->freeBlocksNum(); + ASSERT_GT(free_before, 5u); + + auto blocks = block_pool->malloc(5); + ASSERT_EQ(blocks.size(), 5u); + EXPECT_EQ(allocator->freeBlocksNum(), free_before - 5); + + block_pool->requestFree(blocks); + EXPECT_EQ(allocator->freeBlocksNum(), free_before); +} + +// ============================================================ +// Prefix cache: insertIntoCache skips HCA_STATE but keeps other groups reusable. +// ============================================================ + +TEST_F(DSV4AllocatorTest, InsertIntoCacheAllGroups) { + auto config = makeDSV4AllocatorConfig(); + auto allocator = std::make_shared(config, AllocationType::DEVICE); + auto shared_cache = std::make_shared(); + allocator->setSharedBlockCache(shared_cache); + ASSERT_TRUE(allocator->init()); + + auto block_pool = allocator->getBlockPool(); + + // Manually set up a BatchKVCacheResource with blocks for all 7 groups + auto batch_res = std::make_shared(); + batch_res->resetBatchSize(1); + initDsv4BatchGroups(*batch_res, config); + + CacheKeysType keys = {200, 201, 202, 203}; + batch_res->setBatchCacheKeys(0, keys); + + // Allocate 3 blocks per group (simulating 3 full blocks) + for (int gid = 0; gid < 7; gid++) { + auto blocks = block_pool->malloc(3); + ASSERT_EQ(blocks.size(), 3u); + batch_res->mutableBlockIds(0, gid).assign(BlockIndicesType(blocks.begin(), blocks.end())); + } + + // Create CompleteTokenIds: 3 full blocks * seq_size_per_block tokens + partial + int seq_size_per_block = allocator->seqSizePerBlock(); + auto complete_token_ids = std::make_shared(1, 1, 4096, seq_size_per_block); + auto generate_input = std::make_shared(); + int total_tokens = 3 * seq_size_per_block + 1; // 3 full blocks + 1 partial + generate_input->input_ids = torch::arange(total_tokens, torch::kInt32); + generate_input->generate_config = std::make_shared(); + complete_token_ids->init(generate_input); + + InsertInfo insert_info{batch_res, complete_token_ids, /*is_resident=*/false}; + allocator->insertIntoCache(insert_info); + + // HCA_STATE is runtime scratch state and must not be persisted as reusable prefix cache. + for (int gid = 0; gid < 7; gid++) { + if (config.tagForGroup(gid) == "hca_state") { + EXPECT_TRUE(isNullBlockIdx(shared_cache->matchGroup(200, gid))) << "HCA_STATE should skip key 200"; + EXPECT_TRUE(isNullBlockIdx(shared_cache->matchGroup(201, gid))) << "HCA_STATE should skip tail key 201"; + EXPECT_TRUE(isNullBlockIdx(shared_cache->matchGroup(202, gid))) << "HCA_STATE should skip tail key 202"; + continue; + } + EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(200, gid))) << config.tagForGroup(gid); + if (config.typeForGroup(gid) != CacheGroupType::FULL) { + EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(201, gid))) << config.tagForGroup(gid); + EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(202, gid))) << config.tagForGroup(gid); + } + } + + // Free all blocks + for (int gid = 0; gid < 7; gid++) { + const auto& blocks = batch_res->blocks(0, gid); + block_pool->requestFree(blocks); + } +} + +// ============================================================ +// Prefix cache: Flash config insertIntoCache skips HCA_STATE. +// ============================================================ + +TEST_F(DSV4AllocatorTest, FlashInsertIntoCacheAllGroups) { + auto config = makeDSV4AllocatorConfig(/*use_flash=*/true); + auto allocator = std::make_shared(config, AllocationType::DEVICE); + auto shared_cache = std::make_shared(); + allocator->setSharedBlockCache(shared_cache); + ASSERT_TRUE(allocator->init()); + + auto block_pool = allocator->getBlockPool(); + + auto batch_res = std::make_shared(); + batch_res->resetBatchSize(1); + initDsv4BatchGroups(*batch_res, config); + + CacheKeysType keys = {300, 301, 302, 303}; + batch_res->setBatchCacheKeys(0, keys); + + for (int gid = 0; gid < 7; gid++) { + auto blocks = block_pool->malloc(3); + ASSERT_EQ(blocks.size(), 3u); + batch_res->mutableBlockIds(0, gid).assign(BlockIndicesType(blocks.begin(), blocks.end())); + } + + int seq_size_per_block = allocator->seqSizePerBlock(); + auto complete_token_ids = std::make_shared(1, 1, 4096, seq_size_per_block); + auto generate_input = std::make_shared(); + int total_tokens = 3 * seq_size_per_block + 1; + generate_input->input_ids = torch::arange(total_tokens, torch::kInt32); + generate_input->generate_config = std::make_shared(); + complete_token_ids->init(generate_input); + + InsertInfo insert_info{batch_res, complete_token_ids, /*is_resident=*/false}; + allocator->insertIntoCache(insert_info); + + for (int gid = 0; gid < 7; gid++) { + if (config.tagForGroup(gid) == "hca_state") { + EXPECT_TRUE(isNullBlockIdx(shared_cache->matchGroup(300, gid))) << "Flash HCA_STATE should skip key 300"; + EXPECT_TRUE(isNullBlockIdx(shared_cache->matchGroup(301, gid))) << "Flash HCA_STATE should skip tail key 301"; + EXPECT_TRUE(isNullBlockIdx(shared_cache->matchGroup(302, gid))) << "Flash HCA_STATE should skip tail key 302"; + continue; + } + EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(300, gid))) << config.tagForGroup(gid); + if (config.typeForGroup(gid) != CacheGroupType::FULL) { + EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(301, gid))) << config.tagForGroup(gid); + EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(302, gid))) << config.tagForGroup(gid); + } + } + + for (int gid = 0; gid < 7; gid++) { + block_pool->requestFree(batch_res->blocks(0, gid)); + } +} + +// ============================================================ +// Prefix cache: paged FULL groups reuse; reusable SWA/state groups require a matched latest tail block. +// ============================================================ + +TEST_F(DSV4AllocatorTest, PrefixCacheReusePagedGroupsOnly) { + auto config = makeDSV4AllocatorConfig(); + auto allocator = std::make_shared(config, AllocationType::DEVICE); + auto shared_cache = std::make_shared(); + allocator->setSharedBlockCache(shared_cache); + ASSERT_TRUE(allocator->init()); + + auto block_pool = allocator->getBlockPool(); + + // Pre-populate cache for ALL 7 groups with keys {100,101,102} + constexpr int group_num = 7; + CacheKeysType cached_keys = {100, 101, 102}; + std::vector> cached_blocks(group_num); + for (int gid = 0; gid < group_num; gid++) { + auto blocks = block_pool->malloc(static_cast(cached_keys.size())); + ASSERT_EQ(blocks.size(), cached_keys.size()); + for (size_t i = 0; i < cached_keys.size(); ++i) { + std::vector group_slots(group_num, NULL_BLOCK_IDX); + group_slots[gid] = blocks[i]; + shared_cache->put(cached_keys[i], group_slots, true); + } + cached_blocks[gid] = blocks; + block_pool->requestFree(blocks); + } + + // Now do a malloc with reuse enabled — keys {100,101,102,103} + auto batch_res = std::make_shared(); + batch_res->resetBatchSize(1); + initDsv4BatchGroups(*batch_res, config); + batch_res->setBatchCacheKeys(0, CacheKeysType{100, 101, 102, 103}); + + int seq_size_per_block = allocator->seqSizePerBlock(); + int seq_len = 3 * seq_size_per_block + 1; // 3 full + partial + auto complete_token_ids = std::make_shared(1, 1, 4096, seq_size_per_block); + auto generate_input = std::make_shared(); + generate_input->input_ids = torch::arange(seq_len, torch::kInt32); + generate_input->generate_config = std::make_shared(); + complete_token_ids->init(generate_input); + + MallocInfo info{batch_res, complete_token_ids}; + info.enable_device_cache = true; + info.reuse_cache = true; + auto result = allocator->malloc(info); + ASSERT_TRUE(result.success); + + EXPECT_GT(result.reuse_len, 0) << "Prefix cache reuse should work with paged DSV4 groups"; + + for (int gid = 0; gid < group_num; gid++) { + const auto& out_blocks = batch_res->blocks(0, gid); + ASSERT_GE(out_blocks.size(), 3u) << config.tagForGroup(gid); + if (config.typeForGroup(gid) == CacheGroupType::FULL) { + EXPECT_EQ(out_blocks[0], cached_blocks[gid][0]) << config.tagForGroup(gid); + EXPECT_EQ(out_blocks[1], cached_blocks[gid][1]) << config.tagForGroup(gid); + continue; + } + EXPECT_TRUE(isNullBlockIdx(out_blocks[1])) << config.tagForGroup(gid); + if (config.tagForGroup(gid) == "hca_state") { + EXPECT_TRUE(isNullBlockIdx(out_blocks[2])) << "HCA_STATE should not reuse a cached tail block"; + continue; + } + EXPECT_EQ(out_blocks[2], cached_blocks[gid][2]) << config.tagForGroup(gid); + } + + // Clean up + FreeInfo free_info{batch_res}; + allocator->free(free_info); +} + +TEST_F(DSV4AllocatorTest, PrefixCacheReuseRequiresSWATailHit) { + auto config = makeDSV4AllocatorConfig(); + auto allocator = std::make_shared(config, AllocationType::DEVICE); + auto shared_cache = std::make_shared(); + allocator->setSharedBlockCache(shared_cache); + ASSERT_TRUE(allocator->init()); + + auto block_pool = allocator->getBlockPool(); + + constexpr int group_num = 7; + CacheKeysType cached_keys = {100, 101, 102}; + std::vector> cached_blocks(3); + for (int gid = 0; gid < 3; gid++) { + auto blocks = block_pool->malloc(static_cast(cached_keys.size())); + ASSERT_EQ(blocks.size(), cached_keys.size()); + for (size_t i = 0; i < cached_keys.size(); ++i) { + std::vector group_slots(group_num, NULL_BLOCK_IDX); + group_slots[gid] = blocks[i]; + shared_cache->put(cached_keys[i], group_slots, true); + } + cached_blocks[gid] = blocks; + block_pool->requestFree(blocks); + } + + auto batch_res = std::make_shared(); + batch_res->resetBatchSize(1); + initDsv4BatchGroups(*batch_res, config); + batch_res->setBatchCacheKeys(0, CacheKeysType{100, 101, 102, 103}); + + int seq_size_per_block = allocator->seqSizePerBlock(); + int seq_len = 3 * seq_size_per_block + 1; + auto complete_token_ids = std::make_shared(1, 1, 4096, seq_size_per_block); + auto generate_input = std::make_shared(); + generate_input->input_ids = torch::arange(seq_len, torch::kInt32); + generate_input->generate_config = std::make_shared(); + complete_token_ids->init(generate_input); + + MallocInfo info{batch_res, complete_token_ids}; + info.enable_device_cache = true; + info.reuse_cache = true; + auto result = allocator->malloc(info); + ASSERT_TRUE(result.success); + + EXPECT_EQ(result.reuse_len, 0) << "SWA tail miss should veto paged prefix reuse"; + + FreeInfo free_info{batch_res}; + allocator->free(free_info); +} + +TEST_F(DSV4AllocatorTest, PrefixCacheReuseDoesNotRequireHCAStateHit) { + auto config = makeDSV4AllocatorConfig(); + auto allocator = std::make_shared(config, AllocationType::DEVICE); + auto shared_cache = std::make_shared(); + allocator->setSharedBlockCache(shared_cache); + ASSERT_TRUE(allocator->init()); + + auto block_pool = allocator->getBlockPool(); + + constexpr int group_num = 7; + CacheKeysType cached_keys = {1100, 1101, 1102}; + std::vector> cached_blocks(group_num); + for (int gid = 0; gid < group_num; gid++) { + if (config.tagForGroup(gid) == "hca_state") { + continue; + } + auto blocks = block_pool->malloc(static_cast(cached_keys.size())); + ASSERT_EQ(blocks.size(), cached_keys.size()); + for (size_t i = 0; i < cached_keys.size(); ++i) { + if (config.typeForGroup(gid) != CacheGroupType::FULL && i + 1 < cached_keys.size()) { + continue; + } + std::vector group_slots(group_num, NULL_BLOCK_IDX); + group_slots[gid] = blocks[i]; + shared_cache->put(cached_keys[i], group_slots, true); + } + cached_blocks[gid] = blocks; + block_pool->requestFree(blocks); + } + + auto batch_res = std::make_shared(); + batch_res->resetBatchSize(1); + initDsv4BatchGroups(*batch_res, config); + batch_res->setBatchCacheKeys(0, CacheKeysType{1100, 1101, 1102, 1103}); + + const int spb = allocator->seqSizePerBlock(); + auto cti = std::make_shared(1, 1, 4096, spb); + auto gi = std::make_shared(); + gi->input_ids = torch::arange(3 * spb + 1, torch::kInt32); + gi->generate_config = std::make_shared(); + cti->init(gi); + + MallocInfo info{batch_res, cti}; + info.enable_device_cache = true; + info.reuse_cache = true; + auto result = allocator->malloc(info); + ASSERT_TRUE(result.success); + + EXPECT_GT(result.reuse_len, 0) << "HCA_STATE miss should not veto DSV4 prefix reuse"; + const auto hca_state_gid = gidForTag(config, "hca_state"); + const auto swa_gid = gidForTag(config, "swa_kv"); + EXPECT_TRUE(isNullBlockIdx(batch_res->blocks(0, hca_state_gid).at(2))) << "HCA_STATE should remain non-reused"; + EXPECT_EQ(batch_res->blocks(0, swa_gid).at(2), cached_blocks[swa_gid][2]) << "SWA_KV tail should still gate reuse"; + + FreeInfo free_info{batch_res}; + allocator->free(free_info); +} + +TEST_F(DSV4AllocatorTest, PrefixCacheReuseAcceptsSingleLatestSWATailHit) { + auto config = makeDSV4AllocatorConfig(); + auto allocator = std::make_shared(config, AllocationType::DEVICE); + auto shared_cache = std::make_shared(); + allocator->setSharedBlockCache(shared_cache); + ASSERT_TRUE(allocator->init()); + + auto block_pool = allocator->getBlockPool(); + + constexpr int group_num = 7; + CacheKeysType cached_keys = {100, 101, 102}; + for (int gid = 0; gid < group_num; gid++) { + auto blocks = block_pool->malloc(static_cast(cached_keys.size())); + ASSERT_EQ(blocks.size(), cached_keys.size()); + for (size_t i = 0; i < cached_keys.size(); ++i) { + if (config.typeForGroup(gid) != CacheGroupType::FULL && i + 1 < cached_keys.size()) { + continue; + } + std::vector group_slots(group_num, NULL_BLOCK_IDX); + group_slots[gid] = blocks[i]; + shared_cache->put(cached_keys[i], group_slots, true); + } + block_pool->requestFree(blocks); + } + + auto batch_res = std::make_shared(); + batch_res->resetBatchSize(1); + initDsv4BatchGroups(*batch_res, config); + batch_res->setBatchCacheKeys(0, CacheKeysType{100, 101, 102, 103}); + + const int spb = allocator->seqSizePerBlock(); + auto cti = std::make_shared(1, 1, 4096, spb); + auto gi = std::make_shared(); + gi->input_ids = torch::arange(3 * spb + 1, torch::kInt32); + gi->generate_config = std::make_shared(); + cti->init(gi); + + MallocInfo info{batch_res, cti}; + info.enable_device_cache = true; + info.reuse_cache = true; + auto result = allocator->malloc(info); + ASSERT_TRUE(result.success); + + EXPECT_GT(result.reuse_len, 0) << "latest SWA tail hit should allow paged prefix reuse"; + + FreeInfo free_info{batch_res}; + allocator->free(free_info); +} + +TEST_F(DSV4AllocatorTest, FlashPrefixCacheReusePagedGroupsOnly) { + auto config = makeDSV4AllocatorConfig(/*use_flash=*/true); + auto allocator = std::make_shared(config, AllocationType::DEVICE); + auto shared_cache = std::make_shared(); + allocator->setSharedBlockCache(shared_cache); + ASSERT_TRUE(allocator->init()); + + auto block_pool = allocator->getBlockPool(); + + constexpr int group_num = 7; + CacheKeysType cached_keys = {500, 501, 502}; + std::vector> cached_blocks(group_num); + for (int gid = 0; gid < group_num; gid++) { + auto blocks = block_pool->malloc(static_cast(cached_keys.size())); + ASSERT_EQ(blocks.size(), cached_keys.size()); + for (size_t i = 0; i < cached_keys.size(); ++i) { + std::vector group_slots(group_num, NULL_BLOCK_IDX); + group_slots[gid] = blocks[i]; + shared_cache->put(cached_keys[i], group_slots, true); + } + cached_blocks[gid] = blocks; + block_pool->requestFree(blocks); + } + + auto batch_res = std::make_shared(); + batch_res->resetBatchSize(1); + initDsv4BatchGroups(*batch_res, config); + batch_res->setBatchCacheKeys(0, CacheKeysType{500, 501, 502, 503}); + + int seq_size_per_block = allocator->seqSizePerBlock(); + int seq_len = 3 * seq_size_per_block + 1; + auto complete_token_ids = std::make_shared(1, 1, 4096, seq_size_per_block); + auto generate_input = std::make_shared(); + generate_input->input_ids = torch::arange(seq_len, torch::kInt32); + generate_input->generate_config = std::make_shared(); + complete_token_ids->init(generate_input); + + MallocInfo info{batch_res, complete_token_ids}; + info.enable_device_cache = true; + info.reuse_cache = true; + auto result = allocator->malloc(info); + ASSERT_TRUE(result.success); + + EXPECT_GT(result.reuse_len, 0) << "Flash prefix cache reuse should work for paged groups"; + + for (int gid = 0; gid < group_num; gid++) { + const auto& out_blocks = batch_res->blocks(0, gid); + ASSERT_GE(out_blocks.size(), 3u) << config.tagForGroup(gid); + if (config.typeForGroup(gid) == CacheGroupType::FULL) { + EXPECT_EQ(out_blocks[0], cached_blocks[gid][0]) << config.tagForGroup(gid); + continue; + } + EXPECT_TRUE(isNullBlockIdx(out_blocks[1])) << config.tagForGroup(gid); + if (config.tagForGroup(gid) == "hca_state") { + EXPECT_TRUE(isNullBlockIdx(out_blocks[2])) << "Flash HCA_STATE should not reuse a cached tail block"; + continue; + } + EXPECT_EQ(out_blocks[2], cached_blocks[gid][2]) << config.tagForGroup(gid); + } + + FreeInfo free_info{batch_res}; + allocator->free(free_info); +} + +TEST_F(DSV4AllocatorTest, HybridPoolReserveBlocksAreDistributedAcrossGroups) { + auto config = makeDSV4AllocatorConfig(/*use_flash=*/true); + config.block_num = 200; + auto allocator = std::make_shared( + config, AllocationType::DEVICE, nullptr, /*reserve_block_ratio=*/10); + ASSERT_TRUE(allocator->init()); + + auto batch_res = std::make_shared(); + batch_res->resetBatchSize(1); + initDsv4BatchGroups(*batch_res, config); + batch_res->setBatchCacheKeys(0, CacheKeysType{600, 601}); + + const int spb = allocator->seqSizePerBlock(); + auto cti = std::make_shared(1, 1, 4096, spb); + auto gi = std::make_shared(); + gi->input_ids = torch::arange(spb, torch::kInt32); + gi->generate_config = std::make_shared(); + cti->init(gi); + + MallocInfo info{batch_res, cti}; + info.enable_device_cache = false; + info.reuse_cache = false; + info.verbose = true; + auto result = allocator->malloc(info); + ASSERT_TRUE(result.success); + + FreeInfo free_info{batch_res}; + allocator->free(free_info); +} + +TEST_F(DSV4AllocatorTest, HybridPoolReserveBlocksDoNotReduceExplicitHcaStateCapacity) { + auto mc = makeFlashModelConfig(); + ParallelismConfig pc; + auto kv_config = makeDsv4KvCacheConfig(); + setDsv4ExplicitPoolBlocks(mc, "hca_state", 11); + auto config = CacheConfigCreator::createBasicConfig(mc, pc, kv_config, false, 0); + config.block_num = 40; + std::vector block_nums(static_cast(config.groupNums()), config.block_num); + for (size_t gid = 0; gid < static_cast(config.groupNums()); ++gid) { + if (config.tagForGroup(gid) == "hca_state") { + block_nums[gid] = 11; + } + } + setGroupBlockNumsForTest(config, block_nums); + + auto allocator = std::make_shared( + config, AllocationType::DEVICE, nullptr, /*reserve_block_ratio=*/50); + ASSERT_TRUE(allocator->init()); + + auto batch_res = std::make_shared(); + batch_res->resetBatchSize(1); + initDsv4BatchGroups(*batch_res, config); + + const int spb = allocator->seqSizePerBlock(); + const int seq_len = 10 * spb; + auto cti = std::make_shared(1, 1, seq_len + spb, spb); + auto gi = std::make_shared(); + gi->input_ids = torch::arange(seq_len, torch::kInt32); + gi->generate_config = std::make_shared(); + cti->init(gi); + + MallocInfo info{batch_res, cti}; + info.enable_device_cache = false; + info.reuse_cache = false; + info.verbose = true; + auto result = allocator->malloc(info); + ASSERT_TRUE(result.success); + + FreeInfo free_info{batch_res}; + allocator->free(free_info); +} + +// ============================================================ +// SWA (group 6) prefix cache: verify SWA blocks participate in reuse +// ============================================================ + +TEST_F(DSV4AllocatorTest, SWAGroupParticipatesInPrefixCacheReuse) { + auto config = makeDSV4AllocatorConfig(); + config.block_num = 100; + auto allocator = std::make_shared(config, AllocationType::DEVICE); + auto shared_cache = std::make_shared(); + allocator->setSharedBlockCache(shared_cache); + ASSERT_TRUE(allocator->init()); + + auto block_pool = allocator->getBlockPool(); + + constexpr int group_num = 7; + + // Only populate SWA group (6) and one paged group (0) to verify SWA participates + CacheKeysType cached_keys = {700, 701}; + std::vector swa_blocks, csa_blocks; + + // Group 0 (CSA KV) + { + auto blocks = block_pool->malloc(2); + for (size_t i = 0; i < 2; ++i) { + std::vector group_slots(group_num, NULL_BLOCK_IDX); + group_slots[0] = blocks[i]; + shared_cache->put(cached_keys[i], group_slots, true); + } + csa_blocks = blocks; + block_pool->requestFree(blocks); + } + // Group 6 (SWA KV) + { + auto blocks = block_pool->malloc(2); + for (size_t i = 0; i < 2; ++i) { + std::vector group_slots(group_num, NULL_BLOCK_IDX); + group_slots[6] = blocks[i]; + shared_cache->put(cached_keys[i], group_slots, true); + } + swa_blocks = blocks; + block_pool->requestFree(blocks); + } + + // Verify both groups have cache entries + EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(700, 0))); + EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(700, 6))); + EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(701, 0))); + EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(701, 6))); + + // Groups 1,2,3,4,5 not populated — they will limit reuse to 0 + // But this verifies SWA group 6 IS in the reuse path + EXPECT_TRUE(isNullBlockIdx(shared_cache->matchGroup(700, 3))); + EXPECT_TRUE(isNullBlockIdx(shared_cache->matchGroup(700, 4))); + EXPECT_TRUE(isNullBlockIdx(shared_cache->matchGroup(700, 5))); +} + +// ============================================================ +// SWA prefix cache: cache entries exist and the matched tail window gates reuse. +// ============================================================ + +TEST_F(DSV4AllocatorTest, SWAPrefixCacheRestoresTailReuse) { + auto config = makeDSV4AllocatorConfig(); + auto allocator = std::make_shared(config, AllocationType::DEVICE); + auto shared_cache = std::make_shared(); + allocator->setSharedBlockCache(shared_cache); + ASSERT_TRUE(allocator->init()); + + auto block_pool = allocator->getBlockPool(); + + // Populate ALL 7 groups with same keys + constexpr int group_num = 7; + CacheKeysType cached_keys = {800, 801}; + std::vector> cached_blocks(group_num); + for (int gid = 0; gid < group_num; gid++) { + auto blocks = block_pool->malloc(2); + for (size_t i = 0; i < 2; ++i) { + std::vector group_slots(group_num, NULL_BLOCK_IDX); + group_slots[gid] = blocks[i]; + shared_cache->put(cached_keys[i], group_slots, true); + } + cached_blocks[gid] = blocks; + block_pool->requestFree(blocks); + } + + // Malloc with reuse — keys {800, 801, 802} + auto batch_res = std::make_shared(); + batch_res->resetBatchSize(1); + initDsv4BatchGroups(*batch_res, config); + batch_res->setBatchCacheKeys(0, CacheKeysType{800, 801, 802}); + + int spb = allocator->seqSizePerBlock(); + int seq_len = 2 * spb + 1; + auto cti = std::make_shared(1, 1, 4096, spb); + auto gi = std::make_shared(); + gi->input_ids = torch::arange(seq_len, torch::kInt32); + gi->generate_config = std::make_shared(); + cti->init(gi); + + MallocInfo info{batch_res, cti}; + info.enable_device_cache = true; + info.reuse_cache = true; + auto result = allocator->malloc(info); + ASSERT_TRUE(result.success); + EXPECT_GT(result.reuse_len, 0); + + const auto& swa_out = batch_res->blocks(0, 6); + ASSERT_GE(swa_out.size(), 2u); + EXPECT_TRUE(isNullBlockIdx(swa_out[0])) << "SWA previous matched tail is evicted after new tail allocation"; + EXPECT_EQ(swa_out[1], cached_blocks[6][1]) << "SWA last matched tail block should remain"; + + FreeInfo free_info{batch_res}; + allocator->free(free_info); +} + +// ============================================================ +// incrMalloc: decode grows sequence after initial prefill +// ============================================================ + +TEST_F(DSV4AllocatorTest, IncrMallocDecodeGrowsBlocks) { + auto config = makeDSV4AllocatorConfig(); + auto allocator = std::make_shared(config, AllocationType::DEVICE); + ASSERT_TRUE(allocator->init()); + + int spb = allocator->seqSizePerBlock(); + + // Initial malloc: 1 block worth of tokens + auto batch_res = std::make_shared(); + batch_res->resetBatchSize(1); + initDsv4BatchGroups(*batch_res, config); + batch_res->setBatchCacheKeys(0, CacheKeysType{900, 901}); + + auto cti = std::make_shared(1, 1, 4096, spb); + auto gi = std::make_shared(); + gi->input_ids = torch::arange(spb, torch::kInt32); + gi->generate_config = std::make_shared(); + cti->init(gi); + + MallocInfo init_info{batch_res, cti}; + init_info.enable_device_cache = false; + auto init_result = allocator->malloc(init_info); + ASSERT_TRUE(init_result.success); + + // All 7 groups should have 1 block each + for (int gid = 0; gid < 7; gid++) { + EXPECT_EQ(batch_res->blocksNum(0, gid), 1u) << "group " << gid << " should have 1 block after init"; + } + + size_t free_after_init = allocator->freeBlocksNum(); + + // incrMalloc: grow to 2 blocks + cti->setSeqLength(2 * spb); + MallocInfo incr_info{batch_res, cti}; + incr_info.enable_device_cache = false; + auto incr_result = allocator->malloc(incr_info); + ASSERT_TRUE(incr_result.success); + + // All 7 groups should now have 2 blocks each + for (int gid = 0; gid < 7; gid++) { + EXPECT_EQ(batch_res->blocksNum(0, gid), 2u) << "group " << gid << " should have 2 blocks after incr"; + } + + // HCA_STATE is not reusable: decode may materialize a new tail, but the + // skipped old tail is released, so only the other six groups consume a net + // additional block. + EXPECT_EQ(allocator->freeBlocksNum(), free_after_init - 6); + + FreeInfo free_info{batch_res}; + allocator->free(free_info); +} + +// ============================================================ +// Free and reallocate: blocks return to pool +// ============================================================ + +TEST_F(DSV4AllocatorTest, FreeReturnsBlocksToPool) { + auto config = makeDSV4AllocatorConfig(); + auto allocator = std::make_shared(config, AllocationType::DEVICE); + ASSERT_TRUE(allocator->init()); + + size_t free_before = allocator->freeBlocksNum(); + int spb = allocator->seqSizePerBlock(); + + // Allocate + auto batch_res = std::make_shared(); + batch_res->resetBatchSize(1); + initDsv4BatchGroups(*batch_res, config); + batch_res->setBatchCacheKeys(0, CacheKeysType{1000, 1001}); + + auto cti = std::make_shared(1, 1, 4096, spb); + auto gi = std::make_shared(); + gi->input_ids = torch::arange(spb, torch::kInt32); + gi->generate_config = std::make_shared(); + cti->init(gi); + + MallocInfo info{batch_res, cti}; + info.enable_device_cache = false; + auto result = allocator->malloc(info); + ASSERT_TRUE(result.success); + + size_t free_after_alloc = allocator->freeBlocksNum(); + EXPECT_LT(free_after_alloc, free_before); + + // Free + FreeInfo free_info{batch_res}; + allocator->free(free_info); + + // All blocks should be returned + EXPECT_EQ(allocator->freeBlocksNum(), free_before); + + // Can allocate again + auto batch_res2 = std::make_shared(); + batch_res2->resetBatchSize(1); + initDsv4BatchGroups(*batch_res2, config); + batch_res2->setBatchCacheKeys(0, CacheKeysType{1100, 1101}); + + MallocInfo info2{batch_res2, cti}; + info2.enable_device_cache = false; + auto result2 = allocator->malloc(info2); + ASSERT_TRUE(result2.success); + + FreeInfo free_info2{batch_res2}; + allocator->free(free_info2); + EXPECT_EQ(allocator->freeBlocksNum(), free_before); +} + +// ============================================================ +// Flash: incrMalloc decode path +// ============================================================ + +TEST_F(DSV4AllocatorTest, FlashIncrMallocDecode) { + auto config = makeDSV4AllocatorConfig(/*use_flash=*/true); + auto allocator = std::make_shared(config, AllocationType::DEVICE); + ASSERT_TRUE(allocator->init()); + + int spb = allocator->seqSizePerBlock(); + + auto batch_res = std::make_shared(); + batch_res->resetBatchSize(1); + initDsv4BatchGroups(*batch_res, config); + batch_res->setBatchCacheKeys(0, CacheKeysType{1200, 1201}); + + auto cti = std::make_shared(1, 1, 4096, spb); + auto gi = std::make_shared(); + gi->input_ids = torch::arange(spb, torch::kInt32); + gi->generate_config = std::make_shared(); + cti->init(gi); + + MallocInfo init_info{batch_res, cti}; + init_info.enable_device_cache = false; + ASSERT_TRUE(allocator->malloc(init_info).success); + + for (int gid = 0; gid < 7; gid++) { + EXPECT_EQ(batch_res->blocksNum(0, gid), 1u) << "Flash group " << gid; + } + + // Grow to 3 blocks + cti->setSeqLength(3 * spb); + MallocInfo incr_info{batch_res, cti}; + incr_info.enable_device_cache = false; + ASSERT_TRUE(allocator->malloc(incr_info).success); + + for (int gid = 0; gid < 7; gid++) { + EXPECT_EQ(batch_res->blocksNum(0, gid), 3u) << "Flash group " << gid << " after incr"; + } + + FreeInfo free_info{batch_res}; + allocator->free(free_info); +} + +} // namespace test +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/test/FullKVCacheGroupTest.cc b/rtp_llm/cpp/cache/test/FullKVCacheGroupTest.cc index e1855bfdb9..ad317209d0 100644 --- a/rtp_llm/cpp/cache/test/FullKVCacheGroupTest.cc +++ b/rtp_llm/cpp/cache/test/FullKVCacheGroupTest.cc @@ -4,7 +4,8 @@ #include #include #include -#include "rtp_llm/cpp/cache/FullKVCacheGroup.h" +#include "rtp_llm/cpp/cache/group/FullKVCacheGroup.h" +#include "rtp_llm/cpp/cache/SharedBlockCache.h" #include "rtp_llm/cpp/cache/test/BlockPoolTestHelper.h" namespace rtp_llm { @@ -75,22 +76,21 @@ TEST_F(FullKVCacheGroupTest, MatchTest) { auto block_pool = createBlockPool(); block_pool->init(); - auto block_cache = block_pool->blockCache(); - BlockCache::CacheItem item = {101, 0, 1, false}; - auto result1 = block_cache->put(item); - EXPECT_TRUE(result1); - - BlockCache::CacheItem item2 = {102, 0, 2, false}; - auto result2 = block_cache->put(item2); - EXPECT_TRUE(result2); + auto shared_cache = std::make_shared(); + std::vector group_pools = {block_pool}; + shared_cache->init(1, group_pools); auto spec = std::make_shared(); spec->seq_size_per_block = 4; - FullKVCacheGroup group1({}, spec, block_pool, 0); + FullKVCacheGroup group1({}, spec, block_pool, 0, shared_cache.get()); + + // Put items into shared cache: cache_key -> group_slots (group 0 = block_idx) + shared_cache->put(101, {1}, false); + shared_cache->put(102, {2}, false); - // zero math + // zero match CacheKeysType cache_keys = {103, 104, 105, 106}; auto match_result1 = group1.match(cache_keys); ASSERT_EQ(match_result1.reuse_blocks, 0); @@ -107,13 +107,8 @@ TEST_F(FullKVCacheGroupTest, MatchTest) { ASSERT_EQ(match_result2.block_indices, expected_result); // all match - BlockCache::CacheItem item3 = {103, 0, 3, false}; - auto result3 = block_cache->put(item3); - EXPECT_TRUE(result3); - - BlockCache::CacheItem item4 = {104, 0, 4, false}; - auto result4 = block_cache->put(item4); - EXPECT_TRUE(result4); + shared_cache->put(103, {3}, false); + shared_cache->put(104, {4}, false); cache_keys = {101, 102, 103, 104}; auto match_result3 = group1.match(cache_keys); @@ -154,85 +149,6 @@ TEST_F(FullKVCacheGroupTest, MallocFreeTest) { ASSERT_FALSE(group1.malloc(block_ids2, 180)); } -TEST_F(FullKVCacheGroupTest, InsertIntoCacheTest) { - auto block_pool = createBlockPool(); - block_pool->init(); - ASSERT_EQ(block_pool->freeBlocksNum(), 9); - ASSERT_EQ(block_pool->availableBlocksNum(), 9); - - auto spec = std::make_shared(); - spec->seq_size_per_block = 2; - - FullKVCacheGroup group1({}, spec, block_pool, 0); - - CacheKeysType cache_keys = {103, 104, 105, 106}; - BlockIds block_ids(/*kernel_blocks_per_kv_block=*/1); - - group1.malloc(block_ids, 8); - ASSERT_EQ(block_pool->freeBlocksNum(), 5); - ASSERT_EQ(block_ids.blocks().size(), 4); - BlockIndicesType expected_result = {1, 2, 3, 4}; - ASSERT_EQ(block_ids.blocks(), expected_result); - - group1.insertIntoCache(cache_keys, block_ids.blocks(), false); - - CacheKeysType cache_keys1 = {107, 108}; - auto match_result1 = group1.match(cache_keys1); - ASSERT_EQ(match_result1.reuse_length, 0); - - CacheKeysType cache_keys2 = {103, 104, 107}; - auto match_result2 = group1.match(cache_keys2); - ASSERT_EQ(match_result2.reuse_length, 2 * 2); - BlockIndicesType expected_result2 = {1, 2}; - ASSERT_EQ(match_result2.block_indices, expected_result2); - - CacheKeysType cache_keys3 = {103, 104, 105, 106}; - auto match_result3 = group1.match(cache_keys3); - ASSERT_EQ(match_result3.reuse_length, 4 * 2); - BlockIndicesType expected_result3 = {1, 2, 3, 4}; - ASSERT_EQ(match_result3.block_indices, expected_result3); -} - -TEST_F(FullKVCacheGroupTest, EnsureFreeBlocksTest) { - auto block_pool = createBlockPool(); - block_pool->init(); - auto block_cache = block_pool->blockCache(); - auto total_blocks = block_pool->freeBlocksNum(); - - auto spec = std::make_shared(); - spec->seq_size_per_block = 2; - - FullKVCacheGroup group1({}, spec, block_pool, 0); - ASSERT_EQ(true, group1.ensureFreeBlocks(5)); - ASSERT_EQ(block_pool->freeBlocksNum(), total_blocks); - ASSERT_EQ(block_pool->availableBlocksNum(), total_blocks); - - ASSERT_EQ(false, group1.ensureFreeBlocks(10)); - - CacheKeysType cache_keys = {101, 102, 103, 104}; - BlockIds block_ids(/*kernel_blocks_per_kv_block=*/1); - - ASSERT_TRUE(group1.malloc(block_ids, 8)); - ASSERT_EQ(block_ids.blocks().size(), 4); - ASSERT_EQ(block_pool->freeBlocksNum(), total_blocks - 4); - ASSERT_EQ(block_pool->availableBlocksNum(), total_blocks - 4); - - group1.insertIntoCache(cache_keys, block_ids.blocks(), false); - ASSERT_EQ(block_cache->size(), 4); - ASSERT_EQ(block_pool->freeBlocksNum(), total_blocks - 4); - ASSERT_EQ(block_pool->availableBlocksNum(), total_blocks - 4); - - group1.free(block_ids.blocks()); - ASSERT_EQ(block_cache->size(), 4); - ASSERT_EQ(block_pool->freeBlocksNum(), total_blocks - 4); - ASSERT_EQ(block_pool->availableBlocksNum(), total_blocks); - - ASSERT_EQ(true, group1.ensureFreeBlocks(total_blocks - 2)); - ASSERT_EQ(block_cache->size(), 2); - ASSERT_EQ(block_pool->freeBlocksNum(), total_blocks - 2); - ASSERT_EQ(block_pool->availableBlocksNum(), total_blocks); -} - } // namespace test } // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/test/HybridKVCacheAllocatorCPShardTest.cc b/rtp_llm/cpp/cache/test/HybridKVCacheAllocatorCPShardTest.cc new file mode 100644 index 0000000000..1f7d412312 --- /dev/null +++ b/rtp_llm/cpp/cache/test/HybridKVCacheAllocatorCPShardTest.cc @@ -0,0 +1,295 @@ +// CP-shard (Stage 5, Plan A) UTs for HybridKVCacheAllocator. +// +// These exercise the cp_slot_mapper plumbing in initMallocForCommonLen, +// incrMalloc, insertIntoCache, and getNeedBlocks. The shape of the tests +// piggybacks on the helpers in HybridTypeKVCacheAllocatorTest.cc but +// keeps the configuration self-contained so the two files build cleanly +// alongside each other. + +#include + +#include +#include + +#include "rtp_llm/cpp/cache/BatchKVCacheResource.h" +#include "rtp_llm/cpp/cache/CPSlotMapper.h" +#include "rtp_llm/cpp/cache/allocator/HybridTypeKVCacheAllocator.h" +#include "rtp_llm/cpp/cache/SharedBlockCache.h" +#include "rtp_llm/cpp/cache/test/BlockPoolTestHelper.h" +#include "rtp_llm/cpp/engine_base/stream/CompleteTokenIds.h" +#include "rtp_llm/cpp/utils/Logger.h" + +namespace rtp_llm { +namespace test { + +namespace { + +// Two-group hybrid: gid=0 linear (won't be exercised here), gid=1 full (the CP-shard target). +CacheConfig makeCPHybridConfig() { + CacheConfig config; + config.dtype = rtp_llm::DataType::TYPE_FP16; + config.layer_num = 4; + config.layer_all_num = 4; + config.block_num = 32; // headroom for cp_size=2 expansion + config.seq_size_per_block = 4; + config.kernel_seq_size_per_block = 2; + config.linear_step = 2; + config.group_layer_num = 2; + + auto linear_spec = std::make_shared(); + linear_spec->type = KVCacheSpecType::LinearAttention; + linear_spec->dtype = config.dtype; + linear_spec->local_num_k_heads = 1; + linear_spec->local_num_v_heads = 1; + linear_spec->head_k_dim = 1; + linear_spec->head_v_dim = 1; + linear_spec->conv_kernel_dim = 2; + linear_spec->local_head_num_kv = 1; + linear_spec->seq_size_per_block = static_cast(config.seq_size_per_block); + + auto full_spec = std::make_shared(); + full_spec->type = KVCacheSpecType::MultiHeadAttention; + full_spec->dtype = config.dtype; + full_spec->local_head_num_kv = 1; + full_spec->size_per_head = 1; + full_spec->seq_size_per_block = static_cast(config.seq_size_per_block); + + config.fromGroupedSpecs({linear_spec, full_spec}, + {{0, 1}, {2, 3}}, + {CacheGroupType::LINEAR, CacheGroupType::FULL}, + {"linear", "full"}); + + config.kv_block_stride_bytes = std::max(full_spec->block_size_bytes(), linear_spec->block_size_bytes()); + config.kv_block_size_bytes = static_cast(config.group_layer_num) * config.kv_block_stride_bytes; + config.kv_scale_stride_bytes = 0; + config.kv_scale_size_bytes = 0; + config.block_size_bytes = config.kv_block_size_bytes + config.kv_scale_size_bytes; + + return config; +} + +CompleteTokenIdsPtr makeTokens(int batch_size, int seq_length, int seq_size_per_block) { + auto tokens = std::make_shared(batch_size, batch_size, seq_length + 64, seq_size_per_block); + auto ids = torch::empty({(int64_t)seq_length}, torch::kInt32); + auto* p = ids.data_ptr(); + for (int i = 0; i < seq_length; ++i) { + p[i] = i + 1; + } + auto gen = std::make_shared(); + gen->input_ids = ids; + gen->generate_config = std::make_shared(); + tokens->init(gen); + return tokens; +} + +BatchKVCacheResourcePtr makeBatchRes(int batch_size, const CacheConfig& config, CacheKeysType keys) { + auto res = std::make_shared(); + res->resetBatchSize(batch_size); + res->initGroups(config.groupNums(), + static_cast(config.layer_all_num), + config.layerGroupIdsSnapshot()); + for (int b = 0; b < batch_size; ++b) { + res->setBatchCacheKeys(b, keys); + } + return res; +} + +// Cache (key, group-slot) pairs into SharedBlockCache and drop request refs so blocks are reusable. +std::vector seedCache( + BlockPoolPtr block_pool, SharedBlockCachePtr shared_cache, int group_num, int group_id, const CacheKeysType& keys) { + auto blocks = block_pool->malloc(static_cast(keys.size())); + EXPECT_EQ(blocks.size(), keys.size()); + for (size_t i = 0; i < keys.size(); ++i) { + std::vector group_slots(static_cast(group_num), NULL_BLOCK_IDX); + group_slots[static_cast(group_id)] = blocks[i]; + shared_cache->put(keys[i], group_slots, true); + } + block_pool->requestFree(blocks); + return blocks; +} + +} // namespace + +class HybridKVCacheAllocatorCPShardTest: public ::testing::Test { +protected: + void SetUp() override { + rtp_llm::initLogger(); + createDevice(); + } +}; + +// 1) When cp_slot_mapper is null/passthrough, behavior is identical to the non-CP baseline: +// a request occupying 4 logical blocks allocates 4 blocks in the full group. +TEST_F(HybridKVCacheAllocatorCPShardTest, NullMapperIsPassthrough) { + auto config = makeCPHybridConfig(); + auto allocator = std::make_shared(config, AllocationType::DEVICE); + allocator->setSharedBlockCache(std::make_shared()); + ASSERT_TRUE(allocator->init()); + + const int gid_full = 1; + auto batch_res = makeBatchRes(/*batch_size=*/1, config, CacheKeysType{100, 101, 102, 103}); + // seq_len=16 => 4 slots @ block_size=4 + auto tokens = makeTokens(/*batch=*/1, /*seq_len=*/16, /*sspb=*/4); + MallocInfo info{batch_res, tokens}; + info.enable_device_cache = false; + info.reuse_cache = false; + // cp_slot_mapper intentionally left null. + auto result = allocator->malloc(info); + ASSERT_TRUE(result.success); + EXPECT_EQ(batch_res->blocksNum(0, gid_full), 4); +} + +// 2) With cp_slot_mapper(cp_rank=0, cp_size=2, block_size=4): a 4-block request allocates ceil(4/2)=2 +// physical blocks on this rank for the full group. +TEST_F(HybridKVCacheAllocatorCPShardTest, ShardedAllocHalvesFullGroup) { + auto config = makeCPHybridConfig(); + auto allocator = std::make_shared(config, AllocationType::DEVICE); + allocator->setSharedBlockCache(std::make_shared()); + ASSERT_TRUE(allocator->init()); + + const int gid_full = 1; + auto batch_res = makeBatchRes(1, config, CacheKeysType{100, 101, 102, 103}); + auto tokens = makeTokens(1, 16, 4); // 4 logical blocks worth + + MallocInfo info{batch_res, tokens}; + info.enable_device_cache = false; + info.reuse_cache = false; + allocator->setCPSlotMapper(std::make_shared(/*cp_rank=*/0, /*cp_size=*/2, /*block_size=*/4)); + auto result = allocator->malloc(info); + ASSERT_TRUE(result.success); + EXPECT_EQ(batch_res->blocksNum(0, gid_full), 2) + << "cp_size=2 should halve allocation to ceil(4/2)=2 physical blocks per rank"; +} + +// 3) Reuse path: cache the last-rank canonical key and confirm a second malloc hits it, +// returning reuse_len in units of virtualBlockSize (= block_size * cp_size). +TEST_F(HybridKVCacheAllocatorCPShardTest, ReuseHitOnLastRankCanonicalKey) { + auto config = makeCPHybridConfig(); + auto allocator = std::make_shared(config, AllocationType::DEVICE); + allocator->setSharedBlockCache(std::make_shared()); + ASSERT_TRUE(allocator->init()); + + auto block_pool = allocator->getBlockPool(); + auto shared_cache = allocator->sharedBlockCache(); + ASSERT_NE(block_pool, nullptr); + ASSERT_NE(shared_cache, nullptr); + + const int gid_linear = 0; + const int gid_full = 1; + const int group_num = 2; + // Full keys for 4 blocks: {100,101,102,103}. + // localCacheKeys(cp_rank=cp_size-1=1, cp_size=2) selects indices {1,3} => {101, 103}. + // initMallocForCommonLen drops the last for matching => match_keys = {101}. + // Joint match requires the linear group's tail to also resolve, so seed both groups with key 101. + seedCache(block_pool, shared_cache, group_num, gid_full, CacheKeysType{101}); + seedCache(block_pool, shared_cache, group_num, gid_linear, CacheKeysType{101}); + + auto batch_res = makeBatchRes(1, config, CacheKeysType{100, 101, 102, 103}); + auto tokens = makeTokens(1, 16, 4); + + MallocInfo info{batch_res, tokens}; + info.enable_device_cache = true; + info.reuse_cache = true; + allocator->setCPSlotMapper(std::make_shared(/*cp_rank=*/0, /*cp_size=*/2, /*block_size=*/4)); + auto result = allocator->malloc(info); + ASSERT_TRUE(result.success); + + // Expect 1 reuse virtual-block * virtualBlockSize(=8 tokens). + EXPECT_EQ(result.reuse_len, 8); + // Per-rank physical blocks for full group still = ceil(4/2) = 2. + EXPECT_EQ(batch_res->blocksNum(0, gid_full), 2); +} + +// 4) When reuse is disabled, cp_slot_mapper still translates seq_len for malloc and skips the match. +TEST_F(HybridKVCacheAllocatorCPShardTest, ShardedAllocSkipsReuseWhenDisabled) { + auto config = makeCPHybridConfig(); + auto allocator = std::make_shared(config, AllocationType::DEVICE); + allocator->setSharedBlockCache(std::make_shared()); + ASSERT_TRUE(allocator->init()); + + auto block_pool = allocator->getBlockPool(); + auto shared_cache = allocator->sharedBlockCache(); + + const int gid_full = 1; + seedCache(block_pool, shared_cache, /*group_num=*/2, gid_full, CacheKeysType{101}); + + auto batch_res = makeBatchRes(1, config, CacheKeysType{100, 101, 102, 103}); + auto tokens = makeTokens(1, 16, 4); + + MallocInfo info{batch_res, tokens}; + info.enable_device_cache = false; + info.reuse_cache = false; + allocator->setCPSlotMapper(std::make_shared(0, 2, 4)); + auto result = allocator->malloc(info); + ASSERT_TRUE(result.success); + EXPECT_EQ(result.reuse_len, 0); + EXPECT_EQ(batch_res->blocksNum(0, gid_full), 2); +} + +// 5) insertIntoCache uses last-rank canonical keys and virtualBlockSize when sharded: +// a 12-token request (full_blocks_num = floor(12/8)=1 virtual block) inserts only key {103} +// (= last-rank canonical key at index cp_size-1=1 of the first virtual block window). +TEST_F(HybridKVCacheAllocatorCPShardTest, InsertIntoCacheUsesCanonicalKeysAndVirtualBlockSize) { + auto config = makeCPHybridConfig(); + auto allocator = std::make_shared(config, AllocationType::DEVICE); + allocator->setSharedBlockCache(std::make_shared()); + ASSERT_TRUE(allocator->init()); + + auto shared_cache = allocator->sharedBlockCache(); + ASSERT_NE(shared_cache, nullptr); + + const int gid_full = 1; + auto batch_res = makeBatchRes(1, config, CacheKeysType{100, 101, 102, 103}); + + // seq_len=16 => allocator computes 4 logical blocks; cp_size=2 keeps 2 per rank. + auto tokens = makeTokens(1, 16, 4); + MallocInfo malloc_info{batch_res, tokens}; + malloc_info.enable_device_cache = false; + malloc_info.reuse_cache = false; + allocator->setCPSlotMapper(std::make_shared(0, 2, 4)); + ASSERT_TRUE(allocator->malloc(malloc_info).success); + ASSERT_EQ(batch_res->blocksNum(0, gid_full), 2); + + // CompleteTokenIds reflects token-len 16, so token_len-1 = 15. virtualBlockSize=8 => + // full_blocks_num = floor(15/8) = 1. n = min(local_keys.size()=2, 1) = 1. + // local_keys = {101, 103}; first key is 101. + InsertInfo insert_info{batch_res, tokens, /*is_resident=*/false}; + allocator->insertIntoCache(insert_info); + + EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(101, gid_full))); + EXPECT_TRUE(isNullBlockIdx(shared_cache->matchGroup(100, gid_full))); + EXPECT_TRUE(isNullBlockIdx(shared_cache->matchGroup(102, gid_full))); + EXPECT_TRUE(isNullBlockIdx(shared_cache->matchGroup(103, gid_full))); +} + +// 6) Two-malloc smoke: cp_size=4 sharding, request occupies 8 logical blocks ⇒ 2 per rank. +TEST_F(HybridKVCacheAllocatorCPShardTest, ShardedAllocCpSize4) { + auto config = makeCPHybridConfig(); + auto allocator = std::make_shared(config, AllocationType::DEVICE); + allocator->setSharedBlockCache(std::make_shared()); + ASSERT_TRUE(allocator->init()); + + const int gid_full = 1; + CacheKeysType keys; + for (int i = 0; i < 8; ++i) { + keys.push_back(200 + i); + } + auto batch_res = makeBatchRes(1, config, keys); + auto tokens = makeTokens(1, /*seq_len=*/32, 4); // 8 logical blocks + + MallocInfo info{batch_res, tokens}; + info.enable_device_cache = false; + info.reuse_cache = false; + allocator->setCPSlotMapper(std::make_shared(/*cp_rank=*/2, /*cp_size=*/4, /*block_size=*/4)); + auto result = allocator->malloc(info); + ASSERT_TRUE(result.success); + EXPECT_EQ(batch_res->blocksNum(0, gid_full), 2); // ceil(8/4)=2 +} + +} // namespace test +} // namespace rtp_llm + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/rtp_llm/cpp/cache/test/HybridPoolKVCacheAllocatorTest.cc b/rtp_llm/cpp/cache/test/HybridPoolKVCacheAllocatorTest.cc new file mode 100644 index 0000000000..2d8831b890 --- /dev/null +++ b/rtp_llm/cpp/cache/test/HybridPoolKVCacheAllocatorTest.cc @@ -0,0 +1,1367 @@ +#include + +#include +#include +#include +#include +#include +#include + +#include "rtp_llm/cpp/utils/AssertUtils.h" + +#include "rtp_llm/cpp/cache/BatchKVCacheResource.h" +#include "rtp_llm/cpp/cache/SharedBlockCache.h" +#include "rtp_llm/cpp/cache/BlockPool.h" +#include "rtp_llm/cpp/cache/CacheConfig.h" +#include "rtp_llm/cpp/cache/spec/CacheGroupType.h" +#include "rtp_llm/cpp/cache/CPSlotMapper.h" +#include "rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h" +#include "rtp_llm/cpp/cache/config_creator/HybridPoolConfigCreator.h" +#include "rtp_llm/cpp/cache/allocator/HybridPoolKVCacheAllocator.h" +#include "rtp_llm/cpp/cache/spec/LinearKVCacheSpec.h" +#include "rtp_llm/cpp/cache/spec/MHAKVCacheSpec.h" +#include "rtp_llm/cpp/cache/test/BlockPoolTestHelper.h" +#include "rtp_llm/cpp/cache/test/CacheConfigTestUtils.h" +#include "rtp_llm/cpp/config/ModelConfig.h" +#include "rtp_llm/cpp/disaggregate/cache_store/CacheStore.h" +#include "rtp_llm/cpp/disaggregate/cache_store/MemoryUtil.h" +#include "rtp_llm/cpp/engine_base/stream/CompleteTokenIds.h" +#include "rtp_llm/cpp/utils/Logger.h" + +namespace rtp_llm { +namespace test { + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +// Build a tiny multi-pool config with two groups: gid=0 LINEAR(layers 0,1) +// and gid=1 FULL(layers 2,3). Each group has its own per-group block budget, +// so HybridPoolKVCacheAllocator creates two independent BlockPools. +static CacheConfig makeTinyMultiPoolHybridConfig(uint32_t linear_block_num = 6, + uint32_t full_block_num = 8, + CacheGroupType second_type = CacheGroupType::FULL) { + CacheConfig config; + config.dtype = rtp_llm::DataType::TYPE_FP16; + config.layer_num = 4; + config.layer_all_num = 4; + config.block_num = std::max(linear_block_num, full_block_num); + config.seq_size_per_block = 4; + config.kernel_seq_size_per_block = 4; + config.linear_step = 2; + config.group_layer_num = 2; + + auto linear_spec = std::make_shared(); + linear_spec->type = KVCacheSpecType::LinearAttention; + linear_spec->dtype = config.dtype; + linear_spec->local_num_k_heads = 1; + linear_spec->local_num_v_heads = 1; + linear_spec->head_k_dim = 1; + linear_spec->head_v_dim = 1; + linear_spec->conv_kernel_dim = 2; + linear_spec->local_head_num_kv = 1; + linear_spec->seq_size_per_block = static_cast(config.seq_size_per_block); + + auto full_spec = std::make_shared(); + full_spec->type = KVCacheSpecType::MultiHeadAttention; + full_spec->dtype = config.dtype; + full_spec->local_head_num_kv = 1; + full_spec->size_per_head = 1; + full_spec->seq_size_per_block = static_cast(config.seq_size_per_block); + + config.use_independent_block_pools = true; + config.fromGroupedSpecs({linear_spec, full_spec}, + {{0, 1}, {2, 3}}, + {CacheGroupType::LINEAR, second_type}, + {"linear", second_type == CacheGroupType::SWA ? "swa" : "full"}); + + // Same tokens per block for both groups. + config.group_seq_size_per_block = {config.seq_size_per_block, config.seq_size_per_block}; + + config.kv_block_stride_bytes = std::max(full_spec->block_size_bytes(), linear_spec->block_size_bytes()); + config.kv_block_size_bytes = static_cast(config.group_layer_num) * config.kv_block_stride_bytes; + config.kv_scale_stride_bytes = 0; + config.kv_scale_size_bytes = 0; + config.block_size_bytes = config.kv_block_size_bytes + config.kv_scale_size_bytes; + config.layer_to_block_stride_bytes.assign(static_cast(config.layer_all_num), + static_cast(config.kv_block_stride_bytes)); + const auto linear_stride = linear_spec->block_size_bytes(); + const auto full_stride = full_spec->block_size_bytes(); + config.setGroupBlockLayout({linear_block_num, full_block_num}, {linear_stride, full_stride}, {0, 0}); + return config; +} + +static CacheConfig makeTinySwaMultiPoolHybridConfig(uint32_t linear_block_num = 6, uint32_t swa_block_num = 8) { + return makeTinyMultiPoolHybridConfig(linear_block_num, swa_block_num, CacheGroupType::SWA); +} + +static ModelConfig makeTinyDSV4ModelConfig() { + ModelConfig mc; + mc.num_layers = 5; + mc.hidden_size = 32; + mc.attn_config.head_num = 4; + mc.attn_config.kv_head_num = 1; + mc.attn_config.size_per_head = 8; + mc.attn_config.rope_head_dim = 4; + mc.attn_config.sliding_window = 128; + mc.attn_config.indexer_head_dim = 8; + mc.attn_config.indexer_head_num = 2; + mc.attn_config.indexer_topk = 16; + mc.attn_config.o_groups = 2; + mc.attn_config.o_lora_rank = 16; + mc.attn_config.layer_compress_ratios = {4, 128, 4, 128, 0}; + mc.hybrid_attention_config.enable_hybrid_attention = true; + mc.hybrid_attention_config.enable_independent_kv_cache_pools = true; + setDsv4KvCacheSpecs(mc); + return mc; +} + +static ModelConfig makeProModelConfig() { + ModelConfig mc; + mc.num_layers = 61; + mc.hidden_size = 7168; + mc.attn_config.head_num = 128; + mc.attn_config.kv_head_num = 1; + mc.attn_config.size_per_head = 512; + mc.attn_config.rope_head_dim = 64; + mc.attn_config.sliding_window = 128; + mc.attn_config.indexer_head_dim = 128; + mc.attn_config.indexer_head_num = 64; + mc.attn_config.indexer_topk = 1024; + mc.attn_config.o_groups = 16; + mc.attn_config.o_lora_rank = 1024; + std::vector ratios; + ratios.push_back(128); + ratios.push_back(128); + for (int i = 2; i < 61; i++) { + ratios.push_back((i % 2 == 0) ? 4 : 128); + } + ratios.push_back(0); + mc.attn_config.layer_compress_ratios = ratios; + setDsv4KvCacheSpecs(mc); + return mc; +} + +// Build a DSV4 7-pool CacheConfig (uses use_independent_block_pools=true). +static CacheConfig makeDSV4HybridPoolConfig(uint32_t block_num = 200) { + auto mc = makeProModelConfig(); + mc.hybrid_attention_config.enable_hybrid_attention = true; + mc.hybrid_attention_config.enable_independent_kv_cache_pools = true; + ParallelismConfig pc; + KVCacheConfig kv_cache_config; + kv_cache_config.seq_size_per_block = 128; + kv_cache_config.kernel_seq_size_per_block = 128; + auto config = CacheConfigCreator::createBasicConfig(mc, pc, kv_cache_config, false, 0); + config.block_num = block_num; + return config; +} + +static void setExplicitBlocksForGroup(CacheConfig& config, size_t group_id, uint32_t block_num) { + ASSERT_LT(group_id, static_cast(config.groupNums())); + std::vector policies; + policies.reserve(static_cast(config.groupNums())); + for (size_t gid = 0; gid < static_cast(config.groupNums()); ++gid) { + policies.push_back(config.policyForGroup(gid)); + } + policies[group_id].explicit_block_num = block_num; + config.setGroupPolicies(policies); +} + +static size_t firstExplicitIndependentGroup(const CacheConfig& config) { + for (size_t gid = 0; gid < static_cast(config.groupNums()); ++gid) { + const auto policy = config.policyForGroup(gid); + if (policy.evict_policy == CacheEvictPolicy::INDEPENDENT && policy.explicit_block_num > 0) { + return gid; + } + } + ADD_FAILURE() << "missing explicit independent cache group"; + return 0; +} + +static CompleteTokenIdsPtr makeCompleteTokenIds(int batch_size, int seq_length, int seq_size_per_block) { + auto cti = std::make_shared(batch_size, batch_size, seq_length + 64, seq_size_per_block); + auto input_ids = torch::empty({(int64_t)seq_length}, torch::kInt32); + auto* token_data = input_ids.data_ptr(); + for (int i = 0; i < seq_length; ++i) { + token_data[i] = i + 1; + } + auto gi = std::make_shared(); + gi->input_ids = input_ids; + gi->generate_config = std::make_shared(); + cti->init(gi); + return cti; +} + +static BatchKVCacheResourcePtr makeBatchResource(int batch_size, const CacheConfig& config) { + auto res = std::make_shared(); + res->resetBatchSize(batch_size); + res->initGroups(config.groupNums(), + static_cast(config.layer_all_num), + config.layerGroupIdsSnapshot(), + config.kernelBlocksPerKvBlock(), + config.groupTypesSnapshot()); + return res; +} + +static std::vector groupBlockNumsSnapshot(const CacheConfig& config) { + std::vector block_nums; + block_nums.reserve(static_cast(config.groupNums())); + for (size_t gid = 0; gid < static_cast(config.groupNums()); ++gid) { + block_nums.push_back(config.blockNumForGroup(gid)); + } + return block_nums; +} + +static void setGroupBlockNums(CacheConfig& config, const std::vector& block_nums) { + std::vector kv_strides; + std::vector scale_strides; + kv_strides.reserve(static_cast(config.groupNums())); + scale_strides.reserve(static_cast(config.groupNums())); + for (size_t gid = 0; gid < static_cast(config.groupNums()); ++gid) { + kv_strides.push_back(config.kvBlockStrideBytesForGroup(gid)); + scale_strides.push_back(config.kvScaleStrideBytesForGroup(gid)); + } + config.setGroupBlockLayout(block_nums, kv_strides, scale_strides); +} + +static size_t validBlockCount(const BlockIndicesType& blocks) { + return static_cast( + std::count_if(blocks.begin(), blocks.end(), [](BlockIdxType block) { return !isNullBlockIdx(block); })); +} + +// Create HybridPoolKVCacheAllocator with SharedBlockCache injected (required before init()). +static HybridPoolKVCacheAllocatorPtr makeAllocator(const CacheConfig& config, RoleType role_type = RoleType::PDFUSION) { + auto allocator = + std::make_shared(config, AllocationType::DEVICE, nullptr, 0, role_type); + auto shared_cache = std::make_shared(); + allocator->setSharedBlockCache(shared_cache); + return allocator; +} + +class RecordingMemoryUtil: public MemoryUtil { +public: + bool regUserMr(void*, uint64_t, bool gpu, uint64_t) override { + reg_gpu_flags.push_back(gpu); + return true; + } + + bool deregUserMr(void*, bool gpu) override { + dereg_gpu_flags.push_back(gpu); + return true; + } + + bool isMemoryMr(void*, uint64_t, bool, bool) override { + return false; + } + + bool findMemoryMr(void*, void*, uint64_t, bool, bool) override { + return false; + } + + bool isRdmaMode() override { + return true; + } + + std::vector reg_gpu_flags; + std::vector dereg_gpu_flags; +}; + +class RecordingCacheStore: public CacheStore { +public: + explicit RecordingCacheStore(std::shared_ptr memory_util): memory_util_(std::move(memory_util)) {} + + void store(const std::shared_ptr&, CacheStoreStoreDoneCallback callback) override { + if (callback) { + callback(false, CacheStoreErrorCode::InvalidParams); + } + } + + void load(const std::shared_ptr&, + CacheStoreLoadDoneCallback callback, + const std::string&, + uint32_t, + uint32_t, + uint32_t, + int, + int) override { + if (callback) { + callback(false, CacheStoreErrorCode::InvalidParams); + } + } + + std::shared_ptr loadBuffers(const std::vector>&, + const std::string&, + uint32_t, + uint32_t, + int64_t, + LoadContext::CheckCancelFunc, + int, + int) override { + return nullptr; + } + + std::shared_ptr storeBuffers(const std::vector>&, + int64_t) override { + return nullptr; + } + + std::shared_ptr + submitRemoteStoreTask(const std::shared_ptr&, + const std::shared_ptr&, + RemoteStoreTask::CheckCancelFunc) override { + return nullptr; + } + + void releaseRemoteStoreTask(const std::shared_ptr&) override {} + + bool regUserBuffers(const std::vector>&) override { + return true; + } + + std::shared_ptr findUserBuffer(const std::string&) override { + return nullptr; + } + + const std::shared_ptr& getMemoryUtil() const override { + return memory_util_; + } + + void debugInfo() override {} + +private: + std::shared_ptr memory_util_; +}; + +// Insert a non-resident cache item into the shared block cache for a specific group. +// Returns the BlockIdx allocated for the item (kept blockCache-referenced + request-released). +static BlockIdxType +seedNonResidentCacheItem(const HybridPoolKVCacheAllocatorPtr& allocator, int gid, CacheKeyType key) { + auto pool = allocator->groupBlockPools()[static_cast(gid)]; + auto blocks = pool->malloc(1); + EXPECT_EQ(blocks.size(), 1u); + auto shared_cache = allocator->sharedBlockCache(); + std::vector group_slots(allocator->groupBlockPools().size(), NULL_BLOCK_IDX); + group_slots[static_cast(gid)] = blocks[0]; + shared_cache->put(key, group_slots, false); + // SharedBlockCache::put() internally calls pool->blockCacheReference() + pool->requestFree(blocks); + return blocks[0]; +} + +struct PoolCounters { + size_t free_blocks; + size_t available_blocks; + size_t request_refs; + size_t block_cache_refs; + size_t connector_refs; +}; + +static std::vector snapshotPoolCounters(const HybridPoolKVCacheAllocatorPtr& allocator) { + std::vector counters; + counters.reserve(allocator->groupBlockPools().size()); + for (const auto& pool : allocator->groupBlockPools()) { + counters.push_back({pool->freeBlocksNum(), + pool->availableBlocksNum(), + pool->requestRefBlocksNum(), + pool->blockCacheRefBlocksNum(), + pool->connectorRefBlocksNum()}); + } + return counters; +} + +static void expectPoolCountersEq(const HybridPoolKVCacheAllocatorPtr& allocator, + const std::vector& expected) { + ASSERT_EQ(allocator->groupBlockPools().size(), expected.size()); + for (size_t gid = 0; gid < expected.size(); ++gid) { + const auto& pool = allocator->groupBlockPools()[gid]; + EXPECT_EQ(pool->freeBlocksNum(), expected[gid].free_blocks) << "gid=" << gid; + EXPECT_EQ(pool->availableBlocksNum(), expected[gid].available_blocks) << "gid=" << gid; + EXPECT_EQ(pool->requestRefBlocksNum(), expected[gid].request_refs) << "gid=" << gid; + EXPECT_EQ(pool->blockCacheRefBlocksNum(), expected[gid].block_cache_refs) << "gid=" << gid; + EXPECT_EQ(pool->connectorRefBlocksNum(), expected[gid].connector_refs) << "gid=" << gid; + } +} + +class HybridPoolKVCacheAllocatorTest: public ::testing::Test { +protected: + void SetUp() override { + rtp_llm::initLogger(); + createDevice(); + } +}; + +// --------------------------------------------------------------------------- +// Init / per-group pool creation +// --------------------------------------------------------------------------- + +TEST_F(HybridPoolKVCacheAllocatorTest, InitCreatesIndependentBlockPoolPerGroup) { + auto config = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/6, /*full_block_num=*/8); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + ASSERT_EQ(allocator->groupBlockPools().size(), 2u); + EXPECT_NE(allocator->groupBlockPools()[0], allocator->groupBlockPools()[1]); + + // Per-pool totalBlocksNum = group_block_nums[gid] - 1 (block 0 reserved). + EXPECT_EQ(allocator->groupBlockPools()[0]->totalBlocksNum(), 6u - 1u); + EXPECT_EQ(allocator->groupBlockPools()[1]->totalBlocksNum(), 8u - 1u); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, SwaDefaultRegionGroupPoolUsesGpuBacking) { + auto config = makeTinySwaMultiPoolHybridConfig(/*linear_block_num=*/6, /*swa_block_num=*/8); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + ASSERT_EQ(allocator->groupBlockPools().size(), 2u); + EXPECT_EQ(allocator->groupBlockPools()[0]->where(), MemoryType::MEMORY_GPU); + EXPECT_EQ(allocator->groupBlockPools()[1]->where(), MemoryType::MEMORY_GPU); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, GetBlockPoolReturnsNullptrInHybridPoolMode) { + // HybridPoolKVCacheAllocator owns one BlockPool per group and does not + // expose a single canonical block_pool_; getBlockPool() must return nullptr. + auto config = makeTinyMultiPoolHybridConfig(); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + EXPECT_EQ(allocator->getBlockPool(), nullptr); +} + +// --------------------------------------------------------------------------- +// Aggregated counters +// --------------------------------------------------------------------------- + +TEST_F(HybridPoolKVCacheAllocatorTest, TotalAndFreeBlocksAggregateAcrossGroups) { + auto config = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/6, /*full_block_num=*/8); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + const size_t expected_total = (6u - 1u) + (8u - 1u); + EXPECT_EQ(allocator->totalBlocksNum(), expected_total); + EXPECT_EQ(allocator->freeBlocksNum(), expected_total); + EXPECT_EQ(allocator->availableBlocksNum(), expected_total); + EXPECT_EQ(allocator->notInUseBlocksNum(), expected_total); + EXPECT_EQ(allocator->requestRefBlocksNum(), 0u); + EXPECT_EQ(allocator->connectorRefBlocksNum(), 0u); + EXPECT_EQ(allocator->blockCacheRefBlocksNum(), 0u); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, TokenAggregatorsUseDifferentCapacityScopes) { + auto config = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/6, /*full_block_num=*/8); + // Group 0 (LINEAR): seq_size_per_block=2 -> 5 blocks * 2 = 10 + // Group 1 (FULL): seq_size_per_block=4 -> 7 blocks * 4 = 28 + config.group_seq_size_per_block = {2, 4}; + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + EXPECT_EQ(allocator->maxAvailableTokensNum(), 28u); + EXPECT_EQ(allocator->availableTokensNum(), 28u); + EXPECT_EQ(allocator->totalTokensNum(), 28u); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, TokenAggregatorsUseCPVirtualBlockSizeForFullGroups) { + auto config = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/6, /*full_block_num=*/8); + config.group_seq_size_per_block = {100, 4}; + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + EXPECT_EQ(allocator->maxAvailableTokensNum(), 7u * 4u); + EXPECT_EQ(allocator->availableTokensNum(), 7u * 4u); + + allocator->setCPSlotMapper(std::make_shared(/*cp_rank=*/0, /*cp_size=*/2, /*block_size=*/4)); + + EXPECT_EQ(allocator->maxAvailableTokensNum(), 7u * 8u); + EXPECT_EQ(allocator->availableTokensNum(), 7u * 8u); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, TokenAggregatorsFallBackToGlobalSeqSize) { + auto config = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/6, /*full_block_num=*/6); + config.group_seq_size_per_block.clear(); // fall back to config.seq_size_per_block + config.seq_size_per_block = 4; + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + EXPECT_EQ(allocator->maxAvailableTokensNum(), 5u * 4u); + EXPECT_EQ(allocator->availableTokensNum(), 5u * 4u); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, RequestAndConnectorRefAggregateAcrossGroups) { + auto config = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/6, /*full_block_num=*/8); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + auto pool0 = allocator->groupBlockPools()[0]; + auto pool1 = allocator->groupBlockPools()[1]; + + const size_t free_total_before = allocator->freeBlocksNum(); + auto g0_blocks = pool0->malloc(2); + auto g1_blocks = pool1->malloc(3); + ASSERT_EQ(g0_blocks.size(), 2u); + ASSERT_EQ(g1_blocks.size(), 3u); + + EXPECT_EQ(allocator->requestRefBlocksNum(), 5u); + EXPECT_EQ(allocator->freeBlocksNum(), free_total_before - 5u); + EXPECT_EQ(allocator->availableBlocksNum(), free_total_before - 5u); + + // Mark some blocks as connector-referenced (simulating cache transfer). + pool0->connectorReference(g0_blocks[0]); + pool1->connectorReference(g1_blocks[0]); + EXPECT_EQ(allocator->connectorRefBlocksNum(), 2u); + + pool0->requestFree(g0_blocks); + pool1->requestFree(g1_blocks); + EXPECT_EQ(allocator->requestRefBlocksNum(), 0u); + + // Connector still holds 2 blocks → freeBlocksNum (set of returnable + // ids) drops by 2; notInUseBlocksNum counts blocks not held by *request* + // or *block cache* refs, so connector-held blocks still count as "not + // in use" → equals the full pool total. + EXPECT_EQ(allocator->freeBlocksNum(), free_total_before - 2u); + EXPECT_EQ(allocator->notInUseBlocksNum(), free_total_before); + + pool0->connectorFree(g0_blocks[0]); + pool1->connectorFree(g1_blocks[0]); + EXPECT_EQ(allocator->connectorRefBlocksNum(), 0u); + EXPECT_EQ(allocator->freeBlocksNum(), free_total_before); + EXPECT_EQ(allocator->notInUseBlocksNum(), free_total_before); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, BlockCacheRefAggregatesAcrossGroups) { + auto config = makeTinyMultiPoolHybridConfig(); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + seedNonResidentCacheItem(allocator, /*gid=*/0, /*key=*/100); + seedNonResidentCacheItem(allocator, /*gid=*/1, /*key=*/200); + seedNonResidentCacheItem(allocator, /*gid=*/1, /*key=*/201); + + EXPECT_EQ(allocator->blockCacheRefBlocksNum(), 3u); +} + +// --------------------------------------------------------------------------- +// Address / buffer lookups +// --------------------------------------------------------------------------- + +TEST_F(HybridPoolKVCacheAllocatorTest, ConvertIndexToAddrAndBufferDefault) { + auto config = makeTinyMultiPoolHybridConfig(); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + // Layer in linear group. + { + auto addr = allocator->convertIndexToAddr(/*layer_id=*/0, /*block_id=*/1); + EXPECT_NE(addr.kv_addr, nullptr); + auto bufs = allocator->convertIndexToBuffer(/*layer_id=*/0, /*block_id=*/1); + ASSERT_FALSE(bufs.empty()); + EXPECT_NE(bufs[0].addr, nullptr); + } + // Layer in full group. + { + auto addr = allocator->convertIndexToAddr(/*layer_id=*/3, /*block_id=*/1); + EXPECT_NE(addr.kv_addr, nullptr); + auto bufs = allocator->convertIndexToBuffer(/*layer_id=*/3, /*block_id=*/1); + ASSERT_FALSE(bufs.empty()); + EXPECT_NE(bufs[0].addr, nullptr); + } +} + +TEST_F(HybridPoolKVCacheAllocatorTest, ConvertIndexToBufferPartitionDefault) { + auto config = makeTinyMultiPoolHybridConfig(); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + auto bufs = allocator->convertIndexToBuffer( + /*layer_id=*/3, /*block_id=*/1, /*partition_count=*/1, /*partition_id=*/0); + ASSERT_FALSE(bufs.empty()); + EXPECT_NE(bufs[0].addr, nullptr); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, ConvertIndexToAddrAndBufferByGroup) { + auto config = makeTinyMultiPoolHybridConfig(); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + auto addr_default = allocator->convertIndexToAddr(/*layer_id=*/0, /*group_id=*/0, /*block_id=*/1); + auto addr_via_layer = allocator->convertIndexToAddr(/*layer_id=*/0, /*block_id=*/1); + EXPECT_EQ(addr_default.kv_addr, addr_via_layer.kv_addr); + + auto bufs_default = allocator->convertIndexToBuffer(/*layer_id=*/0, /*group_id=*/0, /*block_id=*/1); + ASSERT_FALSE(bufs_default.empty()); + EXPECT_NE(bufs_default[0].addr, nullptr); + + auto bufs_partitioned = allocator->convertIndexToBuffer( + /*layer_id=*/0, /*group_id=*/0, /*block_id=*/1, /*partition_count=*/1, /*partition_id=*/0); + ASSERT_FALSE(bufs_partitioned.empty()); + EXPECT_NE(bufs_partitioned[0].addr, nullptr); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, AllLayerCacheBaseExposesPerLayerAndPerGroupTensors) { + auto config = makeTinyMultiPoolHybridConfig(); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + auto layout = allocator->allLayerCacheBase(); + ASSERT_EQ(layout.layers_to_kv_buffer_ptrs.size(), static_cast(config.layer_all_num)); + for (size_t i = 0; i < layout.layers_to_kv_buffer_ptrs.size(); ++i) { + EXPECT_TRUE(layout.layers_to_kv_buffer_ptrs[i].defined()) << "layer " << i << " missing kv buffer"; + } + EXPECT_EQ(layout.layer_to_group_ids, config.layerGroupIdsSnapshot()); + EXPECT_EQ(layout.group_types, config.groupTypesSnapshot()); + + ASSERT_EQ(layout.layers_to_kv_buffer_ptrs_by_group.size(), static_cast(config.layer_all_num)); + for (size_t i = 0; i < layout.layers_to_kv_buffer_ptrs_by_group.size(); ++i) { + EXPECT_EQ(layout.layers_to_kv_buffer_ptrs_by_group[i].size(), static_cast(config.groupNums())); + } + + for (size_t i = 0; i < static_cast(config.layer_all_num); ++i) { + ASSERT_FALSE(layout.layer_to_group_ids[i].empty()); + const auto gid = static_cast(layout.layer_to_group_ids[i].front()); + const auto& by_default = layout.layers_to_kv_buffer_ptrs_by_group[i][gid]; + EXPECT_TRUE(by_default.defined()) << "layer " << i << " primary group tensor undefined"; + EXPECT_EQ(by_default.data_ptr(), layout.layers_to_kv_buffer_ptrs[i].data_ptr()); + } +} + +// --------------------------------------------------------------------------- +// regUserMr / getMrCostTimeMs +// --------------------------------------------------------------------------- + +TEST_F(HybridPoolKVCacheAllocatorTest, RegUserMrWithoutCacheStoreIsNoOpAndZeroCost) { + auto config = makeTinyMultiPoolHybridConfig(); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + // No CacheStore is plumbed in: regUserMr should be a benign no-op for every + // group pool, and the aggregated MR cost remains zero. + EXPECT_NO_THROW(allocator->regUserMr(/*model_id=*/0, /*cache_store=*/nullptr)); + EXPECT_EQ(allocator->getMrCostTimeMs(), 0); +} + +// --------------------------------------------------------------------------- +// popBlocksFromCache / blockCacheFree +// --------------------------------------------------------------------------- + +TEST_F(HybridPoolKVCacheAllocatorTest, PopBlocksFromCacheReturnsEvictedBatchAcrossGroups) { + auto config = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/6, /*full_block_num=*/8); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + // Seed identical key on both groups, plus a unique key on the full group. + auto g0_block_for_100 = seedNonResidentCacheItem(allocator, /*gid=*/0, /*key=*/100); + auto g1_block_for_100 = seedNonResidentCacheItem(allocator, /*gid=*/1, /*key=*/100); + auto g1_block_for_200 = seedNonResidentCacheItem(allocator, /*gid=*/1, /*key=*/200); + EXPECT_EQ(allocator->blockCacheRefBlocksNum(), 3u); + + auto evicted = allocator->popBlocksFromCache(/*min_blocks_to_free=*/3); + ASSERT_NE(evicted, nullptr); + EXPECT_EQ(evicted->batchSize(), 1); + EXPECT_EQ(evicted->groupNums(), 2); + EXPECT_TRUE(evicted->cacheResource(0).cacheKeysAreCpCanonical()); + const auto& keys = evicted->cacheKeys(0); + EXPECT_EQ(keys.size(), 2u); // 100 (shared) + 200 (g1 only) + + std::unordered_set key_set(keys.begin(), keys.end()); + EXPECT_TRUE(key_set.count(100)); + EXPECT_TRUE(key_set.count(200)); + + // Per-group block ids: each group's blocks should be set only at the slot + // matching the key it owned, and NULL elsewhere. + const auto& g0_blocks = evicted->blocks(/*batch_id=*/0, /*gid=*/0); + const auto& g1_blocks = evicted->blocks(/*batch_id=*/0, /*gid=*/1); + ASSERT_EQ(g0_blocks.size(), 2u); + ASSERT_EQ(g1_blocks.size(), 2u); + + auto idx_of = [&](CacheKeyType k) -> size_t { + for (size_t i = 0; i < keys.size(); ++i) { + if (keys[i] == k) { + return i; + } + } + return keys.size(); + }; + const size_t pos_100 = idx_of(100); + const size_t pos_200 = idx_of(200); + ASSERT_LT(pos_100, keys.size()); + ASSERT_LT(pos_200, keys.size()); + + EXPECT_EQ(g0_blocks[pos_100], g0_block_for_100); + EXPECT_TRUE(isNullBlockIdx(g0_blocks[pos_200])); + EXPECT_EQ(g1_blocks[pos_100], g1_block_for_100); + EXPECT_EQ(g1_blocks[pos_200], g1_block_for_200); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, PopBlocksFromCacheZeroFreeReturnsNull) { + auto config = makeTinyMultiPoolHybridConfig(); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + EXPECT_EQ(allocator->popBlocksFromCache(0), nullptr); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, PopBlocksFromCacheEmptyCachesReturnsNull) { + auto config = makeTinyMultiPoolHybridConfig(); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + EXPECT_EQ(allocator->popBlocksFromCache(/*min_blocks_to_free=*/4), nullptr); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, BlockCacheFreeReleasesEvictedBatchAcrossGroups) { + auto config = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/6, /*full_block_num=*/6); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + seedNonResidentCacheItem(allocator, /*gid=*/0, /*key=*/100); + seedNonResidentCacheItem(allocator, /*gid=*/1, /*key=*/200); + EXPECT_EQ(allocator->blockCacheRefBlocksNum(), 2u); + + const size_t free_before = allocator->freeBlocksNum(); + auto evicted = allocator->popBlocksFromCache(/*min_blocks_to_free=*/2); + ASSERT_NE(evicted, nullptr); + // Eviction releases the LRU entries from BlockCache; the underlying blocks + // are still referenced by blockCacheRef. Releasing those refs is what + // blockCacheFree() does. + allocator->blockCacheFree(evicted); + EXPECT_EQ(allocator->blockCacheRefBlocksNum(), 0u); + EXPECT_EQ(allocator->freeBlocksNum(), free_before + 2u); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, BlockCacheFreeNullPtrIsNoOp) { + auto config = makeTinyMultiPoolHybridConfig(); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + EXPECT_NO_THROW(allocator->blockCacheFree(nullptr)); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, BlockCacheFreeIgnoresDuplicateAndNullBlockIds) { + auto config = makeTinyMultiPoolHybridConfig(); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + auto seeded = seedNonResidentCacheItem(allocator, /*gid=*/1, /*key=*/300); + EXPECT_EQ(allocator->blockCacheRefBlocksNum(), 1u); + + auto batch = std::make_shared(); + batch->resetBatchSize(1); + batch->initGroups(config.groupNums(), static_cast(config.layer_all_num), config.layerGroupIdsSnapshot()); + // Same block listed twice in the same group should only be released once; + // NULL_BLOCK_IDX entries should be skipped. + batch->mutableBlockIds(0, /*gid=*/1).assign(BlockIndicesType{seeded, seeded, NULL_BLOCK_IDX}); + EXPECT_NO_THROW(allocator->blockCacheFree(batch)); + EXPECT_EQ(allocator->blockCacheRefBlocksNum(), 0u); +} + +// --------------------------------------------------------------------------- +// hasAvailableBlocksForReserve via reserve_block_num +// --------------------------------------------------------------------------- + +TEST_F(HybridPoolKVCacheAllocatorTest, ReserveBlocksAreDistributedAcrossGroupsForInitMalloc) { + // Group 0 (linear) gets 6 blocks (5 free), group 1 (full) gets 4 blocks (3 free). + // total_available = 8. Set reserve = 4. + // Expected per-group reserve: floor(4 * 5/8) = 2 for gid=0, floor(4 * 3/8) = 1 for gid=1. + auto config = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/6, /*full_block_num=*/4); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + allocator->setReserveBlockNum(4); + + // seq_len=4 -> 1 block per group. + auto batch_res = makeBatchResource(/*batch_size=*/1, config); + batch_res->setBatchCacheKeys(0, CacheKeysType{100}); + auto token_ids = makeCompleteTokenIds(/*batch_size=*/1, /*seq_length=*/4, /*seq_size_per_block=*/4); + MallocInfo malloc_info{batch_res, token_ids}; + malloc_info.enable_device_cache = false; + malloc_info.reuse_cache = false; + auto result = allocator->malloc(malloc_info); + EXPECT_TRUE(result.success); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, ReserveBlocksRejectsWhenGroupCannotMeetItsShare) { + // Force a group whose available_blocks < need + group_reserve_blocks. + auto config = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/6, /*full_block_num=*/4); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + // A reserve large enough to hide most blocks should reject init malloc. + allocator->setReserveBlockNum(allocator->availableBlocksNum()); + + auto batch_res = makeBatchResource(/*batch_size=*/1, config); + batch_res->setBatchCacheKeys(0, CacheKeysType{100}); + auto token_ids = makeCompleteTokenIds(/*batch_size=*/1, /*seq_length=*/4, /*seq_size_per_block=*/4); + MallocInfo malloc_info{batch_res, token_ids}; + malloc_info.enable_device_cache = false; + malloc_info.reuse_cache = false; + malloc_info.verbose = false; + auto result = allocator->malloc(malloc_info); + EXPECT_FALSE(result.success); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, PoolMetricsSnapshotsReportReserveBlocks) { + auto config = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/6, /*full_block_num=*/8); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + constexpr size_t reserve_blocks = 6; + allocator->setReserveBlockNum(reserve_blocks); + + const auto snapshots = allocator->poolMetricsSnapshots(); + ASSERT_EQ(snapshots.size(), 2u); + EXPECT_EQ("linear", snapshots[0].pool_name); + EXPECT_EQ("full", snapshots[1].pool_name); + + const size_t total_reservable_available_blocks = + snapshots[0].available_blocks + snapshots[1].available_blocks; + ASSERT_GT(total_reservable_available_blocks, 0u); + EXPECT_EQ(reserve_blocks * snapshots[0].available_blocks / total_reservable_available_blocks, + snapshots[0].reserve_blocks); + EXPECT_EQ(reserve_blocks * snapshots[1].available_blocks / total_reservable_available_blocks, + snapshots[1].reserve_blocks); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, ReserveBlocksUseCPShardedFullGroupNeed) { + auto config = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/20, /*full_block_num=*/6); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + allocator->setReserveBlockNum(1); + + auto batch_res = makeBatchResource(/*batch_size=*/1, config); + batch_res->setBatchCacheKeys(0, CacheKeysType{100, 101, 102, 103, 104, 105, 106, 107}); + auto token_ids = makeCompleteTokenIds(/*batch_size=*/1, /*seq_length=*/32, /*seq_size_per_block=*/4); + allocator->setCPSlotMapper(std::make_shared(/*cp_rank=*/0, /*cp_size=*/2, /*block_size=*/4)); + + MallocInfo malloc_info{batch_res, token_ids}; + malloc_info.enable_device_cache = false; + malloc_info.reuse_cache = false; + + auto result = allocator->malloc(malloc_info); + ASSERT_TRUE(result.success); + EXPECT_EQ(validBlockCount(batch_res->blocks(0, /*gid=*/1)), 4u); + + FreeInfo free_info{batch_res, token_ids}; + allocator->free(free_info); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, ReserveCheckIsBypassedWhenMallocInfoLacksContext) { + // hasAvailableBlocksForReserve returns true when info has no resource/tokens. + auto config = makeTinyMultiPoolHybridConfig(); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + MallocInfo info{}; + EXPECT_TRUE(allocator->hasAvailableBlocksForReserve(info, /*reserve_blocks=*/9999)); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, InitMallocRollbackFreesPartiallyAllocatedGroupBlocks) { + // gid=0 has enough room for the LINEAR tail block; gid=1 cannot satisfy + // the 3 FULL blocks needed for seq_len=9. initMallocForCommonLen should + // roll gid=0 back after gid=1 fails. + auto config = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/3, /*full_block_num=*/3); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + const auto counters_before = snapshotPoolCounters(allocator); + + auto batch_res = makeBatchResource(/*batch_size=*/1, config); + batch_res->setBatchCacheKeys(0, CacheKeysType{100, 101, 102}); + auto token_ids = makeCompleteTokenIds(/*batch_size=*/1, /*seq_length=*/9, /*seq_size_per_block=*/4); + MallocInfo malloc_info{batch_res, token_ids}; + malloc_info.enable_device_cache = false; + malloc_info.reuse_cache = false; + malloc_info.verbose = false; + + auto result = allocator->malloc(malloc_info); + EXPECT_FALSE(result.success); + + EXPECT_EQ(batch_res->curBlocksNum(), 0u); + EXPECT_EQ(batch_res->blocksNum(0, /*gid=*/0), 0u); + EXPECT_EQ(batch_res->blocksNum(0, /*gid=*/1), 0u); + EXPECT_EQ(allocator->requestRefBlocksNum(), 0u); + expectPoolCountersEq(allocator, counters_before); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, InitMallocRollbackReleasesDeviceReuseReferencesOnReserveReject) { + auto config = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/4, /*full_block_num=*/4); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + const auto linear_cached = seedNonResidentCacheItem(allocator, /*gid=*/0, /*key=*/100); + const auto full_cached = seedNonResidentCacheItem(allocator, /*gid=*/1, /*key=*/100); + ASSERT_FALSE(isNullBlockIdx(linear_cached)); + ASSERT_FALSE(isNullBlockIdx(full_cached)); + ASSERT_EQ(allocator->requestRefBlocksNum(), 0u); + ASSERT_EQ(allocator->blockCacheRefBlocksNum(), 2u); + + const size_t available_before = allocator->availableBlocksNum(); + const auto counters_before = snapshotPoolCounters(allocator); + allocator->setReserveBlockNum(std::max(1, available_before * 8)); + + auto batch_res = makeBatchResource(/*batch_size=*/1, config); + batch_res->setBatchCacheKeys(0, CacheKeysType{100, 101, 102}); + auto token_ids = makeCompleteTokenIds(/*batch_size=*/1, /*seq_length=*/8, /*seq_size_per_block=*/4); + MallocInfo malloc_info{batch_res, token_ids}; + malloc_info.enable_device_cache = true; + malloc_info.reuse_cache = true; + malloc_info.verbose = false; + + auto result = allocator->malloc(malloc_info); + EXPECT_FALSE(result.success); + + EXPECT_EQ(batch_res->curBlocksNum(), 0u); + EXPECT_EQ(batch_res->blocksNum(0, /*gid=*/0), 0u); + EXPECT_EQ(batch_res->blocksNum(0, /*gid=*/1), 0u); + EXPECT_EQ(allocator->requestRefBlocksNum(), 0u); + EXPECT_EQ(allocator->blockCacheRefBlocksNum(), 2u); + expectPoolCountersEq(allocator, counters_before); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, IncrMallocRollbackFreesPartiallyAllocatedGroupBlocks) { + auto config = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/4, /*full_block_num=*/2); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + auto batch_res = makeBatchResource(/*batch_size=*/1, config); + batch_res->setBatchCacheKeys(0, CacheKeysType{100, 101, 102}); + + auto token_ids = makeCompleteTokenIds(/*batch_size=*/1, /*seq_length=*/4, /*seq_size_per_block=*/4); + MallocInfo init_info{batch_res, token_ids}; + init_info.enable_device_cache = false; + init_info.reuse_cache = false; + ASSERT_TRUE(allocator->malloc(init_info).success); + + ASSERT_EQ(batch_res->blocksNum(0, /*gid=*/0), 1u); + ASSERT_EQ(batch_res->blocksNum(0, /*gid=*/1), 1u); + const auto linear_block_before = batch_res->blocks(0, /*gid=*/0)[0]; + const auto full_block_before = batch_res->blocks(0, /*gid=*/1)[0]; + const auto counters_before = snapshotPoolCounters(allocator); + + // gid=0 can append one real LINEAR tail block. gid=1 has no remaining + // free blocks and no cache to evict, so FULL allocation fails. + token_ids->setSeqLength(9); + MallocInfo incr_info{batch_res, token_ids}; + incr_info.enable_device_cache = false; + incr_info.reuse_cache = false; + auto incr_result = allocator->malloc(incr_info); + EXPECT_FALSE(incr_result.success); + + ASSERT_EQ(batch_res->blocksNum(0, /*gid=*/0), 1u); + ASSERT_EQ(batch_res->blocksNum(0, /*gid=*/1), 1u); + EXPECT_EQ(batch_res->blocks(0, /*gid=*/0)[0], linear_block_before); + EXPECT_EQ(batch_res->blocks(0, /*gid=*/1)[0], full_block_before); + expectPoolCountersEq(allocator, counters_before); +} + +// --------------------------------------------------------------------------- +// Full malloc / free cycle +// --------------------------------------------------------------------------- + +TEST_F(HybridPoolKVCacheAllocatorTest, MallocAndFreeCycleAcrossPerGroupPools) { + auto config = makeTinyMultiPoolHybridConfig(/*linear_block_num=*/8, /*full_block_num=*/8); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + const size_t free_before = allocator->freeBlocksNum(); + + auto batch_res = makeBatchResource(/*batch_size=*/1, config); + batch_res->setBatchCacheKeys(0, CacheKeysType{100, 101, 102}); + auto token_ids = makeCompleteTokenIds(/*batch_size=*/1, /*seq_length=*/12, /*seq_size_per_block=*/4); + MallocInfo malloc_info{batch_res, token_ids}; + malloc_info.enable_device_cache = false; + malloc_info.reuse_cache = false; + auto result = allocator->malloc(malloc_info); + ASSERT_TRUE(result.success); + EXPECT_LT(allocator->freeBlocksNum(), free_before); + + FreeInfo free_info{batch_res, token_ids}; + allocator->free(free_info); + EXPECT_EQ(allocator->freeBlocksNum(), free_before); +} + +// --------------------------------------------------------------------------- +// DSV4 7-group HybridPool: covers per-tag addressing and SWA tail +// --------------------------------------------------------------------------- + +TEST_F(HybridPoolKVCacheAllocatorTest, DSV4InitAndAggregatedCounters) { + auto config = makeDSV4HybridPoolConfig(/*block_num=*/200); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + EXPECT_EQ(config.groupNums(), 7); + ASSERT_EQ(allocator->groupBlockPools().size(), 7u); + + // Sum of per-pool totals must equal aggregated totalBlocksNum. + size_t expected_total = 0; + for (const auto& pool : allocator->groupBlockPools()) { + expected_total += pool->totalBlocksNum(); + } + EXPECT_EQ(allocator->totalBlocksNum(), expected_total); + EXPECT_EQ(allocator->freeBlocksNum(), expected_total); + EXPECT_EQ(allocator->availableBlocksNum(), expected_total); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, DSV4FixedTagPoolsUseGpuBacking) { + auto config = makeDSV4HybridPoolConfig(/*block_num=*/200); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + ASSERT_EQ(allocator->groupBlockPools().size(), 7u); + for (size_t gid = 0; gid < allocator->groupBlockPools().size(); ++gid) { + EXPECT_EQ(allocator->groupBlockPools()[gid]->where(), MemoryType::MEMORY_GPU) + << "gid=" << gid << " tag=" << config.tagForGroup(gid); + } +} + +TEST_F(HybridPoolKVCacheAllocatorTest, DSV4HCAStateReuseEnabledAllocatesTailOnly) { + auto config = makeDSV4HybridPoolConfig(/*block_num=*/200); + config.linear_step = 4; + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + const int hca_state_gid = config.groupIdForTag("hca_state"); + ASSERT_GT(config.groupNums(), hca_state_gid); + ASSERT_GT(allocator->groupBlockPools().size(), static_cast(hca_state_gid)); + + const size_t hca_free_before = allocator->groupBlockPools()[hca_state_gid]->freeBlocksNum(); + + auto batch_res = makeBatchResource(/*batch_size=*/1, config); + batch_res->setBatchCacheKeys(0, CacheKeysType{100, 101, 102, 103, 104, 105, 106, 107, 108, 109}); + auto token_ids = makeCompleteTokenIds( + /*batch_size=*/1, /*seq_length=*/10 * static_cast(config.seq_size_per_block), config.seq_size_per_block); + + MallocInfo malloc_info{batch_res, token_ids}; + malloc_info.enable_device_cache = false; + malloc_info.reuse_cache = true; + auto result = allocator->malloc(malloc_info); + ASSERT_TRUE(result.success); + + const auto& hca_blocks = batch_res->blocks(0, hca_state_gid); + ASSERT_EQ(hca_blocks.size(), 10u); + EXPECT_EQ(validBlockCount(hca_blocks), 1u); + EXPECT_TRUE(isNullBlockIdx(hca_blocks[8])); + EXPECT_FALSE(isNullBlockIdx(hca_blocks[9])); + EXPECT_EQ(hca_free_before - allocator->groupBlockPools()[hca_state_gid]->freeBlocksNum(), 1u); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, TokenAggregatorsIgnoreSmallHCAStatePool) { + auto config = makeDSV4HybridPoolConfig(/*block_num=*/50); + + const int hca_state_gid = config.groupIdForTag("hca_state"); + ASSERT_GT(config.groupNums(), hca_state_gid); + auto block_nums = groupBlockNumsSnapshot(config); + block_nums[hca_state_gid] = 2; + setGroupBlockNums(config, block_nums); + + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + ASSERT_GT(allocator->groupBlockPools().size(), static_cast(hca_state_gid)); + + const auto hca_state_tokens = + allocator->groupBlockPools()[hca_state_gid]->totalBlocksNum() * config.group_seq_size_per_block[hca_state_gid]; + EXPECT_LT(hca_state_tokens, allocator->totalTokensNum()); + EXPECT_EQ(allocator->availableTokensNum(), allocator->maxAvailableTokensNum()); + EXPECT_EQ(allocator->totalTokensNum(), allocator->maxAvailableTokensNum()); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, DSV4ConfigUsesOnlyPagedGroupsForBlockSize) { + auto mc = makeTinyDSV4ModelConfig(); + ParallelismConfig pc; + KVCacheConfig kv_cache_config; + kv_cache_config.seq_size_per_block = 128; + kv_cache_config.kernel_seq_size_per_block = 128; + auto config = CacheConfigCreator::createBasicConfig(mc, pc, kv_cache_config, false, 0); + + ASSERT_EQ(config.groupNums(), 7); + ASSERT_EQ(config.groupNums(), 7); + + size_t expected_non_full_bytes = 0; + size_t expected_full_bytes = 0; + for (size_t gid = 0; gid < static_cast(config.groupNums()); ++gid) { + const auto type = config.typeForGroup(gid); + if (type == CacheGroupType::FULL) { + expected_full_bytes += config.blockSizeBytesForGroup(gid); + } else { + expected_non_full_bytes += config.blockSizeBytesForGroup(gid); + } + } + + EXPECT_GT(expected_non_full_bytes, 0u); + EXPECT_GT(expected_full_bytes, 0u); + + EXPECT_EQ(config.block_size_bytes, expected_full_bytes); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, DSV4FinalizeBlockNumsUsesHcaStatePoolBlocks) { + auto config = makeDSV4HybridPoolConfig(/*block_num=*/50); + const size_t explicit_gid = firstExplicitIndependentGroup(config); + setExplicitBlocksForGroup(config, explicit_gid, 50); + + RuntimeConfig rt; // unused inside finalizeBlockNums today + config.finalizeBlockNums(/*global_block_num=*/200, rt); + + for (size_t gid = 0; gid < static_cast(config.groupNums()); ++gid) { + const uint32_t expected = config.policyForGroup(gid).explicit_block_num > 0 ? 50u : 200u; + EXPECT_EQ(config.blockNumForGroup(gid), expected) << "gid=" << gid; + } + + const size_t expected_reserve = 50u * config.blockSizeBytesForGroup(explicit_gid); + EXPECT_EQ(config.explicitly_sized_pool_reserve_bytes, expected_reserve); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, DSV4FinalizeBlockNumsUsesGlobalBlocksWhenHcaStateBlocksDisabled) { + auto config = makeDSV4HybridPoolConfig(/*block_num=*/123); + setExplicitBlocksForGroup(config, firstExplicitIndependentGroup(config), 0); + + RuntimeConfig rt; + config.finalizeBlockNums(/*global_block_num=*/123, rt); + + for (size_t gid = 0; gid < static_cast(config.groupNums()); ++gid) { + EXPECT_EQ(config.blockNumForGroup(gid), 123u); + } + EXPECT_EQ(config.explicitly_sized_pool_reserve_bytes, 0u); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, DSV4GpuHcaStatePoolIncludesFixedReserve) { + auto config = makeDSV4HybridPoolConfig(/*block_num=*/50); + const size_t explicit_gid = firstExplicitIndependentGroup(config); + setExplicitBlocksForGroup(config, explicit_gid, 50); + + RuntimeConfig rt; + config.finalizeBlockNums(/*global_block_num=*/200, rt); + + for (size_t gid = 0; gid < static_cast(config.groupNums()); ++gid) { + const uint32_t expected = config.policyForGroup(gid).explicit_block_num > 0 ? 50u : 200u; + EXPECT_EQ(config.blockNumForGroup(gid), expected) << "gid=" << gid; + } + const size_t expected_reserve = 50u * config.blockSizeBytesForGroup(explicit_gid); + EXPECT_EQ(config.explicitly_sized_pool_reserve_bytes, expected_reserve); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, DSV4StateSwaPoolsWithoutExplicitBlocksUseGlobalBlocks) { + auto mc = makeProModelConfig(); + mc.hybrid_attention_config.enable_hybrid_attention = true; + mc.hybrid_attention_config.enable_independent_kv_cache_pools = true; + ParallelismConfig pc; + KVCacheConfig kv_cache_config; + kv_cache_config.seq_size_per_block = 128; + kv_cache_config.kernel_seq_size_per_block = 128; + setDsv4ExplicitPoolBlocks(mc, "hca_state", 0); + auto config = CacheConfigCreator::createBasicConfig(mc, pc, kv_cache_config, false, 0); + config.linear_step = 4; + + RuntimeConfig rt; + config.finalizeBlockNums(/*global_block_num=*/128, rt); + + for (size_t gid = 0; gid < static_cast(config.groupNums()); ++gid) { + EXPECT_EQ(config.blockNumForGroup(gid), 128u) << "gid=" << gid; + } + EXPECT_EQ(config.explicitly_sized_pool_reserve_bytes, 0u); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, DSV4ConvertIndexToAddrByTagRoutesToCorrectPool) { + auto config = makeDSV4HybridPoolConfig(); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + // CSA layer (compress_ratio=4) -- pick the first one. + int csa_layer = -1; + for (size_t l = 0; l < config.layer_all_num; ++l) { + if (config.layerTagToGroupIdSnapshot()[l].count("csa_kv") > 0) { + csa_layer = static_cast(l); + break; + } + } + ASSERT_GE(csa_layer, 0); + + // csa_kv tag routes to gid=0; it must produce a non-null kv address that + // matches the CSA group's pool. + auto addr_csa = allocator->convertIndexToAddrByTag(csa_layer, "csa_kv", 1); + EXPECT_NE(addr_csa.kv_addr, nullptr); + + auto addr_swa = allocator->convertIndexToAddrByTag(csa_layer, "swa_kv", 1); + EXPECT_NE(addr_swa.kv_addr, nullptr); + + // The two tags live in different pools, so their addresses cannot alias. + EXPECT_NE(addr_csa.kv_addr, addr_swa.kv_addr); + + // Default single-group access is ambiguous for multi-tag layers. + EXPECT_ANY_THROW((void)allocator->convertIndexToAddr(csa_layer, /*block_id=*/1)); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, DSV4ConvertIndexToBufferByTagAndPartition) { + auto config = makeDSV4HybridPoolConfig(); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + int csa_layer = -1; + for (size_t l = 0; l < config.layer_all_num; ++l) { + if (config.layerTagToGroupIdSnapshot()[l].count("csa_kv") > 0) { + csa_layer = static_cast(l); + break; + } + } + ASSERT_GE(csa_layer, 0); + + auto buf = allocator->convertIndexToBufferByTag(csa_layer, "csa_kv", /*block_id=*/1); + ASSERT_FALSE(buf.empty()); + EXPECT_NE(buf[0].addr, nullptr); + + auto buf_part = allocator->convertIndexToBufferByTag( + csa_layer, "csa_kv", /*block_id=*/1, /*partition_count=*/1, /*partition_id=*/0); + ASSERT_FALSE(buf_part.empty()); + EXPECT_NE(buf_part[0].addr, nullptr); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, DSV4AllLayerCacheBaseHasPerGroupTensors) { + auto config = makeDSV4HybridPoolConfig(); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + auto layout = allocator->allLayerCacheBase(); + ASSERT_EQ(layout.layers_to_kv_buffer_ptrs.size(), static_cast(config.layer_all_num)); + ASSERT_EQ(layout.layers_to_kv_buffer_ptrs_by_group.size(), static_cast(config.layer_all_num)); + + const int swa_kv_gid = config.groupIdForTag("swa_kv"); + for (size_t l = 0; l < static_cast(config.layer_all_num); ++l) { + EXPECT_FALSE(layout.layers_to_kv_buffer_ptrs[l].defined()) + << "multi-tag DSV4 layer should not publish a legacy single-group tensor"; + const auto& swa_t = layout.layers_to_kv_buffer_ptrs_by_group[l][swa_kv_gid]; + EXPECT_TRUE(swa_t.defined()) << "layer " << l << " missing SWA_KV tensor"; + } + EXPECT_EQ(layout.group_tags.size(), 7u); + EXPECT_EQ(layout.group_types.size(), 7u); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, DSV4SharedBlockCacheIsUnifiedAcrossGroups) { + auto config = makeDSV4HybridPoolConfig(); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + // All groups share a single SharedBlockCache owned by the allocator. + auto shared_cache = allocator->sharedBlockCache(); + ASSERT_NE(shared_cache, nullptr); + + // Inserting a cache item for one group is visible via the shared cache. + auto pool0 = allocator->groupBlockPools()[0]; + auto blocks = pool0->malloc(1); + ASSERT_EQ(blocks.size(), 1u); + std::vector group_slots(allocator->groupBlockPools().size(), NULL_BLOCK_IDX); + group_slots[0] = blocks[0]; + shared_cache->put(/*cache_key=*/42, group_slots, /*is_resident=*/false); + EXPECT_TRUE(shared_cache->contains(42)); + + // The same cache is returned by the allocator accessor. + EXPECT_EQ(allocator->sharedBlockCache(), shared_cache); + + // Clean up. + pool0->requestFree(blocks); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, DSV4CPShardedInsertThenReuseSamePrefix) { + auto config = makeDSV4HybridPoolConfig(/*block_num=*/64); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + const int spb = static_cast(config.seq_size_per_block); + const int seq_len = 10 * spb + 17; + + CacheKeysType full_keys; + for (int i = 0; i < 10; ++i) { + full_keys.push_back(1000 + i); + } + CacheKeysType request_keys = full_keys; + request_keys.push_back(2000); // partial tail key present on the incoming request. + + auto cp_mapper = std::make_shared(/*cp_rank=*/0, /*cp_size=*/2, spb); + allocator->setCPSlotMapper(cp_mapper); + + auto seed_res = makeBatchResource(/*batch_size=*/1, config); + seed_res->setBatchCacheKeys(0, full_keys); + auto seed_tokens = makeCompleteTokenIds(/*batch_size=*/1, seq_len, spb); + + MallocInfo seed_malloc{seed_res, seed_tokens}; + seed_malloc.reuse_cache = true; + seed_malloc.enable_device_cache = false; + allocator->setCPSlotMapper(cp_mapper); + ASSERT_TRUE(allocator->malloc(seed_malloc).success); + + InsertInfo insert_info{seed_res, seed_tokens, /*is_resident=*/false}; + allocator->setCPSlotMapper(cp_mapper); + allocator->insertIntoCache(insert_info); + + FreeInfo seed_free{seed_res, seed_tokens}; + allocator->free(seed_free); + + auto hit_res = makeBatchResource(/*batch_size=*/1, config); + hit_res->setBatchCacheKeys(0, request_keys); + auto hit_tokens = makeCompleteTokenIds(/*batch_size=*/1, seq_len, spb); + + MallocInfo hit_malloc{hit_res, hit_tokens}; + hit_malloc.reuse_cache = true; + hit_malloc.enable_device_cache = true; + allocator->setCPSlotMapper(cp_mapper); + auto result = allocator->malloc(hit_malloc); + + ASSERT_TRUE(result.success); + EXPECT_EQ(result.reuse_len, 5 * spb * 2); + + FreeInfo hit_free{hit_res, hit_tokens}; + allocator->free(hit_free); +} + +TEST_F(HybridPoolKVCacheAllocatorTest, DSV4CPShardedEvictionMarksCanonicalResource) { + auto config = makeDSV4HybridPoolConfig(/*block_num=*/64); + auto allocator = makeAllocator(config); + ASSERT_TRUE(allocator->init()); + + const int spb = static_cast(config.seq_size_per_block); + const int seq_len = 10 * spb + 17; + + CacheKeysType full_keys; + for (int i = 0; i < 10; ++i) { + full_keys.push_back(1000 + i); + } + + auto cp_mapper = std::make_shared(/*cp_rank=*/0, /*cp_size=*/2, spb); + allocator->setCPSlotMapper(cp_mapper); + + auto seed_res = makeBatchResource(/*batch_size=*/1, config); + seed_res->setBatchCacheKeys(0, full_keys); + auto seed_tokens = makeCompleteTokenIds(/*batch_size=*/1, seq_len, spb); + + MallocInfo seed_malloc{seed_res, seed_tokens}; + seed_malloc.reuse_cache = true; + seed_malloc.enable_device_cache = false; + ASSERT_TRUE(allocator->malloc(seed_malloc).success); + + InsertInfo insert_info{seed_res, seed_tokens, /*is_resident=*/false}; + allocator->insertIntoCache(insert_info); + + FreeInfo seed_free{seed_res, seed_tokens}; + allocator->free(seed_free); + + auto evicted = allocator->popBlocksFromCache(/*min_blocks_to_free=*/4); + ASSERT_NE(evicted, nullptr); + ASSERT_TRUE(evicted->hasCacheKeys()); + EXPECT_TRUE(evicted->cacheResource(0).cacheKeysAreCpCanonical()); + + KVCacheResource canonical_source; + canonical_source.setCacheKeys(full_keys); + const auto expected_canonical = canonical_source.localCacheKeys(cp_mapper->cpSize() - 1, cp_mapper->cpSize()); + EXPECT_EQ(evicted->cacheKeys(0), expected_canonical); + const auto& dependencies = evicted->cacheResource(0).blockDependencies(); + ASSERT_EQ(dependencies.size(), expected_canonical.size()); + for (size_t i = 0; i < dependencies.size(); ++i) { + EXPECT_EQ(dependencies[i].ordinal, static_cast(i)); + if (i == 0) { + EXPECT_FALSE(dependencies[i].has_parent); + } else { + EXPECT_TRUE(dependencies[i].has_parent); + EXPECT_EQ(dependencies[i].parent_key, expected_canonical[i - 1]); + } + } +} + +} // namespace test +} // namespace rtp_llm + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/rtp_llm/cpp/cache/test/HybridTypeKVCacheAllocatorTest.cc b/rtp_llm/cpp/cache/test/HybridTypeKVCacheAllocatorTest.cc index c175826cee..ab99d3f1bc 100644 --- a/rtp_llm/cpp/cache/test/HybridTypeKVCacheAllocatorTest.cc +++ b/rtp_llm/cpp/cache/test/HybridTypeKVCacheAllocatorTest.cc @@ -3,13 +3,15 @@ #include #include #include +#include #include #include "rtp_llm/cpp/cache/BatchKVCacheResource.h" -#include "rtp_llm/cpp/cache/BlockCache.h" -#include "rtp_llm/cpp/cache/HybridTypeKVCacheAllocator.h" -#include "rtp_llm/cpp/cache/CacheConfigCreator.h" +#include "rtp_llm/cpp/cache/SharedBlockCache.h" +#include "rtp_llm/cpp/cache/allocator/HybridTypeKVCacheAllocator.h" +#include "rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h" #include "rtp_llm/cpp/cache/test/BlockPoolTestHelper.h" +#include "rtp_llm/cpp/cache/test/CacheConfigTestUtils.h" #include "rtp_llm/cpp/config/ModelConfig.h" #include "rtp_llm/cpp/engine_base/stream/CompleteTokenIds.h" #include "rtp_llm/cpp/utils/Logger.h" @@ -33,7 +35,6 @@ static CacheConfig makeTinyHybridConfig() { auto linear_spec = std::make_shared(); linear_spec->type = KVCacheSpecType::LinearAttention; linear_spec->dtype = config.dtype; - linear_spec->layer_num = 2; linear_spec->local_num_k_heads = 1; linear_spec->local_num_v_heads = 1; linear_spec->head_k_dim = 1; @@ -46,17 +47,15 @@ static CacheConfig makeTinyHybridConfig() { auto full_spec = std::make_shared(); full_spec->type = KVCacheSpecType::MultiHeadAttention; full_spec->dtype = config.dtype; - full_spec->layer_num = 2; full_spec->local_head_num_kv = 1; full_spec->size_per_head = 1; full_spec->seq_size_per_block = static_cast(config.seq_size_per_block); // Order matters: linear groups first, then full groups (as in CacheConfigCreator). - config.layer_ids = {{0, 1}, {2, 3}}; - config.global_layer_ids = config.layer_ids; - config.cache_specs = {linear_spec, full_spec}; - config.linear_group_num = 1; - config.full_group_num = 1; + config.fromGroupedSpecs({linear_spec, full_spec}, + {{0, 1}, {2, 3}}, + {CacheGroupType::LINEAR, CacheGroupType::FULL}, + {"linear", "full"}); // Physical block strides: take max between full and linear. config.kv_block_stride_bytes = std::max(full_spec->block_size_bytes(), linear_spec->block_size_bytes()); @@ -68,12 +67,6 @@ static CacheConfig makeTinyHybridConfig() { config.block_size_bytes = config.kv_block_size_bytes + config.kv_scale_size_bytes; - config.layer_to_group_id.assign(static_cast(config.layer_num), 0); - for (size_t gid = 0; gid < config.layer_ids.size(); ++gid) { - for (int layer_id : config.layer_ids[gid]) { - config.layer_to_group_id[static_cast(layer_id)] = static_cast(gid); - } - } return config; } @@ -105,6 +98,8 @@ static CacheConfig makeTinyHybridMtpConfigByCreateSpConfig() { score_model_cfg.linear_attention_config.linear_value_head_dim = 8; score_model_cfg.linear_attention_config.linear_num_key_heads = 2; score_model_cfg.linear_attention_config.linear_num_value_heads = 2; + setHybridAttentionKvCacheSpecs(score_model_cfg); + setDefaultKvCacheSpec(propose_model_cfg); ParallelismConfig parallelism_cfg; parallelism_cfg.tp_size = 1; @@ -143,11 +138,12 @@ static CompleteTokenIdsPtr makeCompleteTokenIds(int batch_size, int seq_length, return complete_token_ids; } -static BatchKVCacheResourcePtr makeBatchResource( - int batch_size, int group_nums, int layer_num, const std::vector& layer_to_group_id, CacheKeysType keys) { +static BatchKVCacheResourcePtr makeBatchResource(int batch_size, const CacheConfig& config, CacheKeysType keys) { auto res = std::make_shared(); res->resetBatchSize(batch_size); - res->initGroups(group_nums, layer_num, layer_to_group_id); + res->initGroups(config.groupNums(), + static_cast(config.layer_all_num), + config.layerGroupIdsSnapshot()); for (int b = 0; b < batch_size; ++b) { res->setBatchCacheKeys(b, keys); } @@ -155,21 +151,18 @@ static BatchKVCacheResourcePtr makeBatchResource( } static std::vector allocateAndCache(BlockPoolPtr block_pool, - BlockCachePtr block_cache, + SharedBlockCachePtr shared_cache, int group_id, + int group_num, const CacheKeysType& keys, bool is_resident = true) { auto blocks = block_pool->malloc(static_cast(keys.size())); EXPECT_EQ(blocks.size(), keys.size()); for (size_t i = 0; i < keys.size(); ++i) { - BlockCache::CacheItem item; - item.cache_key = keys[i]; - item.group_id = group_id; - item.block_index = blocks[i]; - item.is_resident = is_resident; - EXPECT_TRUE(block_cache->put(item)); - block_pool->blockCacheReference(blocks[i]); + std::vector group_slots(static_cast(group_num), NULL_BLOCK_IDX); + group_slots[static_cast(group_id)] = blocks[i]; + shared_cache->put(keys[i], group_slots, is_resident); } // Drop request references so these blocks behave like "cached but available" blocks. @@ -178,21 +171,18 @@ static std::vector allocateAndCache(BlockPoolPtr block_poo } static std::vector allocateAndCacheKeepAllocated(BlockPoolPtr block_pool, - BlockCachePtr block_cache, + SharedBlockCachePtr shared_cache, int group_id, + int group_num, const CacheKeysType& keys, bool is_resident = true) { auto blocks = block_pool->malloc(static_cast(keys.size())); EXPECT_EQ(blocks.size(), keys.size()); for (size_t i = 0; i < keys.size(); ++i) { - BlockCache::CacheItem item; - item.cache_key = keys[i]; - item.group_id = group_id; - item.block_index = blocks[i]; - item.is_resident = is_resident; - EXPECT_TRUE(block_cache->put(item)); - block_pool->blockCacheReference(blocks[i]); + std::vector group_slots(static_cast(group_num), NULL_BLOCK_IDX); + group_slots[static_cast(group_id)] = blocks[i]; + shared_cache->put(keys[i], group_slots, is_resident); } // NOTE: intentionally keep these blocks allocated/unavailable to avoid accidental reuse via malloc(). @@ -220,6 +210,7 @@ class HybridTypeKVCacheAllocatorTest: public ::testing::Test { TEST_F(HybridTypeKVCacheAllocatorTest, InitAndAddressLookupSmoke) { auto config = makeTinyHybridConfig(); auto allocator = std::make_shared(config, AllocationType::DEVICE); + allocator->setSharedBlockCache(std::make_shared()); ASSERT_TRUE(allocator->init()); EXPECT_EQ(allocator->seqSizePerBlock(), 4); @@ -247,7 +238,7 @@ TEST_F(HybridTypeKVCacheAllocatorTest, ConvertToGlobalLayerIdHybridNoMtp) { std::numeric_limits::max()); } -TEST_F(HybridTypeKVCacheAllocatorTest, ConvertToGlobalLayerIdHybridWithMtpSubConfigs) { +TEST_F(HybridTypeKVCacheAllocatorTest, DISABLED_ConvertToGlobalLayerIdHybridWithMtpSubConfigs) { auto config = makeTinyHybridMtpConfigByCreateSpConfig(); auto allocator = std::make_shared(config, AllocationType::DEVICE); @@ -263,41 +254,34 @@ TEST_F(HybridTypeKVCacheAllocatorTest, ConvertToGlobalLayerIdHybridWithMtpSubCon TEST_F(HybridTypeKVCacheAllocatorTest, GetNeedBlocksUsesGroupGetNeedBlocksAndReuseFlag) { auto config = makeTinyHybridConfig(); auto allocator = std::make_shared(config, AllocationType::DEVICE); + allocator->setSharedBlockCache(std::make_shared()); ASSERT_TRUE(allocator->init()); // batch=2, seq_len=12 (3 slots), reserve_step=2 auto token_ids = makeCompleteTokenIds(/*batch_size=*/2, /*seq_length=*/12, /*seq_size_per_block=*/4); token_ids->setReserveStep(2); - // Reuse disabled: linear group keeps only tail for common blocks; reserve_step contributes extra blocks. + // Reuse disabled: linear group keeps tail and tail-1 for common blocks; reserve_step contributes extra blocks. // full group contributes common=3, extra=1. { - auto batch_res = makeBatchResource(/*batch_size=*/2, - /*group_nums=*/2, - /*layer_num=*/static_cast(config.layer_all_num), - /*layer_to_group_id=*/config.layer_to_group_id, - CacheKeysType{100, 101, 102, 103}); + auto batch_res = makeBatchResource(/*batch_size=*/2, config, CacheKeysType{100, 101, 102, 103}); MallocInfo info{batch_res, token_ids}; info.enable_device_cache = false; info.reuse_cache = false; - // common_total = full(3) + linear(1) = 4 + // common_total = full(3) + linear(2) = 5 // extra_total = full(1) + linear(reserve_step-1=1) = 2 - // total = 4 + 2*2 = 8 - EXPECT_EQ(allocator->getNeedBlocks(info), 8); + // total = 5 + 2*2 = 9 + EXPECT_EQ(allocator->getNeedBlocks(info), 9); } - // Reuse enabled but no existing blocks: linear group uses sparse counting from begin=0. + // Reuse enabled but no existing blocks: linear group keeps step hits plus tail/tail-1. { - auto batch_res = makeBatchResource(/*batch_size=*/2, - /*group_nums=*/2, - /*layer_num=*/static_cast(config.layer_all_num), - /*layer_to_group_id=*/config.layer_to_group_id, - CacheKeysType{100, 101, 102, 103}); + auto batch_res = makeBatchResource(/*batch_size=*/2, config, CacheKeysType{100, 101, 102, 103}); MallocInfo info{batch_res, token_ids}; info.enable_device_cache = true; info.reuse_cache = true; // full: common=3 extra=1 - // linear: common=count(0,3]=2, extra=reserve_step-1(=1) + // linear: common=2, extra=reserve_step-1(=1) // common_total = 3 + 2 = 5 // extra_total = 1 + 1 = 2 // total = 5 + 2*2 = 9 @@ -308,32 +292,30 @@ TEST_F(HybridTypeKVCacheAllocatorTest, GetNeedBlocksUsesGroupGetNeedBlocksAndReu TEST_F(HybridTypeKVCacheAllocatorTest, JointReuseUsesFullPrefixAndLinearTailOnly) { auto config = makeTinyHybridConfig(); auto allocator = std::make_shared(config, AllocationType::DEVICE); + allocator->setSharedBlockCache(std::make_shared()); ASSERT_TRUE(allocator->init()); - auto block_pool = allocator->getBlockPool(); - auto block_cache = block_pool->blockCache(); + auto block_pool = allocator->getBlockPool(); + auto shared_cache = allocator->sharedBlockCache(); ASSERT_NE(block_pool, nullptr); - ASSERT_NE(block_cache, nullptr); + ASSERT_NE(shared_cache, nullptr); // Config order: gid=0 linear, gid=1 full. const int gid_linear = 0; const int gid_full = 1; + const int group_num = 2; // Full group has prefix matches for {100,101,102}. CacheKeysType full_keys = {100, 101, 102}; - auto full_blocks = allocateAndCache(block_pool, block_cache, gid_full, full_keys); + auto full_blocks = allocateAndCache(block_pool, shared_cache, gid_full, group_num, full_keys); // Linear group only matches key 101 (so joint match should backoff to pos=1 => reuse_blocks_len=2). CacheKeysType linear_keys = {101}; - auto linear_blocks = allocateAndCache(block_pool, block_cache, gid_linear, linear_keys); + auto linear_blocks = allocateAndCache(block_pool, shared_cache, gid_linear, group_num, linear_keys); ASSERT_EQ(linear_blocks.size(), 1u); // Request has 4 keys, but allocator drops the last for matching. - auto batch_res = makeBatchResource(/*batch_size=*/1, - /*group_nums=*/2, - /*layer_num=*/static_cast(config.layer_all_num), - /*layer_to_group_id=*/config.layer_to_group_id, - CacheKeysType{100, 101, 102, 103}); + auto batch_res = makeBatchResource(/*batch_size=*/1, config, CacheKeysType{100, 101, 102, 103}); // Enable device cache reuse for joint match. // seq_len=12 => 3 slots (4 tokens per block). @@ -359,16 +341,13 @@ TEST_F(HybridTypeKVCacheAllocatorTest, JointReuseUsesFullPrefixAndLinearTailOnly EXPECT_FALSE(isNullBlockIdx(linear_out[2])); // allocated tail for common length } -TEST_F(HybridTypeKVCacheAllocatorTest, DisableReuseKeepsOnlyLinearTailOnInitMalloc) { +TEST_F(HybridTypeKVCacheAllocatorTest, DisableReuseKeepsLinearTailAndTailMinusOneOnInitMalloc) { auto config = makeTinyHybridConfig(); auto allocator = std::make_shared(config, AllocationType::DEVICE); + allocator->setSharedBlockCache(std::make_shared()); ASSERT_TRUE(allocator->init()); - auto batch_res = makeBatchResource(/*batch_size=*/1, - /*group_nums=*/2, - /*layer_num=*/static_cast(config.layer_all_num), - /*layer_to_group_id=*/config.layer_to_group_id, - CacheKeysType{100, 101, 102, 103}); + auto batch_res = makeBatchResource(/*batch_size=*/1, config, CacheKeysType{100, 101, 102, 103}); // Disable device cache reuse. auto token_ids = makeCompleteTokenIds(/*batch_size=*/1, /*seq_length=*/12, /*seq_size_per_block=*/4); @@ -379,39 +358,37 @@ TEST_F(HybridTypeKVCacheAllocatorTest, DisableReuseKeepsOnlyLinearTailOnInitMall auto result = allocator->malloc(info); ASSERT_TRUE(result.success); - // Linear group should keep only the tail block across common length slots. + // Linear group should keep tail and tail-1 across common length slots. const auto& linear_out = batch_res->blocks(0, /*group_id=*/0); ASSERT_EQ(linear_out.size(), 3u); EXPECT_TRUE(isNullBlockIdx(linear_out[0])); - EXPECT_TRUE(isNullBlockIdx(linear_out[1])); + EXPECT_FALSE(isNullBlockIdx(linear_out[1])); EXPECT_FALSE(isNullBlockIdx(linear_out[2])); } -TEST_F(HybridTypeKVCacheAllocatorTest, DisableDeviceCacheSkipsReuseMatchAndAllocatesOnlyLinearTail) { +TEST_F(HybridTypeKVCacheAllocatorTest, DisableDeviceCacheSkipsReuseMatchAndAllocatesLinearTailAndTailMinusOne) { auto config = makeTinyHybridConfig(); auto allocator = std::make_shared(config, AllocationType::DEVICE); + allocator->setSharedBlockCache(std::make_shared()); ASSERT_TRUE(allocator->init()); - auto block_pool = allocator->getBlockPool(); - auto block_cache = block_pool->blockCache(); + auto block_pool = allocator->getBlockPool(); + auto shared_cache = allocator->sharedBlockCache(); ASSERT_NE(block_pool, nullptr); - ASSERT_NE(block_cache, nullptr); + ASSERT_NE(shared_cache, nullptr); // Config order: gid=0 linear, gid=1 full. const int gid_linear = 0; const int gid_full = 1; + const int group_num = 2; // Prepare cached blocks for full group; keep them allocated so allocator's malloc() cannot accidentally return same // ids. CacheKeysType full_keys = {100, 101, 102}; - auto full_blocks = allocateAndCacheKeepAllocated(block_pool, block_cache, gid_full, full_keys); + auto full_blocks = allocateAndCacheKeepAllocated(block_pool, shared_cache, gid_full, group_num, full_keys); ASSERT_EQ(full_blocks.size(), 3u); - auto batch_res = makeBatchResource(/*batch_size=*/1, - /*group_nums=*/2, - /*layer_num=*/static_cast(config.layer_all_num), - /*layer_to_group_id=*/config.layer_to_group_id, - CacheKeysType{100, 101, 102, 103}); + auto batch_res = makeBatchResource(/*batch_size=*/1, config, CacheKeysType{100, 101, 102, 103}); // Disable device cache reuse: allocator should skip reuse match even if cache exists. auto token_ids = makeCompleteTokenIds(/*batch_size=*/1, /*seq_length=*/12, /*seq_size_per_block=*/4); // 3 slots @@ -435,18 +412,19 @@ TEST_F(HybridTypeKVCacheAllocatorTest, DisableDeviceCacheSkipsReuseMatchAndAlloc EXPECT_NE(full_out[1], full_blocks[1]); EXPECT_NE(full_out[2], full_blocks[2]); - // Linear group keeps only tail block (others NULL) when reuse is disabled. + // Linear group keeps tail and tail-1 when reuse is disabled. const auto& linear_out = batch_res->blocks(0, gid_linear); ASSERT_EQ(linear_out.size(), 3u); EXPECT_TRUE(isNullBlockIdx(linear_out[0])); - EXPECT_TRUE(isNullBlockIdx(linear_out[1])); + EXPECT_FALSE(isNullBlockIdx(linear_out[1])); EXPECT_FALSE(isNullBlockIdx(linear_out[2])); - EXPECT_EQ(countValidBlocks(linear_out), 1u); + EXPECT_EQ(countValidBlocks(linear_out), 2u); } TEST_F(HybridTypeKVCacheAllocatorTest, IncrDecrKVCacheRefReferencesOnlyMatchedValidBlocksAcrossGroups) { auto config = makeTinyHybridConfig(); auto allocator = std::make_shared(config, AllocationType::HOST); + allocator->setSharedBlockCache(std::make_shared()); ASSERT_TRUE(allocator->init()); auto block_pool = allocator->getBlockPool(); @@ -458,9 +436,12 @@ TEST_F(HybridTypeKVCacheAllocatorTest, IncrDecrKVCacheRefReferencesOnlyMatchedVa EXPECT_EQ(allocator->freeBlocksNum(), free_before - 4); KVCacheResource resource; + std::vector group_types = {CacheGroupType::LINEAR, CacheGroupType::FULL}; resource.initGroups(/*group_nums=*/2, /*layer_num=*/static_cast(config.layer_all_num), - /*layer_to_group_id=*/config.layer_to_group_id); + /*layer_group_ids=*/config.layerGroupIdsSnapshot(), + /*kernel_blocks_per_kv_block=*/config.kernelBlocksPerKvBlock(), + /*group_types=*/group_types); resource.cacheKeys() = CacheKeysType{100, 101, 102}; resource.mutableBlockIds(/*gid=*/0).assign( BlockIndicesType{blocks[0], 0, blocks[1]}); // linear group (contains a 0) @@ -470,7 +451,7 @@ TEST_F(HybridTypeKVCacheAllocatorTest, IncrDecrKVCacheRefReferencesOnlyMatchedVa auto ref = allocator->incrKVCacheRef(resource, CacheKeysType{101, 999, 102}); ASSERT_NE(ref, nullptr); ASSERT_EQ(ref->groupNums(), 2); - ASSERT_EQ(ref->cacheKeys().size(), 3u); + ASSERT_EQ(ref->cacheKeys(), (CacheKeysType{101, 102})); ASSERT_EQ(ref->blocks(0).size(), 2u); ASSERT_EQ(ref->blocks(1).size(), 2u); @@ -481,28 +462,65 @@ TEST_F(HybridTypeKVCacheAllocatorTest, IncrDecrKVCacheRefReferencesOnlyMatchedVa EXPECT_EQ(allocator->freeBlocksNum(), free_before); } -TEST_F(HybridTypeKVCacheAllocatorTest, InsertIntoCacheInsertsOnlyFullBlocks) { +TEST_F(HybridTypeKVCacheAllocatorTest, IncrKVCacheRefPreservesConnectorDummyTail) { + auto config = makeTinyHybridConfig(); + auto allocator = std::make_shared(config, AllocationType::HOST); + allocator->setSharedBlockCache(std::make_shared()); + ASSERT_TRUE(allocator->init()); + + auto block_pool = allocator->getBlockPool(); + ASSERT_NE(block_pool, nullptr); + + const size_t free_before = allocator->freeBlocksNum(); + auto blocks = block_pool->malloc(2); + ASSERT_EQ(blocks.size(), 2u); + + KVCacheResource resource; + resource.initGroups(/*group_nums=*/2, + /*layer_num=*/static_cast(config.layer_all_num), + /*layer_group_ids=*/config.layerGroupIdsSnapshot()); + resource.cacheKeys() = CacheKeysType{101, 103, 999}; + resource.rebuildLinearBlockDependencies(); + resource.setLastBlockAligned(false); + resource.mutableBlockIds(/*gid=*/0).assign(BlockIndicesType{NULL_BLOCK_IDX, NULL_BLOCK_IDX}); + resource.mutableBlockIds(/*gid=*/1).assign(BlockIndicesType{blocks[0], blocks[1]}); + + auto ref = allocator->incrKVCacheRef(resource, CacheKeysType{101, 103, 999}, /*is_connector=*/true); + ASSERT_NE(ref, nullptr); + EXPECT_FALSE(ref->lastBlockAligned()); + EXPECT_EQ(ref->cacheKeys(), (CacheKeysType{101, 103, 999})); + ASSERT_EQ(ref->blocks(0).size(), 3u); + ASSERT_EQ(ref->blocks(1).size(), 3u); + EXPECT_TRUE(isNullBlockIdx(ref->blocks(0)[2])); + EXPECT_TRUE(isNullBlockIdx(ref->blocks(1)[2])); + + block_pool->requestFree(blocks); + EXPECT_EQ(allocator->freeBlocksNum(), free_before - 2); + + ref.reset(); + EXPECT_EQ(allocator->freeBlocksNum(), free_before); +} + +TEST_F(HybridTypeKVCacheAllocatorTest, InsertIntoCachePreservesLegacyNonCpAggregateSurface) { auto config = makeTinyHybridConfig(); auto allocator = std::make_shared(config, AllocationType::DEVICE); + allocator->setSharedBlockCache(std::make_shared()); ASSERT_TRUE(allocator->init()); - auto block_pool = allocator->getBlockPool(); - auto block_cache = block_pool->blockCache(); + auto block_pool = allocator->getBlockPool(); + auto shared_cache = allocator->sharedBlockCache(); ASSERT_NE(block_pool, nullptr); - ASSERT_NE(block_cache, nullptr); + ASSERT_NE(shared_cache, nullptr); // gid=0 linear, gid=1 full. const int gid_linear = 0; const int gid_full = 1; - auto batch_res = makeBatchResource(/*batch_size=*/1, - /*group_nums=*/2, - /*layer_num=*/static_cast(config.layer_all_num), - /*layer_to_group_id=*/config.layer_to_group_id, - CacheKeysType{100, 101, 102}); + auto batch_res = makeBatchResource(/*batch_size=*/1, config, CacheKeysType{100, 101, 102}); // Disable device cache reuse. - // seq_len=10 => 3 slots, full_blocks_num = floor(10/4)=2 -> only first 2 keys inserted. + // Non-CP insert keeps the legacy aggregate surface: every materialized + // group slot is merged under its key, including hybrid tail slots. auto token_ids = makeCompleteTokenIds(/*batch_size=*/1, /*seq_length=*/10, /*seq_size_per_block=*/4); MallocInfo malloc_info{batch_res, token_ids}; @@ -516,19 +534,21 @@ TEST_F(HybridTypeKVCacheAllocatorTest, InsertIntoCacheInsertsOnlyFullBlocks) { InsertInfo insert_info{batch_res, token_ids, /*is_resident=*/false}; allocator->insertIntoCache(insert_info); - // Full group should have cached first two keys. - EXPECT_TRUE(block_cache->contains(100, gid_full)); - EXPECT_TRUE(block_cache->contains(101, gid_full)); - EXPECT_FALSE(block_cache->contains(102, gid_full)); + // Full group has all allocated slots cached, including the trailing block. + EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(100, gid_full))); + EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(101, gid_full))); + EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(102, gid_full))); - // Linear group has NULL in early slots when reuse disabled, thus should not insert these full blocks. - EXPECT_FALSE(block_cache->contains(100, gid_linear)); - EXPECT_FALSE(block_cache->contains(101, gid_linear)); + // Linear group keeps its tail and tail-minus-one slots. + EXPECT_TRUE(isNullBlockIdx(shared_cache->matchGroup(100, gid_linear))); + EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(101, gid_linear))); + EXPECT_FALSE(isNullBlockIdx(shared_cache->matchGroup(102, gid_linear))); } TEST_F(HybridTypeKVCacheAllocatorTest, ConvertIndexToBufferAndAllLayerCacheBaseSmoke) { auto config = makeTinyHybridConfig(); auto allocator = std::make_shared(config, AllocationType::DEVICE); + allocator->setSharedBlockCache(std::make_shared()); ASSERT_TRUE(allocator->init()); KVCacheAllocator* base = allocator.get(); @@ -547,17 +567,14 @@ TEST_F(HybridTypeKVCacheAllocatorTest, IncrMallocRollbackFreesPartiallyAllocated auto config = makeTinyHybridConfig(); config.block_num = 6; // free=5 auto allocator = std::make_shared(config, AllocationType::DEVICE); + allocator->setSharedBlockCache(std::make_shared()); ASSERT_TRUE(allocator->init()); auto block_pool = allocator->getBlockPool(); ASSERT_NE(block_pool, nullptr); - auto batch_res = makeBatchResource(/*batch_size=*/1, - /*group_nums=*/2, - /*layer_num=*/static_cast(config.layer_all_num), - /*layer_to_group_id=*/config.layer_to_group_id, - CacheKeysType{100, 101, 102}); - // Disable device cache reuse (makes linear group allocate only tail for new slots). + auto batch_res = makeBatchResource(/*batch_size=*/1, config, CacheKeysType{100, 101, 102}); + // Disable device cache reuse (linear group still materializes tail and tail-1). // Initial small allocation: seq_len=4 => 1 slot per group. auto token_ids = makeCompleteTokenIds(/*batch_size=*/1, /*seq_length=*/4, /*seq_size_per_block=*/4); @@ -597,114 +614,6 @@ TEST_F(HybridTypeKVCacheAllocatorTest, IncrMallocRollbackFreesPartiallyAllocated block_pool->requestFree(keep); } -// Prefill init path (StreamCacheResource::initKVBlock sets enable_remove_skipped_blocks=false). -// With step=2 and reuse_blocks_len=3, the reused linear tail lands at pos 2, which is NOT -// a step hit ((2+1)%2==1). Without sparse cleanup, that slot must survive so that -// causal_conv1d can still read it by prefix_length. -TEST_F(HybridTypeKVCacheAllocatorTest, PrefillInitSkipsSparseCleanupAndPreservesReusedLinearTail) { - auto config = makeTinyHybridConfig(); - config.block_num = 16; // 6 cached (resident, non-evictable) + 4 new + 1 null reserved - auto allocator = std::make_shared(config, AllocationType::DEVICE); - ASSERT_TRUE(allocator->init()); - - auto block_pool = allocator->getBlockPool(); - auto block_cache = block_pool->blockCache(); - ASSERT_NE(block_pool, nullptr); - ASSERT_NE(block_cache, nullptr); - - const int gid_linear = 0; - const int gid_full = 1; - - CacheKeysType shared_keys = {100, 101, 102}; - auto cached_full_blocks = allocateAndCache(block_pool, block_cache, gid_full, shared_keys); - auto cached_linear_blocks = allocateAndCache(block_pool, block_cache, gid_linear, shared_keys); - ASSERT_EQ(cached_linear_blocks.size(), 3u); - - // Request has 5 keys; allocator drops the last before matching, leaving {100,101,102,103}. - // Full matches the first 3 (103 is absent); linear joint backoff stops at pos=2 => reuse_blocks_len=3. - auto batch_res = makeBatchResource(/*batch_size=*/1, - /*group_nums=*/2, - /*layer_num=*/static_cast(config.layer_all_num), - /*layer_to_group_id=*/config.layer_to_group_id, - CacheKeysType{100, 101, 102, 103, 104}); - - // seq_len=20 => 5 slots. block_size-3-reserve_step = 2, so removeSkippedBlocks would scan pos 2. - auto token_ids = makeCompleteTokenIds(/*batch_size=*/1, /*seq_length=*/20, /*seq_size_per_block=*/4); - - MallocInfo info{batch_res, token_ids}; - info.enable_device_cache = true; - info.reuse_cache = true; - info.enable_remove_skipped_blocks = false; // prefill init path - auto result = allocator->malloc(info); - ASSERT_TRUE(result.success); - - const auto& linear_out = batch_res->blocks(0, gid_linear); - ASSERT_EQ(linear_out.size(), 5u); - EXPECT_TRUE(isNullBlockIdx(linear_out[0])); - EXPECT_TRUE(isNullBlockIdx(linear_out[1])); - EXPECT_EQ(linear_out[2], cached_linear_blocks[2]) << "reused linear tail must survive prefill init"; - EXPECT_FALSE(isNullBlockIdx(linear_out[3])); - EXPECT_FALSE(isNullBlockIdx(linear_out[4])); -} - -// Decode path (StreamCacheResource::incrKVBlock sets enable_remove_skipped_blocks=true). -// The allocator is invoked on an already-populated resource, so malloc() dispatches directly -// to incrMalloc(). Sparse cleanup must prune non-step blocks while preserving step hits and -// the last two slots. -TEST_F(HybridTypeKVCacheAllocatorTest, DecodeIncrMallocAppliesSparseCleanupOnLinearGroups) { - auto config = makeTinyHybridConfig(); - config.block_num = 16; // pre-allocates 6 + 6 = 12 blocks plus the reserved null block - auto allocator = std::make_shared(config, AllocationType::DEVICE); - ASSERT_TRUE(allocator->init()); - - auto block_pool = allocator->getBlockPool(); - ASSERT_NE(block_pool, nullptr); - - const int gid_linear = 0; - const int gid_full = 1; - - auto linear_alloc = block_pool->malloc(6); - auto full_alloc = block_pool->malloc(6); - ASSERT_EQ(linear_alloc.size(), 6u); - ASSERT_EQ(full_alloc.size(), 6u); - - auto batch_res = makeBatchResource(/*batch_size=*/1, - /*group_nums=*/2, - /*layer_num=*/static_cast(config.layer_all_num), - /*layer_to_group_id=*/config.layer_to_group_id, - CacheKeysType{}); - batch_res->mutableBlockIds(0, gid_linear).assign(linear_alloc); - batch_res->mutableBlockIds(0, gid_full).assign(full_alloc); - ASSERT_GT(batch_res->curBlocksNum(), 0); - - // seq_len=24 => 6 slots; current_blocks==6 so group malloc is a no-op and only cleanup runs. - auto token_ids = makeCompleteTokenIds(/*batch_size=*/1, /*seq_length=*/24, /*seq_size_per_block=*/4); - - MallocInfo info{batch_res, token_ids}; - info.enable_device_cache = false; - info.reuse_cache = true; - info.enable_remove_skipped_blocks = true; // decode path - auto result = allocator->malloc(info); - ASSERT_TRUE(result.success); - - // For step=2 and size=6: keep pos 1, 3 (step hits) and last two (4, 5); null pos 0, 2. - const auto& linear_out = batch_res->blocks(0, gid_linear); - ASSERT_EQ(linear_out.size(), 6u); - EXPECT_TRUE(isNullBlockIdx(linear_out[0])); - EXPECT_FALSE(isNullBlockIdx(linear_out[1])); - EXPECT_TRUE(isNullBlockIdx(linear_out[2])); - EXPECT_FALSE(isNullBlockIdx(linear_out[3])); - EXPECT_FALSE(isNullBlockIdx(linear_out[4])); - EXPECT_FALSE(isNullBlockIdx(linear_out[5])); - - // Full group is untouched by sparse cleanup. - const auto& full_out = batch_res->blocks(0, gid_full); - ASSERT_EQ(full_out.size(), 6u); - for (size_t i = 0; i < full_out.size(); ++i) { - EXPECT_EQ(full_out[i], full_alloc[i]); - } -} - } // namespace test } // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/test/KVCacheGroupTest.cc b/rtp_llm/cpp/cache/test/KVCacheGroupTest.cc index 438f4ab870..77cb7b4881 100644 --- a/rtp_llm/cpp/cache/test/KVCacheGroupTest.cc +++ b/rtp_llm/cpp/cache/test/KVCacheGroupTest.cc @@ -4,7 +4,7 @@ #include #include #include -#include "rtp_llm/cpp/cache/FullKVCacheGroup.h" +#include "rtp_llm/cpp/cache/group/FullKVCacheGroup.h" #include "rtp_llm/cpp/cache/test/BlockPoolTestHelper.h" namespace rtp_llm { diff --git a/rtp_llm/cpp/cache/test/KVCacheManagerCPSlotMapperTest.cc b/rtp_llm/cpp/cache/test/KVCacheManagerCPSlotMapperTest.cc new file mode 100644 index 0000000000..1070266c64 --- /dev/null +++ b/rtp_llm/cpp/cache/test/KVCacheManagerCPSlotMapperTest.cc @@ -0,0 +1,309 @@ +#include +#include +#include + +#include "rtp_llm/cpp/cache/KVCacheManager.h" +#include "rtp_llm/cpp/cache/CPSlotMapper.h" +#include "rtp_llm/cpp/cache/test/CacheConfigTestUtils.h" +#include "rtp_llm/cpp/cache/test/BlockPoolTestHelper.h" +#include "rtp_llm/cpp/cache/BatchKVCacheResource.h" +#include "rtp_llm/cpp/engine_base/stream/CompleteTokenIds.h" +#include "rtp_llm/cpp/utils/Logger.h" + +namespace rtp_llm { +namespace test { + +static CacheConfig makeTestConfig(int block_num = 20, int seq_size_per_block = 4) { + return makeSimpleMhaCacheConfig( + /*layer_num=*/2, + block_num, + /*tokens_per_block=*/static_cast(seq_size_per_block), + rtp_llm::DataType::TYPE_FP16, + /*local_head_num_kv=*/1, + /*size_per_head=*/16); +} + +static CompleteTokenIdsPtr makeTokenIds(int batch_size, int seq_len, int block_size) { + auto ids = std::make_shared(batch_size, batch_size, seq_len + 100, block_size); + auto input_ids = torch::empty({(int64_t)seq_len}, torch::kInt32); + auto* ptr = input_ids.data_ptr(); + for (int i = 0; i < seq_len; ++i) + ptr[i] = i + 1; + auto gi = std::make_shared(); + gi->input_ids = input_ids; + gi->generate_config = std::make_shared(); + ids->init(gi); + return ids; +} + +static BatchKVCacheResourcePtr makeResource(int batch_size, int layer_num) { + auto res = std::make_shared(); + res->resetBatchSize(batch_size); + std::vector> layer_group_ids(static_cast(layer_num), std::vector{0}); + res->initGroups(/*group_nums=*/1, layer_num, layer_group_ids); + return res; +} + +class KVCacheManagerCPSlotMapperTest: public ::testing::Test { +protected: + void SetUp() override { + rtp_llm::initLogger(); + createDevice(); + } +}; + +// When kv_cache_sharded is false (default), cpSlotMapper() should return nullptr. +TEST_F(KVCacheManagerCPSlotMapperTest, NoCPSharding_ReturnsNullMapper) { + auto config = makeTestConfig(); + ParallelismConfig par; + par.tp_rank = 0; + par.tp_size = 2; + par.prefill_cp_config.kv_cache_sharded = false; + + // warmup=true skips allocateAndSync (which would NCCL all-gather across the + // tp_size process group; in single-process UT there are no peers). cp_slot_mapper_ + // is constructed regardless of warmup, so cpSlotMapper() check is unaffected. + auto mgr = std::make_shared(config, /*warmup=*/true, nullptr, KVCacheConfig{}, par); + ASSERT_TRUE(mgr->init()); + + EXPECT_EQ(mgr->cpSlotMapper(), nullptr); +} + +// When tp_size == 1, cpSlotMapper() should return nullptr even if kv_cache_sharded is true. +TEST_F(KVCacheManagerCPSlotMapperTest, SingleRank_ReturnsNullMapper) { + auto config = makeTestConfig(); + ParallelismConfig par; + par.tp_rank = 0; + par.tp_size = 1; + par.prefill_cp_config.kv_cache_sharded = true; + + // warmup=true skips allocateAndSync (which would NCCL all-gather across the + // tp_size process group; in single-process UT there are no peers). cp_slot_mapper_ + // is constructed regardless of warmup, so cpSlotMapper() check is unaffected. + auto mgr = std::make_shared(config, /*warmup=*/true, nullptr, KVCacheConfig{}, par); + ASSERT_TRUE(mgr->init()); + + EXPECT_EQ(mgr->cpSlotMapper(), nullptr); +} + +// When kv_cache_sharded is true and tp_size > 1, cpSlotMapper() should return a valid mapper. +TEST_F(KVCacheManagerCPSlotMapperTest, CPShardingEnabled_ReturnsValidMapper) { + const int seq_size_per_block = 4; + auto config = makeTestConfig(/*block_num=*/20, seq_size_per_block); + + ParallelismConfig par; + par.tp_rank = 1; + par.tp_size = 2; + par.prefill_cp_config.kv_cache_sharded = true; + + // warmup=true skips allocateAndSync (which would NCCL all-gather across the + // tp_size process group; in single-process UT there are no peers). cp_slot_mapper_ + // is constructed regardless of warmup, so cpSlotMapper() check is unaffected. + auto mgr = std::make_shared(config, /*warmup=*/true, nullptr, KVCacheConfig{}, par); + ASSERT_TRUE(mgr->init()); + + auto mapper = mgr->cpSlotMapper(); + ASSERT_NE(mapper, nullptr); + EXPECT_TRUE(mapper->isSharded()); + EXPECT_EQ(mapper->cpRank(), 1); + EXPECT_EQ(mapper->cpSize(), 2); + EXPECT_EQ(mapper->blockSize(), seq_size_per_block); + EXPECT_EQ(mapper->virtualBlockSize(), seq_size_per_block * 2); +} + +TEST_F(KVCacheManagerCPSlotMapperTest, CPShardingEnabled_CacheInfoReportsVirtualBlockSize) { + const int seq_size_per_block = 4; + auto config = makeTestConfig(/*block_num=*/20, seq_size_per_block); + + ParallelismConfig par; + par.tp_rank = 0; + par.tp_size = 4; + par.prefill_cp_config.kv_cache_sharded = true; + + auto mgr = std::make_shared(config, /*warmup=*/true, nullptr, KVCacheConfig{}, par); + ASSERT_TRUE(mgr->init()); + + auto info = mgr->getKVCacheInfo(/*latest_version=*/-1, /*need_cache_keys=*/false); + EXPECT_EQ(info.block_size, static_cast(seq_size_per_block * par.tp_size)); +} + +// Partial tails may be allocated as live KV blocks before they become cacheable +// full blocks. CP invariants must therefore be based on logical sequence length, +// not cacheKeys().size(). +TEST_F(KVCacheManagerCPSlotMapperTest, CPShardedMallocAllowsPartialTailWithoutCacheKey) { + const int seq_size_per_block = 4; + auto config = makeTestConfig(/*block_num=*/20, seq_size_per_block); + + ParallelismConfig par; + + auto mgr = std::make_shared(config, /*warmup=*/false, nullptr, KVCacheConfig{}, par); + ASSERT_TRUE(mgr->init()); + + auto resource = makeResource(1, config.layer_num); + auto token_ids = makeTokenIds(1, /*seq_len=*/1, seq_size_per_block); + + MallocInfo info{resource, token_ids}; + auto cp_mapper = std::make_shared(0, 2, seq_size_per_block); + mgr->cp_slot_mapper_ = cp_mapper; + mgr->allocator_->setCPSlotMapper(cp_mapper); + + auto result = mgr->malloc(info); + ASSERT_TRUE(result.success); + EXPECT_EQ(resource->blocksNum(0, 0), 1); + + token_ids->setSeqLength(2); + result = mgr->malloc(info); + ASSERT_TRUE(result.success); + EXPECT_EQ(resource->blocksNum(0, 0), 1); + EXPECT_EQ(resource->cacheKeys(0).size(), 0); +} + +// malloc() should use the manager-level cpSlotMapper. +// With CP sharding (cp_size=2, block_size=4), virtual_block_size=8. +// A sequence of 16 tokens needs ceil(16/8)=2 physical blocks per batch (not 4). +// DISABLED: needs multi-rank NCCL harness (KVCacheManager::allocateAndSync calls +// execAllGather across the tp_size group); covered end-to-end in Stage 6 smoke. +TEST_F(KVCacheManagerCPSlotMapperTest, DISABLED_MallocAutoInjectReducesBlockCount) { + const int seq_size_per_block = 4; + auto config = makeTestConfig(/*block_num=*/20, seq_size_per_block); + + ParallelismConfig par; + par.tp_rank = 0; + par.tp_size = 2; + par.prefill_cp_config.kv_cache_sharded = true; + + // warmup=true skips allocateAndSync (which would NCCL all-gather across the + // tp_size process group; in single-process UT there are no peers). cp_slot_mapper_ + // is constructed regardless of warmup, so cpSlotMapper() check is unaffected. + auto mgr = std::make_shared(config, /*warmup=*/true, nullptr, KVCacheConfig{}, par); + ASSERT_TRUE(mgr->init()); + + const int seq_len = 16; + auto resource = makeResource(1, config.layer_num); + auto token_ids = makeTokenIds(1, seq_len, seq_size_per_block); + + MallocInfo info{resource, token_ids}; + auto result = mgr->malloc(info); + ASSERT_TRUE(result.success); + + // virtual_block_size = 4 * 2 = 8 + // effectiveSeqLenForAlloc(16) = ceil(16/8) * 4 = 8 tokens worth => ceil(8/4) = 2 blocks + EXPECT_EQ(resource->blocksNum(0, 0), 2); +} + +// Without CP sharding, the same seq_len should allocate more blocks. +// DISABLED: needs multi-rank NCCL harness (KVCacheManager::allocateAndSync calls +// execAllGather across the tp_size group); covered end-to-end in Stage 6 smoke. +TEST_F(KVCacheManagerCPSlotMapperTest, DISABLED_MallocWithoutCPAllocatesFullBlocks) { + const int seq_size_per_block = 4; + auto config = makeTestConfig(/*block_num=*/20, seq_size_per_block); + + ParallelismConfig par; + par.tp_rank = 0; + par.tp_size = 2; + par.prefill_cp_config.kv_cache_sharded = false; + + // warmup=true skips allocateAndSync (which would NCCL all-gather across the + // tp_size process group; in single-process UT there are no peers). cp_slot_mapper_ + // is constructed regardless of warmup, so cpSlotMapper() check is unaffected. + auto mgr = std::make_shared(config, /*warmup=*/true, nullptr, KVCacheConfig{}, par); + ASSERT_TRUE(mgr->init()); + + const int seq_len = 16; + auto resource = makeResource(1, config.layer_num); + auto token_ids = makeTokenIds(1, seq_len, seq_size_per_block); + + MallocInfo info{resource, token_ids}; + auto result = mgr->malloc(info); + ASSERT_TRUE(result.success); + + // Without CP: ceil(16/4) = 4 blocks + EXPECT_EQ(resource->blocksNum(0, 0), 4); +} + +// Allocator-level cp_slot_mapper should drive malloc sharding. +// DISABLED: needs multi-rank NCCL harness (KVCacheManager::allocateAndSync calls +// execAllGather across the tp_size group); covered end-to-end in Stage 6 smoke. +TEST_F(KVCacheManagerCPSlotMapperTest, DISABLED_AllocatorMapperControlsMalloc) { + const int seq_size_per_block = 4; + auto config = makeTestConfig(/*block_num=*/30, seq_size_per_block); + + ParallelismConfig par; + par.tp_rank = 0; + par.tp_size = 2; + par.prefill_cp_config.kv_cache_sharded = true; + + // warmup=true skips allocateAndSync (which would NCCL all-gather across the + // tp_size process group; in single-process UT there are no peers). cp_slot_mapper_ + // is constructed regardless of warmup, so cpSlotMapper() check is unaffected. + auto mgr = std::make_shared(config, /*warmup=*/true, nullptr, KVCacheConfig{}, par); + ASSERT_TRUE(mgr->init()); + + const int seq_len = 64; + auto resource = makeResource(1, config.layer_num); + auto token_ids = makeTokenIds(1, seq_len, seq_size_per_block); + + auto explicit_mapper = std::make_shared(0, 4, seq_size_per_block); + // virtual_block_size = 4 * 4 = 16 + // effectiveSeqLenForAlloc(64) = ceil(64/16)*4 = 16 tokens => ceil(16/4) = 4 blocks + + MallocInfo info{resource, token_ids}; + mgr->cp_slot_mapper_ = explicit_mapper; + mgr->allocator_->setCPSlotMapper(explicit_mapper); + auto result = mgr->malloc(info); + ASSERT_TRUE(result.success); + + EXPECT_EQ(resource->blocksNum(0, 0), 4); +} + +// insertIntoCache() should also use the manager-level mapper. +// DISABLED: same reason as above (multi-rank harness needed). +TEST_F(KVCacheManagerCPSlotMapperTest, DISABLED_InsertAutoInjectsMapper) { + const int seq_size_per_block = 4; + auto config = makeTestConfig(/*block_num=*/20, seq_size_per_block); + + ParallelismConfig par; + par.tp_rank = 0; + par.tp_size = 2; + par.prefill_cp_config.kv_cache_sharded = true; + + KVCacheConfig kv_cfg; + kv_cfg.reuse_cache = true; + kv_cfg.enable_device_cache = true; + + auto mgr = std::make_shared(config, false, nullptr, kv_cfg, par); + ASSERT_TRUE(mgr->init()); + // virtual_block_size = 4 * 2 = 8 + // effectiveSeqLenForAlloc(16) = ceil(16/8) * 4 = 8 tokens worth => ceil(8/4) = 2 blocks + + const int seq_len = 16; + auto resource = makeResource(1, config.layer_num); + auto token_ids = makeTokenIds(1, seq_len, seq_size_per_block); + + MallocInfo malloc_info{resource, token_ids}; + malloc_info.reuse_cache = true; + malloc_info.enable_device_cache = true; + auto result = mgr->malloc(malloc_info); + ASSERT_TRUE(result.success); + + // Insert into cache using the allocator-level cp_slot_mapper. + // This should not crash and should use sharded insert logic. + InsertInfo insert_info{resource, token_ids, /*is_resident=*/false}; + EXPECT_NO_THROW(mgr->insertIntoCache(insert_info)); + + // Now try to malloc again with the same token_ids -- should get reuse hit. + auto resource2 = makeResource(1, config.layer_num); + MallocInfo malloc_info2{resource2, token_ids}; + malloc_info2.reuse_cache = true; + malloc_info2.enable_device_cache = true; + auto result2 = mgr->malloc(malloc_info2); + ASSERT_TRUE(result2.success); + // With CP sharding (cp_size=2, block_size=4), virtual_block_size=8. + // seq_len=16 produces 2 cache keys (each covering 8 tokens). + // match drops the last key → 1 matched key → reuse_len = 1 * virtual_block_size = 8. + // The sharded reuse_length adjustment ensures this is 1 * virtual_block_size = 8, not 1 * seq_size_per_block = 4. + EXPECT_EQ(result2.reuse_len, seq_size_per_block * par.tp_size); // = 4 * 2 = 8 +} + +} // namespace test +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/test/KVCacheManagerTest.cc b/rtp_llm/cpp/cache/test/KVCacheManagerTest.cc index 36d45115e8..d55a76e97e 100644 --- a/rtp_llm/cpp/cache/test/KVCacheManagerTest.cc +++ b/rtp_llm/cpp/cache/test/KVCacheManagerTest.cc @@ -4,10 +4,14 @@ #include #include #include +#include #include #include "kmonitor/client/MetricsReporter.h" -#include "rtp_llm/cpp/cache/BlockCache.h" +#include "rtp_llm/cpp/cache/SharedBlockCache.h" +#include "rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h" +#include "rtp_llm/cpp/cache/allocator/HybridPoolKVCacheAllocator.h" +#include "rtp_llm/cpp/cache/config_creator/HybridPoolConfigCreator.h" #include "rtp_llm/cpp/cache/KVCacheManager.h" #include "rtp_llm/cpp/cache/test/CacheConfigTestUtils.h" #include "rtp_llm/cpp/cache/test/BlockPoolTestHelper.h" @@ -16,19 +20,35 @@ #include "rtp_llm/cpp/cache/connector/test/mock/MockAsyncContext.h" #include "rtp_llm/cpp/cache/connector/test/mock/MockKVCacheConnectorCoordinator.h" #include "rtp_llm/cpp/cache/connector/test/mock/MockKVCacheConnectorReadWriteContext.h" +#include "rtp_llm/cpp/config/ModelConfig.h" +#include "rtp_llm/cpp/config/StaticConfig.h" +#include "rtp_llm/cpp/engine_base/stream/CompleteTokenIds.h" #include "rtp_llm/cpp/utils/Logger.h" namespace rtp_llm { namespace test { +namespace { +constexpr int kDsv4PoolNum = 7; +const std::vector kDsv4Tags = { + "swa_kv", "csa_kv", "indexer_kv", "indexer_state", "csa_state", "hca_kv", "hca_state"}; +} + class KVCacheManagerTest: public ::testing::Test { protected: void SetUp() override { + old_core_dump_on_exception_ = StaticConfig::user_ft_core_dump_on_exception; + StaticConfig::user_ft_core_dump_on_exception = false; rtp_llm::initLogger(); createDevice(); } -protected: + void TearDown() override { + StaticConfig::user_ft_core_dump_on_exception = old_core_dump_on_exception_; + } + +private: + bool old_core_dump_on_exception_{false}; }; static void assertBlockBytesEq(const std::shared_ptr& cache_manager, @@ -79,6 +99,207 @@ static void assertScaleEq(const std::shared_ptr& cache_ } } +static ModelConfig makeDSV4ManagerFlashModelConfig() { + ModelConfig mc; + mc.num_layers = 43; + mc.hidden_size = 4096; + mc.attn_config.head_num = 64; + mc.attn_config.kv_head_num = 1; + mc.attn_config.size_per_head = 512; + mc.attn_config.rope_head_dim = 64; + mc.attn_config.sliding_window = 128; + mc.attn_config.indexer_head_dim = 128; + mc.attn_config.indexer_head_num = 64; + mc.attn_config.indexer_topk = 512; + mc.attn_config.o_groups = 8; + mc.attn_config.o_lora_rank = 1024; + std::vector ratios = {0, 0}; + for (int i = 2; i < 43; i++) { + ratios.push_back((i % 2 == 0) ? 4 : 128); + } + ratios.push_back(0); + mc.attn_config.layer_compress_ratios = ratios; + mc.hybrid_attention_config.enable_hybrid_attention = true; + mc.hybrid_attention_config.enable_independent_kv_cache_pools = true; + setDsv4KvCacheSpecs(mc); + return mc; +} + +static void setGroupBlockNumsForTest(CacheConfig& config, const std::vector& block_nums) { + std::vector kv_strides; + std::vector scale_strides; + kv_strides.reserve(static_cast(config.groupNums())); + scale_strides.reserve(static_cast(config.groupNums())); + for (size_t gid = 0; gid < static_cast(config.groupNums()); ++gid) { + kv_strides.push_back(config.kvBlockStrideBytesForGroup(gid)); + scale_strides.push_back(config.kvScaleStrideBytesForGroup(gid)); + } + config.setGroupBlockLayout(block_nums, kv_strides, scale_strides); +} + +static CacheConfig makeCompactDSV4ManagerConfig(uint32_t block_num = 16) { + ParallelismConfig pc; + KVCacheConfig kv_cache_config; + kv_cache_config.seq_size_per_block = 128; + auto mc = makeDSV4ManagerFlashModelConfig(); + setDsv4ExplicitPoolBlocks(mc, "hca_state", 0); + auto config = CacheConfigCreator::createBasicConfig(mc, pc, kv_cache_config, false, 0); + config.block_num = block_num; + setGroupBlockNumsForTest(config, std::vector(static_cast(config.groupNums()), block_num)); + return config; +} + +static bool isValidGroup(const CacheConfig& config, int gid) { + return gid >= 0 && static_cast(gid) < static_cast(config.groupNums()); +} + +static bool isFullGroup(const CacheConfig& config, int gid) { + return isValidGroup(config, gid) && config.typeForGroup(static_cast(gid)) == CacheGroupType::FULL; +} + +static bool isFixedTailGroup(const CacheConfig& config, int gid) { + return isValidGroup(config, gid) && config.typeForGroup(static_cast(gid)) != CacheGroupType::FULL; +} + +static bool isHcaStateGroup(const CacheConfig& config, int gid) { + return isValidGroup(config, gid) && config.tagForGroup(static_cast(gid)) == "hca_state"; +} + +static std::vector dsv4GroupIdsByType(const CacheConfig& config, CacheGroupType type) { + std::vector group_ids; + for (int gid = 0; gid < config.groupNums(); ++gid) { + if (config.typeForGroup(static_cast(gid)) == type) { + group_ids.push_back(gid); + } + } + return group_ids; +} + +static std::vector dsv4FixedTailGroupIds(const CacheConfig& config) { + std::vector group_ids; + for (int gid = 0; gid < config.groupNums(); ++gid) { + if (isFixedTailGroup(config, gid)) { + group_ids.push_back(gid); + } + } + return group_ids; +} + +static int dsv4ActiveTailBlocks(const CacheConfig& config, int gid) { + return isHcaStateGroup(config, gid) ? 1 : 2; +} + +static void expectDsv4SwaAllocatedBlocks(const CacheConfig& config, + const BlockIndicesType& blocks, + int gid, + const std::string& label, + bool enable_reuse_cache = false) { + const int active_tail_blocks = dsv4ActiveTailBlocks(config, gid); + const int tail_begin = std::max(static_cast(blocks.size()) - active_tail_blocks, 0); + const int linear_step = std::max(1, config.linear_step); + const bool effective_reuse = enable_reuse_cache && !isHcaStateGroup(config, gid); + for (int i = 0; i < static_cast(blocks.size()); ++i) { + const bool should_allocate = i >= tail_begin || (effective_reuse && ((i + 1) % linear_step == 0)); + if (should_allocate) { + EXPECT_FALSE(isNullBlockIdx(blocks[static_cast(i)])) + << label << " group " << gid << " pos " << i; + } else { + EXPECT_TRUE(isNullBlockIdx(blocks[static_cast(i)])) + << label << " group " << gid << " pos " << i; + } + } +} + +// Creates an intentionally tight DSV4 config for eviction stress tests: FULL +// groups use a large paged pool, while SWA groups use a small independent pool. +static CacheConfig makeDSV4ConfigWithConcurrencyPool(uint32_t full_block_num, uint32_t swa_batch_size) { + ParallelismConfig pc; + KVCacheConfig kv_cache_config; + kv_cache_config.seq_size_per_block = 128; + auto mc = makeDSV4ManagerFlashModelConfig(); + setDsv4ExplicitPoolBlocks(mc, "hca_state", 0); + auto config = CacheConfigCreator::createBasicConfig(mc, pc, kv_cache_config, false, 0); + config.block_num = full_block_num; + std::vector block_nums(static_cast(config.groupNums()), full_block_num); + for (int gid = 0; gid < config.groupNums(); ++gid) { + block_nums[static_cast(gid)] = isFullGroup(config, gid) ? full_block_num : (2u * swa_batch_size); + } + setGroupBlockNumsForTest(config, block_nums); + return config; +} + +static CacheConfig +makeProductionDSV4Config(uint32_t full_block_num, uint32_t max_concurrency, uint32_t hca_state_pool_blocks = 4) { + ParallelismConfig pc; + RuntimeConfig runtime_config; + KVCacheConfig kv_cache_config; + kv_cache_config.seq_size_per_block = 128; + kv_cache_config.test_block_num = full_block_num; + auto mc = makeDSV4ManagerFlashModelConfig(); + setDsv4ExplicitPoolBlocks(mc, "hca_state", hca_state_pool_blocks); + runtime_config.max_generate_batch_size = max_concurrency; + runtime_config.fifo_scheduler_config.max_context_batch_size = max_concurrency; + return CacheConfigCreator::createConfig(mc, pc, runtime_config, kv_cache_config); +} + +static BatchKVCacheResourcePtr makeDSV4BatchResource(const CacheConfig& config) { + auto res = std::make_shared(); + res->resetBatchSize(1); + res->initGroups(config.groupNums(), + static_cast(config.layer_all_num), + config.layerGroupIdsSnapshot(), + config.kernelBlocksPerKvBlock(), + config.groupTypesSnapshot()); + return res; +} + +static CompleteTokenIdsPtr makeDSV4CompleteTokenIds(int initial_seq_len, int max_seq_len, int seq_size_per_block) { + auto input_ids = torch::arange(max_seq_len, torch::kInt32); + auto gi = std::make_shared(); + gi->input_ids = input_ids; + gi->generate_config = std::make_shared(); + + auto complete_token_ids = std::make_shared(1, 1, max_seq_len + 16, seq_size_per_block); + complete_token_ids->init(gi); + complete_token_ids->setSeqLength(initial_seq_len); + return complete_token_ids; +} + +static void writeDsv4RegionPattern(const std::shared_ptr& manager, + int block_id, + int layer_id, + int group_id, + size_t bytes, + uint8_t pattern) { + auto addr_info = manager->convertIndexToAddr(block_id, layer_id, group_id); + ASSERT_NE(addr_info.kv_addr, nullptr); + + auto dst = + torch::from_blob(addr_info.kv_addr, {(int64_t)bytes}, torch::TensorOptions(torch::kUInt8).device(torch::kCUDA)); + auto src = torch::full({(int64_t)bytes}, pattern, torch::TensorOptions(torch::kUInt8).device(torch::kCPU)); + dst.copy_(src); + runtimeSyncAndCheck(); +} + +static void assertDsv4RegionPatternEq(const std::shared_ptr& manager, + int block_id, + int layer_id, + int group_id, + size_t bytes, + uint8_t expected) { + auto addr_info = manager->convertIndexToAddr(block_id, layer_id, group_id); + ASSERT_NE(addr_info.kv_addr, nullptr); + + auto dev_t = + torch::from_blob(addr_info.kv_addr, {(int64_t)bytes}, torch::TensorOptions(torch::kUInt8).device(torch::kCUDA)); + auto host_t = dev_t.cpu(); + const auto* ptr = host_t.data_ptr(); + for (size_t i = 0; i < bytes; ++i) { + ASSERT_EQ(ptr[i], expected) << "mismatch at byte " << i << " layer=" << layer_id << " block=" << block_id + << " group=" << group_id; + } +} + TEST_F(KVCacheManagerTest, WarmupConfigSmoke) { auto cache_config = makeSimpleMhaCacheConfig( /*layer_num=*/1, /*block_num=*/4, /*tokens_per_block=*/2, rtp_llm::DataType::TYPE_INT8); @@ -92,6 +313,39 @@ TEST_F(KVCacheManagerTest, WarmupConfigSmoke) { EXPECT_EQ(cache_manager->freeBlocksNum(), 0); } +TEST_F(KVCacheManagerTest, DSV4IndependentPoolsUseGpuBacking) { + auto expect_pool_backing = [](RoleType role_type) { + auto config = makeCompactDSV4ManagerConfig(/*block_num=*/8); + + PDSepConfig pd_sep_config; + pd_sep_config.role_type = role_type; + KVCacheConfig kv_cache_config; + auto cache_manager = std::make_shared(config, + /*warmup=*/false, + nullptr, + kv_cache_config, + ParallelismConfig{}, + RuntimeConfig{}, + SpeculativeExecutionConfig{}, + pd_sep_config); + ASSERT_TRUE(cache_manager->init()); + + auto allocator = std::dynamic_pointer_cast(cache_manager->allocator_); + ASSERT_NE(allocator, nullptr); + ASSERT_EQ(allocator->groupBlockPools().size(), static_cast(config.groupNums())); + + for (size_t gid = 0; gid < allocator->groupBlockPools().size(); ++gid) { + const auto& tag = config.tagForGroup(gid); + EXPECT_EQ(allocator->groupBlockPools()[gid]->where(), MemoryType::MEMORY_GPU) + << "role=" << static_cast(role_type) << " gid=" << gid << " tag=" << tag; + } + }; + + expect_pool_backing(RoleType::PREFILL); + expect_pool_backing(RoleType::DECODE); + expect_pool_backing(RoleType::PDFUSION); +} + TEST_F(KVCacheManagerTest, MetricsThreadSmoke) { auto cache_config = makeSimpleMhaCacheConfig( /*layer_num=*/1, /*block_num=*/4, /*tokens_per_block=*/2, rtp_llm::DataType::TYPE_INT8); @@ -115,7 +369,7 @@ TEST_F(KVCacheManagerTest, SetKVBlockValueAndBlockCopy) { auto cache_manager = std::make_shared(cache_config, /*warmup=*/false); ASSERT_TRUE(cache_manager->init()); - auto& spec = cache_manager->cacheConfig().cache_specs[0]; + auto& spec = cache_manager->cacheConfig().specForGroup(0); const size_t k_bytes = spec->k_block_size_bytes(); const size_t v_bytes = spec->v_block_size_bytes(); ASSERT_GT(k_bytes, 0u); @@ -129,7 +383,7 @@ TEST_F(KVCacheManagerTest, SetKVBlockValueAndBlockCopy) { auto k_t = torch::from_blob(k_vec.data(), {(int64_t)k_bytes}, torch::kInt8).clone(); auto v_t = torch::from_blob(v_vec.data(), {(int64_t)v_bytes}, torch::kInt8).clone(); - ASSERT_TRUE(cache_manager->setKVBlockValue(block_src, k_t, v_t)); + ASSERT_TRUE(cache_manager->writeKVBlockForTest(block_src, k_t, v_t)); std::vector expected_block(k_bytes + v_bytes, 0); std::fill(expected_block.begin(), expected_block.begin() + k_bytes, 7); @@ -149,7 +403,7 @@ TEST_F(KVCacheManagerTest, SetKVBlockValueAndBlockCopy) { std::vector v2_vec(v_bytes, 2); auto k2_t = torch::from_blob(k2_vec.data(), {(int64_t)k_bytes}, torch::kInt8).clone(); auto v2_t = torch::from_blob(v2_vec.data(), {(int64_t)v_bytes}, torch::kInt8).clone(); - ASSERT_TRUE(cache_manager->setKVBlockValue(block_dst, /*layer_id=*/0, k2_t, v2_t)); + ASSERT_TRUE(cache_manager->writeKVBlockForTest(block_dst, /*layer_id=*/0, k2_t, v2_t)); std::vector expected_layer0(k_bytes + v_bytes, 0); std::fill(expected_layer0.begin(), expected_layer0.begin() + k_bytes, 1); @@ -212,7 +466,7 @@ TEST_F(KVCacheManagerTest, BlockBatchCopy) { auto cache_manager = std::make_shared(cache_config, /*warmup=*/false); ASSERT_TRUE(cache_manager->init()); - auto& spec = cache_manager->cacheConfig().cache_specs[0]; + auto& spec = cache_manager->cacheConfig().specForGroup(0); const size_t k_bytes = spec->k_block_size_bytes(); const size_t v_bytes = spec->v_block_size_bytes(); @@ -226,7 +480,7 @@ TEST_F(KVCacheManagerTest, BlockBatchCopy) { std::vector v_vec(v_bytes, static_cast(block_id + 10)); auto k_t = torch::from_blob(k_vec.data(), {(int64_t)k_bytes}, torch::kInt8).clone(); auto v_t = torch::from_blob(v_vec.data(), {(int64_t)v_bytes}, torch::kInt8).clone(); - ASSERT_TRUE(cache_manager->setKVBlockValue(block_id, k_t, v_t)); + ASSERT_TRUE(cache_manager->writeKVBlockForTest(block_id, k_t, v_t)); } std::vector mapping; @@ -253,6 +507,362 @@ TEST_F(KVCacheManagerTest, BlockBatchCopy) { } } +TEST_F(KVCacheManagerTest, DSV4MallocIncrFreeExposesSevenTypedRegions) { + auto manager_config = makeCompactDSV4ManagerConfig(/*block_num=*/16); + auto manager = std::make_shared(manager_config, /*warmup=*/false); + ASSERT_TRUE(manager->init()); + + const size_t free_before = manager->freeBlocksNum(); + const int spb = static_cast(manager_config.seq_size_per_block); + auto resource = makeDSV4BatchResource(manager_config); + auto tokens = makeDSV4CompleteTokenIds(/*initial_seq_len=*/2 * spb + 17, + /*max_seq_len=*/4 * spb + 32, + spb); + + MallocInfo malloc_info{resource, tokens}; + malloc_info.reuse_cache = false; + malloc_info.enable_device_cache = false; + auto malloc_result = manager->malloc(malloc_info); + ASSERT_TRUE(malloc_result.success); + ASSERT_EQ(resource->groupNums(), kDsv4PoolNum); + + for (int gid = 0; gid < kDsv4PoolNum; ++gid) { + ASSERT_EQ(resource->blocksNum(0, gid), 3) << "group " << gid; + const auto& blocks = resource->blocks(0, gid); + if (isFullGroup(manager_config, gid)) { + EXPECT_FALSE(isNullBlockIdx(blocks[0])) << "paged group " << gid; + EXPECT_FALSE(isNullBlockIdx(blocks[1])) << "paged group " << gid; + EXPECT_FALSE(isNullBlockIdx(blocks[2])) << "paged group " << gid; + } else { + expectDsv4SwaAllocatedBlocks(manager_config, blocks, gid, "tail group"); + } + } + + tokens->setSeqLength(4 * spb); + MallocInfo incr_info{resource, tokens}; + incr_info.reuse_cache = false; + incr_info.enable_device_cache = false; + auto incr_result = manager->malloc(incr_info); + ASSERT_TRUE(incr_result.success); + + for (int gid = 0; gid < kDsv4PoolNum; ++gid) { + EXPECT_EQ(resource->blocksNum(0, gid), 4) << "group " << gid; + } + + auto layout = manager->getMainModelCacheLayerLayout(); + ASSERT_EQ(layout.group_tags.size(), static_cast(kDsv4PoolNum)); + EXPECT_EQ(layout.group_tags, kDsv4Tags); + ASSERT_EQ(layout.group_seq_size_per_block, manager_config.group_seq_size_per_block); + EXPECT_EQ(layout.layers_to_kv_buffer_ptrs_by_group.size(), static_cast(manager_config.layer_num)); + + const int swa_gid = manager_config.groupIdForTag("swa_kv"); + const int csa_gid = manager_config.groupIdForTag("csa_kv"); + const int indexer_gid = manager_config.groupIdForTag("indexer_kv"); + const int csa_state_gid = manager_config.groupIdForTag("csa_state"); + const int hca_gid = manager_config.groupIdForTag("hca_kv"); + const int hca_state_gid = manager_config.groupIdForTag("hca_state"); + const int csa_layer = manager_config.layerIdsForGroup(static_cast(csa_gid))[0]; + const int hca_layer = manager_config.layerIdsForGroup(static_cast(hca_gid))[0]; + EXPECT_NE(manager->convertIndexToAddr(resource->blocks(0, csa_gid)[0], csa_layer, csa_gid).kv_addr, nullptr); + EXPECT_NE(manager->convertIndexToAddr(resource->blocks(0, indexer_gid)[0], csa_layer, indexer_gid).kv_addr, + nullptr); + EXPECT_NE(manager->convertIndexToAddr(resource->blocks(0, csa_state_gid)[2], csa_layer, csa_state_gid).kv_addr, + nullptr); + EXPECT_NE(manager->convertIndexToAddr(resource->blocks(0, hca_state_gid).back(), hca_layer, hca_state_gid).kv_addr, + nullptr); + EXPECT_NE(manager->convertIndexToAddr(resource->blocks(0, hca_gid)[0], hca_layer, hca_gid).kv_addr, nullptr); + EXPECT_NE(manager->convertIndexToAddr(resource->blocks(0, swa_gid)[2], csa_layer, swa_gid).kv_addr, nullptr); + EXPECT_ANY_THROW((void)manager->convertIndexToAddr(resource->blocks(0, hca_gid)[0], csa_layer, hca_gid)); + + FreeInfo free_info{resource, tokens}; + manager->free(free_info); + EXPECT_EQ(manager->freeBlocksNum(), free_before); +} + +TEST_F(KVCacheManagerTest, DSV4LayerRegionBlockTablesMatchInferenceAccessPattern) { + auto manager_config = makeCompactDSV4ManagerConfig(/*block_num=*/16); + auto manager = std::make_shared(manager_config, /*warmup=*/false); + ASSERT_TRUE(manager->init()); + + const int spb = static_cast(manager_config.seq_size_per_block); + auto resource = makeDSV4BatchResource(manager_config); + auto tokens = makeDSV4CompleteTokenIds(/*initial_seq_len=*/3 * spb + 17, + /*max_seq_len=*/4 * spb + 32, + spb); + + MallocInfo malloc_info{resource, tokens}; + malloc_info.reuse_cache = false; + malloc_info.enable_device_cache = false; + ASSERT_TRUE(manager->malloc(malloc_info).success); + + auto expectTagGroup = [&](int layer_id, const std::string& tag, int expected_gid) { + EXPECT_EQ(manager_config.groupIdForLayerTag(layer_id, tag), expected_gid) + << "layer=" << layer_id << " tag=" << tag; + EXPECT_EQ(resource->groupId(/*batch_id=*/0, layer_id, expected_gid), expected_gid) + << "layer=" << layer_id << " tag=" << tag; + EXPECT_EQ(resource->blocks(/*batch_id=*/0, layer_id, expected_gid), resource->blocks(0, expected_gid)) + << "layer=" << layer_id << " tag=" << tag; + EXPECT_EQ(resource->kernelBlocks(/*batch_id=*/0, layer_id, expected_gid), + resource->kernelBlocks(0, expected_gid)) + << "layer=" << layer_id << " tag=" << tag; + }; + + // Flash DSV4 layers 0/1 are SWA-only. Inference resolves typed block tables by semantic tag. + expectTagGroup(/*layer_id=*/0, "swa_kv", manager_config.groupIdForTag("swa_kv")); + EXPECT_THROW((void)manager_config.groupIdForLayerTag(/*layer_id=*/0, "csa_kv"), std::exception); + EXPECT_THROW((void)manager_config.groupIdForLayerTag(/*layer_id=*/0, "hca_kv"), std::exception); + + // Layer 2 is CSA: CSA_KV + INDEXER_KV + INDEXER_STATE + CSA_STATE + SWA_KV. + const int csa_layer = manager_config.layerIdsForGroup(static_cast(manager_config.groupIdForTag("csa_kv")))[0]; + expectTagGroup(csa_layer, "csa_kv", manager_config.groupIdForTag("csa_kv")); + expectTagGroup(csa_layer, "indexer_kv", manager_config.groupIdForTag("indexer_kv")); + expectTagGroup(csa_layer, "indexer_state", manager_config.groupIdForTag("indexer_state")); + expectTagGroup(csa_layer, "csa_state", manager_config.groupIdForTag("csa_state")); + expectTagGroup(csa_layer, "swa_kv", manager_config.groupIdForTag("swa_kv")); + EXPECT_THROW((void)manager_config.groupIdForLayerTag(csa_layer, "hca_kv"), std::exception); + + // Layer 3 is HCA: HCA_KV + HCA_STATE + SWA_KV. + const int hca_layer = manager_config.layerIdsForGroup(static_cast(manager_config.groupIdForTag("hca_kv")))[0]; + expectTagGroup(hca_layer, "hca_kv", manager_config.groupIdForTag("hca_kv")); + expectTagGroup(hca_layer, "hca_state", manager_config.groupIdForTag("hca_state")); + expectTagGroup(hca_layer, "swa_kv", manager_config.groupIdForTag("swa_kv")); + EXPECT_THROW((void)manager_config.groupIdForLayerTag(hca_layer, "csa_kv"), std::exception); + + FreeInfo free_info{resource, tokens}; + manager->free(free_info); +} + +TEST_F(KVCacheManagerTest, DSV4BlockCopyPreservesTypedRegionBytes) { + auto manager_config = makeCompactDSV4ManagerConfig(/*block_num=*/8); + auto manager = std::make_shared(manager_config, /*warmup=*/false); + ASSERT_TRUE(manager->init()); + + const int spb = static_cast(manager_config.seq_size_per_block); + const int seq_len = 3 * spb + 1; + auto resource = makeDSV4BatchResource(manager_config); + auto tokens = makeDSV4CompleteTokenIds(seq_len, seq_len, spb); + + MallocInfo malloc_info{resource, tokens}; + malloc_info.reuse_cache = false; + malloc_info.enable_device_cache = false; + ASSERT_TRUE(manager->malloc(malloc_info).success); + + const int src_block = 1; + const int dst_block = 2; + const int swa_gid = manager_config.groupIdForTag("swa_kv"); + const int csa_gid = manager_config.groupIdForTag("csa_kv"); + const int indexer_gid = manager_config.groupIdForTag("indexer_kv"); + const int indexer_state_gid = manager_config.groupIdForTag("indexer_state"); + const int csa_state_gid = manager_config.groupIdForTag("csa_state"); + const int hca_gid = manager_config.groupIdForTag("hca_kv"); + const int hca_state_gid = manager_config.groupIdForTag("hca_state"); + const int csa_layer = manager_config.layerIdsForGroup(static_cast(csa_gid))[0]; + const int hca_layer = manager_config.layerIdsForGroup(static_cast(hca_gid))[0]; + const int swa_only_layer = 0; + + for (int gid = 0; gid < kDsv4PoolNum; ++gid) { + const auto& blocks = resource->blocks(0, gid); + EXPECT_NE(std::find(blocks.begin(), blocks.end(), src_block), blocks.end()) << "group " << gid; + if (!isHcaStateGroup(manager_config, gid)) { + EXPECT_NE(std::find(blocks.begin(), blocks.end(), dst_block), blocks.end()) << "group " << gid; + } + } + + struct RegionCase { + int gid; + int layer_id; + uint8_t pattern; + }; + + const std::vector cases = { + {swa_gid, csa_layer, 0x11}, + {csa_gid, csa_layer, 0x22}, + {indexer_gid, csa_layer, 0x33}, + {indexer_state_gid, csa_layer, 0x44}, + {csa_state_gid, csa_layer, 0x55}, + {hca_gid, hca_layer, 0x66}, + {hca_state_gid, hca_layer, 0x77}, + {swa_gid, swa_only_layer, 0x88}, + }; + + for (const auto& region_case : cases) { + const size_t bytes = manager_config.specForGroup(static_cast(region_case.gid))->block_size_bytes(); + ASSERT_GT(bytes, 0u); + writeDsv4RegionPattern(manager, src_block, region_case.layer_id, region_case.gid, bytes, region_case.pattern); + writeDsv4RegionPattern(manager, dst_block, region_case.layer_id, region_case.gid, bytes, 0); + assertDsv4RegionPatternEq( + manager, src_block, region_case.layer_id, region_case.gid, bytes, region_case.pattern); + assertDsv4RegionPatternEq(manager, dst_block, region_case.layer_id, region_case.gid, bytes, 0); + } + + manager->blockCopy(src_block, dst_block); + runtimeSyncAndCheck(); + + for (const auto& region_case : cases) { + const size_t bytes = manager_config.specForGroup(static_cast(region_case.gid))->block_size_bytes(); + assertDsv4RegionPatternEq(manager, dst_block, region_case.layer_id, region_case.gid, bytes, region_case.pattern); + } + + FreeInfo free_info{resource, tokens}; + manager->free(free_info); +} + +TEST_F(KVCacheManagerTest, DSV4InsertIntoDeviceBlockCacheThenReuseSamePrefix) { + auto manager_config = makeCompactDSV4ManagerConfig(/*block_num=*/16); + auto manager = std::make_shared(manager_config, /*warmup=*/false); + ASSERT_TRUE(manager->init()); + + const int spb = static_cast(manager_config.seq_size_per_block); + const int seq_len = 3 * spb + 17; + + auto first_resource = makeDSV4BatchResource(manager_config); + auto first_tokens = makeDSV4CompleteTokenIds(seq_len, seq_len, spb); + + MallocInfo first_malloc{first_resource, first_tokens}; + first_malloc.reuse_cache = true; + first_malloc.enable_device_cache = false; + ASSERT_TRUE(manager->malloc(first_malloc).success); + + std::vector first_blocks; + first_blocks.reserve(kDsv4PoolNum); + for (int gid = 0; gid < kDsv4PoolNum; ++gid) { + first_blocks.push_back(first_resource->blocks(0, gid)); + } + + InsertInfo insert_info{first_resource, first_tokens, /*is_resident=*/false}; + manager->insertIntoCache(insert_info); + + FreeInfo first_free{first_resource, first_tokens}; + manager->free(first_free); + + auto second_resource = makeDSV4BatchResource(manager_config); + auto second_tokens = makeDSV4CompleteTokenIds(seq_len, seq_len, spb); + + MallocInfo second_malloc{second_resource, second_tokens}; + second_malloc.reuse_cache = true; + second_malloc.enable_device_cache = true; + auto reuse_result = manager->malloc(second_malloc); + ASSERT_TRUE(reuse_result.success); + EXPECT_GE(reuse_result.reuse_len, spb); + + for (int gid : dsv4GroupIdsByType(manager_config, CacheGroupType::FULL)) { + ASSERT_GE(second_resource->blocksNum(0, gid), 3) << "paged group " << gid; + EXPECT_EQ(second_resource->blocks(0, gid)[0], first_blocks[gid][0]); + EXPECT_EQ(second_resource->blocks(0, gid)[1], first_blocks[gid][1]); + } + for (int gid : dsv4FixedTailGroupIds(manager_config)) { + if (manager_config.policyForGroup(static_cast(gid)).reuse_policy == CacheReusePolicy::NON_REUSABLE) { + continue; + } + ASSERT_GE(second_resource->blocksNum(0, gid), 3) << "tail group " << gid; + EXPECT_EQ(second_resource->blocks(0, gid)[2], first_blocks[gid][2]); + } + + FreeInfo second_free{second_resource, second_tokens}; + manager->free(second_free); +} + +TEST_F(KVCacheManagerTest, DSV4InitReuseKeepsSWAPrefixTailBlock) { + auto manager_config = makeCompactDSV4ManagerConfig(/*block_num=*/64); + auto manager = std::make_shared(manager_config, /*warmup=*/false); + ASSERT_TRUE(manager->init()); + + const int spb = static_cast(manager_config.seq_size_per_block); + + auto first_resource = makeDSV4BatchResource(manager_config); + auto first_tokens = makeDSV4CompleteTokenIds(/*initial_seq_len=*/4 * spb, /*max_seq_len=*/4 * spb + 1, spb); + + MallocInfo first_malloc{first_resource, first_tokens}; + first_malloc.reuse_cache = false; + first_malloc.enable_device_cache = false; + ASSERT_TRUE(manager->malloc(first_malloc).success); + + std::vector first_swa_tail_blocks(static_cast(kDsv4PoolNum), NULL_BLOCK_IDX); + for (int gid : dsv4FixedTailGroupIds(manager_config)) { + ASSERT_EQ(first_resource->blocksNum(0, gid), 4) << "first SWA group " << gid; + expectDsv4SwaAllocatedBlocks(manager_config, first_resource->blocks(0, gid), gid, "first SWA"); + first_swa_tail_blocks[static_cast(gid)] = first_resource->blocks(0, gid)[3]; + } + + // Simulate one generated token before inserting into the device cache, so + // the fourth full block is cached and can be reused by the next prefill. + first_tokens->setSeqLength(4 * spb + 1); + manager->insertIntoCache(InsertInfo{first_resource, first_tokens, /*is_resident=*/false}); + manager->free(FreeInfo{first_resource, first_tokens}); + + auto second_resource = makeDSV4BatchResource(manager_config); + auto second_tokens = makeDSV4CompleteTokenIds(/*initial_seq_len=*/24 * spb, /*max_seq_len=*/24 * spb, spb); + + MallocInfo second_malloc{second_resource, second_tokens}; + second_malloc.reuse_cache = true; + second_malloc.enable_device_cache = true; + second_malloc.enable_remove_skipped_blocks = false; + auto reuse_result = manager->malloc(second_malloc); + ASSERT_TRUE(reuse_result.success); + EXPECT_EQ(reuse_result.reuse_len, 4 * spb); + + for (int gid : dsv4FixedTailGroupIds(manager_config)) { + if (manager_config.policyForGroup(static_cast(gid)).reuse_policy == CacheReusePolicy::NON_REUSABLE) { + continue; + } + const auto& blocks = second_resource->blocks(0, gid); + ASSERT_EQ(blocks.size(), 24u) << "second SWA group " << gid; + EXPECT_TRUE(isNullBlockIdx(blocks[2])) << "SWA reuse prefix penultimate block is NULL (no prev lookup)"; + EXPECT_EQ(blocks[3], first_swa_tail_blocks[static_cast(gid)]) + << "SWA reuse prefix tail block must stay readable"; + EXPECT_FALSE(isNullBlockIdx(blocks[22])) << "second SWA group " << gid << " fresh tail block 22"; + EXPECT_FALSE(isNullBlockIdx(blocks[23])) << "second SWA group " << gid << " fresh tail block 23"; + } + + manager->free(FreeInfo{second_resource, second_tokens}); +} + +TEST_F(KVCacheManagerTest, DSV4PopCachedBlocksPreservesGroupShape) { + auto manager_config = makeCompactDSV4ManagerConfig(/*block_num=*/16); + auto manager = std::make_shared(manager_config, /*warmup=*/false); + ASSERT_TRUE(manager->init()); + + const int spb = static_cast(manager_config.seq_size_per_block); + const int seq_len = 3 * spb + 1; + auto resource = makeDSV4BatchResource(manager_config); + auto tokens = makeDSV4CompleteTokenIds(seq_len, seq_len, spb); + + MallocInfo malloc_info{resource, tokens}; + malloc_info.reuse_cache = true; + malloc_info.enable_device_cache = false; + ASSERT_TRUE(manager->malloc(malloc_info).success); + + InsertInfo insert_info{resource, tokens, /*is_resident=*/false}; + manager->insertIntoCache(insert_info); + FreeInfo free_info{resource, tokens}; + manager->free(free_info); + + auto evicted = manager->popBlocksFromCache(/*min_blocks_to_free=*/10); + ASSERT_NE(evicted, nullptr); + ASSERT_TRUE(evicted->hasCacheKeys()); + EXPECT_EQ(evicted->groupNums(), kDsv4PoolNum); + EXPECT_EQ(evicted->cacheResource(0).layerGroupBlocks().size(), static_cast(manager_config.layer_num)); + + bool saw_paged_block = false; + bool saw_tail_block = false; + for (int gid = 0; gid < kDsv4PoolNum; ++gid) { + ASSERT_EQ(evicted->blocksNum(0, gid), static_cast(evicted->cacheKeys(0).size())) << "group " << gid; + for (auto block : evicted->blocks(0, gid)) { + if (!isNullBlockIdx(block)) { + if (isFullGroup(manager_config, gid)) { + saw_paged_block = true; + } else { + saw_tail_block = true; + } + } + } + } + EXPECT_TRUE(saw_paged_block); + EXPECT_TRUE(saw_tail_block); + + manager->blockCacheFree(evicted); +} + TEST_F(KVCacheManagerTest, Init_ReturnTrue_WhenMemoryCacheDisabled) { auto cache_config = makeSimpleMhaCacheConfig(1, 4, 2, rtp_llm::DataType::TYPE_INT8); KVCacheConfig kv_cache_config; @@ -405,28 +1015,23 @@ TEST_F(KVCacheManagerTest, GetKVCacheInfo_MergesDeviceAndMemoryKeys_Dedup) { ASSERT_NE(kv_cache_manager->coordinator_, nullptr); // Seed device block cache with keys: 10, 11, 12 (put makes MRU at front => snapshot order: 12,11,10) - auto block_cache = kv_cache_manager->allocator_->getBlockPool()->blockCache(); - ASSERT_NE(block_cache, nullptr); + auto shared_cache = kv_cache_manager->allocator_->sharedBlockCache(); + ASSERT_NE(shared_cache, nullptr); { - BlockCache::CacheItem item; - item.group_id = 0; - item.is_resident = false; - item.cache_key = 10; - item.block_index = 1; - ASSERT_TRUE(block_cache->put(item)); - item.cache_key = 11; - item.block_index = 2; - ASSERT_TRUE(block_cache->put(item)); - item.cache_key = 12; - item.block_index = 3; - ASSERT_TRUE(block_cache->put(item)); + std::vector group_slots(1); + group_slots[0] = 1; + shared_cache->put(10, group_slots, false); + group_slots[0] = 2; + shared_cache->put(11, group_slots, false); + group_slots[0] = 3; + shared_cache->put(12, group_slots, false); } // Inject a lightweight memory connector with a MemoryBlockCache snapshot: // put 11 then 13 => MRU order: 13,11 (11 duplicates device key) auto mem_connector = std::make_shared( cache_config, kv_cache_config, kv_cache_manager->allocator_, std::vector{}); - mem_connector->block_cache_ = std::make_shared(); + mem_connector->block_cache_ = std::make_shared(); { MemoryBlockCache::CacheItem item; item.cache_key = 11; @@ -440,7 +1045,7 @@ TEST_F(KVCacheManagerTest, GetKVCacheInfo_MergesDeviceAndMemoryKeys_Dedup) { } kv_cache_manager->coordinator_->memory_connector_ = mem_connector; - // latest_version=-1 forces BlockCache snapshot to return all current keys. + // latest_version=-1 forces SharedBlockCache snapshot to return all current keys. auto info = kv_cache_manager->getKVCacheInfo(/*latest_version=*/-1, /*need_cache_keys=*/true); // Current implementation uses unordered_set -> assign, so order is not stable. @@ -452,6 +1057,68 @@ TEST_F(KVCacheManagerTest, GetKVCacheInfo_MergesDeviceAndMemoryKeys_Dedup) { EXPECT_EQ(got, expected); } +TEST_F(KVCacheManagerTest, GetKVCacheInfo_UsesSmallestHybridPoolTokenCapacity) { + auto cache_config = makeDSV4ConfigWithConcurrencyPool(/*full_block_num=*/16, /*swa_batch_size=*/3); + + auto kv_cache_manager = std::make_shared(cache_config); + ASSERT_TRUE(kv_cache_manager->init()); + + auto hybrid_allocator = std::dynamic_pointer_cast(kv_cache_manager->allocator_); + ASSERT_NE(hybrid_allocator, nullptr); + + size_t expected_total_tokens = std::numeric_limits::max(); + size_t expected_available_tokens = std::numeric_limits::max(); + const auto& pools = hybrid_allocator->groupBlockPools(); + ASSERT_GT(pools.size(), 1u); + + for (size_t gid = 0; gid < pools.size(); ++gid) { + ASSERT_NE(pools[gid], nullptr); + const size_t seq_size = + (gid < cache_config.group_seq_size_per_block.size() && cache_config.group_seq_size_per_block[gid] > 0) ? + cache_config.group_seq_size_per_block[gid] : + cache_config.seq_size_per_block; + expected_total_tokens = std::min(expected_total_tokens, pools[gid]->totalBlocksNum() * seq_size); + expected_available_tokens = std::min(expected_available_tokens, pools[gid]->availableBlocksNum() * seq_size); + } + + auto info = kv_cache_manager->getKVCacheInfo(/*latest_version=*/-1, /*need_cache_keys=*/false); + + EXPECT_EQ(info.total_kv_cache, expected_total_tokens); + EXPECT_EQ(info.available_kv_cache, expected_available_tokens); + EXPECT_LT(info.total_kv_cache, kv_cache_manager->totalBlocksNum() * cache_config.seq_size_per_block); +} + +TEST_F(KVCacheManagerTest, MaxAvailableTokensNumUsesCPVirtualBlockSizeForHybridPoolFullGroups) { + auto cache_config = makeDSV4ConfigWithConcurrencyPool(/*full_block_num=*/16, /*swa_batch_size=*/3); + + auto kv_cache_manager = std::make_shared(cache_config); + ASSERT_TRUE(kv_cache_manager->init()); + + auto hybrid_allocator = std::dynamic_pointer_cast(kv_cache_manager->allocator_); + ASSERT_NE(hybrid_allocator, nullptr); + + const size_t physical_capacity = hybrid_allocator->maxAvailableTokensNum(); + auto cp_slot_mapper = + std::make_shared(/*cp_rank=*/0, /*cp_size=*/2, static_cast(cache_config.seq_size_per_block)); + kv_cache_manager->cp_slot_mapper_ = cp_slot_mapper; + hybrid_allocator->setCPSlotMapper(cp_slot_mapper); + + size_t expected_logical_capacity = std::numeric_limits::max(); + const auto& pools = hybrid_allocator->groupBlockPools(); + for (size_t gid = 0; gid < pools.size(); ++gid) { + if (gid < static_cast(cache_config.groupNums()) + && cache_config.typeForGroup(gid) != CacheGroupType::FULL) { + continue; + } + expected_logical_capacity = + std::min(expected_logical_capacity, + pools[gid]->totalBlocksNum() * static_cast(cache_config.seq_size_per_block * 2)); + } + + EXPECT_EQ(kv_cache_manager->maxAvailableTokensNum(), expected_logical_capacity); + EXPECT_GT(kv_cache_manager->maxAvailableTokensNum(), physical_capacity); +} + TEST_F(KVCacheManagerTest, GetKVCacheInfo_IncludesMemoryBlocksInTotalAndAvailable) { auto cache_config = makeSimpleMhaCacheConfig(1, 8, 2, rtp_llm::DataType::TYPE_INT8); KVCacheConfig kv_cache_config; @@ -481,5 +1148,508 @@ TEST_F(KVCacheManagerTest, GetKVCacheInfo_IncludesMemoryBlocksInTotalAndAvailabl EXPECT_GE(info.available_kv_cache, device_only_available); } +TEST_F(KVCacheManagerTest, DSV4EvictionTriggeredWhenPoolExhaustedByCache) { + // This test verifies that when block pools are exhausted by cached (but freed) requests, + // a new allocation correctly triggers LRU eviction from each group's independent BlockCache. + // + // Setup: block_num=8 → 7 usable blocks per group (block 0 reserved). + // Request seq_len = 3*spb. FULL groups allocate 3 blocks. Reusable SWA groups allocate + // linear-step blocks (step=1 here, so all 3), while HCA_STATE keeps only its active tail block. + // insertIntoCache drops the active tail slot, so each completed request caches: + // FULL groups: 2 blocks per group + // SWA/state groups: fixed-window cached blocks; HCA_STATE skips reuse. + // + // After 3 requests are cached and request-freed: + // FULL groups (0,1,2): 6 blocks cached, 1 free → new request needs 3, triggers eviction + // SWA/state groups (3,4,5,6): reusable groups may also evict under their independent pools. + // + // The fourth allocation MUST succeed via eviction on FULL groups. + auto manager_config = makeCompactDSV4ManagerConfig(/*block_num=*/8); + auto manager = std::make_shared(manager_config, /*warmup=*/false); + ASSERT_TRUE(manager->init()); + + const int spb = static_cast(manager_config.seq_size_per_block); + const int seq_len = 3 * spb; + const size_t free_before = manager->freeBlocksNum(); + // 7 groups × 7 usable blocks = 49 total free. + EXPECT_EQ(free_before, 7u * 7u); + + // Helper: create CompleteTokenIds with a token-value offset so each request gets unique cache keys. + auto makeTokens = [&](int offset) { + auto input_ids = torch::arange(offset, offset + seq_len, torch::kInt32); + auto gi = std::make_shared(); + gi->input_ids = input_ids; + gi->generate_config = std::make_shared(); + auto cti = std::make_shared(1, 1, seq_len + 16, spb); + cti->init(gi); + cti->setSeqLength(seq_len); + return cti; + }; + + // --- Request A: allocate, cache, free request reference --- + auto res_a = makeDSV4BatchResource(manager_config); + auto tokens_a = makeTokens(/*offset=*/0); + MallocInfo malloc_a{res_a, tokens_a}; + malloc_a.reuse_cache = true; + malloc_a.enable_device_cache = false; + ASSERT_TRUE(manager->malloc(malloc_a).success); + + InsertInfo insert_a{res_a, tokens_a, /*is_resident=*/false}; + manager->insertIntoCache(insert_a); + FreeInfo free_a{res_a, tokens_a}; + manager->free(free_a); + + const size_t free_after_a = manager->freeBlocksNum(); + EXPECT_LT(free_after_a, free_before); + + // --- Request B: different tokens → different cache keys --- + auto res_b = makeDSV4BatchResource(manager_config); + auto tokens_b = makeTokens(/*offset=*/10000); + MallocInfo malloc_b{res_b, tokens_b}; + malloc_b.reuse_cache = true; + malloc_b.enable_device_cache = false; + ASSERT_TRUE(manager->malloc(malloc_b).success); + + InsertInfo insert_b{res_b, tokens_b, /*is_resident=*/false}; + manager->insertIntoCache(insert_b); + FreeInfo free_b{res_b, tokens_b}; + manager->free(free_b); + + const size_t free_after_b = manager->freeBlocksNum(); + EXPECT_LT(free_after_b, free_after_a); + + // --- Request C: still fits, but leaves FULL groups with only one free block --- + auto res_c = makeDSV4BatchResource(manager_config); + auto tokens_c = makeTokens(/*offset=*/20000); + MallocInfo malloc_c{res_c, tokens_c}; + malloc_c.reuse_cache = true; + malloc_c.enable_device_cache = false; + ASSERT_TRUE(manager->malloc(malloc_c).success); + + InsertInfo insert_c{res_c, tokens_c, /*is_resident=*/false}; + manager->insertIntoCache(insert_c); + FreeInfo free_c{res_c, tokens_c}; + manager->free(free_c); + + const size_t free_after_c = manager->freeBlocksNum(); + EXPECT_LE(free_after_c, free_after_b); + + // --- Request D: triggers eviction on FULL groups --- + auto res_d = makeDSV4BatchResource(manager_config); + auto tokens_d = makeTokens(/*offset=*/30000); + MallocInfo malloc_d{res_d, tokens_d}; + malloc_d.reuse_cache = true; + malloc_d.enable_device_cache = false; + + // This allocation MUST succeed — FULL groups trigger ensureFreeBlocks → evict from cache. + auto result_d = manager->malloc(malloc_d); + ASSERT_TRUE(result_d.success) << "Fourth allocation should succeed via eviction"; + + // Verify block structure for request D. + ASSERT_EQ(res_d->groupNums(), kDsv4PoolNum); + for (int gid = 0; gid < kDsv4PoolNum; ++gid) { + ASSERT_EQ(res_d->blocksNum(0, gid), 3) << "group " << gid; + const auto& blocks = res_d->blocks(0, gid); + if (isFullGroup(manager_config, gid)) { + for (int i = 0; i < 3; ++i) { + EXPECT_FALSE(isNullBlockIdx(blocks[i])) << "FULL group " << gid << " pos " << i; + } + } else { + expectDsv4SwaAllocatedBlocks(manager_config, blocks, gid, "fixed group", /*enable_reuse_cache=*/true); + } + } + + EXPECT_LE(manager->freeBlocksNum(), free_after_c) << "Pool should be tighter after D allocated"; + + // --- Free D and verify blocks return to pool --- + FreeInfo free_d{res_d, tokens_d}; + manager->free(free_d); + + // After freeing D, its blocks (request_ref→0, cache_ref=0 since we did not insert D into cache) + // return to the free pool. + // But cached blocks from eviction of A are fully freed (both refs=0) so they also count. + // Expect freeBlocksNum >= free_after_c (at least as good as before D was allocated). + EXPECT_GE(manager->freeBlocksNum(), free_after_c); + + // --- Pop all remaining cached blocks and verify full pool recovery --- + auto evicted = manager->popBlocksFromCache(/*min_blocks_to_free=*/100); + if (evicted) { + manager->blockCacheFree(evicted); + } + EXPECT_EQ(manager->freeBlocksNum(), free_before); +} + +TEST_F(KVCacheManagerTest, DSV4MaxConcurrencyOneReuseOneBlockAndAllocTwoTailBlocks) { + auto manager_config = + makeProductionDSV4Config(/*full_block_num=*/8, /*max_concurrency=*/1, /*hca_state_pool_blocks=*/12); + ASSERT_EQ(manager_config.groupBlockNumsSnapshot().size(), static_cast(kDsv4PoolNum)); + for (int gid : dsv4FixedTailGroupIds(manager_config)) { + const uint32_t expected = isHcaStateGroup(manager_config, gid) ? 12u : 8u; + ASSERT_EQ(manager_config.blockNumForGroup(static_cast(gid)), expected) << "group " << gid; + } + + auto manager = std::make_shared(manager_config, /*warmup=*/false); + ASSERT_TRUE(manager->init()); + + const size_t free_before = manager->freeBlocksNum(); + EXPECT_EQ(free_before, 6u * 7u + 11u); + const int spb = static_cast(manager_config.seq_size_per_block); + + auto makeTokens = [&](int seq_len) { + auto input_ids = torch::arange(0, seq_len, torch::kInt32); + auto gi = std::make_shared(); + gi->input_ids = input_ids; + gi->generate_config = std::make_shared(); + auto cti = std::make_shared(1, 1, /*max_seq_len=*/4 * spb, spb); + cti->init(gi); + cti->setSeqLength(seq_len); + return cti; + }; + + // Seed one reusable SWA/state block per independent pool. For a 2-block request, + // insertIntoCache keeps only the first full block; the active tail is not cached. + auto seed_res = makeDSV4BatchResource(manager_config); + auto seed_tokens = makeTokens(2 * spb); + MallocInfo seed_malloc{seed_res, seed_tokens}; + seed_malloc.reuse_cache = false; + seed_malloc.enable_device_cache = false; + ASSERT_TRUE(manager->malloc(seed_malloc).success); + + for (int gid : dsv4FixedTailGroupIds(manager_config)) { + ASSERT_EQ(seed_res->blocksNum(0, gid), 2) << "seed group " << gid; + expectDsv4SwaAllocatedBlocks(manager_config, seed_res->blocks(0, gid), gid, "seed group"); + } + + manager->insertIntoCache(InsertInfo{seed_res, seed_tokens, /*is_resident=*/false}); + manager->free(FreeInfo{seed_res, seed_tokens}); + + // Same prefix, one more block. This hits one cached independent-pool block and + // must still have room for the two fresh tail blocks. The matched block is + // then skipped out of the active SWA tail by the decode allocation path. + auto reuse_res = makeDSV4BatchResource(manager_config); + auto reuse_tokens = makeTokens(3 * spb); + MallocInfo reuse_malloc{reuse_res, reuse_tokens}; + reuse_malloc.reuse_cache = true; + reuse_malloc.enable_device_cache = true; + auto reuse_result = manager->malloc(reuse_malloc); + ASSERT_TRUE(reuse_result.success); + EXPECT_EQ(reuse_result.reuse_len, 2 * spb); + + for (int gid : dsv4FixedTailGroupIds(manager_config)) { + if (manager_config.policyForGroup(static_cast(gid)).reuse_policy == CacheReusePolicy::NON_REUSABLE) { + continue; + } + const auto& blocks = reuse_res->blocks(0, gid); + ASSERT_EQ(blocks.size(), 3u) << "reuse group " << gid; + EXPECT_TRUE(isNullBlockIdx(blocks[0])) << "reuse group " << gid << " skipped reused prefix"; + EXPECT_FALSE(isNullBlockIdx(blocks[1])) << "reuse group " << gid << " tail block 1"; + EXPECT_FALSE(isNullBlockIdx(blocks[2])) << "reuse group " << gid << " tail block 2"; + } + + manager->free(FreeInfo{reuse_res, reuse_tokens}); + auto evicted = manager->popBlocksFromCache(/*min_blocks_to_free=*/100); + if (evicted) { + manager->blockCacheFree(evicted); + } + EXPECT_EQ(manager->freeBlocksNum(), free_before); +} + +TEST_F(KVCacheManagerTest, DSV4EvictionOnSWAGroupsDuringInferenceWithDecodeContinuation) { + // This test simulates full DSV4 inference including SWA group eviction. + // + // Tight stress layout: + // FULL groups (0,1,2): large paged pool (block_num=8, 7 usable) + // SWA groups (3,4,5,6): small independent pool with 3 usable blocks + // + // SWA pools are sized by concurrency, NOT by global block_num. This test verifies that + // eviction is triggered independently on SWA groups when concurrent requests exhaust + // the independent pool, and that decode-phase removeSkippedBlocks interacts correctly with eviction. + // + // Lifecycle: + // Phase 1: 2 requests complete and get cached → SWA pools nearly full (2 of 3 cached) + // Phase 2: 3rd request triggers eviction on SWA groups + // Phase 3: Decode-phase incrKVBlock triggers further FULL/SWA eviction + removeSkippedBlocks + // Phase 4: Free and verify pool recovery + auto manager_config = makeDSV4ConfigWithConcurrencyPool(/*full_block_num=*/8, /*swa_batch_size=*/4); + auto manager = std::make_shared(manager_config, /*warmup=*/false); + ASSERT_TRUE(manager->init()); + + const int spb = static_cast(manager_config.seq_size_per_block); + const int seq_len = 3 * spb; + + // Verify differentiated pool sizes. + const size_t free_before = manager->freeBlocksNum(); + EXPECT_EQ(free_before, 3u * 7u + 4u * 7u); + + // Helper: create tokens with unique offset for distinct cache keys. + auto makeTokens = [&](int offset) { + auto input_ids = torch::arange(offset, offset + seq_len, torch::kInt32); + auto gi = std::make_shared(); + gi->input_ids = input_ids; + gi->generate_config = std::make_shared(); + auto cti = std::make_shared(1, 1, /*max_seq_len=*/10 * spb, spb); + cti->init(gi); + cti->setSeqLength(seq_len); + return cti; + }; + + // === Phase 1: Fill caches with 2 completed requests === + auto res_a = makeDSV4BatchResource(manager_config); + auto tokens_a = makeTokens(/*offset=*/0); + MallocInfo malloc_a{res_a, tokens_a}; + malloc_a.reuse_cache = true; + malloc_a.enable_device_cache = false; + ASSERT_TRUE(manager->malloc(malloc_a).success); + InsertInfo insert_a{res_a, tokens_a, /*is_resident=*/false}; + manager->insertIntoCache(insert_a); + manager->free(FreeInfo{res_a, tokens_a}); + + auto res_b = makeDSV4BatchResource(manager_config); + auto tokens_b = makeTokens(/*offset=*/10000); + MallocInfo malloc_b{res_b, tokens_b}; + malloc_b.reuse_cache = true; + malloc_b.enable_device_cache = false; + ASSERT_TRUE(manager->malloc(malloc_b).success); + InsertInfo insert_b{res_b, tokens_b, /*is_resident=*/false}; + manager->insertIntoCache(insert_b); + manager->free(FreeInfo{res_b, tokens_b}); + + const size_t free_after_cache = manager->freeBlocksNum(); + EXPECT_LT(free_after_cache, free_before); + + // === Phase 2: 3rd request triggers eviction on SWA groups === + auto res_c = makeDSV4BatchResource(manager_config); + auto tokens_c = makeTokens(/*offset=*/20000); + MallocInfo malloc_c{res_c, tokens_c}; + malloc_c.reuse_cache = true; + malloc_c.enable_device_cache = false; + + // FULL needs 3, has exactly 3 free → no FULL eviction yet. + // SWA needs 2, only 1 free → ensureFreeBlocks evicts 1 from SWA cache. + auto result_c = manager->malloc(malloc_c); + ASSERT_TRUE(result_c.success) << "3rd allocation must succeed via SWA eviction"; + + // Verify block structure. + ASSERT_EQ(res_c->groupNums(), kDsv4PoolNum); + for (int gid = 0; gid < kDsv4PoolNum; ++gid) { + ASSERT_EQ(res_c->blocksNum(0, gid), 3) << "group " << gid; + const auto& blocks = res_c->blocks(0, gid); + if (isFullGroup(manager_config, gid)) { + for (int i = 0; i < 3; ++i) { + EXPECT_FALSE(isNullBlockIdx(blocks[i])) << "FULL group " << gid << " pos " << i; + } + } else { + expectDsv4SwaAllocatedBlocks(manager_config, blocks, gid, "SWA group", /*enable_reuse_cache=*/true); + } + } + + // === Phase 3: Decode incrKVBlock → SWA removeSkippedBlocks + further SWA eviction === + + // --- Incr to 4*spb --- + // Non-HCA SWA state starts from the reusable linear-step allocation and then keeps the active tail window. + // HCA_STATE skips reuse and keeps only its active tail block. + // FULL pool after Phase 2: 4 cached + 3 request = 7 used, 0 free → ensureFreeBlocks evicts 1. + tokens_c->setSeqLength(4 * spb); + MallocInfo incr1{res_c, tokens_c}; + incr1.reuse_cache = false; + incr1.enable_device_cache = false; + ASSERT_TRUE(manager->malloc(incr1).success) << "First incr must succeed via eviction"; + + for (int gid = 0; gid < kDsv4PoolNum; ++gid) { + ASSERT_EQ(res_c->blocksNum(0, gid), 4) << "group " << gid << " after incr to 4*spb"; + } + // SWA/state fixed groups retain the current tail window. + for (int gid : dsv4FixedTailGroupIds(manager_config)) { + expectDsv4SwaAllocatedBlocks(manager_config, res_c->blocks(0, gid), gid, "SWA group"); + } + + // --- Incr to 5*spb --- + // Non-HCA SWA removes blocks before the active two-block tail; HCA_STATE keeps a one-block tail. + // SWA pools may need another eviction if no free block remains. + tokens_c->setSeqLength(5 * spb); + MallocInfo incr2{res_c, tokens_c}; + incr2.reuse_cache = false; + incr2.enable_device_cache = false; + ASSERT_TRUE(manager->malloc(incr2).success) << "Second incr must succeed (removeSkipped frees block)"; + + for (int gid = 0; gid < kDsv4PoolNum; ++gid) { + ASSERT_EQ(res_c->blocksNum(0, gid), 5) << "group " << gid << " after incr to 5*spb"; + } + // SWA/state fixed groups keep only the active tail window. + for (int gid : dsv4FixedTailGroupIds(manager_config)) { + expectDsv4SwaAllocatedBlocks(manager_config, res_c->blocks(0, gid), gid, "SWA group"); + } + + // === Phase 4: Free all and verify full pool recovery === + manager->free(FreeInfo{res_c, tokens_c}); + + // Pop remaining cached blocks to restore pool. + auto evicted = manager->popBlocksFromCache(/*min_blocks_to_free=*/100); + if (evicted) { + manager->blockCacheFree(evicted); + } + EXPECT_EQ(manager->freeBlocksNum(), free_before); +} +TEST_F(KVCacheManagerTest, DSV4InitThenIncrWithRemoveSkippedBlocksFullLifecycle) { + // This test exercises the full lifecycle of a DSV4 request: + // 1. initKVBlock (first malloc with 4 blocks) + // 2. Multiple incrKVBlock calls (decode phase) that trigger removeSkippedBlocks + // 3. Verify SWA groups free old non-tail blocks during incr + // 4. Final free returns all blocks to pool + auto manager_config = makeCompactDSV4ManagerConfig(/*block_num=*/32); + auto manager = std::make_shared(manager_config, /*warmup=*/false); + ASSERT_TRUE(manager->init()); + + const size_t free_before = manager->freeBlocksNum(); + const int spb = static_cast(manager_config.seq_size_per_block); + auto resource = makeDSV4BatchResource(manager_config); + + // --- Phase 1: initKVBlock with 4 blocks (simulates prefill completion) --- + const int init_seq_len = 4 * spb; + auto tokens = makeDSV4CompleteTokenIds(init_seq_len, /*max_seq_len=*/10 * spb, spb); + + MallocInfo init_info{resource, tokens}; + init_info.reuse_cache = false; + init_info.enable_device_cache = false; + auto init_result = manager->malloc(init_info); + ASSERT_TRUE(init_result.success); + ASSERT_EQ(resource->groupNums(), kDsv4PoolNum); + + // After init: FULL groups (0,1,2) have 4 real blocks each. + // SWA groups keep the active tail window; HCA_STATE keeps a one-block tail. + for (int gid = 0; gid < kDsv4PoolNum; ++gid) { + ASSERT_EQ(resource->blocksNum(0, gid), 4) << "group " << gid; + const auto& blocks = resource->blocks(0, gid); + if (isFullGroup(manager_config, gid)) { + for (int i = 0; i < 4; ++i) { + EXPECT_FALSE(isNullBlockIdx(blocks[i])) << "FULL group " << gid << " pos " << i; + } + } else { + expectDsv4SwaAllocatedBlocks(manager_config, blocks, gid, "SWA group"); + } + } + + // Record block IDs allocated after init for later validation. + std::vector init_blocks(kDsv4PoolNum); + for (int gid = 0; gid < kDsv4PoolNum; ++gid) { + init_blocks[gid] = resource->blocks(0, gid); + } + const size_t free_after_init = manager->freeBlocksNum(); + + // --- Phase 2: First incrKVBlock (4 → 5 blocks) --- + // removeSkippedBlocks on SWA groups: [NULL, NULL, A, B] → keep_begin=2, loop i=0..1 both NULL → no free. + // Then allocate 1 new block per group. + tokens->setSeqLength(5 * spb); + MallocInfo incr1_info{resource, tokens}; + incr1_info.reuse_cache = false; + incr1_info.enable_device_cache = false; + ASSERT_TRUE(manager->malloc(incr1_info).success); + + for (int gid = 0; gid < kDsv4PoolNum; ++gid) { + ASSERT_EQ(resource->blocksNum(0, gid), 5) << "group " << gid << " after incr1"; + } + // FULL groups: all 5 blocks should be real. + for (int gid : dsv4GroupIdsByType(manager_config, CacheGroupType::FULL)) { + const auto& blocks = resource->blocks(0, gid); + for (int i = 0; i < 5; ++i) { + EXPECT_FALSE(isNullBlockIdx(blocks[i])) << "FULL group " << gid << " pos " << i << " after incr1"; + } + // Original init blocks should be preserved. + for (int i = 0; i < 4; ++i) { + EXPECT_EQ(blocks[i], init_blocks[gid][i]) << "FULL group " << gid << " pos " << i << " changed"; + } + } + // SWA/state fixed groups keep the current tail window. + for (int gid : dsv4FixedTailGroupIds(manager_config)) { + const auto& blocks = resource->blocks(0, gid); + expectDsv4SwaAllocatedBlocks(manager_config, blocks, gid, "SWA group after incr1"); + if (!isHcaStateGroup(manager_config, gid)) { + EXPECT_EQ(blocks[3], init_blocks[gid][3]) << "SWA group " << gid << " old tail pos 3"; + } + } + + // Four fixed groups freed one stale block and all seven groups allocated one new block. + EXPECT_EQ(manager->freeBlocksNum(), free_after_init - 7 + 4); + const size_t free_after_incr1 = manager->freeBlocksNum(); + + // Record SWA tail blocks after incr1 for the next step. + std::vector swa_new_C(static_cast(manager_config.groupNums()), NULL_BLOCK_IDX); + for (int gid : dsv4FixedTailGroupIds(manager_config)) { + swa_new_C[static_cast(gid)] = resource->blocks(0, gid)[4]; + } + + // --- Phase 3: Second incrKVBlock (5 → 6 blocks) — triggers removeSkippedBlocks --- + // SWA removeSkippedBlocks on [NULL, NULL, A, B, C] (size=5): keep_begin = 5-2 = 3. + // Loop i=0: NULL → skip. + // Loop i=1: NULL → skip. + // Loop i=2: A (real block) → FREE it, set to NULL. + // After remove: [NULL, NULL, NULL, B, C] + // Then malloc allocates 1 new block D → [NULL, NULL, NULL, B, C, D] + tokens->setSeqLength(6 * spb); + MallocInfo incr2_info{resource, tokens}; + incr2_info.reuse_cache = false; + incr2_info.enable_device_cache = false; + ASSERT_TRUE(manager->malloc(incr2_info).success); + + for (int gid = 0; gid < kDsv4PoolNum; ++gid) { + ASSERT_EQ(resource->blocksNum(0, gid), 6) << "group " << gid << " after incr2"; + } + + // FULL groups: all 6 blocks real, first 4 unchanged. + for (int gid : dsv4GroupIdsByType(manager_config, CacheGroupType::FULL)) { + const auto& blocks = resource->blocks(0, gid); + for (int i = 0; i < 6; ++i) { + EXPECT_FALSE(isNullBlockIdx(blocks[i])) << "FULL group " << gid << " pos " << i << " after incr2"; + } + for (int i = 0; i < 4; ++i) { + EXPECT_EQ(blocks[i], init_blocks[gid][i]) << "FULL group " << gid << " init block preserved"; + } + } + + // SWA/state fixed groups after incr2 keep their configured active tail window. + for (int gid : dsv4FixedTailGroupIds(manager_config)) { + const auto& blocks = resource->blocks(0, gid); + expectDsv4SwaAllocatedBlocks(manager_config, blocks, gid, "SWA group after incr2"); + if (!isHcaStateGroup(manager_config, gid)) { + EXPECT_EQ(blocks[4], swa_new_C[static_cast(gid)]) << "SWA group " << gid << " pos 4 = old C"; + } + } + + // Free block accounting: SWA freed 1 block per SWA group (4 groups) at removeSkippedBlocks, + // then allocated 1 new block per group (7 groups). Net change: -7 + 4 = -3. + EXPECT_EQ(manager->freeBlocksNum(), free_after_incr1 - 7 + 4); + const size_t free_after_incr2 = manager->freeBlocksNum(); + + // --- Phase 4: Third incrKVBlock (6 → 7 blocks) — triggers another removeSkippedBlocks --- + // SWA removeSkippedBlocks on [NULL, NULL, NULL, B, C, D] (size=6): keep_begin = 6-2 = 4. + // Loop i=0..2: all NULL → skip. + // Loop i=3: B (real block) → FREE it, set to NULL. + // After remove: [NULL, NULL, NULL, NULL, C, D] + // Then malloc allocates 1 new block E → [NULL, NULL, NULL, NULL, C, D, E] + tokens->setSeqLength(7 * spb); + MallocInfo incr3_info{resource, tokens}; + incr3_info.reuse_cache = false; + incr3_info.enable_device_cache = false; + ASSERT_TRUE(manager->malloc(incr3_info).success); + + for (int gid = 0; gid < kDsv4PoolNum; ++gid) { + ASSERT_EQ(resource->blocksNum(0, gid), 7) << "group " << gid << " after incr3"; + } + + // SWA/state fixed groups after incr3 keep their configured active tail window. + for (int gid : dsv4FixedTailGroupIds(manager_config)) { + const auto& blocks = resource->blocks(0, gid); + expectDsv4SwaAllocatedBlocks(manager_config, blocks, gid, "SWA group after incr3"); + } + + // SWA freed 1 block per SWA group (4) and allocated 1 per all groups (7). Net: -7+4 = -3. + EXPECT_EQ(manager->freeBlocksNum(), free_after_incr2 - 7 + 4); + + // --- Phase 5: Free all — all blocks should return to pool --- + FreeInfo free_info{resource, tokens}; + manager->free(free_info); + EXPECT_EQ(manager->freeBlocksNum(), free_before); +} + } // namespace test } // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/test/KVCacheResourceLocalCacheKeysTest.cc b/rtp_llm/cpp/cache/test/KVCacheResourceLocalCacheKeysTest.cc new file mode 100644 index 0000000000..634021e130 --- /dev/null +++ b/rtp_llm/cpp/cache/test/KVCacheResourceLocalCacheKeysTest.cc @@ -0,0 +1,75 @@ +#include + +#include "rtp_llm/cpp/cache/KVCacheResource.h" + +namespace rtp_llm { +namespace test { + +class LocalCacheKeysTest: public ::testing::Test { +protected: + KVCacheResource make(const CacheKeysType& keys) { + KVCacheResource r; + r.cacheKeys() = keys; + return r; + } +}; + +TEST_F(LocalCacheKeysTest, CpSize1Passthrough) { + auto r = make({10, 20, 30, 40}); + auto out = r.localCacheKeys(0, 1); + ASSERT_EQ(out.size(), 4u); + EXPECT_EQ(out[0], 10); + EXPECT_EQ(out[1], 20); + EXPECT_EQ(out[2], 30); + EXPECT_EQ(out[3], 40); +} + +TEST_F(LocalCacheKeysTest, CpSize2EvenLengthLastRank) { + auto r = make({100, 101, 200, 201, 300, 301, 400, 401}); + // last-rank stride: rank=1, size=2 → idx 1,3,5,7 + auto out = r.localCacheKeys(1, 2); + ASSERT_EQ(out.size(), 4u); + EXPECT_EQ(out[0], 101); + EXPECT_EQ(out[1], 201); + EXPECT_EQ(out[2], 301); + EXPECT_EQ(out[3], 401); +} + +TEST_F(LocalCacheKeysTest, CpSize2Rank0) { + auto r = make({100, 101, 200, 201}); + auto out = r.localCacheKeys(0, 2); + ASSERT_EQ(out.size(), 2u); + EXPECT_EQ(out[0], 100); + EXPECT_EQ(out[1], 200); +} + +TEST_F(LocalCacheKeysTest, CpSize4NonDivisibleLastRankShorter) { + // 10 keys, cp_size=4 → last-rank (3) takes idx 3, 7 → length 2 (vs blocks=ceil(10/4)=3) + auto r = make({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); + auto out = r.localCacheKeys(3, 4); + ASSERT_EQ(out.size(), 2u); + EXPECT_EQ(out[0], 3); + EXPECT_EQ(out[1], 7); +} + +TEST_F(LocalCacheKeysTest, EmptyKeys) { + auto r = make({}); + auto out = r.localCacheKeys(0, 4); + EXPECT_TRUE(out.empty()); +} + +TEST_F(LocalCacheKeysTest, KeysShorterThanCpSizeReturnsEmptyForLastRank) { + auto r = make({42}); + auto out = r.localCacheKeys(3, 4); // last-rank starts at idx 3, but only 1 key + EXPECT_TRUE(out.empty()); +} + +TEST_F(LocalCacheKeysTest, KeysShorterThanCpSizeRank0HasOne) { + auto r = make({42}); + auto out = r.localCacheKeys(0, 4); + ASSERT_EQ(out.size(), 1u); + EXPECT_EQ(out[0], 42); +} + +} // namespace test +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/test/KVCacheResourceTest.cc b/rtp_llm/cpp/cache/test/KVCacheResourceTest.cc index 283461ddbf..d4b57012d4 100644 --- a/rtp_llm/cpp/cache/test/KVCacheResourceTest.cc +++ b/rtp_llm/cpp/cache/test/KVCacheResourceTest.cc @@ -59,7 +59,7 @@ TEST(KVCacheResourceTest, InitGroups_RespectsGroupTypesAndBlocksPerKvBlock) { KVCacheResource resource; resource.initGroups(/*group_num=*/2, /*layer_num=*/3, - /*layer_to_group_id=*/{0, 1, 0}, + /*layer_group_ids=*/{{0}, {1}, {0}}, /*kernel_blocks_per_kv_block=*/4, /*group_types=*/{CacheGroupType::FULL, CacheGroupType::LINEAR}); @@ -82,6 +82,38 @@ TEST(KVCacheResourceTest, InitGroups_RespectsGroupTypesAndBlocksPerKvBlock) { ASSERT_EQ(resource.kernelBlocks(1), (BlockIndicesType{1})); } +TEST(KVCacheResourceTest, CacheKeysMaintainLinearDependencies) { + KVCacheResource resource; + resource.setCacheKeys(CacheKeysType{10, 20, 30}); + + ASSERT_EQ(resource.blockDependencies().size(), 3u); + EXPECT_FALSE(resource.blockDependencies()[0].has_parent); + EXPECT_EQ(resource.blockDependencies()[0].ordinal, 0u); + EXPECT_TRUE(resource.blockDependencies()[1].has_parent); + EXPECT_EQ(resource.blockDependencies()[1].parent_key, 10); + EXPECT_EQ(resource.blockDependencies()[1].ordinal, 1u); + EXPECT_TRUE(resource.blockDependencies()[2].has_parent); + EXPECT_EQ(resource.blockDependencies()[2].parent_key, 20); + EXPECT_EQ(resource.blockDependencies()[2].ordinal, 2u); + + BlockDependenciesType custom = { + BlockDependency{false, 0, 7}, + BlockDependency{true, 100, 8}, + }; + resource.setCacheKeys(CacheKeysType{100, 200}); + resource.setBlockDependencies(custom); + resource.ensureLinearBlockDependencies(); + ASSERT_EQ(resource.blockDependencies().size(), 2u); + EXPECT_EQ(resource.blockDependencies()[0].ordinal, 7u); + EXPECT_EQ(resource.blockDependencies()[1].parent_key, 100); + + resource.cacheKeys().push_back(300); + resource.ensureLinearBlockDependencies(); + ASSERT_EQ(resource.blockDependencies().size(), 3u); + EXPECT_EQ(resource.blockDependencies()[2].parent_key, 200); + EXPECT_EQ(resource.blockDependencies()[2].ordinal, 2u); +} + TEST(CacheConfigTest, KernelBlocksPerKvBlockSafeByDefault) { CacheConfig config; config.seq_size_per_block = 1; @@ -98,7 +130,7 @@ TEST(BatchKVCacheResourceTest, BasicBatchOperations_WorkAsExpected) { batch.resetBatchSize(2); batch.initGroups(/*group_nums=*/2, /*layer_num=*/3, - /*layer_to_group_id=*/{0, 1, 0}, + /*layer_group_ids=*/{{0}, {1}, {0}}, /*kernel_blocks_per_kv_block=*/4, /*group_types=*/{CacheGroupType::FULL, CacheGroupType::LINEAR}); @@ -139,7 +171,7 @@ TEST(BatchKVCacheResourceTest, BasicBatchOperations_WorkAsExpected) { KVCacheResource moved; moved.initGroups(/*group_num=*/1, /*layer_num=*/1, - /*layer_to_group_id=*/{0}, + /*layer_group_ids=*/{{0}}, /*kernel_blocks_per_kv_block=*/2, /*group_types=*/{CacheGroupType::FULL}); moved.mutableBlockIds(0).add(BlockIndicesType{3}); diff --git a/rtp_llm/cpp/cache/test/LinearKVCacheGroupTest.cc b/rtp_llm/cpp/cache/test/LinearKVCacheGroupTest.cc index 4a1db05b18..ba2207f585 100644 --- a/rtp_llm/cpp/cache/test/LinearKVCacheGroupTest.cc +++ b/rtp_llm/cpp/cache/test/LinearKVCacheGroupTest.cc @@ -3,8 +3,8 @@ #include #include -#include "rtp_llm/cpp/cache/BlockCache.h" -#include "rtp_llm/cpp/cache/LinearKVCacheGroup.h" +#include "rtp_llm/cpp/cache/SharedBlockCache.h" +#include "rtp_llm/cpp/cache/group/LinearKVCacheGroup.h" #include "rtp_llm/cpp/cache/test/BlockPoolTestHelper.h" namespace rtp_llm { @@ -14,7 +14,6 @@ static std::shared_ptr makeLinearSpec(uint32_t seq_size_per_b auto spec = std::make_shared(); spec->type = KVCacheSpecType::LinearAttention; spec->dtype = rtp_llm::DataType::TYPE_FP16; - spec->layer_num = 2; spec->local_num_k_heads = 1; spec->local_num_v_heads = 1; spec->head_k_dim = 1; @@ -27,7 +26,24 @@ static std::shared_ptr makeLinearSpec(uint32_t seq_size_per_b class LinearKVCacheGroupTest: public ::testing::Test {}; -TEST_F(LinearKVCacheGroupTest, GetNeedBlocksReuseDisabledCountsOnlyReserveStep) { +TEST_F(LinearKVCacheGroupTest, DefaultPolicyDrivesBehaviorInterfaces) { + auto block_pool = createBlockPool(); + ASSERT_TRUE(block_pool->init()); + + auto spec = makeLinearSpec(/*seq_size_per_block=*/4); + LinearKVCacheGroup group(/*layer_ids=*/{}, spec, block_pool, /*group_id=*/0, /*linear_step=*/2); + + EXPECT_FALSE(group.prefixReusable()); + EXPECT_FALSE(group.isCpShardable()); + EXPECT_TRUE(group.hasSparseSlots()); + EXPECT_FALSE(group.hasKernelBlockSubdiv()); + EXPECT_TRUE(group.transferTailBlocks()); + EXPECT_FALSE(group.cpCompactTailBlocks()); + EXPECT_TRUE(group.isReservable()); + EXPECT_FALSE(group.usesPinnedCpuBacking()); +} + +TEST_F(LinearKVCacheGroupTest, GetNeedBlocksReuseDisabledCountsLastTwoTailAndReserveStep) { auto block_pool = createBlockPool(); ASSERT_TRUE(block_pool->init()); @@ -35,11 +51,11 @@ TEST_F(LinearKVCacheGroupTest, GetNeedBlocksReuseDisabledCountsOnlyReserveStep) LinearKVCacheGroup group(/*layer_ids=*/{}, spec, block_pool, /*group_id=*/0, /*linear_step=*/2); ASSERT_TRUE(group.init()); - // common_slots=2, seq_slots=3, total_slots=5 => when reuse disabled, common=1(tail), - // extra=1(tail)+1(reserve_step-1)=2 + // common_slots=2, seq_slots=3, total_slots=4 => common phase materializes + // its last two slots; incremental phase adds final tail and reserve slots. const auto need = group.getNeedBlocks(/*common_seq_len=*/8, /*seq_len=*/12, /*reserve_step=*/2, /*reuse_blocks_len=*/0, false); - EXPECT_EQ(need.common_blocks, 1); + EXPECT_EQ(need.common_blocks, 2); EXPECT_EQ(need.extra_blocks, 2); } @@ -51,8 +67,8 @@ TEST_F(LinearKVCacheGroupTest, GetNeedBlocksReuseEnabledUsesSparseCountingAndRes LinearKVCacheGroup group(/*layer_ids=*/{}, spec, block_pool, /*group_id=*/0, /*linear_step=*/2); ASSERT_TRUE(group.init()); - // common_slots=2: - // count(0,2]=2; count(2,3]=1; reserve_step=2 => extra=2 + // common_slots=2, seq_slots=3, total_slots=4. Reuse enabled keeps step + // hits plus the last two seq slots, so this matches the disabled case here. const auto need = group.getNeedBlocks(/*common_seq_len=*/8, /*seq_len=*/12, /*reserve_step=*/2, /*reuse_blocks_len=*/0, true); EXPECT_EQ(need.common_blocks, 2); @@ -74,14 +90,14 @@ TEST_F(LinearKVCacheGroupTest, MallocAllocatesStepHitsAndTailWhenReuseEnabled) { ASSERT_EQ(blocks.blocksNum(), 4u); EXPECT_TRUE(isNullBlockIdx(blocks.blocks()[0])); EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[1])); - EXPECT_TRUE(isNullBlockIdx(blocks.blocks()[2])); + EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[2])); // tail-1 protects causal_conv1d boundary read EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[3])); - // Only 2 real blocks allocated. - EXPECT_EQ(block_pool->freeBlocksNum(), 7u); + // Step hit + tail-1 + tail. + EXPECT_EQ(block_pool->freeBlocksNum(), 6u); } -TEST_F(LinearKVCacheGroupTest, MallocAllocatesOnlyTailWhenReuseDisabled) { +TEST_F(LinearKVCacheGroupTest, MallocAllocatesLastTwoTailBlocksWhenReuseDisabled) { auto block_pool = createBlockPool(); ASSERT_TRUE(block_pool->init()); ASSERT_EQ(block_pool->freeBlocksNum(), 9u); @@ -96,11 +112,10 @@ TEST_F(LinearKVCacheGroupTest, MallocAllocatesOnlyTailWhenReuseDisabled) { ASSERT_EQ(blocks.blocksNum(), 4u); EXPECT_TRUE(isNullBlockIdx(blocks.blocks()[0])); EXPECT_TRUE(isNullBlockIdx(blocks.blocks()[1])); - EXPECT_TRUE(isNullBlockIdx(blocks.blocks()[2])); + EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[2])); // tail-1 protects causal_conv1d boundary read EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[3])); - // Only 1 real block allocated. - EXPECT_EQ(block_pool->freeBlocksNum(), 8u); + EXPECT_EQ(block_pool->freeBlocksNum(), 7u); } TEST_F(LinearKVCacheGroupTest, MallocAllocatesReserveTailBlocksWhenReuseDisabled) { @@ -119,12 +134,95 @@ TEST_F(LinearKVCacheGroupTest, MallocAllocatesReserveTailBlocksWhenReuseDisabled ASSERT_EQ(blocks.blocksNum(), 5u); EXPECT_TRUE(isNullBlockIdx(blocks.blocks()[0])); EXPECT_TRUE(isNullBlockIdx(blocks.blocks()[1])); - EXPECT_TRUE(isNullBlockIdx(blocks.blocks()[2])); + EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[2])); // tail-1 protects causal_conv1d boundary read EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[3])); // seq tail EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[4])); // reserve tail - // Tail + reserve_step blocks are allocated. - EXPECT_EQ(block_pool->freeBlocksNum(), 7u); + EXPECT_EQ(block_pool->freeBlocksNum(), 6u); +} + +TEST_F(LinearKVCacheGroupTest, MallocBackfillsExistingNullReadSlot) { + auto block_pool = createBlockPool(); + ASSERT_TRUE(block_pool->init()); + ASSERT_EQ(block_pool->freeBlocksNum(), 9u); + + auto spec = makeLinearSpec(/*seq_size_per_block=*/4); + LinearKVCacheGroup group(/*layer_ids=*/{}, spec, block_pool, /*group_id=*/0, /*linear_step=*/2); + ASSERT_TRUE(group.init()); + + auto allocated = block_pool->malloc(2); + ASSERT_EQ(allocated.size(), 2u); + + BlockIds blocks; + blocks.assign(BlockIndicesType{allocated[0], NULL_BLOCK_IDX, allocated[1]}); + const size_t free_before = block_pool->freeBlocksNum(); + + // seq_len=12 => seq_slots=3. Position 1 is tail-1 and is the read slot + // for sequence_length=13, so it must be materialized even though no new + // slots are appended. + ASSERT_TRUE(group.malloc(blocks, /*seq_len=*/12, /*enable_reuse_cache=*/false)); + + ASSERT_EQ(blocks.blocksNum(), 3u); + EXPECT_EQ(blocks.blocks()[0], allocated[0]); + EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[1])); + EXPECT_EQ(blocks.blocks()[2], allocated[1]); + EXPECT_EQ(block_pool->freeBlocksNum(), free_before - 1); +} + +TEST_F(LinearKVCacheGroupTest, MallocMaterializesCausalConvReadSlotAtBoundaries) { + const std::vector seq_lens = {4, 5, 8, 9}; + + for (bool enable_reuse_cache : {false, true}) { + for (int seq_len : seq_lens) { + auto block_pool = createBlockPool(); + ASSERT_TRUE(block_pool->init()); + + auto spec = makeLinearSpec(/*seq_size_per_block=*/4); + LinearKVCacheGroup group(/*layer_ids=*/{}, spec, block_pool, /*group_id=*/0, /*linear_step=*/2); + ASSERT_TRUE(group.init()); + + BlockIds blocks; + ASSERT_TRUE(group.malloc(blocks, seq_len, enable_reuse_cache)) << "seq_len=" << seq_len; + + const int read_pos = (seq_len - 2) / 4; + ASSERT_GE(read_pos, 0); + ASSERT_LT(static_cast(read_pos), blocks.blocksNum()); + EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[static_cast(read_pos)])) + << "seq_len=" << seq_len << " reuse=" << enable_reuse_cache << " read_pos=" << read_pos; + } + } +} + +TEST_F(LinearKVCacheGroupTest, GetNeedBlocksMatchesMallocForReserveSteps) { + for (bool enable_reuse_cache : {false, true}) { + for (int reserve_step : {0, 1, 2, 3}) { + auto block_pool = createBlockPool(); + ASSERT_TRUE(block_pool->init()); + + auto spec = makeLinearSpec(/*seq_size_per_block=*/4); + LinearKVCacheGroup group(/*layer_ids=*/{}, spec, block_pool, /*group_id=*/0, /*linear_step=*/2); + ASSERT_TRUE(group.init()); + + const auto need = group.getNeedBlocks(/*common_seq_len=*/8, + /*seq_len=*/12, + reserve_step, + /*reuse_blocks_len=*/0, + enable_reuse_cache); + + BlockIds blocks; + ASSERT_TRUE(group.malloc(blocks, /*seq_len=*/8, enable_reuse_cache)); + ASSERT_TRUE(group.malloc(blocks, /*seq_len=*/12, enable_reuse_cache, reserve_step)); + + size_t valid_count = 0; + for (auto block : blocks.blocks()) { + if (!isNullBlockIdx(block)) { + valid_count++; + } + } + EXPECT_EQ(valid_count, static_cast(need.common_blocks + need.extra_blocks)) + << "reserve_step=" << reserve_step << " reuse=" << enable_reuse_cache; + } + } } TEST_F(LinearKVCacheGroupTest, RemoveSkippedBlocksFreesNonStepBlocksButKeepsLastTwo) { @@ -158,53 +256,51 @@ TEST_F(LinearKVCacheGroupTest, RemoveSkippedBlocksFreesNonStepBlocksButKeepsLast EXPECT_EQ(block_pool->freeBlocksNum(), free_before + 2); } -TEST_F(LinearKVCacheGroupTest, InsertIntoCacheSkipsNullBlocks) { +TEST_F(LinearKVCacheGroupTest, PutIntoCacheSkipsNullBlocks) { auto block_pool = createBlockPool(); ASSERT_TRUE(block_pool->init()); - auto spec = makeLinearSpec(/*seq_size_per_block=*/4); - LinearKVCacheGroup group(/*layer_ids=*/{}, spec, block_pool, /*group_id=*/3, /*linear_step=*/2); - ASSERT_TRUE(group.init()); + auto shared_cache = std::make_shared(); + std::vector group_pools(4, block_pool); + shared_cache->init(4, group_pools); - auto block_cache = block_pool->blockCache(); - ASSERT_NE(block_cache, nullptr); + auto block1 = block_pool->malloc(1)[0]; + auto block2 = block_pool->malloc(1)[0]; - BlockIndicesType blocks; - blocks.push_back(NULL_BLOCK_IDX); - blocks.push_back(block_pool->malloc(1)[0]); - blocks.push_back(NULL_BLOCK_IDX); - blocks.push_back(block_pool->malloc(1)[0]); + // Only put entries with non-NULL blocks (simulating allocator-level filtering) + std::vector slots1(4, NULL_BLOCK_IDX); + slots1[3] = block1; + shared_cache->put(101, slots1, /*is_resident=*/false); - CacheKeysType keys = {100, 101, 102, 103}; - group.insertIntoCache(keys, blocks, /*is_resident=*/false); + std::vector slots2(4, NULL_BLOCK_IDX); + slots2[3] = block2; + shared_cache->put(103, slots2, /*is_resident=*/false); - EXPECT_FALSE(block_cache->contains(100, /*group_id=*/3)); - EXPECT_TRUE(block_cache->contains(101, /*group_id=*/3)); - EXPECT_FALSE(block_cache->contains(102, /*group_id=*/3)); - EXPECT_TRUE(block_cache->contains(103, /*group_id=*/3)); + EXPECT_FALSE(shared_cache->contains(100)); + EXPECT_TRUE(shared_cache->contains(101)); + EXPECT_FALSE(shared_cache->contains(102)); + EXPECT_TRUE(shared_cache->contains(103)); } TEST_F(LinearKVCacheGroupTest, MatchSingleKeyReturnsMatchedBlockOrEmpty) { auto block_pool = createBlockPool(); ASSERT_TRUE(block_pool->init()); + auto shared_cache = std::make_shared(); + std::vector group_pools(8, block_pool); + shared_cache->init(8, group_pools); + auto spec = makeLinearSpec(/*seq_size_per_block=*/4); - LinearKVCacheGroup group(/*layer_ids=*/{}, spec, block_pool, /*group_id=*/7, /*linear_step=*/2); + LinearKVCacheGroup group(/*layer_ids=*/{}, spec, block_pool, /*group_id=*/7, /*linear_step=*/2, shared_cache.get()); ASSERT_TRUE(group.init()); - auto block_cache = block_pool->blockCache(); - ASSERT_NE(block_cache, nullptr); - // Allocate a block, then put it into cache for group_id=7. auto blocks = block_pool->malloc(1); ASSERT_EQ(blocks.size(), 1u); - BlockCache::CacheItem item; - item.cache_key = 123; - item.group_id = 7; - item.block_index = blocks[0]; - item.is_resident = false; - ASSERT_TRUE(block_cache->put(item)); + std::vector group_slots(8, NULL_BLOCK_IDX); + group_slots[7] = blocks[0]; + shared_cache->put(123, group_slots, /*is_resident=*/false); auto hit = group.matchSingleKey(123); ASSERT_EQ(hit.block_indices.size(), 1u); @@ -259,14 +355,19 @@ TEST_F(LinearKVCacheGroupTest, MallocEnsuresFreeBlocksByEvictingCache) { ASSERT_TRUE(block_pool->init()); ASSERT_EQ(block_pool->freeBlocksNum(), 9u); + auto shared_cache = std::make_shared(); + std::vector group_pools = {block_pool}; + shared_cache->init(1, group_pools); + auto spec = makeLinearSpec(/*seq_size_per_block=*/4); - LinearKVCacheGroup group(/*layer_ids=*/{}, spec, block_pool, /*group_id=*/0, /*linear_step=*/2); + LinearKVCacheGroup group(/*layer_ids=*/{}, spec, block_pool, /*group_id=*/0, /*linear_step=*/2, shared_cache.get()); ASSERT_TRUE(group.init()); // Put one block into cache (non-resident) and release request reference so it becomes evictable. auto cached = block_pool->malloc(1); ASSERT_EQ(cached.size(), 1u); - group.insertIntoCache(CacheKeysType{123}, cached, /*is_resident=*/false); + std::vector slots = {cached[0]}; + shared_cache->put(123, slots, /*is_resident=*/false); block_pool->requestFree(cached); // Exhaust the remaining free blocks so malloc must evict from cache to proceed. @@ -357,23 +458,6 @@ TEST_F(LinearKVCacheGroupTest, ReferenceAppendsAndIncrementsRefCountForValidBloc EXPECT_EQ(block_pool->freeBlocksNum(), free_before + 1); } -TEST_F(LinearKVCacheGroupTest, InsertIntoCacheWithEmptyInputsIsNoop) { - auto block_pool = createBlockPool(); - ASSERT_TRUE(block_pool->init()); - - auto spec = makeLinearSpec(/*seq_size_per_block=*/4); - LinearKVCacheGroup group(/*layer_ids=*/{}, spec, block_pool, /*group_id=*/3, /*linear_step=*/2); - ASSERT_TRUE(group.init()); - - auto block_cache = block_pool->blockCache(); - ASSERT_NE(block_cache, nullptr); - ASSERT_EQ(block_cache->size(), 0u); - - group.insertIntoCache(CacheKeysType{}, BlockIndicesType{1, 2}, /*is_resident=*/false); - group.insertIntoCache(CacheKeysType{100, 101}, BlockIndicesType{}, /*is_resident=*/false); - EXPECT_EQ(block_cache->size(), 0u); -} - } // namespace test } // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/test/MemoryLayoutStrategyTest.cc b/rtp_llm/cpp/cache/test/MemoryLayoutStrategyTest.cc index 2407179f83..7354ee4b59 100644 --- a/rtp_llm/cpp/cache/test/MemoryLayoutStrategyTest.cc +++ b/rtp_llm/cpp/cache/test/MemoryLayoutStrategyTest.cc @@ -1,5 +1,6 @@ #include #include +#include #include #include #include "rtp_llm/cpp/cache/MemoryLayoutStrategy.h" @@ -9,6 +10,7 @@ #include "rtp_llm/models_py/bindings/core/ExecOps.h" #include "rtp_llm/cpp/config/ConfigModules.h" #include "rtp_llm/cpp/config/ModelConfig.h" +#include "rtp_llm/cpp/config/StaticConfig.h" #include "rtp_llm/cpp/utils/Logger.h" namespace rtp_llm { @@ -29,6 +31,8 @@ class MemoryLayoutStrategyTest: public ::testing::Test { }; void SetUp() override { + old_core_dump_on_exception_ = StaticConfig::user_ft_core_dump_on_exception; + StaticConfig::user_ft_core_dump_on_exception = false; rtp_llm::initLogger(); torch::manual_seed(114514); @@ -39,7 +43,9 @@ class MemoryLayoutStrategyTest: public ::testing::Test { ASSERT_TRUE(rtp_llm::isRuntimeInitialized()); } - void TearDown() override {} + void TearDown() override { + StaticConfig::user_ft_core_dump_on_exception = old_core_dump_on_exception_; + } static KVCacheSpecPtr createTestKvCacheSpec(uint32_t layer_num, rtp_llm::DataType dtype, @@ -70,8 +76,7 @@ class MemoryLayoutStrategyTest: public ::testing::Test { auto spec = std::make_shared(); spec->type = KVCacheSpecType::MultiHeadAttention; spec->dtype = dtype; - spec->layer_num = layer_num; - spec->local_head_num_kv = local_head_num_kv; + spec->local_head_num_kv = local_head_num_kv; spec->seq_size_per_block = seq_size_per_block; spec->size_per_head = static_cast(k_elems / denom); return spec; @@ -79,8 +84,7 @@ class MemoryLayoutStrategyTest: public ::testing::Test { auto spec = std::make_shared(); spec->type = KVCacheSpecType::MultiHeadLatentAttention; spec->dtype = dtype; - spec->layer_num = layer_num; - spec->local_head_num_kv = local_head_num_kv; + spec->local_head_num_kv = local_head_num_kv; spec->seq_size_per_block = seq_size_per_block; spec->kv_lora_rank = static_cast(k_elems / denom); spec->rope_head_dim = static_cast(v_elems / denom); @@ -99,14 +103,14 @@ class MemoryLayoutStrategyTest: public ::testing::Test { /*k_block_stride_bytes=*/k_block_bytes, /*v_block_stride_bytes=*/v_block_bytes); - // Create CacheConfig with the spec rtp_llm::CacheConfig cache_config; - cache_config.cache_specs = {spec}; cache_config.layer_num = layer_num; + cache_config.layer_all_num = layer_num; cache_config.block_num = block_num; cache_config.dtype = rtp_llm::DataType::TYPE_INT8; cache_config.seq_size_per_block = 1; cache_config.kv_block_stride_bytes = spec->block_size_bytes(); + initializeSingleGroup(cache_config, spec); auto pool_cfg = BlockPoolConfigHelper::createConfig(cache_config); auto layout_cfg = pool_cfg.memory_layouts[0]; @@ -121,6 +125,12 @@ class MemoryLayoutStrategyTest: public ::testing::Test { return layout_cfg; } + static void initializeSingleGroup(rtp_llm::CacheConfig& cache_config, const KVCacheSpecPtr& spec) { + std::vector layer_ids(cache_config.layer_num); + std::iota(layer_ids.begin(), layer_ids.end(), 0); + cache_config.fromGroupedSpecs({spec}, {layer_ids}, {CacheGroupType::FULL}, {"default"}); + } + static MemoryLayoutConfig createTestConfig(size_t k_block_bytes = 512, size_t v_block_bytes = 512) { return createTestConfig(/*layer_num=*/4, /*block_num=*/8, k_block_bytes, v_block_bytes); } @@ -166,6 +176,8 @@ class MemoryLayoutStrategyTest: public ::testing::Test { BufferInitMode init_mode = BufferInitMode::Zeros) { return createTestContext(createTestConfig(k_block_bytes, v_block_bytes), device, init_mode); } + + bool old_core_dump_on_exception_{false}; }; TEST_F(MemoryLayoutStrategyTest, Initialization) { @@ -186,15 +198,15 @@ TEST_F(MemoryLayoutStrategyTest, InitializationWithScaleTensor) { /*seq_size_per_block=*/4, /*k_block_stride_bytes=*/512, /*v_block_stride_bytes=*/512); - // Create CacheConfig with the spec rtp_llm::CacheConfig cache_config; - cache_config.cache_specs = {spec}; cache_config.layer_num = 4; + cache_config.layer_all_num = 4; cache_config.block_num = 8; cache_config.dtype = rtp_llm::DataType::TYPE_INT8; cache_config.seq_size_per_block = 4; cache_config.kv_block_stride_bytes = spec->block_size_bytes(); cache_config.kv_scale_stride_bytes = spec->scale_block_size_bytes(); + initializeSingleGroup(cache_config, spec); auto pool_cfg = BlockPoolConfigHelper::createConfig(cache_config); auto config = pool_cfg.memory_layouts[0]; // keep enable_kv_scale=true @@ -351,14 +363,14 @@ TEST_F(MemoryLayoutStrategyTest, ConvertIndexToBufferPartitionedByHeadFp16UsesBy /*seq_size_per_block=*/64, /*k_block_stride_bytes=*/1024, /*v_block_stride_bytes=*/1024); - // Create CacheConfig with the spec rtp_llm::CacheConfig cache_config; - cache_config.cache_specs = {spec}; cache_config.layer_num = 4; + cache_config.layer_all_num = 4; cache_config.block_num = 8; cache_config.dtype = rtp_llm::DataType::TYPE_FP16; cache_config.seq_size_per_block = 64; cache_config.kv_block_stride_bytes = spec->block_size_bytes(); + initializeSingleGroup(cache_config, spec); auto pool_cfg = BlockPoolConfigHelper::createConfig(cache_config); auto config = pool_cfg.memory_layouts[0]; @@ -422,15 +434,15 @@ TEST_F(MemoryLayoutStrategyTest, ConvertIndexToBufferPartitionedByHeadWithScale) /*seq_size_per_block=*/64, /*k_block_stride_bytes=*/512, /*v_block_stride_bytes=*/512); - // Create CacheConfig with the spec rtp_llm::CacheConfig cache_config; - cache_config.cache_specs = {spec}; cache_config.layer_num = 4; + cache_config.layer_all_num = 4; cache_config.block_num = 8; cache_config.dtype = rtp_llm::DataType::TYPE_INT8; cache_config.seq_size_per_block = 64; cache_config.kv_block_stride_bytes = spec->block_size_bytes(); cache_config.kv_scale_stride_bytes = spec->scale_block_size_bytes(); + initializeSingleGroup(cache_config, spec); auto pool_cfg = BlockPoolConfigHelper::createConfig(cache_config); auto config = pool_cfg.memory_layouts[0]; // keep enable_kv_scale=true @@ -582,6 +594,32 @@ TEST_F(MemoryLayoutStrategyTest, AddressSequentiality) { EXPECT_EQ(addr2_val - addr1_val, ctx.config.kv_block_stride_bytes); } +TEST_F(MemoryLayoutStrategyTest, ConvertIndexToBufferUsesPhysicalStrideForKernelView) { + auto config = createTestConfig(/*layer_num=*/2, /*block_num=*/4, 64, 64); + config.kernel_blocks_per_kv_block = 4; + auto ctx = createTestContext(std::move(config), torch::kCPU, BufferInitMode::Arange); + + auto strategy = std::make_unique(); + torch::Tensor empty_scale; + ASSERT_TRUE(strategy->init(ctx.config, ctx.kv_cache_buffer, empty_scale, ctx.cache_ptr)); + + auto layer_tensors = strategy->getLayerCacheTensors(); + ASSERT_EQ(layer_tensors[0].size(0), static_cast(ctx.config.block_num * 4)); + ASSERT_EQ(static_cast(layer_tensors[0].stride(0) * layer_tensors[0].element_size()), + ctx.config.kv_block_stride_bytes / 4); + + auto block0 = strategy->convertIndexToBuffer(/*layer_id=*/0, /*block_id=*/0); + auto block1 = strategy->convertIndexToBuffer(/*layer_id=*/0, /*block_id=*/1); + ASSERT_EQ(block0.size(), 1u); + ASSERT_EQ(block1.size(), 1u); + EXPECT_EQ(block0[0].size_bytes, ctx.config.kv_block_stride_bytes); + EXPECT_EQ(block1[0].size_bytes, ctx.config.kv_block_stride_bytes); + + const auto addr0 = reinterpret_cast(block0[0].addr); + const auto addr1 = reinterpret_cast(block1[0].addr); + EXPECT_EQ(addr1 - addr0, ctx.config.kv_block_stride_bytes); +} + // Layout Comparison Test class LayoutComparisonTest: public MemoryLayoutStrategyTest {}; diff --git a/rtp_llm/cpp/cache/test/SWAKVCacheGroupMallocRangeTest.cc b/rtp_llm/cpp/cache/test/SWAKVCacheGroupMallocRangeTest.cc new file mode 100644 index 0000000000..547fde3aac --- /dev/null +++ b/rtp_llm/cpp/cache/test/SWAKVCacheGroupMallocRangeTest.cc @@ -0,0 +1,122 @@ +#include + +#include +#include +#include +#include + +#include "rtp_llm/cpp/cache/BlockPool.h" +#include "rtp_llm/cpp/cache/group/SWAKVCacheGroup.h" + +namespace rtp_llm { +namespace test { +namespace { + +class ScopedEnvVar { +public: + ScopedEnvVar(const char* name, const char* value): name_(name) { + const char* old_value = std::getenv(name_); + if (old_value != nullptr) { + old_value_ = old_value; + had_value_ = true; + } + setenv(name_, value, 1); + } + + ~ScopedEnvVar() { + if (had_value_) { + setenv(name_, old_value_.c_str(), 1); + } else { + unsetenv(name_); + } + } + +private: + const char* name_; + std::string old_value_; + bool had_value_ = false; +}; + +BlockPoolConfig makeHostBlockPoolConfig() { + constexpr uint32_t kLayerNum = 1; + constexpr uint32_t kBlockNum = 4; + constexpr size_t kKvBlockStride = 1024; + constexpr size_t kHalfBlockStride = kKvBlockStride / 2; + + MemoryLayoutConfig layout; + layout.layer_num = kLayerNum; + layout.block_num = kBlockNum; + layout.dtype = rtp_llm::DataType::TYPE_FP16; + layout.kv_cache_offset_bytes = 0; + layout.kv_scale_offset_bytes = kLayerNum * kBlockNum * kKvBlockStride; + layout.kv_block_stride_bytes = kKvBlockStride; + layout.k_block_stride_bytes = kHalfBlockStride; + layout.v_block_stride_bytes = kHalfBlockStride; + layout.kv_block_pool_size_bytes = kLayerNum * kBlockNum * kKvBlockStride; + layout.kv_scale_pool_size_bytes = 0; + layout.total_size_bytes = layout.kv_block_pool_size_bytes; + + BlockPoolConfig config; + config.block_num = kBlockNum; + config.total_size_bytes = layout.total_size_bytes; + config.memory_layouts = {layout}; + return config; +} + +BlockPoolPtr createHostBlockPool() { + auto block_pool = std::make_shared(makeHostBlockPoolConfig(), AllocationType::HOST); + RTP_LLM_CHECK_WITH_INFO(block_pool->init(), "init host block pool failed"); + return block_pool; +} + +std::shared_ptr makeMHASpec(int seq_size_per_block) { + auto spec = std::make_shared(); + spec->seq_size_per_block = seq_size_per_block; + return spec; +} + +} // namespace + +TEST(SWAKVCacheGroupMallocRangeTest, EmptyBlockIdsKeepTailBlocksForSeqLenUpTo1M) { + constexpr int kSeqSizePerBlock = 256; + constexpr int kMaxSeqLen = 1000000; + + ScopedEnvVar disable_pin_host_pool("RTP_LLM_PIN_HOST_BLOCK_POOL", "0"); + auto block_pool = createHostBlockPool(); + SWAKVCacheGroup group({}, makeMHASpec(kSeqSizePerBlock), block_pool, 0); + + auto check_seq_len = [&](int seq_len) { + BlockIds block_ids; + ASSERT_EQ(block_ids.blocksNum(), 0u) << "seq_len=" << seq_len; + + ASSERT_TRUE(group.malloc(block_ids, seq_len, /*enable_reuse_cache=*/false, /*reserve_step=*/0)) + << "seq_len=" << seq_len; + + const auto& blocks = block_ids.blocks(); + ASSERT_EQ(blocks.size(), static_cast((seq_len + kSeqSizePerBlock - 1) / kSeqSizePerBlock)) + << "seq_len=" << seq_len; + if (blocks.size() == 1) { + EXPECT_FALSE(isNullBlockIdx(blocks[0])) << "seq_len=" << seq_len; + } else { + EXPECT_FALSE(isNullBlockIdx(blocks[blocks.size() - 2])) << "seq_len=" << seq_len; + EXPECT_FALSE(isNullBlockIdx(blocks[blocks.size() - 1])) << "seq_len=" << seq_len; + } + + group.free(blocks); + }; + + // SWA malloc depends on seq_slots=ceil(seq_len / block_size). The first + // and last seq_len in each slot cover all behavior classes from 1..1M. + const int max_seq_slots = (kMaxSeqLen + kSeqSizePerBlock - 1) / kSeqSizePerBlock; + for (int seq_slots = 1; seq_slots <= max_seq_slots; ++seq_slots) { + const int first_seq_len = (seq_slots - 1) * kSeqSizePerBlock + 1; + const int last_seq_len = std::min(seq_slots * kSeqSizePerBlock, kMaxSeqLen); + check_seq_len(first_seq_len); + if (last_seq_len != first_seq_len) { + check_seq_len(last_seq_len); + } + } +} + +} // namespace test +} // namespace rtp_llm diff --git a/rtp_llm/cpp/cache/test/SWAKVCacheGroupTest.cc b/rtp_llm/cpp/cache/test/SWAKVCacheGroupTest.cc new file mode 100644 index 0000000000..6fc05678b4 --- /dev/null +++ b/rtp_llm/cpp/cache/test/SWAKVCacheGroupTest.cc @@ -0,0 +1,740 @@ +#include +#include +#include +#include +#include +#include + +#include "rtp_llm/cpp/cache/spec/OpaqueKVCacheSpec.h" +#include "rtp_llm/cpp/cache/group/SWAKVCacheGroup.h" +#include "rtp_llm/cpp/cache/SharedBlockCache.h" +#include "rtp_llm/cpp/cache/test/BlockPoolTestHelper.h" +#include "rtp_llm/cpp/config/StaticConfig.h" + +namespace rtp_llm { +namespace test { + +namespace { + +class ScopedEnvVar { +public: + ScopedEnvVar(const char* name, const char* value): name_(name) { + const char* old_value = std::getenv(name_); + if (old_value != nullptr) { + old_value_ = old_value; + had_value_ = true; + } + setenv(name_, value, 1); + } + + ~ScopedEnvVar() { + if (had_value_) { + setenv(name_, old_value_.c_str(), 1); + } else { + unsetenv(name_); + } + } + +private: + const char* name_; + std::string old_value_; + bool had_value_ = false; +}; + +std::shared_ptr makeDsv4StateSpec(const std::string& tag, int seq_size_per_block) { + return std::make_shared(tag, + /*state_elements=*/1024, + /*block_entries=*/128, + DataType::TYPE_FP32, + seq_size_per_block); +} + +CacheGroupPolicy makePolicy(const KVCacheSpecPtr& spec) { + return CacheConfig::cacheGroupPolicyForSpec(spec, CacheGroupType::SWA); +} + +size_t validBlockCount(const BlockIndicesType& blocks) { + return static_cast(std::count_if(blocks.begin(), blocks.end(), [](BlockIdxType block) { + return !isNullBlockIdx(block); + })); +} + +} // namespace + +class SWAKVCacheGroupTest: public ::testing::Test { +protected: + void SetUp() override { + old_core_dump_on_exception_ = StaticConfig::user_ft_core_dump_on_exception; + StaticConfig::user_ft_core_dump_on_exception = false; + block_pool_ = createBlockPool(); + block_pool_->init(); + total_blocks_ = block_pool_->freeBlocksNum(); + shared_cache_ = std::make_shared(); + std::vector group_pools = {block_pool_}; + shared_cache_->init(1, group_pools); + } + + void TearDown() override { + StaticConfig::user_ft_core_dump_on_exception = old_core_dump_on_exception_; + } + + SWAKVCacheGroup makeGroup(int seq_size_per_block) { + auto spec = std::make_shared(); + spec->seq_size_per_block = seq_size_per_block; + return SWAKVCacheGroup({}, spec, block_pool_, 0, 0, shared_cache_.get()); + } + + SWAKVCacheGroup makeGroupWithStep(int seq_size_per_block, int linear_step) { + auto spec = std::make_shared(); + spec->seq_size_per_block = seq_size_per_block; + return SWAKVCacheGroup({}, spec, block_pool_, 0, linear_step, shared_cache_.get()); + } + + BlockPoolPtr block_pool_; + SharedBlockCachePtr shared_cache_; + size_t total_blocks_ = 0; + bool old_core_dump_on_exception_{false}; +}; + +TEST_F(SWAKVCacheGroupTest, DefaultPolicyDrivesBehaviorInterfaces) { + auto spec = std::make_shared(); + spec->seq_size_per_block = 4; + SWAKVCacheGroup group({}, spec, block_pool_, 0, 0, shared_cache_.get()); + + EXPECT_FALSE(group.prefixReusable()); + EXPECT_FALSE(group.isCpShardable()); + EXPECT_TRUE(group.hasSparseSlots()); + EXPECT_FALSE(group.hasKernelBlockSubdiv()); + EXPECT_TRUE(group.transferTailBlocks()); + EXPECT_TRUE(group.cpCompactTailBlocks()); + EXPECT_TRUE(group.isReservable()); + EXPECT_FALSE(group.usesPinnedCpuBacking()); +} + +// ==================== needBlocksNum ==================== + +TEST_F(SWAKVCacheGroupTest, NeedBlocksNum_Basic) { + auto group = makeGroup(4); + EXPECT_EQ(group.needBlocksNum(1, 0), 1); + EXPECT_EQ(group.needBlocksNum(4, 0), 1); + EXPECT_EQ(group.needBlocksNum(5, 0), 2); + EXPECT_EQ(group.needBlocksNum(8, 0), 2); + EXPECT_EQ(group.needBlocksNum(9, 0), 3); +} + +TEST_F(SWAKVCacheGroupTest, NeedBlocksNum_WithCurrentBlocks) { + auto group = makeGroup(4); + EXPECT_EQ(group.needBlocksNum(10, 1), 2); + EXPECT_EQ(group.needBlocksNum(10, 3), 0); + EXPECT_EQ(group.needBlocksNum(10, 5), 0); +} + +TEST_F(SWAKVCacheGroupTest, NeedBlocksNum_WithReserveStep) { + auto group = makeGroup(4); + // reserve_step formula: ceil((seq_len + reserve_step) / block_size) - current + EXPECT_EQ(group.needBlocksNum(8, 0, 0), 2); // ceil((8+0)/4) = 2 + EXPECT_EQ(group.needBlocksNum(8, 0, 1), 3); // ceil((8+1)/4) = 3 + EXPECT_EQ(group.needBlocksNum(8, 0, 2), 3); // ceil((8+2)/4) = 3 + EXPECT_EQ(group.needBlocksNum(8, 0, 5), 4); // ceil((8+5)/4) = 4 +} + +// ==================== getNeedBlocks ==================== + +TEST_F(SWAKVCacheGroupTest, GetNeedBlocks_SeqLenZero) { + auto group = makeGroup(4); + auto need = group.getNeedBlocks(0, 0, 0, 0, false); + EXPECT_EQ(need.common_blocks, 0); + EXPECT_EQ(need.extra_blocks, 0); +} + +TEST_F(SWAKVCacheGroupTest, GetNeedBlocks_ReuseDisabledCountsActiveTail) { + auto group = makeGroupWithStep(4, 2); + // seq_len=12 => seq_slots=3, reuse disabled => last two active tail blocks. + auto need = group.getNeedBlocks(0, 12, 0, 0, false); + EXPECT_EQ(need.common_blocks, 0); + EXPECT_EQ(need.extra_blocks, 2); +} + +TEST_F(SWAKVCacheGroupTest, GetNeedBlocks_ReuseEnabledUsesSparse) { + auto group = makeGroupWithStep(4, 2); + // seq_len=12 => seq_slots=3 + // count_sparse(0,3): eligible=(3+1)/2-(0+1)/2=2-0=2, tail=(3+1)%2==0 => 0, total=2 + auto need = group.getNeedBlocks(0, 12, 0, 0, true); + EXPECT_EQ(need.common_blocks, 0); + EXPECT_EQ(need.extra_blocks, 2); +} + +TEST_F(SWAKVCacheGroupTest, GetNeedBlocks_HCAStateReuseEnabledCountsTailOnly) { + auto spec = makeDsv4StateSpec("hca_state", 4); + spec->skip_prefix_reuse = true; + auto group = SWAKVCacheGroup({}, spec, block_pool_, 5, /*linear_step=*/3, shared_cache_.get(), nullptr, makePolicy(spec)); + + // seq_len=40 => seq_slots=10. If reuse sparse allocation were enabled, step hits + // would keep positions 2/5/8 plus tail position 9. HCA_STATE skips reuse and keeps only tail 9. + auto need = group.getNeedBlocks(0, 40, 0, 0, true); + EXPECT_EQ(need.common_blocks, 0); + EXPECT_EQ(need.extra_blocks, 1); +} + +TEST_F(SWAKVCacheGroupTest, GetNeedBlocks_CSAStateReuseEnabledStillUsesSparse) { + auto spec = makeDsv4StateSpec("csa_state", 4); + auto group = SWAKVCacheGroup({}, spec, block_pool_, 4, /*linear_step=*/3, shared_cache_.get(), nullptr, makePolicy(spec)); + + auto need = group.getNeedBlocks(0, 40, 0, 0, true); + EXPECT_EQ(need.common_blocks, 0); + EXPECT_EQ(need.extra_blocks, 4); +} + +TEST_F(SWAKVCacheGroupTest, GetNeedBlocks_WithReserveStep) { + auto group = makeGroupWithStep(4, 2); + // seq_len=8 => two active tail blocks, plus one reserve block. + auto need = group.getNeedBlocks(0, 8, 2, 0, false); + EXPECT_EQ(need.extra_blocks, 3); +} + +TEST_F(SWAKVCacheGroupTest, GetNeedBlocks_ReusePartialOverlap) { + auto group = makeGroupWithStep(4, 2); + // seq_len=12 => seq_slots=3 + // reuse_blocks_len=2: count_sparse(2,3) + // eligible=(3+1)/2-(2+1)/2=2-1=1, tail=(3+1)%2==0 => 0, total=1 + auto need = group.getNeedBlocks(0, 12, 0, 2, true); + EXPECT_EQ(need.extra_blocks, 1); +} + +TEST_F(SWAKVCacheGroupTest, GetNeedBlocks_ReuseFullOverlap) { + auto group = makeGroupWithStep(4, 2); + // seq_len=12 => seq_slots=3 + // reuse_blocks_len=3: count_sparse(3,3) = 0 + auto need = group.getNeedBlocks(0, 12, 0, 3, true); + EXPECT_EQ(need.extra_blocks, 0); +} + +TEST_F(SWAKVCacheGroupTest, GetNeedBlocks_CommonSeqLenIgnored) { + auto group = makeGroup(4); + auto need1 = group.getNeedBlocks(0, 20, 0, 0, false); + auto need2 = group.getNeedBlocks(20, 20, 0, 0, false); + auto need3 = group.getNeedBlocks(100, 20, 0, 0, false); + EXPECT_EQ(need1.extra_blocks, need2.extra_blocks); + EXPECT_EQ(need2.extra_blocks, need3.extra_blocks); + EXPECT_EQ(need1.common_blocks, 0); +} + +// ==================== match ==================== + +TEST_F(SWAKVCacheGroupTest, MatchAlwaysThrows) { + auto group = makeGroup(4); + EXPECT_THROW(group.match({101, 102, 103}), std::exception); +} + +TEST_F(SWAKVCacheGroupTest, MatchSingleKey_NotFound) { + auto group = makeGroup(4); + auto result = group.matchSingleKey(999); + EXPECT_TRUE(result.block_indices.empty()); +} + +TEST_F(SWAKVCacheGroupTest, MatchSingleKey_Found) { + auto group = makeGroup(4); + std::vector group_slots = {1}; // group_id=0, block_index=1 + shared_cache_->put(101, group_slots, false); + + auto result = group.matchSingleKey(101); + ASSERT_EQ(result.block_indices.size(), 1u); + EXPECT_EQ(result.block_indices[0], 1); +} + +// ==================== malloc (default step=0, acts like step=1, tail-only) ==================== + +TEST_F(SWAKVCacheGroupTest, Malloc_ShortSeq_OnlyOneBlock) { + auto group = makeGroup(4); + BlockIds block_ids(1); + ASSERT_TRUE(group.malloc(block_ids, 3)); + EXPECT_EQ(block_ids.blocksNum(), 1u); + EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[0])); + EXPECT_EQ(block_pool_->freeBlocksNum(), total_blocks_ - 1); +} + +TEST_F(SWAKVCacheGroupTest, Malloc_ManyBlocks_LastTwoActiveBlocksReal) { + auto group = makeGroup(4); + BlockIds block_ids(1); + ASSERT_TRUE(group.malloc(block_ids, 20)); + // reuse_cache=false still keeps the last two active blocks. + ASSERT_EQ(block_ids.blocksNum(), 5u); + for (int i = 0; i < 3; ++i) { + EXPECT_TRUE(isNullBlockIdx(block_ids.blocks()[i])) << "position " << i << " should be NULL"; + } + EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[3])); + EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[4])); + EXPECT_EQ(block_pool_->freeBlocksNum(), total_blocks_ - 2); +} + +TEST_F(SWAKVCacheGroupTest, Malloc_DSV4PromptTailKeepsPenultimateBlock) { + auto group = makeGroup(256); + BlockIds block_ids(1); + + ASSERT_TRUE(group.malloc(block_ids, 5121, /*enable_reuse_cache=*/false, /*reserve_step=*/0)); + + ASSERT_EQ(block_ids.blocksNum(), 21u); + for (int i = 0; i < 19; ++i) { + EXPECT_TRUE(isNullBlockIdx(block_ids.blocks()[i])) << "position " << i << " should be NULL"; + } + EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[19])); + EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[20])); + EXPECT_EQ(block_pool_->freeBlocksNum(), total_blocks_ - 2); +} + +TEST_F(SWAKVCacheGroupTest, Malloc_NoOpWhenEnoughBlocks) { + auto group = makeGroup(4); + BlockIds block_ids(1); + ASSERT_TRUE(group.malloc(block_ids, 8)); + size_t free_after_first = block_pool_->freeBlocksNum(); + + ASSERT_TRUE(group.malloc(block_ids, 8)); + EXPECT_EQ(block_ids.blocksNum(), 2u); + EXPECT_EQ(block_pool_->freeBlocksNum(), free_after_first); +} + +TEST_F(SWAKVCacheGroupTest, Malloc_DSV4TrapSkipsHCAStateNullTail) { + ScopedEnvVar env("DSV4_TRAP_INVALID_KV_ACCESS", "1"); + auto spec = makeDsv4StateSpec("hca_state", 4); + spec->skip_prefix_reuse = true; + auto group = SWAKVCacheGroup({}, spec, block_pool_, 5, 0, shared_cache_.get(), nullptr, makePolicy(spec)); + BlockIds block_ids(1); + block_ids.assign(BlockIndicesType{NULL_BLOCK_IDX, NULL_BLOCK_IDX, NULL_BLOCK_IDX}); + + EXPECT_NO_THROW((void)group.malloc(block_ids, 12)); +} + +TEST_F(SWAKVCacheGroupTest, Malloc_HCAStateReuseEnabledAllocatesTailOnly) { + auto spec = makeDsv4StateSpec("hca_state", 4); + spec->skip_prefix_reuse = true; + auto group = SWAKVCacheGroup({}, spec, block_pool_, 5, /*linear_step=*/3, shared_cache_.get(), nullptr, makePolicy(spec)); + BlockIds block_ids(1); + + ASSERT_TRUE(group.malloc(block_ids, 40, /*enable_reuse_cache=*/true, /*reserve_step=*/0)); + + ASSERT_EQ(block_ids.blocksNum(), 10u); + EXPECT_EQ(validBlockCount(block_ids.blocks()), 1u); + EXPECT_TRUE(isNullBlockIdx(block_ids.blocks()[8])); + EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[9])); + EXPECT_EQ(block_pool_->freeBlocksNum(), total_blocks_ - 1); +} + +TEST_F(SWAKVCacheGroupTest, Malloc_CSAStateReuseEnabledKeepsSparseBlocks) { + auto spec = makeDsv4StateSpec("csa_state", 4); + auto group = SWAKVCacheGroup({}, spec, block_pool_, 4, /*linear_step=*/3, shared_cache_.get(), nullptr, makePolicy(spec)); + BlockIds block_ids(1); + + ASSERT_TRUE(group.malloc(block_ids, 40, /*enable_reuse_cache=*/true, /*reserve_step=*/0)); + + ASSERT_EQ(block_ids.blocksNum(), 10u); + EXPECT_EQ(validBlockCount(block_ids.blocks()), 4u); + EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[2])); + EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[5])); + EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[8])); + EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[9])); + EXPECT_EQ(block_pool_->freeBlocksNum(), total_blocks_ - 4); +} + +TEST_F(SWAKVCacheGroupTest, Malloc_DSV4TrapChecksSWAKVNullTail) { + ScopedEnvVar env("DSV4_TRAP_INVALID_KV_ACCESS", "1"); + auto spec = makeDsv4StateSpec("swa_kv", 4); + auto group = SWAKVCacheGroup({}, spec, block_pool_, 6, 0, shared_cache_.get(), nullptr, makePolicy(spec)); + BlockIds block_ids(1); + block_ids.assign(BlockIndicesType{NULL_BLOCK_IDX, NULL_BLOCK_IDX, NULL_BLOCK_IDX}); + + EXPECT_THROW((void)group.malloc(block_ids, 12), std::exception); +} + +TEST_F(SWAKVCacheGroupTest, Malloc_DSV4TrapChecksNonSkipStateNullTail) { + ScopedEnvVar env("DSV4_TRAP_INVALID_KV_ACCESS", "1"); + auto spec = makeDsv4StateSpec("csa_state", 4); + auto group = SWAKVCacheGroup({}, spec, block_pool_, 4, 0, shared_cache_.get(), nullptr, makePolicy(spec)); + BlockIds block_ids(1); + block_ids.assign(BlockIndicesType{NULL_BLOCK_IDX, NULL_BLOCK_IDX, NULL_BLOCK_IDX}); + + EXPECT_THROW((void)group.malloc(block_ids, 12), std::exception); +} + +TEST_F(SWAKVCacheGroupTest, Malloc_WithReserveStep) { + auto group = makeGroup(4); + BlockIds block_ids(1); + // seq_len=4 => seq_slots=1, reserve_step=2 => total=2 (1 + (2-1)) + // index 0: seq_tail => REAL, index 1: reserve => REAL + ASSERT_TRUE(group.malloc(block_ids, 4, false, 2)); + ASSERT_EQ(block_ids.blocksNum(), 2u); + EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[0])); + EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[1])); + EXPECT_EQ(block_pool_->freeBlocksNum(), total_blocks_ - 2); +} + +TEST_F(SWAKVCacheGroupTest, Malloc_FailsWhenPoolExhausted) { + auto group = makeGroup(4); + std::vector holders; + for (size_t i = 0; i < total_blocks_; ++i) { + holders.emplace_back(1); + if (!group.malloc(holders.back(), 4)) { + break; + } + } + EXPECT_EQ(block_pool_->freeBlocksNum(), 0u); + + BlockIds block_ids(1); + EXPECT_FALSE(group.malloc(block_ids, 4)); +} + +// ==================== malloc with linear_step ==================== + +TEST_F(SWAKVCacheGroupTest, Malloc_WithStep_ReuseEnabled) { + auto group = makeGroupWithStep(4, 2); + BlockIds block_ids(1); + // seq_len=16 => 4 slots; keep step hits plus the last two active blocks. + ASSERT_TRUE(group.malloc(block_ids, 16, /*enable_reuse_cache=*/true)); + ASSERT_EQ(block_ids.blocksNum(), 4u); + EXPECT_TRUE(isNullBlockIdx(block_ids.blocks()[0])); + EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[1])); + EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[2])); + EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[3])); + EXPECT_EQ(block_pool_->freeBlocksNum(), total_blocks_ - 3); +} + +TEST_F(SWAKVCacheGroupTest, Malloc_WithStep_ReuseDisabled) { + auto group = makeGroupWithStep(4, 2); + BlockIds block_ids(1); + // seq_len=16 => 4 slots, reuse_cache=false => active tail indices 2 and 3. + ASSERT_TRUE(group.malloc(block_ids, 16, /*enable_reuse_cache=*/false)); + ASSERT_EQ(block_ids.blocksNum(), 4u); + EXPECT_TRUE(isNullBlockIdx(block_ids.blocks()[0])); + EXPECT_TRUE(isNullBlockIdx(block_ids.blocks()[1])); + EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[2])); + EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[3])); + EXPECT_EQ(block_pool_->freeBlocksNum(), total_blocks_ - 2); +} + +TEST_F(SWAKVCacheGroupTest, Malloc_WithStep_ReserveAllocated) { + auto group = makeGroupWithStep(4, 2); + BlockIds block_ids(1); + // seq_len=16 => seq_slots=4, reserve_step=2 => total_slots=5 + // reuse disabled: active tail(2,3) and reserve(4) allocated + ASSERT_TRUE(group.malloc(block_ids, 16, /*enable_reuse_cache=*/false, /*reserve_step=*/2)); + ASSERT_EQ(block_ids.blocksNum(), 5u); + EXPECT_TRUE(isNullBlockIdx(block_ids.blocks()[0])); + EXPECT_TRUE(isNullBlockIdx(block_ids.blocks()[1])); + EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[2])); + EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[3])); + EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[4])); + EXPECT_EQ(block_pool_->freeBlocksNum(), total_blocks_ - 3); +} + +// ==================== removeSkippedBlocks ==================== + +TEST_F(SWAKVCacheGroupTest, RemoveSkippedBlocks_TwoOrFewer_NoOp) { + auto group = makeGroup(4); + BlockIds block_ids(1); + ASSERT_TRUE(group.malloc(block_ids, 5)); + ASSERT_EQ(block_ids.blocksNum(), 2u); + + group.removeSkippedBlocks(block_ids); + EXPECT_EQ(block_ids.blocksNum(), 2u); +} + +TEST_F(SWAKVCacheGroupTest, RemoveSkippedBlocks_FreesNonTailReal) { + auto group = makeGroupWithStep(4, 2); + BlockIds block_ids(1); + // First: 2 blocks with reuse + ASSERT_TRUE(group.malloc(block_ids, 5, true)); + // Extend to 5 blocks with reuse + ASSERT_TRUE(group.malloc(block_ids, 20, true)); + ASSERT_EQ(block_ids.blocksNum(), 5u); + size_t free_before = block_pool_->freeBlocksNum(); + + group.removeSkippedBlocks(block_ids, true); + + // step=2: keep step_hit blocks + last 2 + // step_hit: index 1 ((1+1)%2==0), index 3 ((3+1)%2==0) + // last 2: index 3, 4 + // loop i from block_size-3=2 down to 0: + // i=2: not null, not step_hit => free + // i=1: not null, step_hit => continue + // i=0: not null, not step_hit => free + // But wait, with reuse_cache=true for the first malloc (5 tokens), blocks at 0,1 are: + // active tail at 0,1 and step_hit at 1 => both REAL + // Then extending to 20 tokens with reuse: new blocks at 2,3,4 + // step_hit at 3 and active tail at 3,4 => REAL. index 2: NULL + // So blocks are: [REAL, REAL, NULL, REAL, REAL] + // removeSkippedBlocks: loop from i=2 down: + // i=2: NULL => break (stops on first null going backward) + // No blocks freed. + EXPECT_EQ(block_pool_->freeBlocksNum(), free_before); +} + +TEST_F(SWAKVCacheGroupTest, RemoveSkippedBlocks_WithStep_FreesNonStepBlocks) { + auto block_pool = createBlockPool(); + ASSERT_TRUE(block_pool->init()); + ASSERT_EQ(block_pool->freeBlocksNum(), 9u); + + auto spec = std::make_shared(); + spec->seq_size_per_block = 4; + SWAKVCacheGroup group({}, spec, block_pool, 0, 2); + + // Start with 6 allocated blocks (no NULLs). + auto allocated = block_pool->malloc(6); + ASSERT_EQ(allocated.size(), 6u); + BlockIds blocks; + blocks.assign(allocated); + + const size_t free_before = block_pool->freeBlocksNum(); + group.removeSkippedBlocks(blocks, true); + + // step=2, size=6: keep step_hit + last 2 + // step_hit: index 1 ((1+1)%2==0), 3 ((3+1)%2==0), 5 ((5+1)%2==0 but in last 2) + // last 2: index 4, 5 + // loop from i=3 down: (block_size-3=3) + // i=3: step_hit => continue + // i=2: not step_hit => free + // i=1: step_hit => continue + // i=0: not step_hit => free + ASSERT_EQ(blocks.blocksNum(), 6u); + EXPECT_TRUE(isNullBlockIdx(blocks.blocks()[0])); + EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[1])); + EXPECT_TRUE(isNullBlockIdx(blocks.blocks()[2])); + EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[3])); + EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[4])); + EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[5])); + + EXPECT_EQ(block_pool->freeBlocksNum(), free_before + 2); +} + +TEST_F(SWAKVCacheGroupTest, RemoveSkippedBlocks_HCAStateReuseEnabledKeepsTailOnly) { + auto block_pool = createBlockPool(); + ASSERT_TRUE(block_pool->init()); + ASSERT_EQ(block_pool->freeBlocksNum(), 9u); + + auto spec = makeDsv4StateSpec("hca_state", 4); + spec->skip_prefix_reuse = true; + auto group = SWAKVCacheGroup({}, spec, block_pool, 5, /*linear_step=*/2, nullptr, nullptr, makePolicy(spec)); + + auto allocated = block_pool->malloc(6); + ASSERT_EQ(allocated.size(), 6u); + BlockIds blocks; + blocks.assign(allocated); + + const size_t free_before = block_pool->freeBlocksNum(); + group.removeSkippedBlocks(blocks, /*enable_reuse_cache=*/true); + + ASSERT_EQ(blocks.blocksNum(), 6u); + for (int i = 0; i < 5; ++i) { + EXPECT_TRUE(isNullBlockIdx(blocks.blocks()[i])) << "position " << i << " should be freed"; + } + EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[5])); + EXPECT_EQ(block_pool->freeBlocksNum(), free_before + 5); +} + +TEST_F(SWAKVCacheGroupTest, RemoveSkippedBlocks_WithReserveStep) { + auto block_pool = createBlockPool(); + ASSERT_TRUE(block_pool->init()); + ASSERT_EQ(block_pool->freeBlocksNum(), 9u); + + auto spec = std::make_shared(); + spec->seq_size_per_block = 4; + SWAKVCacheGroup group({}, spec, block_pool, 0, 2); + + auto allocated = block_pool->malloc(6); + ASSERT_EQ(allocated.size(), 6u); + BlockIds blocks; + blocks.assign(allocated); + + const size_t free_before = block_pool->freeBlocksNum(); + // reserve_step=1: keep last 2 + 1 more (index 3) + group.removeSkippedBlocks(blocks, false, 1); + + // reuse_cache=false so no step_hit check + // loop from i=block_size-3-1=2 down: + // i=2: free, i=1: free, i=0: free + ASSERT_EQ(blocks.blocksNum(), 6u); + EXPECT_TRUE(isNullBlockIdx(blocks.blocks()[0])); + EXPECT_TRUE(isNullBlockIdx(blocks.blocks()[1])); + EXPECT_TRUE(isNullBlockIdx(blocks.blocks()[2])); + EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[3])); + EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[4])); + EXPECT_FALSE(isNullBlockIdx(blocks.blocks()[5])); + + EXPECT_EQ(block_pool->freeBlocksNum(), free_before + 3); +} + +// ==================== free ==================== + +TEST_F(SWAKVCacheGroupTest, Free_ReleasesRealBlocks) { + auto group = makeGroup(4); + BlockIds block_ids(1); + ASSERT_TRUE(group.malloc(block_ids, 20)); + EXPECT_LT(block_pool_->freeBlocksNum(), total_blocks_); + + group.free(block_ids.blocks()); + EXPECT_EQ(block_pool_->freeBlocksNum(), total_blocks_); +} + +TEST_F(SWAKVCacheGroupTest, Free_Empty) { + auto group = makeGroup(4); + group.free({}); + EXPECT_EQ(block_pool_->freeBlocksNum(), total_blocks_); +} + +TEST_F(SWAKVCacheGroupTest, Free_SkipsNullBlocks) { + auto group = makeGroup(4); + BlockIds block_ids(1); + ASSERT_TRUE(group.malloc(block_ids, 20)); + EXPECT_LT(block_pool_->freeBlocksNum(), total_blocks_); + + group.free(block_ids.blocks()); + EXPECT_EQ(block_pool_->freeBlocksNum(), total_blocks_); +} + +// ==================== reference ==================== + +TEST_F(SWAKVCacheGroupTest, Reference_AddsAndRefsBlocks) { + auto group = makeGroup(4); + BlockIds block_ids(1); + ASSERT_TRUE(group.malloc(block_ids, 5)); + auto original = block_ids.blocks(); + + BlockIds block_ids2(1); + group.reference(block_ids2, original); + EXPECT_EQ(block_ids2.blocksNum(), original.size()); + EXPECT_EQ(block_ids2.blocks(), original); +} + +TEST_F(SWAKVCacheGroupTest, Reference_NullBlocksNotReffed) { + auto group = makeGroup(4); + BlockIds block_ids(1); + ASSERT_TRUE(group.malloc(block_ids, 20)); + auto original = block_ids.blocks(); + + BlockIds block_ids2(1); + group.reference(block_ids2, original); + EXPECT_EQ(block_ids2.blocksNum(), original.size()); +} + +// ==================== put into cache (allocator-level) ==================== + +TEST_F(SWAKVCacheGroupTest, PutIntoCache_SkipsNullBlocks) { + auto group = makeGroup(4); + BlockIds block_ids(1); + ASSERT_TRUE(group.malloc(block_ids, 20)); + CacheKeysType keys = {101, 102, 103, 104, 105}; + + // Simulate allocator-level insertIntoCache: only put non-NULL blocks + for (size_t i = 0; i < keys.size() && i < block_ids.blocksNum(); ++i) { + if (!isNullBlockIdx(block_ids.blocks()[i])) { + std::vector slots = {block_ids.blocks()[i]}; + shared_cache_->put(keys[i], slots, false); + } + } + + auto result1 = group.matchSingleKey(101); + EXPECT_TRUE(result1.block_indices.empty()); + + // The last two active tail blocks are real. + auto result4 = group.matchSingleKey(104); + ASSERT_EQ(result4.block_indices.size(), 1u); + EXPECT_EQ(result4.block_indices[0], block_ids.blocks()[3]); + + auto result5 = group.matchSingleKey(105); + ASSERT_EQ(result5.block_indices.size(), 1u); + EXPECT_EQ(result5.block_indices[0], block_ids.blocks()[4]); +} + +// ==================== batch allocation atomicity (regression: mid-loop leak) ==================== + +// Reproduces the historical bug where SWAKVCacheGroup::malloc called block_pool_->malloc(1) +// repeatedly inside a loop. If a later iteration failed (e.g. concurrent allocators raced for +// the last free blocks), the previously allocated blocks were leaked because they had only +// been recorded in a stack-local vector and were never written back to block_ids; the upper +// rollback in HybridKVCacheAllocator::initMallocForCommonLen could not see them. +// +// After the fix, SWAKVCacheGroup::malloc performs a single atomic batch malloc on the pool, +// so a failed allocation must leave the pool's free counter unchanged. +TEST_F(SWAKVCacheGroupTest, Malloc_FailsAtomicallyWithoutLeak) { + auto group = makeGroupWithStep(4, 2); + + // Hold 7 blocks so that only 2 free blocks remain. shared_cache_ is empty here, so + // ensureFreeBlocks() cannot evict and refill the pool. + auto pre_alloc = block_pool_->malloc(7); + ASSERT_EQ(pre_alloc.size(), 7u); + const size_t free_before = block_pool_->freeBlocksNum(); + ASSERT_EQ(free_before, total_blocks_ - 7); + + // seq_len=16, step=2, reuse=true => seq_slots=4. The group needs 3 real blocks at + // positions {1, 2, 3}, which exceeds the 2 free blocks currently in the pool. + BlockIds block_ids(1); + EXPECT_FALSE(group.malloc(block_ids, 16, /*enable_reuse_cache=*/true)); + + // Free count must stay identical to the pre-call value (no stranded blocks). + EXPECT_EQ(block_pool_->freeBlocksNum(), free_before); + // No partial state should have leaked into block_ids either. + EXPECT_EQ(block_ids.blocksNum(), 0u); + + // The pre-allocated blocks must still be releasable, proving that BlockPool ref + // counters were not corrupted by the failed malloc path. + block_pool_->requestFree(pre_alloc); + EXPECT_EQ(block_pool_->freeBlocksNum(), total_blocks_); +} + +// Verifies the new behavior: SWAKVCacheGroup::malloc reserves all required physical blocks +// via a single batch BlockPool::malloc(N) call instead of N individual malloc(1) calls. +TEST_F(SWAKVCacheGroupTest, Malloc_AllocatesAtomicallyAsBatch) { + auto group = makeGroupWithStep(4, 2); + const size_t free_before = block_pool_->freeBlocksNum(); + + // seq_len=16, step=2, reuse=true => 4 slots. Real blocks expected at positions {1, 2, 3}. + BlockIds block_ids(1); + ASSERT_TRUE(group.malloc(block_ids, 16, /*enable_reuse_cache=*/true)); + ASSERT_EQ(block_ids.blocksNum(), 4u); + EXPECT_TRUE(isNullBlockIdx(block_ids.blocks()[0])); + EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[1])); + EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[2])); + EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[3])); + + // The pool's free count must drop by exactly the number of physical blocks (3). + EXPECT_EQ(block_pool_->freeBlocksNum(), free_before - 3); + + group.free(block_ids.blocks()); + EXPECT_EQ(block_pool_->freeBlocksNum(), total_blocks_); +} + +// Larger sparse layout: with linear_step=2 and seq_len=24 (=> 6 slots) and reuse enabled, +// the active-tail-2 plus step-hits set {1, 3, 4, 5} forms 4 physical blocks. Validates +// that the batch path correctly distributes the 4 allocated indices across NULL/REAL slots. +TEST_F(SWAKVCacheGroupTest, Malloc_BatchPlacementMatchesShouldAllocate) { + auto group = makeGroupWithStep(4, 2); + const size_t free_before = block_pool_->freeBlocksNum(); + + BlockIds block_ids(1); + ASSERT_TRUE(group.malloc(block_ids, 24, /*enable_reuse_cache=*/true)); + ASSERT_EQ(block_ids.blocksNum(), 6u); + // Expected: idx0=NULL, idx1=REAL(step), idx2=NULL, idx3=REAL(step+tail), idx4=REAL(tail), idx5=REAL(tail). + EXPECT_TRUE(isNullBlockIdx(block_ids.blocks()[0])); + EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[1])); + EXPECT_TRUE(isNullBlockIdx(block_ids.blocks()[2])); + EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[3])); + EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[4])); + EXPECT_FALSE(isNullBlockIdx(block_ids.blocks()[5])); + + // All 4 real blocks must be distinct (the batch BlockPool::malloc returns unique ids). + std::vector reals = { + block_ids.blocks()[1], block_ids.blocks()[3], block_ids.blocks()[4], block_ids.blocks()[5]}; + std::sort(reals.begin(), reals.end()); + EXPECT_EQ(std::adjacent_find(reals.begin(), reals.end()), reals.end()); + + EXPECT_EQ(block_pool_->freeBlocksNum(), free_before - 4); + + group.free(block_ids.blocks()); + EXPECT_EQ(block_pool_->freeBlocksNum(), total_blocks_); +} + +} // namespace test +} // namespace rtp_llm + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/rtp_llm/cpp/cache/test/SharedBlockCacheTest.cc b/rtp_llm/cpp/cache/test/SharedBlockCacheTest.cc new file mode 100644 index 0000000000..8d56e31abd --- /dev/null +++ b/rtp_llm/cpp/cache/test/SharedBlockCacheTest.cc @@ -0,0 +1,419 @@ +#include "gtest/gtest.h" + +#include "rtp_llm/cpp/cache/SharedBlockCache.h" + +namespace rtp_llm::test { +namespace { + +BlockDependency rootDep(uint32_t ordinal = 0) { + BlockDependency dep; + dep.ordinal = ordinal; + return dep; +} + +BlockDependency childDep(CacheKeyType parent, uint32_t ordinal) { + BlockDependency dep; + dep.has_parent = true; + dep.parent_key = parent; + dep.ordinal = ordinal; + return dep; +} + +void putOne(SharedBlockCache& cache, + CacheKeyType key, + BlockIdxType block, + const BlockDependency& dep, + SharedBlockCache::NamespaceId namespace_id = SharedBlockCache::kGpuLogicalNamespace, + bool resident = false) { + cache.put(key, std::vector{block}, resident, namespace_id, dep); +} + +} // namespace + +TEST(SharedBlockCacheTest, PrefixTreeEvictsCollectedChainInParentFirstOrderWithDependencies) { + SharedBlockCache cache; + putOne(cache, 1, 101, rootDep(0)); + putOne(cache, 2, 102, childDep(1, 1)); + putOne(cache, 3, 103, childDep(2, 2)); + + auto evicted = cache.selectAndEvict(/*min_blocks=*/1); + + ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{1, 2, 3})); + ASSERT_EQ(evicted.evicted_slots.at(1), (std::vector{101})); + ASSERT_FALSE(evicted.evicted_dependencies.at(1).has_parent); + ASSERT_TRUE(evicted.evicted_dependencies.at(2).has_parent); + ASSERT_EQ(evicted.evicted_dependencies.at(2).parent_key, 1); + ASSERT_TRUE(evicted.evicted_dependencies.at(3).has_parent); + ASSERT_EQ(evicted.evicted_dependencies.at(3).parent_key, 2); + EXPECT_TRUE(cache.empty()); +} + +TEST(SharedBlockCacheTest, PrefixTreeStopsAtBranchPoint) { + SharedBlockCache cache; + putOne(cache, 1, 101, rootDep(0)); + putOne(cache, 2, 102, childDep(1, 1)); + putOne(cache, 3, 103, childDep(1, 2)); + + auto evicted = cache.selectAndEvict(/*min_blocks=*/1); + + ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{2})); + EXPECT_FALSE(cache.contains(2)); + EXPECT_TRUE(cache.contains(1)); + EXPECT_TRUE(cache.contains(3)); +} + +TEST(SharedBlockCacheTest, PrefixTreeLinksChildInsertedBeforeParent) { + SharedBlockCache cache; + putOne(cache, 2, 102, childDep(1, 1)); + putOne(cache, 1, 101, rootDep(0)); + + ASSERT_EQ(cache.matchGroup(2, 0), 102); + + auto evicted = cache.selectAndEvict(/*min_blocks=*/1); + + ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{1, 2})); + EXPECT_TRUE(cache.empty()); +} + +TEST(SharedBlockCacheTest, PrefixTreeEvictsOrphanLeafWithMissingParentDependency) { + SharedBlockCache cache; + putOne(cache, 2, 102, childDep(1, 1)); + + auto evicted = cache.selectAndEvict(/*min_blocks=*/1); + + ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{2})); + ASSERT_TRUE(evicted.evicted_dependencies.count(2)); + EXPECT_TRUE(evicted.evicted_dependencies.at(2).has_parent); + EXPECT_EQ(evicted.evicted_dependencies.at(2).parent_key, 1); + EXPECT_TRUE(cache.empty()); +} + +TEST(SharedBlockCacheTest, PrefixTreeAttachesMultiplePendingChildrenAndStopsAtBranch) { + SharedBlockCache cache; + putOne(cache, 2, 102, childDep(1, 1)); + putOne(cache, 3, 103, childDep(1, 2)); + putOne(cache, 1, 101, rootDep(0)); + + auto evicted = cache.selectAndEvict(/*min_blocks=*/1); + + ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{2})); + EXPECT_FALSE(cache.contains(2)); + EXPECT_TRUE(cache.contains(1)); + EXPECT_TRUE(cache.contains(3)); + + evicted = cache.selectAndEvict(/*min_blocks=*/1); + ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{1, 3})); + EXPECT_TRUE(cache.empty()); +} + +TEST(SharedBlockCacheTest, PrefixTreeStopsAtResidentParent) { + SharedBlockCache cache; + putOne(cache, 1, 101, rootDep(0), SharedBlockCache::kGpuLogicalNamespace, /*resident=*/true); + putOne(cache, 2, 102, childDep(1, 1)); + + auto evicted = cache.selectAndEvict(/*min_blocks=*/1); + + ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{2})); + ASSERT_TRUE(evicted.evicted_dependencies.count(2)); + EXPECT_TRUE(evicted.evicted_dependencies.at(2).has_parent); + EXPECT_EQ(evicted.evicted_dependencies.at(2).parent_key, 1); + EXPECT_TRUE(cache.contains(1)); + EXPECT_FALSE(cache.contains(2)); +} + +TEST(SharedBlockCacheTest, MatchGroupTouchesPrefixTreeLeafLru) { + SharedBlockCache cache; + putOne(cache, 1, 101, rootDep(0)); + putOne(cache, 2, 102, childDep(1, 1)); + putOne(cache, 3, 103, rootDep(0)); + + ASSERT_EQ(cache.matchGroup(2, 0), 102); + + auto evicted = cache.selectAndEvict(/*min_blocks=*/1); + + ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{3})); + EXPECT_TRUE(cache.contains(1)); + EXPECT_TRUE(cache.contains(2)); + EXPECT_FALSE(cache.contains(3)); +} + +TEST(SharedBlockCacheTest, ResidentIsStickyAcrossPuts) { + SharedBlockCache cache; + putOne(cache, 1, 101, rootDep(0), SharedBlockCache::kGpuLogicalNamespace, /*resident=*/false); + putOne(cache, 1, NULL_BLOCK_IDX, rootDep(0), SharedBlockCache::kGpuLogicalNamespace, /*resident=*/true); + putOne(cache, 1, NULL_BLOCK_IDX, rootDep(0), SharedBlockCache::kGpuLogicalNamespace, /*resident=*/false); + + auto evicted = cache.selectAndEvict(/*min_blocks=*/1); + + EXPECT_TRUE(evicted.evicted_keys.empty()); + EXPECT_TRUE(cache.contains(1)); +} + +TEST(SharedBlockCacheTest, ResidentIsStickyAcrossNamespaceAliases) { + SharedBlockCache cache; + putOne(cache, 1, 101, rootDep(0), SharedBlockCache::kGpuLogicalNamespace, /*resident=*/false); + putOne(cache, 1, NULL_BLOCK_IDX, rootDep(0), SharedBlockCache::kGpuCpCanonicalNamespace, /*resident=*/true); + + auto evicted = cache.selectAndEvict(/*min_blocks=*/1); + + EXPECT_TRUE(evicted.evicted_keys.empty()); + EXPECT_TRUE(cache.contains(1)); +} + +TEST(SharedBlockCacheTest, PrefixTreeEvictionReportsNamespace) { + SharedBlockCache cache; + putOne(cache, 1, 101, rootDep(0), SharedBlockCache::kGpuCpCanonicalNamespace); + + auto evicted = cache.selectAndEvict(/*min_blocks=*/1); + + ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{1})); + ASSERT_TRUE(evicted.evicted_namespaces.count(1)); + EXPECT_EQ(evicted.evicted_namespaces.at(1), SharedBlockCache::kGpuCpCanonicalNamespace); +} + +TEST(SharedBlockCacheTest, PrefixTreeEvictionKeepsCanonicalDependencyWhenLogicalAliasUpdatesSameKey) { + SharedBlockCache cache; + putOne(cache, 8, 108, rootDep(0), SharedBlockCache::kGpuCpCanonicalNamespace); + putOne(cache, 8, NULL_BLOCK_IDX, childDep(7, 7), SharedBlockCache::kGpuLogicalNamespace); + + auto evicted = cache.selectAndEvict(/*min_blocks=*/1); + + ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{8})); + ASSERT_TRUE(evicted.evicted_dependencies.count(8)); + EXPECT_FALSE(evicted.evicted_dependencies.at(8).has_parent); + EXPECT_EQ(evicted.evicted_dependencies.at(8).ordinal, 0u); + ASSERT_TRUE(evicted.evicted_namespaces.count(8)); + EXPECT_EQ(evicted.evicted_namespaces.at(8), SharedBlockCache::kGpuCpCanonicalNamespace); +} + +TEST(SharedBlockCacheTest, CanonicalAliasOwnsEvictionWhenLogicalAliasIsOlder) { + SharedBlockCache cache; + putOne(cache, 100, 1000, rootDep(0), SharedBlockCache::kGpuLogicalNamespace); + putOne(cache, 101, 1010, childDep(100, 1), SharedBlockCache::kGpuLogicalNamespace); + putOne(cache, 102, 1020, childDep(101, 2), SharedBlockCache::kGpuLogicalNamespace); + putOne(cache, 103, 1030, childDep(102, 3), SharedBlockCache::kGpuLogicalNamespace); + + putOne(cache, 101, NULL_BLOCK_IDX, rootDep(0), SharedBlockCache::kGpuCpCanonicalNamespace); + putOne(cache, 103, NULL_BLOCK_IDX, childDep(101, 1), SharedBlockCache::kGpuCpCanonicalNamespace); + + auto evicted = cache.selectAndEvict(/*min_blocks=*/1); + + ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{101, 103})); + ASSERT_TRUE(evicted.evicted_dependencies.count(101)); + EXPECT_FALSE(evicted.evicted_dependencies.at(101).has_parent); + ASSERT_TRUE(evicted.evicted_dependencies.count(103)); + EXPECT_TRUE(evicted.evicted_dependencies.at(103).has_parent); + EXPECT_EQ(evicted.evicted_dependencies.at(103).parent_key, 101); + EXPECT_EQ(evicted.evicted_namespaces.at(101), SharedBlockCache::kGpuCpCanonicalNamespace); + EXPECT_EQ(evicted.evicted_namespaces.at(103), SharedBlockCache::kGpuCpCanonicalNamespace); + EXPECT_TRUE(cache.contains(100)); + EXPECT_TRUE(cache.contains(102)); +} + +TEST(SharedBlockCacheTest, FlatFallbackKeepsCanonicalDependencyWhenLogicalAliasUpdatesSameKey) { + SharedBlockCache cache; + cache.setPrefixTreeEnabled(false); + + putOne(cache, 8, 108, rootDep(0), SharedBlockCache::kGpuCpCanonicalNamespace); + putOne(cache, 8, NULL_BLOCK_IDX, childDep(7, 7), SharedBlockCache::kGpuLogicalNamespace); + + auto evicted = cache.selectAndEvict(/*min_blocks=*/1); + + ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{8})); + ASSERT_TRUE(evicted.evicted_dependencies.count(8)); + EXPECT_FALSE(evicted.evicted_dependencies.at(8).has_parent); + EXPECT_EQ(evicted.evicted_dependencies.at(8).ordinal, 0u); + ASSERT_TRUE(evicted.evicted_namespaces.count(8)); + EXPECT_EQ(evicted.evicted_namespaces.at(8), SharedBlockCache::kGpuCpCanonicalNamespace); +} + +TEST(SharedBlockCacheTest, NonMatchableSlotStillEvictsButDoesNotMatchGroup) { + SharedBlockCache cache; + cache.put(1, + std::vector{101, 201}, + /*is_resident=*/false, + SharedBlockCache::kGpuLogicalNamespace, + rootDep(0), + std::vector{true, false}); + + EXPECT_EQ(cache.matchGroup(1, 0), 101); + EXPECT_TRUE(isNullBlockIdx(cache.matchGroup(1, 1))); + + auto evicted = cache.selectAndEvict(/*min_blocks=*/2); + ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{1})); + ASSERT_EQ(evicted.evicted_slots.at(1), (std::vector{101, 201})); +} + +TEST(SharedBlockCacheTest, StateIndependentEvictionDropsDeepestNonLeafStateFirst) { + SharedBlockCache cache; + cache.setIndependentGroupEviction(/*enabled=*/true, {3}); + + cache.put(1, std::vector{101, NULL_BLOCK_IDX, NULL_BLOCK_IDX, 301}, false, + SharedBlockCache::kGpuLogicalNamespace, rootDep(0)); + cache.put(2, std::vector{102, NULL_BLOCK_IDX, NULL_BLOCK_IDX, 302}, false, + SharedBlockCache::kGpuLogicalNamespace, childDep(1, 1)); + cache.put(3, std::vector{103, NULL_BLOCK_IDX, NULL_BLOCK_IDX, 303}, false, + SharedBlockCache::kGpuLogicalNamespace, childDep(2, 2)); + + auto evicted = cache.selectAndEvictForGroup(/*group_id=*/3, /*min_blocks=*/1); + + ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{2})); + ASSERT_EQ(evicted.evicted_slots.at(2), + (std::vector{NULL_BLOCK_IDX, NULL_BLOCK_IDX, NULL_BLOCK_IDX, 302})); + ASSERT_TRUE(evicted.evicted_independent_group.count(2)); + EXPECT_EQ(evicted.evicted_independent_group.at(2), 3); + EXPECT_EQ(cache.matchGroup(2, 0), 102); + EXPECT_TRUE(isNullBlockIdx(cache.matchGroup(2, 3))); + EXPECT_EQ(cache.matchGroup(3, 3), 303); +} + +TEST(SharedBlockCacheTest, StateIndependentEvictionScansMultipleLeavesSafely) { + SharedBlockCache cache; + cache.setIndependentGroupEviction(/*enabled=*/true, {3}); + + cache.put(1, std::vector{101, NULL_BLOCK_IDX, NULL_BLOCK_IDX, 301}, false, + SharedBlockCache::kGpuLogicalNamespace, rootDep(0)); + cache.put(2, std::vector{102, NULL_BLOCK_IDX, NULL_BLOCK_IDX, 302}, false, + SharedBlockCache::kGpuLogicalNamespace, childDep(1, 1)); + cache.put(3, std::vector{103, NULL_BLOCK_IDX, NULL_BLOCK_IDX, 303}, false, + SharedBlockCache::kGpuLogicalNamespace, childDep(2, 2)); + cache.put(10, std::vector{110, NULL_BLOCK_IDX, NULL_BLOCK_IDX, 310}, false, + SharedBlockCache::kGpuLogicalNamespace, rootDep(0)); + cache.put(11, std::vector{111, NULL_BLOCK_IDX, NULL_BLOCK_IDX, 311}, false, + SharedBlockCache::kGpuLogicalNamespace, childDep(10, 1)); + cache.put(12, std::vector{112, NULL_BLOCK_IDX, NULL_BLOCK_IDX, 312}, false, + SharedBlockCache::kGpuLogicalNamespace, childDep(11, 2)); + + auto evicted = cache.selectAndEvictForGroup(/*group_id=*/3, /*min_blocks=*/2); + + ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{2, 11})); + EXPECT_TRUE(isNullBlockIdx(cache.matchGroup(2, 3))); + EXPECT_TRUE(isNullBlockIdx(cache.matchGroup(11, 3))); + EXPECT_EQ(cache.matchGroup(3, 3), 303); + EXPECT_EQ(cache.matchGroup(12, 3), 312); +} + +TEST(SharedBlockCacheTest, StateIndependentEvictionFallsBackToWholeChainWhenOnlyLeafStateRemains) { + SharedBlockCache cache; + cache.setIndependentGroupEviction(/*enabled=*/true, {3}); + + cache.put(1, std::vector{101, NULL_BLOCK_IDX, NULL_BLOCK_IDX, NULL_BLOCK_IDX}, false, + SharedBlockCache::kGpuLogicalNamespace, rootDep(0)); + cache.put(2, std::vector{102, NULL_BLOCK_IDX, NULL_BLOCK_IDX, 302}, false, + SharedBlockCache::kGpuLogicalNamespace, childDep(1, 1)); + + auto evicted = cache.selectAndEvictForGroup(/*group_id=*/3, /*min_blocks=*/1); + + ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{1, 2})); + ASSERT_FALSE(evicted.evicted_independent_group.count(2)); + EXPECT_TRUE(cache.empty()); +} + +TEST(SharedBlockCacheTest, SelectAndEvictForGroupSkipsChainsWithoutTargetSlot) { + SharedBlockCache cache; + cache.setIndependentGroupEviction(/*enabled=*/true, {3}); + + cache.put(1, std::vector{101, NULL_BLOCK_IDX, NULL_BLOCK_IDX, NULL_BLOCK_IDX}, false, + SharedBlockCache::kGpuLogicalNamespace, rootDep(0)); + cache.put(2, std::vector{102, NULL_BLOCK_IDX, NULL_BLOCK_IDX, NULL_BLOCK_IDX}, false, + SharedBlockCache::kGpuLogicalNamespace, childDep(1, 1)); + cache.put(10, std::vector{110, NULL_BLOCK_IDX, NULL_BLOCK_IDX, NULL_BLOCK_IDX}, false, + SharedBlockCache::kGpuLogicalNamespace, rootDep(0)); + cache.put(11, std::vector{111, NULL_BLOCK_IDX, NULL_BLOCK_IDX, 311}, false, + SharedBlockCache::kGpuLogicalNamespace, childDep(10, 1)); + + auto evicted = cache.selectAndEvictForGroup(/*group_id=*/3, /*min_blocks=*/1); + + ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{10, 11})); + EXPECT_FALSE(cache.contains(10)); + EXPECT_FALSE(cache.contains(11)); + EXPECT_TRUE(cache.contains(1)); + EXPECT_TRUE(cache.contains(2)); +} + +TEST(SharedBlockCacheTest, SelectAndEvictForGroupPrunesBranchUntilTargetAncestorIsEvictable) { + SharedBlockCache cache; + cache.put(1, + std::vector{101, 201}, + /*is_resident=*/false, + SharedBlockCache::kGpuLogicalNamespace, + rootDep(0)); + cache.put(2, + std::vector{102, NULL_BLOCK_IDX}, + /*is_resident=*/false, + SharedBlockCache::kGpuLogicalNamespace, + childDep(1, 1)); + cache.put(3, + std::vector{103, NULL_BLOCK_IDX}, + /*is_resident=*/false, + SharedBlockCache::kGpuLogicalNamespace, + childDep(1, 2)); + + auto evicted = cache.selectAndEvictForGroup(/*group_id=*/1, /*min_blocks=*/1); + + ASSERT_EQ(evicted.evicted_keys, (CacheKeysType{2, 1, 3})); + ASSERT_EQ(evicted.evicted_slots.at(1), (std::vector{101, 201})); + EXPECT_TRUE(isNullBlockIdx(evicted.evicted_slots.at(2)[1])); + EXPECT_TRUE(isNullBlockIdx(evicted.evicted_slots.at(3)[1])); + EXPECT_TRUE(cache.empty()); +} + +TEST(SharedBlockCacheTest, SelectAndEvictForGroupDoesNotPruneWhenTargetAncestorBlockedByResidentSibling) { + SharedBlockCache cache; + cache.put(1, + std::vector{101, 201}, + /*is_resident=*/false, + SharedBlockCache::kGpuLogicalNamespace, + rootDep(0)); + cache.put(2, + std::vector{102, NULL_BLOCK_IDX}, + /*is_resident=*/false, + SharedBlockCache::kGpuLogicalNamespace, + childDep(1, 1)); + cache.put(3, + std::vector{103, NULL_BLOCK_IDX}, + /*is_resident=*/true, + SharedBlockCache::kGpuLogicalNamespace, + childDep(1, 2)); + + auto evicted = cache.selectAndEvictForGroup(/*group_id=*/1, /*min_blocks=*/1); + + EXPECT_TRUE(evicted.evicted_keys.empty()); + EXPECT_TRUE(cache.contains(1)); + EXPECT_TRUE(cache.contains(2)); + EXPECT_TRUE(cache.contains(3)); +} + +TEST(SharedBlockCacheTest, SelectAndEvictForGroupDoesNotPruneWhenTargetAncestorBlockedByResidentDescendant) { + SharedBlockCache cache; + cache.put(1, + std::vector{101, 201}, + /*is_resident=*/false, + SharedBlockCache::kGpuLogicalNamespace, + rootDep(0)); + cache.put(2, + std::vector{102, NULL_BLOCK_IDX}, + /*is_resident=*/false, + SharedBlockCache::kGpuLogicalNamespace, + childDep(1, 1)); + cache.put(3, + std::vector{103, NULL_BLOCK_IDX}, + /*is_resident=*/false, + SharedBlockCache::kGpuLogicalNamespace, + childDep(1, 2)); + cache.put(4, + std::vector{104, NULL_BLOCK_IDX}, + /*is_resident=*/true, + SharedBlockCache::kGpuLogicalNamespace, + childDep(3, 3)); + + auto evicted = cache.selectAndEvictForGroup(/*group_id=*/1, /*min_blocks=*/1); + + EXPECT_TRUE(evicted.evicted_keys.empty()); + EXPECT_TRUE(cache.contains(1)); + EXPECT_TRUE(cache.contains(2)); + EXPECT_TRUE(cache.contains(3)); + EXPECT_TRUE(cache.contains(4)); +} + +} // namespace rtp_llm::test diff --git a/rtp_llm/cpp/cache/test/SingleTypeKVCacheAllocatorTest.cc b/rtp_llm/cpp/cache/test/SingleTypeKVCacheAllocatorTest.cc index d9693afbd1..fc6fcf338f 100644 --- a/rtp_llm/cpp/cache/test/SingleTypeKVCacheAllocatorTest.cc +++ b/rtp_llm/cpp/cache/test/SingleTypeKVCacheAllocatorTest.cc @@ -5,13 +5,15 @@ #include #include #include "rtp_llm/cpp/utils/Logger.h" -#include "rtp_llm/cpp/cache/SingleTypeKVCacheAllocator.h" +#include "rtp_llm/cpp/cache/allocator/SingleTypeKVCacheAllocator.h" #include "rtp_llm/cpp/cache/CacheConfig.h" -#include "rtp_llm/cpp/cache/CacheConfigCreator.h" +#include "rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h" +#include "rtp_llm/cpp/cache/CPSlotMapper.h" #include "rtp_llm/models_py/bindings/core/ExecOps.h" #include "rtp_llm/cpp/cache/test/BlockPoolTestHelper.h" #include "rtp_llm/cpp/cache/test/CacheConfigTestUtils.h" #include "rtp_llm/cpp/cache/BatchKVCacheResource.h" +#include "rtp_llm/cpp/cache/SharedBlockCache.h" #include "rtp_llm/cpp/engine_base/stream/CompleteTokenIds.h" namespace rtp_llm { @@ -41,6 +43,7 @@ static rtp_llm::ModelConfig makeTestModelConfig(uint32_t num_layers) { m.attn_config.kv_lora_rank = 0; m.attn_config.rope_head_dim = 0; m.attn_config.head_num = 2; + setDefaultKvCacheSpec(m); return m; } @@ -94,8 +97,8 @@ BatchKVCacheResourcePtr createBatchKVCacheResource(int batch_size, int layer_num auto resource = std::make_shared(); resource->resetBatchSize(batch_size); for (int i = 0; i < batch_size; ++i) { - std::vector layer_to_group_id(layer_num, 0); - resource->initBatchGroups(i, 1, layer_num, layer_to_group_id); + std::vector> layer_group_ids(static_cast(layer_num), std::vector{0}); + resource->initBatchGroups(i, 1, layer_num, layer_group_ids); resource->setBatchBlocks(i, 0, std::vector(block_num_per_batch)); resource->setBatchCacheKeys(i, CacheKeysType(block_num_per_batch, static_cast(i * 100))); } @@ -220,6 +223,7 @@ TEST_F(SingleTypeKVCacheAllocatorTest, ReserveBlocksOnlyAppliedToInitMalloc) { TEST_F(SingleTypeKVCacheAllocatorTest, ReserveBlocksCheckHappensAfterReuseReferenceInInitMallocForCommonLen) { auto config = createSingleTypeTestConfig(/*layer_num=*/4, /*block_num=*/10, /*seq_size_per_block=*/4); allocator_ = std::make_shared(config); + allocator_->setSharedBlockCache(std::make_shared()); ASSERT_TRUE(allocator_->init()); allocator_->setReserveBlockNum(2); @@ -477,7 +481,7 @@ TEST_F(SingleTypeKVCacheAllocatorTest, LayerCacheBase) { auto layout = allocator_->allLayerCacheBase(); EXPECT_EQ(layout.layers_to_kv_buffer_ptrs.size(), config.layer_num); EXPECT_EQ(layout.layers_to_scale_buffer_ptrs.size(), config.layer_num); - EXPECT_EQ((std::vector(4, 0)), layout.layer_to_groups); + EXPECT_EQ((std::vector>(4, std::vector{0})), layout.layer_to_group_ids); for (size_t i = 0; i < layout.layers_to_kv_buffer_ptrs.size(); ++i) { EXPECT_TRUE(layout.layers_to_kv_buffer_ptrs[i].defined()); @@ -494,7 +498,7 @@ TEST_F(SingleTypeKVCacheAllocatorTest, BlockCopySingle) { int src_block = 0; int dst_block = 1; - auto& spec = config.cache_specs[0]; + auto& spec = config.specForGroup(0); size_t k_block_size = spec->k_block_size(); size_t v_block_size = spec->v_block_size(); @@ -550,7 +554,7 @@ TEST_F(SingleTypeKVCacheAllocatorTest, BlockBatchCopyVector) { copy_mapping.push_back({2, 3}); copy_mapping.push_back({4, 5}); - auto& spec = config.cache_specs[0]; + auto& spec = config.specForGroup(0); size_t k_block_size = spec->k_block_size(); size_t v_block_size = spec->v_block_size(); @@ -616,7 +620,7 @@ TEST_F(SingleTypeKVCacheAllocatorTest, BlockBatchCopyPointers) { BlockIdPair pairs[] = {{0, 1}, {2, 3}}; - auto& spec = config.cache_specs[0]; + auto& spec = config.specForGroup(0); size_t k_block_size = spec->k_block_size(); size_t v_block_size = spec->v_block_size(); @@ -665,7 +669,7 @@ TEST_F(SingleTypeKVCacheAllocatorTest, BlockBatchCopyBuffer) { std::vector data = {0, 1, 2, 3, 4, 5}; // 3 pairs: (0->1, 2->3, 4->5) auto tensor = torch::from_blob(data.data(), {3, 2}, torch::kInt32).clone(); - auto& spec = config.cache_specs[0]; + auto& spec = config.specForGroup(0); size_t k_block_size = spec->k_block_size(); size_t v_block_size = spec->v_block_size(); @@ -740,7 +744,7 @@ TEST_F(SingleTypeKVCacheAllocatorTest, IncrKVCacheRefReferencesMatchedBlocksOnly EXPECT_EQ(allocator_->freeBlocksNum(), total_free_before - 4); KVCacheResource resource; - resource.initGroups(1, config.layer_all_num, config.layer_to_group_id); + resource.initGroups(1, config.layer_all_num, config.layerGroupIdsSnapshot()); resource.cacheKeys() = CacheKeysType{100, 101, 102, 103}; resource.mutableBlockIds(0).assign(BlockIndicesType{blocks[0], blocks[1], 0, blocks[2]}); @@ -760,6 +764,38 @@ TEST_F(SingleTypeKVCacheAllocatorTest, IncrKVCacheRefReferencesMatchedBlocksOnly EXPECT_EQ(allocator_->freeBlocksNum(), total_free_before); } +TEST_F(SingleTypeKVCacheAllocatorTest, IncrKVCacheRefPreservesConnectorDummyTail) { + auto config = createSingleTypeTestConfig(/*layer_num=*/4, /*block_num=*/10, /*seq_size_per_block=*/8); + allocator_ = std::make_shared(config, AllocationType::HOST); + ASSERT_TRUE(allocator_->init()); + + auto block_pool = allocator_->getBlockPool(); + ASSERT_NE(block_pool, nullptr); + + const size_t total_free_before = allocator_->freeBlocksNum(); + auto blocks = block_pool->malloc(2); + ASSERT_EQ(blocks.size(), 2); + + KVCacheResource resource; + resource.initGroups(1, config.layer_all_num, config.layerGroupIdsSnapshot()); + resource.cacheKeys() = CacheKeysType{101, 103, 999}; + resource.rebuildLinearBlockDependencies(); + resource.setLastBlockAligned(false); + resource.mutableBlockIds(0).assign(BlockIndicesType{blocks[0], blocks[1]}); + + auto ref_resource = allocator_->incrKVCacheRef(resource, CacheKeysType{101, 103, 999}, /*is_connector=*/true); + ASSERT_NE(ref_resource, nullptr); + EXPECT_FALSE(ref_resource->lastBlockAligned()); + EXPECT_EQ(ref_resource->cacheKeys(), (CacheKeysType{101, 103, 999})); + EXPECT_EQ(ref_resource->blocks(0), (BlockIndicesType{blocks[0], blocks[1], NULL_BLOCK_IDX})); + + block_pool->requestFree(blocks); + EXPECT_EQ(allocator_->freeBlocksNum(), total_free_before - 2); + + ref_resource.reset(); + EXPECT_EQ(allocator_->freeBlocksNum(), total_free_before); +} + TEST_F(SingleTypeKVCacheAllocatorTest, IncrKVCacheRefEmptyInputNoEffect) { auto config = createSingleTypeTestConfig(/*layer_num=*/4, /*block_num=*/10, /*seq_size_per_block=*/8); allocator_ = std::make_shared(config, AllocationType::HOST); @@ -774,7 +810,7 @@ TEST_F(SingleTypeKVCacheAllocatorTest, IncrKVCacheRefEmptyInputNoEffect) { EXPECT_EQ(allocator_->freeBlocksNum(), total_free_before - 2); KVCacheResource resource; - resource.initGroups(1, config.layer_all_num, config.layer_to_group_id); + resource.initGroups(1, config.layer_all_num, config.layerGroupIdsSnapshot()); resource.cacheKeys() = CacheKeysType{100, 101}; resource.mutableBlockIds(0).assign(BlockIndicesType{blocks[0], blocks[1]}); @@ -801,6 +837,21 @@ TEST_F(SingleTypeKVCacheAllocatorTest, MaxSeqLen) { EXPECT_EQ(allocator_->maxAvailableTokensNum(), (10 - 1) * 8); // block_num * seq_size_per_block } +TEST_F(SingleTypeKVCacheAllocatorTest, CapacityAndNeedBlocksUseCPVirtualBlockSize) { + auto config = createSingleTypeTestConfig(/*layer_num=*/4, /*block_num=*/10, /*seq_size_per_block=*/8); + allocator_ = std::make_shared(config); + ASSERT_TRUE(allocator_->init()); + + allocator_->setCPSlotMapper( + std::make_shared(/*cp_rank=*/0, /*cp_size=*/2, /*block_size=*/8)); + + EXPECT_EQ(allocator_->maxAvailableTokensNum(), (10u - 1u) * 16u); + EXPECT_EQ(allocator_->availableTokensNum(), (10u - 1u) * 16u); + + auto batch_resource = createBatchKVCacheResource(/*batch_size=*/1, config.layer_num); + EXPECT_EQ(allocator_->singleBatchNeedBlocks(batch_resource, /*seq_len=*/65, /*reserve_step=*/0), 5); +} + // Test boundary conditions TEST_F(SingleTypeKVCacheAllocatorTest, MallocWithZeroSeqLength) { @@ -832,6 +883,7 @@ TEST_F(SingleTypeKVCacheAllocatorTest, FreeEmptyBatchResource) { TEST_F(SingleTypeKVCacheAllocatorTest, InitMallocRollbackWhenInitMallocForCommonLenFails) { auto config = createSingleTypeTestConfig(/*layer_num=*/4, /*block_num=*/6, /*seq_size_per_block=*/4); allocator_ = std::make_shared(config, AllocationType::HOST); + allocator_->setSharedBlockCache(std::make_shared()); ASSERT_TRUE(allocator_->init()); auto seed_resource = createBatchKVCacheResource(/*batch_size=*/1, config.layer_num); diff --git a/rtp_llm/cpp/cache/test/mock/MockKVCacheAllocator.h b/rtp_llm/cpp/cache/test/mock/MockKVCacheAllocator.h index 6b80aae4fa..1f86935664 100644 --- a/rtp_llm/cpp/cache/test/mock/MockKVCacheAllocator.h +++ b/rtp_llm/cpp/cache/test/mock/MockKVCacheAllocator.h @@ -2,7 +2,7 @@ #include -#include "rtp_llm/cpp/cache/KVCacheAllocator.h" +#include "rtp_llm/cpp/cache/allocator/KVCacheAllocator.h" namespace rtp_llm { diff --git a/rtp_llm/cpp/config/BUILD b/rtp_llm/cpp/config/BUILD index fcec06b5be..764c5c1d12 100644 --- a/rtp_llm/cpp/config/BUILD +++ b/rtp_llm/cpp/config/BUILD @@ -55,7 +55,8 @@ cc_library( ":config_modules", "//rtp_llm/cpp/model_utils:model_utils", "//rtp_llm/models_py/bindings/core:types", - "//rtp_llm/models_py/bindings/core:type_convert" + "//rtp_llm/models_py/bindings/core:type_convert", + "//rtp_llm/cpp/cache:kv_cache_spec_desc_types", ], visibility = ["//visibility:public"], copts = copts(), diff --git a/rtp_llm/cpp/config/ConfigModules.h b/rtp_llm/cpp/config/ConfigModules.h index 0f4bbf1deb..1a8c1bdce1 100644 --- a/rtp_llm/cpp/config/ConfigModules.h +++ b/rtp_llm/cpp/config/ConfigModules.h @@ -29,7 +29,9 @@ enum class CPRotateMethod { struct PrefillCPConfig { CPRotateMethod method = CPRotateMethod::DISABLED; size_t comm_buffer_size = 512 * 1024 * 1024; // 512MB - bool is_enabled() const { + bool kv_cache_sharded = false; + int64_t prefill_cp_size = 0; + bool is_enabled() const { return method != CPRotateMethod::DISABLED && method != CPRotateMethod::UNKNOWN && method != CPRotateMethod::PREFILL_CP; } @@ -69,6 +71,8 @@ struct ParallelismConfig { bool enable_sp = false; bool use_ub_comm = false; + RoleType role_type = RoleType::PDFUSION; + FfnDisAggregateConfig ffn_disaggregate_config; // FFN disaggregate configuration // Context Parallel configuration @@ -165,7 +169,10 @@ struct KVCacheConfig { bool enable_memory_cache_sm_copy = false; bool enable_remote_cache = false; bool write_cache_sync = false; - bool enable_tiered_memory_cache = false; + bool enable_tiered_memory_cache = false; + bool enable_gpu_prefix_tree = false; + bool enable_prefix_tree_memory_cache = false; + bool enable_independent_group_eviction = false; int64_t device_cache_min_free_blocks = 0; int load_cache_retry_times = 1; // Maximum retry attempts for load cache transfer failures @@ -538,7 +545,8 @@ enum class HybridAttentionType { }; struct HybridAttentionConfig { - bool enable_hybrid_attention = false; + bool enable_hybrid_attention = false; + bool enable_independent_kv_cache_pools = false; std::vector hybrid_attention_types; std::string to_string() const; }; diff --git a/rtp_llm/cpp/config/ModelConfig.h b/rtp_llm/cpp/config/ModelConfig.h index 15981932b2..5cb0b836cc 100644 --- a/rtp_llm/cpp/config/ModelConfig.h +++ b/rtp_llm/cpp/config/ModelConfig.h @@ -11,12 +11,15 @@ #include "rtp_llm/cpp/config/EplbConfig.h" #include "rtp_llm/cpp/config/ConfigModules.h" #include "rtp_llm/cpp/config/SpecialTokens.h" +#include "rtp_llm/cpp/cache/spec/KVCacheSpecDescTypes.h" #include #include #include namespace rtp_llm { +using LayerKVCacheSpecDescs = std::vector>; + enum TaskType { DENSE_EMBEDDING = 0, ALL_EMBEDDING = 1, @@ -122,6 +125,9 @@ class ModelConfig { // Multimodal model configuration MMModelConfig mm_model_config; + // Declarative per-model KV cache layout + LayerKVCacheSpecDescs kv_cache_spec_descs; + // Fields merged from PyModelConfig std::string extra_data_path = ""; std::string local_extra_data_path = ""; diff --git a/rtp_llm/cpp/distribute/BUILD b/rtp_llm/cpp/distribute/BUILD new file mode 100644 index 0000000000..5c338e77b5 --- /dev/null +++ b/rtp_llm/cpp/distribute/BUILD @@ -0,0 +1,24 @@ +load("//:def.bzl", "copts") + +package(default_visibility = ["//visibility:public"]) + +cc_library( + name = "rpc_cpu_tp_broadcaster_hdr", + hdrs = ["RpcCpuTpBroadcaster.h"], + deps = [ + "//rtp_llm/cpp/model_rpc:broadcast_manager", + "//rtp_llm/cpp/model_rpc/proto:model_rpc_service_cc_proto", + ], + copts = copts(), +) + +cc_library( + name = "rpc_cpu_tp_broadcaster", + srcs = ["RpcCpuTpBroadcaster.cc"], + hdrs = ["RpcCpuTpBroadcaster.h"], + deps = [ + ":rpc_cpu_tp_broadcaster_hdr", + "//rtp_llm/cpp/utils:core_utils", + ], + copts = copts(), +) diff --git a/rtp_llm/cpp/distribute/RpcCpuTpBroadcaster.cc b/rtp_llm/cpp/distribute/RpcCpuTpBroadcaster.cc new file mode 100644 index 0000000000..016b6acfaa --- /dev/null +++ b/rtp_llm/cpp/distribute/RpcCpuTpBroadcaster.cc @@ -0,0 +1,288 @@ +#include "rtp_llm/cpp/distribute/RpcCpuTpBroadcaster.h" + +#include "rtp_llm/cpp/utils/AssertUtils.h" +#include "rtp_llm/cpp/utils/Logger.h" + +#include +#include +#include + +namespace rtp_llm { + +namespace { + +constexpr int kDefaultTimeoutMs = 30000; + +int normalizeTimeoutMs(int timeout_ms) { + return timeout_ms > 0 ? timeout_ms : kDefaultTimeoutMs; +} + +} // namespace + +RpcCpuTpBroadcaster& RpcCpuTpBroadcaster::instance() { + static RpcCpuTpBroadcaster i; + return i; +} + +std::size_t RpcCpuTpBroadcaster::InboxKeyHash::operator()(const InboxKey& key) const { + std::size_t h = std::hash{}(key.group_key); + h ^= std::hash{}(key.seq) + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2); + h ^= std::hash{}(key.dst_tp_rank) + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2); + return h; +} + +std::string RpcCpuTpBroadcaster::makeGroupKey(int dp_rank, int tp_size, int world_size) const { + std::ostringstream oss; + oss << "tp_cpu_broadcast:dp=" << dp_rank << ":tp=" << tp_size << ":world=" << world_size; + return oss.str(); +} + +void RpcCpuTpBroadcaster::initialize(int tp_rank, + int tp_size, + int dp_rank, + int world_size, + const std::vector& worker_grpc_addrs, + int timeout_ms) { + std::lock_guard lock(mu_); + timeout_ms = normalizeTimeoutMs(timeout_ms); + + if (initialized_.load(std::memory_order_acquire)) { + const std::string new_group_key = makeGroupKey(dp_rank, tp_size, world_size); + RTP_LLM_CHECK_WITH_INFO(tp_rank_ == tp_rank && tp_size_ == tp_size && dp_rank_ == dp_rank + && world_size_ == world_size && group_key_ == new_group_key, + "RpcCpuTpBroadcaster re-init mismatch: was rank=%d size=%d dp=%d world=%d group=%s, " + "now rank=%d size=%d dp=%d world=%d group=%s", + tp_rank_, + tp_size_, + dp_rank_, + world_size_, + group_key_.c_str(), + tp_rank, + tp_size, + dp_rank, + world_size, + new_group_key.c_str()); + return; + } + + if (tp_size <= 1) { + tp_rank_ = tp_rank; + tp_size_ = tp_size; + dp_rank_ = dp_rank; + world_size_ = world_size; + timeout_ms_ = timeout_ms; + group_key_ = makeGroupKey(dp_rank, tp_size, world_size); + initialized_.store(true, std::memory_order_release); + return; + } + + RTP_LLM_CHECK_WITH_INFO(tp_rank >= 0 && tp_rank < tp_size, + "RpcCpuTpBroadcaster bad tp_rank=%d tp_size=%d", + tp_rank, + tp_size); + RTP_LLM_CHECK_WITH_INFO(static_cast(worker_grpc_addrs.size()) >= world_size, + "RpcCpuTpBroadcaster worker_grpc_addrs too small: addrs=%zu world_size=%d", + worker_grpc_addrs.size(), + world_size); + + tp_rank_ = tp_rank; + tp_size_ = tp_size; + dp_rank_ = dp_rank; + world_size_ = world_size; + timeout_ms_ = timeout_ms; + group_key_ = makeGroupKey(dp_rank, tp_size, world_size); + seq_.store(0, std::memory_order_release); + inbox_.clear(); + peer_addrs_.clear(); + peer_tp_ranks_.clear(); + broadcast_manager_.reset(); + + if (tp_rank_ == 0) { + peer_addrs_.reserve(tp_size - 1); + peer_tp_ranks_.reserve(tp_size - 1); + for (int peer_tp_rank = 1; peer_tp_rank < tp_size; ++peer_tp_rank) { + const int world_rank = dp_rank * tp_size + peer_tp_rank; + RTP_LLM_CHECK_WITH_INFO(world_rank >= 0 && world_rank < static_cast(worker_grpc_addrs.size()), + "RpcCpuTpBroadcaster bad peer world_rank=%d addrs=%zu", + world_rank, + worker_grpc_addrs.size()); + peer_addrs_.push_back(worker_grpc_addrs[world_rank]); + peer_tp_ranks_.push_back(peer_tp_rank); + } + broadcast_manager_ = std::make_shared(peer_addrs_); + RTP_LLM_CHECK_WITH_INFO(broadcast_manager_->init(), + "RpcCpuTpBroadcaster BroadcastManager init failed for %zu peer(s)", + peer_addrs_.size()); + } + + initialized_.store(true, std::memory_order_release); + cv_.notify_all(); + RTP_LLM_LOG_INFO("Initialized RpcCpuTpBroadcaster rank=%d tp_size=%d dp_rank=%d world_size=%d peers=%zu timeout_ms=%d", + tp_rank_, + tp_size_, + dp_rank_, + world_size_, + peer_addrs_.size(), + timeout_ms_); +} + +void RpcCpuTpBroadcaster::reset() { + { + std::lock_guard lock(mu_); + inbox_.clear(); + peer_addrs_.clear(); + peer_tp_ranks_.clear(); + broadcast_manager_.reset(); + tp_rank_ = 0; + tp_size_ = 1; + dp_rank_ = 0; + world_size_ = 1; + timeout_ms_ = kDefaultTimeoutMs; + group_key_.clear(); + seq_.store(0, std::memory_order_release); + initialized_.store(false, std::memory_order_release); + } + cv_.notify_all(); +} + +uint64_t RpcCpuTpBroadcaster::nextSeq() { + return seq_.fetch_add(1, std::memory_order_acq_rel); +} + +void RpcCpuTpBroadcaster::broadcast(void* buf, std::size_t nbytes, int root) { + RTP_LLM_CHECK_WITH_INFO(initialized_.load(std::memory_order_acquire), + "RpcCpuTpBroadcaster::broadcast called before initialize"); + if (tp_size_ <= 1 || nbytes == 0) { + return; + } + RTP_LLM_CHECK_WITH_INFO(root == 0, "RpcCpuTpBroadcaster supports only root=0; got %d", root); + + const uint64_t seq = nextSeq(); + if (tp_rank_ == 0) { + std::shared_ptr manager; + std::vector peer_tp_ranks; + std::string group_key; + int timeout_ms = kDefaultTimeoutMs; + { + std::lock_guard lock(mu_); + manager = broadcast_manager_; + peer_tp_ranks = peer_tp_ranks_; + group_key = group_key_; + timeout_ms = timeout_ms_; + } + RTP_LLM_CHECK_WITH_INFO(manager != nullptr, "RpcCpuTpBroadcaster root has no BroadcastManager"); + + std::vector requests; + requests.reserve(peer_tp_ranks.size()); + for (int peer_tp_rank : peer_tp_ranks) { + CpuTpBroadcastRequestPB request; + request.set_group_key(group_key); + request.set_seq(seq); + request.set_root(root); + request.set_src_tp_rank(tp_rank_); + request.set_dst_tp_rank(peer_tp_rank); + request.set_nbytes(static_cast(nbytes)); + request.set_payload(buf, nbytes); + requests.push_back(std::move(request)); + } + + auto rpc_call = [](std::shared_ptr& stub, + std::shared_ptr& ctx, + const CpuTpBroadcastRequestPB& request, + grpc::CompletionQueue* cq) { + return stub->AsyncCpuTpBroadcast(ctx.get(), request, cq); + }; + + auto result = manager->broadcast( + requests, timeout_ms, rpc_call); + RTP_LLM_CHECK_WITH_INFO(result != nullptr, + "RpcCpuTpBroadcaster broadcast setup failed seq=%lu nbytes=%zu", + seq, + nbytes); + RTP_LLM_CHECK_WITH_INFO(result->waitDone(timeout_ms), + "RpcCpuTpBroadcaster broadcast wait timeout seq=%lu timeout_ms=%d", + seq, + timeout_ms); + RTP_LLM_CHECK_WITH_INFO(result->success(), "RpcCpuTpBroadcaster broadcast RPC failed seq=%lu", seq); + for (const auto& response : result->responses()) { + RTP_LLM_CHECK_WITH_INFO(response.success(), + "RpcCpuTpBroadcaster peer rejected seq=%lu: %s", + seq, + response.error_message().c_str()); + } + return; + } + + InboxKey key; + std::string payload; + int timeout_ms = kDefaultTimeoutMs; + { + std::unique_lock lock(mu_); + key = InboxKey{group_key_, seq, tp_rank_}; + timeout_ms = timeout_ms_; + const bool ready = cv_.wait_for(lock, std::chrono::milliseconds(timeout_ms), [&] { + return !initialized_.load(std::memory_order_acquire) || inbox_.find(key) != inbox_.end(); + }); + RTP_LLM_CHECK_WITH_INFO(ready && initialized_.load(std::memory_order_acquire), + "RpcCpuTpBroadcaster receive timeout seq=%lu rank=%d timeout_ms=%d", + seq, + tp_rank_, + timeout_ms); + auto it = inbox_.find(key); + RTP_LLM_CHECK_WITH_INFO(it != inbox_.end(), "RpcCpuTpBroadcaster missing inbox payload seq=%lu", seq); + payload = std::move(it->second); + inbox_.erase(it); + } + + RTP_LLM_CHECK_WITH_INFO(payload.size() == nbytes, + "RpcCpuTpBroadcaster size mismatch seq=%lu rank=%d expected=%zu actual=%zu", + seq, + tp_rank_, + nbytes, + payload.size()); + std::memcpy(buf, payload.data(), nbytes); +} + +bool RpcCpuTpBroadcaster::handleBroadcastRequest(const CpuTpBroadcastRequestPB& request, + CpuTpBroadcastResponsePB* response) { + auto fail = [&](const std::string& message) { + response->set_success(false); + response->set_error_message(message); + RTP_LLM_LOG_WARNING("RpcCpuTpBroadcaster rejected request: %s", message.c_str()); + return false; + }; + + std::unique_lock lock(mu_); + if (!initialized_.load(std::memory_order_acquire)) { + cv_.wait_for(lock, std::chrono::milliseconds(kDefaultTimeoutMs), [&] { + return initialized_.load(std::memory_order_acquire); + }); + } + if (!initialized_.load(std::memory_order_acquire)) { + return fail("broadcaster is not initialized"); + } + if (request.group_key() != group_key_) { + return fail("group_key mismatch: got " + request.group_key() + ", expected " + group_key_); + } + if (request.root() != 0 || request.src_tp_rank() != 0) { + return fail("only root tp_rank 0 is supported"); + } + if (request.dst_tp_rank() != tp_rank_) { + return fail("dst_tp_rank mismatch"); + } + if (request.nbytes() != request.payload().size()) { + return fail("payload size mismatch"); + } + + InboxKey key{request.group_key(), request.seq(), request.dst_tp_rank()}; + if (inbox_.find(key) != inbox_.end()) { + return fail("duplicate payload"); + } + inbox_.emplace(std::move(key), request.payload()); + response->set_success(true); + response->clear_error_message(); + cv_.notify_all(); + return true; +} + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/distribute/RpcCpuTpBroadcaster.h b/rtp_llm/cpp/distribute/RpcCpuTpBroadcaster.h new file mode 100644 index 0000000000..01537629d5 --- /dev/null +++ b/rtp_llm/cpp/distribute/RpcCpuTpBroadcaster.h @@ -0,0 +1,86 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rtp_llm/cpp/model_rpc/BroadcastManager.h" +#include "rtp_llm/cpp/model_rpc/proto/model_rpc_service.pb.h" + +namespace rtp_llm { + +// Cross-node CPU TP broadcaster over RpcService. Root rank fanouts bytes to TP +// peers; non-root ranks wait on a local inbox filled by the gRPC server thread. +// The logical API intentionally matches CpuTpBroadcaster so execBroadcastCpu can +// choose this path without changing tpSyncModelInputs' packing/unpacking logic. +class RpcCpuTpBroadcaster { +public: + static RpcCpuTpBroadcaster& instance(); + + void initialize(int tp_rank, + int tp_size, + int dp_rank, + int world_size, + const std::vector& worker_grpc_addrs, + int timeout_ms); + + void reset(); + + bool isInitialized() const { + return initialized_.load(std::memory_order_acquire); + } + + void broadcast(void* buf, std::size_t nbytes, int root); + + bool handleBroadcastRequest(const CpuTpBroadcastRequestPB& request, CpuTpBroadcastResponsePB* response); + +private: + struct InboxKey { + std::string group_key; + uint64_t seq = 0; + int dst_tp_rank = 0; + + bool operator==(const InboxKey& other) const { + return group_key == other.group_key && seq == other.seq && dst_tp_rank == other.dst_tp_rank; + } + }; + + struct InboxKeyHash { + std::size_t operator()(const InboxKey& key) const; + }; + + RpcCpuTpBroadcaster() = default; + ~RpcCpuTpBroadcaster() = default; + RpcCpuTpBroadcaster(const RpcCpuTpBroadcaster&) = delete; + RpcCpuTpBroadcaster& operator=(const RpcCpuTpBroadcaster&) = delete; + + uint64_t nextSeq(); + std::string makeGroupKey(int dp_rank, int tp_size, int world_size) const; + +private: + mutable std::mutex mu_; + std::condition_variable cv_; + std::atomic initialized_{false}; + std::atomic seq_{0}; + + int tp_rank_ = 0; + int tp_size_ = 1; + int dp_rank_ = 0; + int world_size_ = 1; + int timeout_ms_ = 3000; + std::string group_key_; + + std::vector peer_addrs_; + std::vector peer_tp_ranks_; + std::shared_ptr broadcast_manager_; + + std::unordered_map inbox_; +}; + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/engine_base/stream/StreamCacheResource.cc b/rtp_llm/cpp/engine_base/stream/StreamCacheResource.cc index 20a601552d..778a28bae8 100644 --- a/rtp_llm/cpp/engine_base/stream/StreamCacheResource.cc +++ b/rtp_llm/cpp/engine_base/stream/StreamCacheResource.cc @@ -196,18 +196,18 @@ static bool applyP2PSideChannelToStream(const std::shared_ptrresetBatchSize(batch_size); - int group_nums = 1; - int layer_all_num = 0; - std::vector layer_to_group = {}; - std::vector group_types = {}; + int group_nums = 1; + int layer_all_num = 0; + std::vector> layer_to_group = {}; + std::vector group_types = {}; size_t kernel_blocks_per_kv_block = 1; if (resource_context_.cache_manager) { // cache manager is null when warmup const auto& cache_config = resource_context_.cache_manager->cacheConfig(); group_nums = cache_config.groupNums(); layer_all_num = static_cast(cache_config.layer_all_num); - layer_to_group = cache_config.layer_to_group_id; - group_types = cache_config.group_types; + layer_to_group = cache_config.layerGroupIdsSnapshot(); + group_types = cache_config.groupTypesSnapshot(); if (cache_config.kernel_seq_size_per_block > 0 && cache_config.seq_size_per_block > 0) { kernel_blocks_per_kv_block = cache_config.seq_size_per_block / cache_config.kernel_seq_size_per_block; } @@ -522,18 +522,18 @@ const CacheKeysType& StreamCacheResource::cacheKeys(int32_t batch_id) const { void StreamCacheResource::fakeInitKVBlock(size_t reserved_blocks) { fake_inited_ = true; batch_kv_cache_resource_->resetBatchSize(stream_->maxBatchSize()); - int group_nums = 1; - int layer_all_num = 0; - size_t kernel_blocks_per_kv_block = 1; - std::vector layer_to_group = {}; - std::vector group_types = {}; + int group_nums = 1; + int layer_all_num = 0; + size_t kernel_blocks_per_kv_block = 1; + std::vector> layer_to_group = {}; + std::vector group_types = {}; if (resource_context_.cache_manager) { const auto& cache_config = resource_context_.cache_manager->cacheConfig(); group_nums = cache_config.groupNums(); layer_all_num = static_cast(cache_config.layer_all_num); - layer_to_group = cache_config.layer_to_group_id; - group_types = cache_config.group_types; + layer_to_group = cache_config.layerGroupIdsSnapshot(); + group_types = cache_config.groupTypesSnapshot(); kernel_blocks_per_kv_block = cache_config.kernelBlocksPerKvBlock(); } batch_kv_cache_resource_->initGroups( @@ -705,7 +705,7 @@ void StreamCacheResource::swapLinearBlocks(int32_t batch_id, size_t rhs, size_t return; } - auto type_list = resource_context_.cache_manager->cacheConfig().group_types; + auto type_list = resource_context_.cache_manager->cacheConfig().groupTypesSnapshot(); for (size_t i = 0; i < type_list.size(); i++) { if (type_list[i] == CacheGroupType::LINEAR) { diff --git a/rtp_llm/cpp/engine_base/stream/test/PdSepKVCacheReleaseTest.cc b/rtp_llm/cpp/engine_base/stream/test/PdSepKVCacheReleaseTest.cc index 5468f396a7..f99e63afa9 100644 --- a/rtp_llm/cpp/engine_base/stream/test/PdSepKVCacheReleaseTest.cc +++ b/rtp_llm/cpp/engine_base/stream/test/PdSepKVCacheReleaseTest.cc @@ -5,23 +5,281 @@ #define protected public #include "rtp_llm/cpp/cache/KVCacheManager.h" #include "rtp_llm/cpp/cache/CacheConfig.h" +#include "rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h" +#include "rtp_llm/cpp/cache/config_creator/HybridPoolConfigCreator.h" +#include "rtp_llm/cpp/cache/KVCacheTransferPlanner.h" #include "rtp_llm/cpp/cache/KVCacheResource.h" #include "rtp_llm/cpp/cache/test/CacheConfigTestUtils.h" +#include "rtp_llm/cpp/disaggregate/cache_store/RequestBlockBufferStore.h" #include "rtp_llm/cpp/engine_base/stream/GenerateStream.h" #include "rtp_llm/cpp/engine_base/stream/GenerateTypes.h" #include "rtp_llm/cpp/engine_base/stream/StreamCacheResource.h" +#include "rtp_llm/cpp/model_rpc/DecodeRpcServer.h" +#include "rtp_llm/cpp/model_rpc/PrefillGenerateContext.h" #include "rtp_llm/cpp/normal_engine/NormalGenerateStream.h" #include "rtp_llm/cpp/testing/TestBase.h" #include "rtp_llm/cpp/config/ConfigModules.h" #include "rtp_llm/cpp/config/RoleTypes.h" +#include "rtp_llm/models_py/bindings/common/WriteCacheStoreOp.h" +#include "rtp_llm/models_py/bindings/core/ExecOps.h" #include #include +#include #include +#include #include +#include namespace rtp_llm { +using test::setDsv4KvCacheSpecs; + +namespace { + +constexpr int kDsv4PoolNum = 7; +constexpr int kDsv4TokensPerBlock = 256; + +class DummyMemoryUtil: public MemoryUtil { +public: + bool regUserMr(void*, uint64_t, bool, uint64_t = 0) override { + return true; + } + bool deregUserMr(void*, bool) override { + return true; + } + bool isMemoryMr(void*, uint64_t, bool, bool) override { + return true; + } + bool findMemoryMr(void*, void*, uint64_t, bool, bool) override { + return true; + } + bool isRdmaMode() override { + return false; + } +}; + +class MemoryBackedCacheStore: public NormalCacheStore { +public: + MemoryBackedCacheStore() { + memory_util_ = std::make_shared(); + request_block_buffer_store_ = std::make_shared(memory_util_); + } + + void store(const std::shared_ptr& request_block_buffer, + CacheStoreStoreDoneCallback callback) override { + runtimeSyncAndCheck(); + for (const auto& [key, block] : request_block_buffer->getBlocks()) { + auto src_options = torch::TensorOptions(torch::kUInt8).device(block->gpu_mem ? torch::kCUDA : torch::kCPU); + auto src = torch::from_blob(block->addr.get(), {(int64_t)block->len}, src_options); + auto host = block->gpu_mem ? src.cpu().contiguous() : src.contiguous(); + std::vector bytes(static_cast(block->len)); + std::memcpy(bytes.data(), host.data_ptr(), bytes.size()); + stored_blocks_[key] = std::move(bytes); + } + store_request_keys_.push_back(request_block_buffer->getRequestKey()); + store_buffer_requests_.push_back(request_block_buffer); + callback(true, CacheStoreErrorCode::None); + } + + void load(const std::shared_ptr& request_block_buffer, + CacheStoreLoadDoneCallback callback, + const std::string&, + uint32_t, + uint32_t, + uint32_t = 1000, + int = 1, + int = 0) override { + bool ok = true; + for (const auto& [key, block] : request_block_buffer->getBlocks()) { + auto it = stored_blocks_.find(key); + if (it == stored_blocks_.end() || it->second.size() != block->len) { + ok = false; + continue; + } + auto host = torch::from_blob(const_cast(it->second.data()), + {(int64_t)it->second.size()}, + torch::TensorOptions(torch::kUInt8).device(torch::kCPU)) + .clone(); + auto dst_options = torch::TensorOptions(torch::kUInt8).device(block->gpu_mem ? torch::kCUDA : torch::kCPU); + auto dst = torch::from_blob(block->addr.get(), {(int64_t)block->len}, dst_options); + dst.copy_(host); + } + runtimeSyncAndCheck(); + load_request_keys_.push_back(request_block_buffer->getRequestKey()); + callback(ok, ok ? CacheStoreErrorCode::None : CacheStoreErrorCode::LoadErrorUnknown); + } + + std::shared_ptr + loadBuffers(const std::vector>& request_block_buffers, + const std::string& ip, + uint32_t port, + uint32_t rdma_port, + int64_t timeout_ms, + LoadContext::CheckCancelFunc check_cancel_func, + int partition_count, + int partition_id) override { + load_buffer_requests_.insert( + load_buffer_requests_.end(), request_block_buffers.begin(), request_block_buffers.end()); + auto context = std::make_shared(shared_from_this(), false); + context->load( + request_block_buffers, ip, port, rdma_port, timeout_ms, check_cancel_func, partition_count, partition_id); + return context; + } + + std::unordered_map> stored_blocks_; + std::vector store_request_keys_; + std::vector load_request_keys_; + std::vector> store_buffer_requests_; + std::vector> load_buffer_requests_; +}; + +class MinimalEngine: public EngineBase { +public: + MinimalEngine(const EngineInitParams& params, std::shared_ptr cache_manager): EngineBase(params) { + resource_context_.cache_manager = std::move(cache_manager); + } + + std::shared_ptr enqueue(const std::shared_ptr&) override { + return nullptr; + } + void enqueue(std::shared_ptr&) override {} + absl::Status stop() override { + return absl::OkStatus(); + } + absl::StatusOr preRun(const std::shared_ptr&, preRunMode) override { + return absl::UnimplementedError("unused in test"); + } + KVCacheInfo getCacheStatusInfo(int64_t, bool) override { + return KVCacheInfo(); + } +}; + +void fillDsv4RegionBytes(const std::shared_ptr& manager, + int block_id, + int layer_id, + int group_id, + uint8_t value) { + auto parts = manager->convertIndexToBuffer(block_id, layer_id, group_id); + ASSERT_EQ(parts.size(), 1u); + auto device = torch::from_blob( + parts[0].addr, {(int64_t)parts[0].size_bytes}, torch::TensorOptions(torch::kUInt8).device(torch::kCUDA)); + auto host = + torch::full({(int64_t)parts[0].size_bytes}, value, torch::TensorOptions(torch::kUInt8).device(torch::kCPU)); + device.copy_(host); +} + +void expectDsv4RegionBytes(const std::shared_ptr& manager, + int block_id, + int layer_id, + int group_id, + uint8_t value) { + auto parts = manager->convertIndexToBuffer(block_id, layer_id, group_id); + ASSERT_EQ(parts.size(), 1u); + auto device = torch::from_blob( + parts[0].addr, {(int64_t)parts[0].size_bytes}, torch::TensorOptions(torch::kUInt8).device(torch::kCUDA)); + auto host = device.cpu().contiguous(); + const auto* ptr = host.data_ptr(); + for (size_t i = 0; i < parts[0].size_bytes; ++i) { + ASSERT_EQ(ptr[i], value) << "byte=" << i << " layer=" << layer_id << " block=" << block_id + << " group=" << group_id; + } +} + +uint8_t dsv4PdPattern(int layer_id, int gid, size_t block_pos) { + return static_cast(17 + layer_id * 19 + gid * 11 + block_pos); +} + +void setGroupBlockNumsForTest(CacheConfig& config, uint32_t block_num) { + const auto group_num = static_cast(config.groupNums()); + std::vector block_nums(group_num, block_num); + std::vector kv_strides; + std::vector scale_strides; + kv_strides.reserve(group_num); + scale_strides.reserve(group_num); + for (size_t gid = 0; gid < group_num; ++gid) { + kv_strides.push_back(config.kvBlockStrideBytesForGroup(gid)); + scale_strides.push_back(config.kvScaleStrideBytesForGroup(gid)); + } + config.setGroupBlockLayout(block_nums, kv_strides, scale_strides); +} + +std::vector dsv4BlockPositionsForCacheTransfer(const CacheConfig& config, + int gid, + size_t block_num, + size_t reuse_block_size) { + const auto policy = config.policyForGroup(static_cast(gid)); + const size_t tail_block_count = + policy.active_tail_blocks > 0 ? static_cast(policy.active_tail_blocks) : 0; + return blockPositionsForCacheTransfer(block_num, + reuse_block_size, + true, + tail_block_count > 0, + tail_block_count, + /*hybrid_full_from_begin=*/true); +} + +size_t expectedDsv4StoredBlocks(const CacheConfig& config, int layer_num, int block_num, size_t reuse_block_size) { + size_t expected = 0; + const auto layer_group_ids = config.layerGroupIdsSnapshot(); + for (int layer_id = 0; layer_id < layer_num; ++layer_id) { + for (int gid : layer_group_ids[layer_id]) { + expected += dsv4BlockPositionsForCacheTransfer(config, gid, block_num, reuse_block_size).size(); + } + } + return expected; +} + +torch::Tensor groupTypesTensorForConfig(const CacheConfig& config) { + std::vector group_types; + for (auto group_type : config.groupTypesSnapshot()) { + group_types.push_back(static_cast(group_type)); + } + return torch::from_blob(group_types.data(), + {static_cast(group_types.size())}, + torch::TensorOptions(torch::kInt32)) + .clone(); +} + +torch::Tensor blockIdsTensor(const BatchKVCacheResourcePtr& resource, int gid) { + const auto& blocks = resource->blocks(0, gid); + return torch::from_blob(const_cast(blocks.data()), {1, static_cast(blocks.size())}, torch::kInt32) + .clone(); +} + +CacheStoreInputs makeSingleBlockWriteInputs(const std::string& cache_key_string, + int request_id_val, + int tokens_per_block, + int kv_stride, + int kv_scale_stride, + bool use_opaque_kv_cache_store, + int group_id, + const std::string& tag) { + CacheStoreInputs inputs; + inputs.input_lengths_host = torch::tensor({tokens_per_block}, torch::kInt32); + inputs.prefix_lengths_host = torch::tensor({0}, torch::kInt32); + inputs.host_kv_cache_offset = torch::tensor({{1}}, torch::kInt32); + inputs.context_batch_size = 1; + inputs.decoder_batch_size = 0; + inputs.request_id = torch::tensor({(int64_t)request_id_val}, torch::kInt64); + inputs.request_pd_separation = torch::tensor({true}, torch::kBool); + inputs.cache_keys = {cache_key_string}; + inputs.tokens_per_block = tokens_per_block; + inputs.kv_block_stride_bytes = kv_stride; + inputs.kv_scale_stride_bytes = kv_scale_stride; + inputs.pd_separation = true; + inputs.model_id = 0; + inputs.decode_entrance = false; + inputs.warmup = false; + inputs.use_opaque_kv_cache_store = use_opaque_kv_cache_store; + inputs.layer_id = 0; + inputs.group_id = group_id; + inputs.tag = tag; + return inputs; +} + +} // namespace + // ============================================================================= // Test fixture: PD sep KV cache release correctness // Validates that holdKVCacheForPDSep / releaseKVCacheForPDSep / releaseResource @@ -44,10 +302,56 @@ class PdSepKVCacheReleaseTest: public DeviceTestBase { rtp_llm::DataType::TYPE_INT8); } + CacheConfig makeDsv4Config(uint32_t block_num = 16, + uint32_t seq_size_per_block = kDsv4TokensPerBlock, + uint32_t kernel_seq_size_per_blk = kDsv4TokensPerBlock) { + ModelConfig mc; + mc.num_layers = 43; + mc.hidden_size = 4096; + mc.attn_config.head_num = 64; + mc.attn_config.kv_head_num = 1; + mc.attn_config.size_per_head = 512; + mc.attn_config.rope_head_dim = 64; + mc.attn_config.sliding_window = 128; + mc.attn_config.indexer_head_dim = 128; + mc.attn_config.indexer_head_num = 64; + mc.attn_config.indexer_topk = 512; + mc.attn_config.o_groups = 8; + mc.attn_config.o_lora_rank = 1024; + std::vector ratios = {0, 0}; + for (int i = 2; i < 43; ++i) { + ratios.push_back((i % 2 == 0) ? 4 : 128); + } + ratios.push_back(0); // MTP tail marker. + mc.attn_config.layer_compress_ratios = ratios; + mc.hybrid_attention_config.enable_hybrid_attention = true; + mc.hybrid_attention_config.enable_independent_kv_cache_pools = true; + setDsv4KvCacheSpecs(mc); + + ParallelismConfig pc; + KVCacheConfig kv_config; + kv_config.seq_size_per_block = seq_size_per_block; + kv_config.kernel_seq_size_per_block = kernel_seq_size_per_blk; + auto config = CacheConfigCreator::createBasicConfig(mc, pc, kv_config, false, 0); + config.block_num = block_num; + setGroupBlockNumsForTest(config, block_num); + return config; + } + // Build a PREFILL stream with reuse_cache enabled void prepareStream(const std::vector& input_tokens) { - auto cache_config = makeConfig(); - cache_manager_ = std::make_shared(cache_config, /*warmup=*/false, nullptr); + prepareStreamWithConfig(input_tokens, makeConfig(), /*tokens_per_block=*/8, RoleType::PREFILL); + } + + void prepareDsv4Stream(const std::vector& input_tokens, RoleType role_type = RoleType::PREFILL) { + prepareStreamWithConfig(input_tokens, makeDsv4Config(), static_cast(kDsv4TokensPerBlock), role_type); + } + + void prepareStreamWithConfig(const std::vector& input_tokens, + const CacheConfig& cache_config, + int tokens_per_block, + RoleType role_type) { + cache_manager_ = std::make_shared(cache_config, /*warmup=*/false, nullptr); ASSERT_TRUE(cache_manager_->init()); initial_free_blocks_ = cache_manager_->freeBlocksNum(); @@ -55,7 +359,7 @@ class PdSepKVCacheReleaseTest: public DeviceTestBase { resource_context.cache_manager = cache_manager_; resource_context.reuse_cache = true; resource_context.enable_device_cache = true; - resource_context.role_type = RoleType::PREFILL; + resource_context.role_type = role_type; auto generate_input = std::make_shared(); auto generate_config = std::make_shared(); @@ -67,8 +371,8 @@ class PdSepKVCacheReleaseTest: public DeviceTestBase { generate_input->generate_config = generate_config; ModelConfig model_config; - model_config.attn_config.tokens_per_block = 8; - model_config.max_seq_len = 2048; + model_config.attn_config.tokens_per_block = tokens_per_block; + model_config.max_seq_len = std::max(2048, input_tokens.size() + tokens_per_block); RuntimeConfig runtime_config; stream_ = std::make_shared( @@ -328,4 +632,842 @@ TEST_F(PdSepKVCacheReleaseTest, testHoldWithoutReleasePDSep_ResourceReleasedStil << "Blocks should be freed once pd_kvcache_ref_ is dropped (minus device cache refs)"; } +TEST_F(PdSepKVCacheReleaseTest, testPrefillContextStopStream_ReleasesPDSepHold) { + prepareStream({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); + allocateAndFinish(); + + auto& resource = stream_->streamCacheResource(); + resource.holdKVCacheForPDSep(); + ASSERT_NE(resource.pd_kvcache_ref_, nullptr); + ASSERT_GT(cache_manager_->allocator_->connectorRefBlocksNum(), 0); + + RemoteServerResource remote_resource; + remote_resource.workers = {"local"}; + remote_resource.cache_store = std::make_shared(); + + GenerateInputPB request; + request.set_request_id(1001); + RPCContext rpc_context{&request, nullptr}; + grpc::ServerContext server_context; + kmonitor::MetricsReporterPtr metrics_reporter; + auto meta = std::make_shared(); + + { + PrefillGenerateContext prefill_context( + &remote_resource, rpc_context, /*timeout_ms=*/0, &server_context, metrics_reporter, meta); + prefill_context.setStream(stream_); + } + + EXPECT_EQ(resource.pd_kvcache_ref_, nullptr); + EXPECT_EQ(cache_manager_->allocator_->connectorRefBlocksNum(), 0); +} + +TEST_F(PdSepKVCacheReleaseTest, testDsv4PDSepPrefillReleaseInsertsSevenGroupDeviceCache) { + const int spb = static_cast(kDsv4TokensPerBlock); + std::vector tokens(3 * spb + 17); + std::iota(tokens.begin(), tokens.end(), 1); + + auto config = makeDsv4Config(); + config.linear_step = 4; + prepareStreamWithConfig(tokens, config, spb, RoleType::PREFILL); + allocateAndFinish(); + + auto& resource = stream_->streamCacheResource(); + ASSERT_EQ(resource.kvCache().groupNums(), kDsv4PoolNum); + ASSERT_GT(resource.curBlocksNum(), 0); + for (int gid = 0; gid < kDsv4PoolNum; ++gid) { + ASSERT_EQ(resource.kvCache().blocksNum(0, gid), 4) << "group " << gid; + const auto& blocks = resource.kvCache().blocks(0, gid); + if (config.typeForGroup(static_cast(gid)) == CacheGroupType::FULL) { + EXPECT_FALSE(isNullBlockIdx(blocks[0])) << "paged group " << gid; + } else { + const int active_tail_blocks = config.policyForGroup(static_cast(gid)).active_tail_blocks; + const int tail_begin = std::max(0, static_cast(blocks.size()) - active_tail_blocks); + for (int block_idx = 0; block_idx < static_cast(blocks.size()); ++block_idx) { + const bool expect_tail = block_idx >= tail_begin; + EXPECT_EQ(isNullBlockIdx(blocks[block_idx]), !expect_tail) + << "tail group " << gid << " block " << block_idx; + } + } + } + + resource.holdKVCacheForPDSep(); + ASSERT_NE(resource.pd_kvcache_ref_, nullptr); + + stream_->releaseResource(); + EXPECT_TRUE(resource.resource_released_); + resource.releaseKVCacheForPDSep(); + EXPECT_EQ(resource.pd_kvcache_ref_, nullptr); + + ResourceContext resource_context2; + resource_context2.cache_manager = cache_manager_; + resource_context2.reuse_cache = true; + resource_context2.enable_device_cache = true; + resource_context2.role_type = RoleType::PREFILL; + + auto generate_input2 = std::make_shared(); + auto generate_config2 = std::make_shared(); + generate_config2->num_return_sequences = 1; + generate_config2->reuse_cache = true; + generate_config2->enable_device_cache = true; + generate_input2->input_ids = torch::tensor(std::vector(tokens.begin(), tokens.end()), torch::kInt32); + generate_input2->generate_config = generate_config2; + + ModelConfig model_config; + model_config.attn_config.tokens_per_block = spb; + model_config.max_seq_len = 4096; + RuntimeConfig runtime_config; + + auto stream2 = std::make_shared( + generate_input2, model_config, runtime_config, resource_context2, nullptr); + stream2->generate_status_->status = StreamState::RUNNING; + + auto& resource2 = stream2->streamCacheResource(); + ASSERT_TRUE(resource2.initKVBlock().ok()); + EXPECT_GE(stream2->reuseLength(), spb) << "DSV4 prefill should reuse cached 7-group prefix blocks"; + EXPECT_EQ(resource2.kvCache().groupNums(), kDsv4PoolNum); + + stream2->generate_status_->status = StreamState::FINISHED; + stream2->fillSubGenerateStatus(StreamState::FINISHED); + stream2->releaseResource(); +} + +TEST_F(PdSepKVCacheReleaseTest, testDsv4DecodeFirstMallocBypassesLocalDeviceReuseInPDSep) { + const int spb = static_cast(kDsv4TokensPerBlock); + std::vector tokens(3 * spb + 17); + std::iota(tokens.begin(), tokens.end(), 1); + + prepareDsv4Stream(tokens, RoleType::PREFILL); + allocateAndFinish(); + auto& prefill_resource = stream_->streamCacheResource(); + prefill_resource.holdKVCacheForPDSep(); + stream_->releaseResource(); + prefill_resource.releaseKVCacheForPDSep(); + + ResourceContext decode_resource_context; + decode_resource_context.cache_manager = cache_manager_; + decode_resource_context.reuse_cache = true; + decode_resource_context.enable_device_cache = true; + decode_resource_context.role_type = RoleType::DECODE; + + auto decode_input = std::make_shared(); + auto decode_config = std::make_shared(); + decode_config->num_return_sequences = 1; + decode_config->reuse_cache = true; + decode_config->enable_device_cache = true; + decode_input->input_ids = torch::tensor(std::vector(tokens.begin(), tokens.end()), torch::kInt32); + decode_input->generate_config = decode_config; + + ModelConfig model_config; + model_config.attn_config.tokens_per_block = spb; + model_config.max_seq_len = 4096; + RuntimeConfig runtime_config; + + auto decode_stream = std::make_shared( + decode_input, model_config, runtime_config, decode_resource_context, nullptr); + decode_stream->generate_status_->status = StreamState::RUNNING; + + auto& decode_resource = decode_stream->streamCacheResource(); + ASSERT_TRUE(decode_resource.initKVBlock().ok()); + + EXPECT_EQ(decode_stream->reuseLength(), 0) + << "Hybrid DSV4 decode first malloc must not consume local device-cache reuse; PD load owns reuse."; + EXPECT_EQ(decode_resource.kvCache().groupNums(), kDsv4PoolNum); + for (int gid = 0; gid < kDsv4PoolNum; ++gid) { + EXPECT_EQ(decode_resource.kvCache().blocksNum(0, gid), 4) << "group " << gid; + } + + decode_stream->releaseResource(); +} + +TEST_F(PdSepKVCacheReleaseTest, testDsv4CacheStorePDSepTransfersAllLayerRegions) { + const int spb = static_cast(kDsv4TokensPerBlock); + const int block_num = 4; + const int64_t request_id = 9017; + const size_t model_id = 77; + + auto config = makeDsv4Config(/*block_num=*/24); + + auto makeResource = [&config]() { + auto resource = std::make_shared(); + resource->resetBatchSize(1); + resource->initGroups(config.groupNums(), + static_cast(config.layer_all_num), + config.layerGroupIdsSnapshot(), + config.kernelBlocksPerKvBlock(), + config.groupTypesSnapshot()); + return resource; + }; + auto makeCompleteTokens = [spb, block_num](int max_seq_len) { + auto input = std::make_shared(); + input->input_ids = torch::arange(max_seq_len, torch::kInt32); + input->generate_config = std::make_shared(); + auto complete_token_ids = std::make_shared(1, 1, max_seq_len + spb, spb); + complete_token_ids->init(input); + complete_token_ids->setSeqLength(block_num * spb); + return complete_token_ids; + }; + + auto prefill_manager = std::make_shared(config, /*warmup=*/false, nullptr); + auto decode_manager = std::make_shared(config, /*warmup=*/false, nullptr); + ASSERT_TRUE(prefill_manager->init()); + ASSERT_TRUE(decode_manager->init()); + + auto prefill_resource = makeResource(); + auto decode_resource = makeResource(); + ASSERT_TRUE( + prefill_manager->malloc({prefill_resource, makeCompleteTokens(block_num * spb), request_id, true, false, false}) + .success); + ASSERT_TRUE( + decode_manager->malloc({decode_resource, makeCompleteTokens(block_num * spb), request_id, true, false, false}) + .success); + + std::vector cache_keys; + std::vector cache_key_strings; + for (int i = 0; i < block_num; ++i) { + cache_keys.push_back(10000 + i); + cache_key_strings.push_back(std::to_string(cache_keys.back())); + } + + for (int layer_id = 0; layer_id < 4; ++layer_id) { + for (int gid : config.groupIdsForLayer(layer_id)) { + auto positions = dsv4BlockPositionsForCacheTransfer(config, gid, block_num, /*reuse_block_size=*/0); + for (auto block_pos : positions) { + auto prefill_block_id = prefill_resource->blocks(0, gid)[block_pos]; + auto decode_block_id = decode_resource->blocks(0, gid)[block_pos]; + ASSERT_FALSE(isNullBlockIdx(prefill_block_id)) << "prefill gid=" << gid << " pos=" << block_pos; + ASSERT_FALSE(isNullBlockIdx(decode_block_id)) << "decode gid=" << gid << " pos=" << block_pos; + fillDsv4RegionBytes( + prefill_manager, prefill_block_id, layer_id, gid, dsv4PdPattern(layer_id, gid, block_pos)); + fillDsv4RegionBytes(decode_manager, decode_block_id, layer_id, gid, 0xEE); + } + } + } + runtimeSyncAndCheck(); + + auto group_types_tensor = groupTypesTensorForConfig(config); + + auto cache_store = std::make_shared(); + auto layout = prefill_manager->getMainModelCacheLayerLayout(); + for (int layer_id = 0; layer_id < 4; ++layer_id) { + for (int gid : config.groupIdsForLayer(layer_id)) { + auto tag = config.tagForGroup(static_cast(gid)); + auto group_idx = static_cast(gid); + ASSERT_TRUE(layout.layers_to_kv_buffer_ptrs_by_group[layer_id][group_idx].defined()) + << "layer=" << layer_id << " region=" << group_idx; + + CacheStoreInputs inputs; + inputs.input_lengths_host = torch::tensor({block_num * spb}, torch::kInt32); + inputs.prefix_lengths_host = torch::tensor({0}, torch::kInt32); + inputs.host_kv_cache_offset = blockIdsTensor(prefill_resource, gid); + inputs.kv_cache_group_types_host = group_types_tensor; + inputs.context_batch_size = 1; + inputs.decoder_batch_size = 0; + inputs.request_id = torch::tensor({request_id}, torch::kInt64); + inputs.request_pd_separation = torch::tensor({true}, torch::kBool); + inputs.cache_keys = cache_key_strings; + inputs.tokens_per_block = spb; + inputs.kv_block_stride_bytes = config.kvBlockStrideBytesForGroup(static_cast(gid)); + inputs.kv_scale_stride_bytes = 0; + inputs.pd_separation = true; + inputs.model_id = model_id; + inputs.decode_entrance = false; + inputs.warmup = false; + inputs.use_opaque_kv_cache_store = config.use_opaque_kv_cache_store; + inputs.layer_id = layer_id; + inputs.group_id = gid; + inputs.tag = tag; + + KvCacheInfo kv_cache_info; + kv_cache_info.kv_cache_buffer = layout.layers_to_kv_buffer_ptrs_by_group[layer_id][group_idx]; + runtimeWriteCacheStore(inputs, kv_cache_info, /*mla_kvcache=*/false, cache_store); + } + } + ASSERT_EQ(cache_store->store_request_keys_.size(), 10u); + ASSERT_EQ(cache_store->stored_blocks_.size(), + expectedDsv4StoredBlocks(config, /*layer_num=*/4, block_num, /*reuse_block_size=*/0)); + + EngineInitParams params; + params.model_id = model_id; + params.model_config_.num_layers = 4; + params.parallelism_config = ParallelismConfig(); + + DecodeRpcServer server; + server.engine_ = std::make_shared(params, decode_manager); + server.maga_init_params_ = params; + server.propose_maga_init_params_ = nullptr; + server.resource_.cache_store = cache_store; + + std::vector peer_addrs = {"127.0.0.1:12345:12346"}; + grpc::ServerContext server_context; + DecodeRpcServer::LoadKVCacheContext load_context(request_id, + "dsv4-cache-store-pd", + peer_addrs, + cache_keys, + decode_resource->groupBlocks(), + /*reuse_block_size=*/0, + /*timeout_ms=*/5000, + /*partition_count=*/1, + /*partition_id=*/0, + &server_context); + auto status = server.loadCache(load_context); + ASSERT_TRUE(status.ok()) << status.ToString(); + + EXPECT_EQ(cache_store->load_buffer_requests_.size(), 10u); + EXPECT_EQ(cache_store->load_request_keys_.size(), 10u); + for (int layer_id = 0; layer_id < 4; ++layer_id) { + for (int gid : config.groupIdsForLayer(layer_id)) { + auto positions = dsv4BlockPositionsForCacheTransfer(config, gid, block_num, /*reuse_block_size=*/0); + for (auto block_pos : positions) { + auto decode_block_id = decode_resource->blocks(0, gid)[block_pos]; + ASSERT_FALSE(isNullBlockIdx(decode_block_id)); + expectDsv4RegionBytes( + decode_manager, decode_block_id, layer_id, gid, dsv4PdPattern(layer_id, gid, block_pos)); + } + } + } +} + +TEST_F(PdSepKVCacheReleaseTest, testDsv4DecoupledCacheStoreTransfersPhysicalBlocks) { + const int spb = 8192; + const int kernel_spb = 128; + const int block_num = 2; + const int64_t request_id = 9020; + const size_t model_id = 80; + + auto config = makeDsv4Config(/*block_num=*/8, spb, kernel_spb); + + auto makeResource = [&config]() { + auto resource = std::make_shared(); + resource->resetBatchSize(1); + resource->initGroups(config.groupNums(), + static_cast(config.layer_all_num), + config.layerGroupIdsSnapshot(), + config.kernelBlocksPerKvBlock(), + config.groupTypesSnapshot()); + return resource; + }; + auto makeCompleteTokens = [spb, block_num](int max_seq_len) { + auto input = std::make_shared(); + input->input_ids = torch::arange(max_seq_len, torch::kInt32); + input->generate_config = std::make_shared(); + auto complete_token_ids = std::make_shared(1, 1, max_seq_len + spb, spb); + complete_token_ids->init(input); + complete_token_ids->setSeqLength(block_num * spb); + return complete_token_ids; + }; + + auto prefill_manager = std::make_shared(config, /*warmup=*/false, nullptr); + auto decode_manager = std::make_shared(config, /*warmup=*/false, nullptr); + ASSERT_TRUE(prefill_manager->init()); + ASSERT_TRUE(decode_manager->init()); + + auto prefill_resource = makeResource(); + auto decode_resource = makeResource(); + ASSERT_TRUE( + prefill_manager->malloc({prefill_resource, makeCompleteTokens(block_num * spb), request_id, true, false, false}) + .success); + ASSERT_TRUE( + decode_manager->malloc({decode_resource, makeCompleteTokens(block_num * spb), request_id, true, false, false}) + .success); + + std::vector cache_keys; + std::vector cache_key_strings; + for (int i = 0; i < block_num; ++i) { + cache_keys.push_back(20000 + i); + cache_key_strings.push_back(std::to_string(cache_keys.back())); + } + + for (int layer_id = 0; layer_id < 4; ++layer_id) { + for (int gid : config.groupIdsForLayer(layer_id)) { + auto positions = dsv4BlockPositionsForCacheTransfer(config, gid, block_num, /*reuse_block_size=*/0); + for (auto block_pos : positions) { + auto prefill_block_id = prefill_resource->blocks(0, gid)[block_pos]; + auto decode_block_id = decode_resource->blocks(0, gid)[block_pos]; + ASSERT_FALSE(isNullBlockIdx(prefill_block_id)) << "prefill gid=" << gid << " pos=" << block_pos; + ASSERT_FALSE(isNullBlockIdx(decode_block_id)) << "decode gid=" << gid << " pos=" << block_pos; + fillDsv4RegionBytes( + prefill_manager, prefill_block_id, layer_id, gid, dsv4PdPattern(layer_id, gid, block_pos)); + fillDsv4RegionBytes(decode_manager, decode_block_id, layer_id, gid, 0xEE); + } + } + } + runtimeSyncAndCheck(); + + auto group_types_tensor = groupTypesTensorForConfig(config); + + auto cache_store = std::make_shared(); + auto layout = prefill_manager->getMainModelCacheLayerLayout(); + for (int layer_id = 0; layer_id < 4; ++layer_id) { + for (int gid : config.groupIdsForLayer(layer_id)) { + auto tag = config.tagForGroup(static_cast(gid)); + auto group_idx = static_cast(gid); + ASSERT_TRUE(layout.layers_to_kv_buffer_ptrs_by_group[layer_id][group_idx].defined()) + << "layer=" << layer_id << " group=" << group_idx; + + torch_ext::PyCacheStoreInputs inputs; + inputs.context_batch_size = 1; + inputs.decoder_batch_size = 0; + inputs.request_id = torch::tensor({request_id}, torch::kInt64); + inputs.request_pd_separation = torch::tensor({true}, torch::kBool); + inputs.kv_cache_group_types = group_types_tensor; + inputs.cache_keys = cache_key_strings; + inputs.input_lengths_host = torch::tensor({block_num * spb}, torch::kInt32); + inputs.prefix_lengths_host = torch::tensor({0}, torch::kInt32); + inputs.tokens_per_block = spb; + inputs.kv_block_stride_bytes = config.kv_block_stride_bytes; + inputs.kv_scale_stride_bytes = 0; + inputs.pd_separation = true; + inputs.model_id = model_id; + inputs.decode_entrance = false; + inputs.warmup = false; + inputs.use_opaque_kv_cache_store = config.use_opaque_kv_cache_store; + inputs.mla_kvcache = false; + inputs.cache_store = cache_store; + + torch_ext::LayerKVCache layer_cache; + layer_cache.kv_cache_base = layout.layers_to_kv_buffer_ptrs_by_group[layer_id][group_idx]; + layer_cache.seq_size_per_block = config.typeForGroup(static_cast(gid)) == CacheGroupType::FULL ? kernel_spb : spb; + layer_cache.layer_id = layer_id; + layer_cache.group_id = gid; + layer_cache.tag = tag; + + WriteCacheStoreOp(inputs.input_lengths_host, + inputs.prefix_lengths_host, + blockIdsTensor(prefill_resource, gid), + inputs, + layer_cache); + } + } + + const auto first_csa_key = "kv_" + makeCacheKey(model_id, cache_key_strings[0], /*layer_id=*/2, "csa_kv"); + ASSERT_NE(cache_store->stored_blocks_.find(first_csa_key), cache_store->stored_blocks_.end()); + EXPECT_EQ(cache_store->stored_blocks_[first_csa_key].size(), + config.kvBlockStrideBytesForGroup(static_cast(config.groupIdForTag("csa_kv")))); + + EngineInitParams params; + params.model_id = model_id; + params.model_config_.num_layers = 4; + params.parallelism_config = ParallelismConfig(); + + DecodeRpcServer server; + server.engine_ = std::make_shared(params, decode_manager); + server.maga_init_params_ = params; + server.propose_maga_init_params_ = nullptr; + server.resource_.cache_store = cache_store; + + std::vector peer_addrs = {"127.0.0.1:12345:12346"}; + grpc::ServerContext server_context; + DecodeRpcServer::LoadKVCacheContext load_context(request_id, + "dsv4-decoupled-cache-store-pd", + peer_addrs, + cache_keys, + decode_resource->groupBlocks(), + /*reuse_block_size=*/0, + /*timeout_ms=*/5000, + /*partition_count=*/1, + /*partition_id=*/0, + &server_context); + auto status = server.loadCache(load_context); + ASSERT_TRUE(status.ok()) << status.ToString(); + + for (int layer_id = 0; layer_id < 4; ++layer_id) { + for (int gid : config.groupIdsForLayer(layer_id)) { + auto positions = dsv4BlockPositionsForCacheTransfer(config, gid, block_num, /*reuse_block_size=*/0); + for (auto block_pos : positions) { + auto decode_block_id = decode_resource->blocks(0, gid)[block_pos]; + ASSERT_FALSE(isNullBlockIdx(decode_block_id)); + expectDsv4RegionBytes( + decode_manager, decode_block_id, layer_id, gid, dsv4PdPattern(layer_id, gid, block_pos)); + } + } + } +} + +TEST_F(PdSepKVCacheReleaseTest, testDsv4CacheStorePDSepTransfersAllLayerRegionsWithPrefixReuse) { + const int spb = static_cast(kDsv4TokensPerBlock); + const int block_num = 4; + const int reuse_num = 1; + const int64_t request_id = 9018; + const size_t model_id = 78; + + auto config = makeDsv4Config(/*block_num=*/24); + + auto makeResource = [&config]() { + auto resource = std::make_shared(); + resource->resetBatchSize(1); + resource->initGroups(config.groupNums(), + static_cast(config.layer_all_num), + config.layerGroupIdsSnapshot(), + config.kernelBlocksPerKvBlock(), + config.groupTypesSnapshot()); + return resource; + }; + auto makeCompleteTokens = [spb, block_num](int max_seq_len) { + auto input = std::make_shared(); + input->input_ids = torch::arange(max_seq_len, torch::kInt32); + input->generate_config = std::make_shared(); + auto complete_token_ids = std::make_shared(1, 1, max_seq_len + spb, spb); + complete_token_ids->init(input); + complete_token_ids->setSeqLength(block_num * spb); + return complete_token_ids; + }; + + auto prefill_manager = std::make_shared(config, /*warmup=*/false, nullptr); + auto decode_manager = std::make_shared(config, /*warmup=*/false, nullptr); + ASSERT_TRUE(prefill_manager->init()); + ASSERT_TRUE(decode_manager->init()); + + auto prefill_resource = makeResource(); + auto decode_resource = makeResource(); + ASSERT_TRUE( + prefill_manager->malloc({prefill_resource, makeCompleteTokens(block_num * spb), request_id, true, false, false}) + .success); + ASSERT_TRUE( + decode_manager->malloc({decode_resource, makeCompleteTokens(block_num * spb), request_id, true, false, false}) + .success); + + std::vector cache_keys; + std::vector cache_key_strings; + for (int i = 0; i < block_num; ++i) { + cache_keys.push_back(11000 + i); + cache_key_strings.push_back(std::to_string(cache_keys.back())); + } + + for (int layer_id = 0; layer_id < 4; ++layer_id) { + for (int gid : config.groupIdsForLayer(layer_id)) { + auto positions = dsv4BlockPositionsForCacheTransfer(config, gid, block_num, reuse_num); + for (auto block_pos : positions) { + auto prefill_block_id = prefill_resource->blocks(0, gid)[block_pos]; + auto decode_block_id = decode_resource->blocks(0, gid)[block_pos]; + ASSERT_FALSE(isNullBlockIdx(prefill_block_id)) << "prefill gid=" << gid << " pos=" << block_pos; + ASSERT_FALSE(isNullBlockIdx(decode_block_id)) << "decode gid=" << gid << " pos=" << block_pos; + fillDsv4RegionBytes( + prefill_manager, prefill_block_id, layer_id, gid, dsv4PdPattern(layer_id, gid, block_pos)); + fillDsv4RegionBytes(decode_manager, decode_block_id, layer_id, gid, 0xEE); + } + } + } + runtimeSyncAndCheck(); + + auto group_types_tensor = groupTypesTensorForConfig(config); + + auto cache_store = std::make_shared(); + auto layout = prefill_manager->getMainModelCacheLayerLayout(); + for (int layer_id = 0; layer_id < 4; ++layer_id) { + for (int gid : config.groupIdsForLayer(layer_id)) { + auto tag = config.tagForGroup(static_cast(gid)); + auto group_idx = static_cast(gid); + ASSERT_TRUE(layout.layers_to_kv_buffer_ptrs_by_group[layer_id][group_idx].defined()) + << "layer=" << layer_id << " group=" << group_idx; + + CacheStoreInputs inputs; + inputs.input_lengths_host = torch::tensor({(block_num - reuse_num) * spb}, torch::kInt32); + inputs.prefix_lengths_host = torch::tensor({reuse_num * spb}, torch::kInt32); + inputs.host_kv_cache_offset = blockIdsTensor(prefill_resource, gid); + inputs.kv_cache_group_types_host = group_types_tensor; + inputs.context_batch_size = 1; + inputs.decoder_batch_size = 0; + inputs.request_id = torch::tensor({request_id}, torch::kInt64); + inputs.request_pd_separation = torch::tensor({true}, torch::kBool); + inputs.cache_keys = cache_key_strings; + inputs.tokens_per_block = spb; + inputs.kv_block_stride_bytes = config.kvBlockStrideBytesForGroup(static_cast(gid)); + inputs.kv_scale_stride_bytes = 0; + inputs.pd_separation = true; + inputs.model_id = model_id; + inputs.decode_entrance = false; + inputs.warmup = false; + inputs.use_opaque_kv_cache_store = config.use_opaque_kv_cache_store; + inputs.layer_id = layer_id; + inputs.group_id = gid; + inputs.tag = tag; + + KvCacheInfo kv_cache_info; + kv_cache_info.kv_cache_buffer = layout.layers_to_kv_buffer_ptrs_by_group[layer_id][group_idx]; + runtimeWriteCacheStore(inputs, kv_cache_info, /*mla_kvcache=*/false, cache_store); + } + } + ASSERT_EQ(cache_store->store_request_keys_.size(), 10u); + ASSERT_EQ(cache_store->stored_blocks_.size(), + expectedDsv4StoredBlocks(config, /*layer_num=*/4, block_num, reuse_num)); + + EngineInitParams params; + params.model_id = model_id; + params.model_config_.num_layers = 4; + params.parallelism_config = ParallelismConfig(); + + DecodeRpcServer server; + server.engine_ = std::make_shared(params, decode_manager); + server.maga_init_params_ = params; + server.propose_maga_init_params_ = nullptr; + server.resource_.cache_store = cache_store; + + std::vector peer_addrs = {"127.0.0.1:12345:12346"}; + grpc::ServerContext server_context; + DecodeRpcServer::LoadKVCacheContext load_context(request_id, + "dsv4-cache-store-pd-prefix-reuse", + peer_addrs, + cache_keys, + decode_resource->groupBlocks(), + reuse_num, + /*timeout_ms=*/5000, + /*partition_count=*/1, + /*partition_id=*/0, + &server_context); + auto status = server.loadCache(load_context); + ASSERT_TRUE(status.ok()) << status.ToString(); + + EXPECT_EQ(cache_store->load_buffer_requests_.size(), 10u); + EXPECT_EQ(cache_store->load_request_keys_.size(), 10u); + for (int layer_id = 0; layer_id < 4; ++layer_id) { + for (int gid : config.groupIdsForLayer(layer_id)) { + auto positions = dsv4BlockPositionsForCacheTransfer(config, gid, block_num, reuse_num); + for (auto block_pos : positions) { + auto decode_block_id = decode_resource->blocks(0, gid)[block_pos]; + ASSERT_FALSE(isNullBlockIdx(decode_block_id)); + expectDsv4RegionBytes( + decode_manager, decode_block_id, layer_id, gid, dsv4PdPattern(layer_id, gid, block_pos)); + } + } + } +} + +// ============================================================================= +// Test: runtimeWriteCacheStore with pinned-host metadata + event sync +// Verifies that when metadata tensors (input_lengths, prefix_lengths) are +// prepared on pinned host via async D2H and a pre_created_event is attached, +// runtimeWriteCacheStore waits for the event and reads metadata correctly — +// the same path used by the optimized WriteCacheStoreOp that avoids +// synchronous .cpu() calls on background threads. +// ============================================================================= +TEST_F(PdSepKVCacheReleaseTest, testWriteCacheStoreWithPinnedHostMetadataAndEvent) { + auto config = makeConfig(); // 3 layers, 16 blocks, 8 tokens/block, INT8 + auto manager = std::make_shared(config, /*warmup=*/false, nullptr); + ASSERT_TRUE(manager->init()); + + const int spb = 8; + const int block_num = 2; + const int input_length = block_num * spb; + const int request_id_val = 42; + + // Allocate KV blocks. + auto resource = std::make_shared(); + resource->resetBatchSize(1); + resource->initGroups(config.groupNums(), + static_cast(config.layer_all_num), + config.layerGroupIdsSnapshot(), + config.kernelBlocksPerKvBlock(), + config.groupTypesSnapshot()); + + auto input = std::make_shared(); + input->input_ids = torch::arange(input_length, torch::kInt32); + input->generate_config = std::make_shared(); + auto complete_token_ids = std::make_shared(1, 1, input_length + spb, spb); + complete_token_ids->init(input); + complete_token_ids->setSeqLength(input_length); + + auto result = manager->malloc({resource, complete_token_ids, request_id_val, true, false, false}); + ASSERT_TRUE(result.success); + + // Fill KV cache blocks with a known pattern so MemoryBackedCacheStore can + // verify the transfer. + auto layout = manager->getMainModelCacheLayerLayout(); + for (int layer_id = 0; layer_id < 3; ++layer_id) { + auto buf = layout.layers_to_kv_buffer_ptrs[layer_id]; + ASSERT_TRUE(buf.defined()); + for (int b = 0; b < block_num; ++b) { + auto bid = resource->blocks(0, 0)[b]; + auto kv_stride = config.kv_block_stride_bytes; + ASSERT_FALSE(isNullBlockIdx(bid)); + auto device_slice = torch::from_blob((uint8_t*)buf.data_ptr() + bid * kv_stride, + {(int64_t)kv_stride}, + torch::TensorOptions(torch::kUInt8).device(torch::kCUDA)); + device_slice.fill_(static_cast(layer_id * 10 + b)); + } + } + runtimeSyncAndCheck(); + + // Prepare cache key strings (one per block). + std::vector cache_key_strings; + for (int i = 0; i < block_num; ++i) { + cache_key_strings.push_back(std::to_string(10000 + i)); + } + + // --- Core of the test: async D2H to pinned host, then event --- + // Create device tensors (mimicking what buildPyAttentionInputs produces). + auto input_lengths_device = torch::tensor({input_length}, torch::kInt32).cuda(); + auto prefix_lengths_device = torch::tensor({0}, torch::kInt32).cuda(); + + // Async-copy to pinned host (mimicking prepareWriteCacheParams). + auto pinned_i32 = torch::TensorOptions(torch::kInt32).pinned_memory(true); + auto input_lengths_host = torch::empty({1}, pinned_i32); + auto prefix_lengths_host = torch::empty({1}, pinned_i32); + input_lengths_host.copy_(input_lengths_device, /*non_blocking=*/true); + prefix_lengths_host.copy_(prefix_lengths_device, /*non_blocking=*/true); + + // Record event AFTER async D2H on the current stream. + auto event = runtimeCreateEvent(); + + // --- Call runtimeWriteCacheStore (event->synchronize() inside) --- + auto cache_store = std::make_shared(); + auto block_ids = torch::from_blob(const_cast(resource->blocks(0, 0).data()), + {1, (int64_t)resource->blocks(0, 0).size()}, + torch::kInt32) + .clone(); + + for (int layer_id = 0; layer_id < 3; ++layer_id) { + CacheStoreInputs inputs; + inputs.input_lengths_host = input_lengths_host; + inputs.prefix_lengths_host = prefix_lengths_host; + inputs.host_kv_cache_offset = block_ids; + inputs.context_batch_size = 1; + inputs.decoder_batch_size = 0; + inputs.request_id = torch::tensor({(int64_t)request_id_val}, torch::kInt64); + inputs.request_pd_separation = torch::tensor({true}, torch::kBool); + inputs.cache_keys = cache_key_strings; + inputs.tokens_per_block = spb; + inputs.kv_block_stride_bytes = config.kv_block_stride_bytes; + inputs.kv_scale_stride_bytes = 0; + inputs.pd_separation = true; + inputs.model_id = 0; + inputs.decode_entrance = false; + inputs.warmup = false; + inputs.use_opaque_kv_cache_store = false; + inputs.layer_id = layer_id; + inputs.group_id = 0; + inputs.tag = ""; + inputs.pre_created_event = event; + + KvCacheInfo kv_cache_info; + kv_cache_info.kv_cache_buffer = layout.layers_to_kv_buffer_ptrs[layer_id]; + runtimeWriteCacheStore(inputs, kv_cache_info, /*mla_kvcache=*/false, cache_store); + } + + // Verify: cache store received correct request key for all 3 layers. + EXPECT_EQ(cache_store->store_request_keys_.size(), 3u); + // MHA (non-opaque, non-mla) splits each block into k + v → 2 entries per block. + EXPECT_EQ(cache_store->stored_blocks_.size(), 3u * block_num * 2u); + + // Verify stored data matches the pattern we filled. + for (int layer_id = 0; layer_id < 3; ++layer_id) { + for (int b = 0; b < block_num; ++b) { + auto k_key = "k_" + makeCacheKey(0, cache_key_strings[b], layer_id); + auto it = cache_store->stored_blocks_.find(k_key); + ASSERT_NE(it, cache_store->stored_blocks_.end()) << "missing key: " << k_key; + uint8_t expected = static_cast(layer_id * 10 + b); + EXPECT_EQ(it->second[0], expected) << "layer=" << layer_id << " block=" << b << " first byte mismatch"; + } + } +} + +TEST_F(PdSepKVCacheReleaseTest, testWriteCacheStoreUsesTensorDeviceForCpuKvBuffer) { + const int spb = 8; + const int kv_stride = 64; + const int request_id_val = 4242; + const std::string cache_key_string = "10000"; + + auto kv_options = torch::TensorOptions(torch::kUInt8).device(torch::kCPU).pinned_memory(true); + auto kv_buffer = torch::empty({2, kv_stride}, kv_options); + kv_buffer[1].fill_(static_cast(123)); + + auto inputs = + makeSingleBlockWriteInputs(cache_key_string, request_id_val, spb, kv_stride, 0, true, 0, "csa_state"); + + KvCacheInfo kv_cache_info; + kv_cache_info.kv_cache_buffer = kv_buffer; + + auto cache_store = std::make_shared(); + runtimeWriteCacheStore(inputs, kv_cache_info, /*mla_kvcache=*/false, cache_store); + + const auto key = "kv_" + makeCacheKey(0, cache_key_string, 0, "csa_state"); + auto it = cache_store->stored_blocks_.find(key); + ASSERT_NE(it, cache_store->stored_blocks_.end()); + ASSERT_EQ(it->second.size(), static_cast(kv_stride)); + EXPECT_EQ(it->second[0], static_cast(123)); + + ASSERT_EQ(cache_store->store_buffer_requests_.size(), 1u); + auto blocks = cache_store->store_buffer_requests_.front()->getBlocks(); + auto block_it = blocks.find(key); + ASSERT_NE(block_it, blocks.end()); + EXPECT_FALSE(block_it->second->gpu_mem); +} + +TEST_F(PdSepKVCacheReleaseTest, testWriteCacheStoreUsesTensorDeviceForCpuSplitKvBuffer) { + const int spb = 8; + const int kv_stride = 64; + const int kv_half = kv_stride / 2; + const int request_id_val = 4243; + const std::string cache_key_string = "10001"; + + auto kv_options = torch::TensorOptions(torch::kUInt8).device(torch::kCPU).pinned_memory(true); + auto kv_buffer = torch::empty({2, kv_stride}, kv_options); + auto block = kv_buffer[1]; + block.slice(0, 0, kv_half).fill_(static_cast(17)); + block.slice(0, kv_half, kv_stride).fill_(static_cast(29)); + + auto inputs = makeSingleBlockWriteInputs(cache_key_string, request_id_val, spb, kv_stride, 0, false, 0, ""); + + KvCacheInfo kv_cache_info; + kv_cache_info.kv_cache_buffer = kv_buffer; + + auto cache_store = std::make_shared(); + runtimeWriteCacheStore(inputs, kv_cache_info, /*mla_kvcache=*/false, cache_store); + + const auto cache_key = makeCacheKey(0, cache_key_string, 0); + const auto k_key = "k_" + cache_key; + const auto v_key = "v_" + cache_key; + auto k_it = cache_store->stored_blocks_.find(k_key); + auto v_it = cache_store->stored_blocks_.find(v_key); + ASSERT_NE(k_it, cache_store->stored_blocks_.end()); + ASSERT_NE(v_it, cache_store->stored_blocks_.end()); + ASSERT_EQ(k_it->second.size(), static_cast(kv_half)); + ASSERT_EQ(v_it->second.size(), static_cast(kv_half)); + EXPECT_EQ(k_it->second[0], static_cast(17)); + EXPECT_EQ(v_it->second[0], static_cast(29)); + + ASSERT_EQ(cache_store->store_buffer_requests_.size(), 1u); + auto k_block = cache_store->store_buffer_requests_.front()->getBlock(k_key); + auto v_block = cache_store->store_buffer_requests_.front()->getBlock(v_key); + ASSERT_NE(k_block, nullptr); + ASSERT_NE(v_block, nullptr); + EXPECT_FALSE(k_block->gpu_mem); + EXPECT_FALSE(v_block->gpu_mem); +} + +TEST_F(PdSepKVCacheReleaseTest, testWriteCacheStoreUsesTensorDeviceForCpuKvScaleBuffer) { + const int spb = 8; + const int kv_stride = 64; + const int scale_stride = 16; + const int request_id_val = 4244; + const std::string cache_key_string = "10002"; + + auto cpu_options = torch::TensorOptions(torch::kUInt8).device(torch::kCPU).pinned_memory(true); + auto kv_buffer = torch::empty({2, kv_stride}, cpu_options); + auto kv_scale_buffer = torch::empty({2, scale_stride}, cpu_options); + kv_buffer[1].fill_(static_cast(41)); + kv_scale_buffer[1].fill_(static_cast(73)); + + auto inputs = + makeSingleBlockWriteInputs(cache_key_string, request_id_val, spb, kv_stride, scale_stride, true, 0, "csa_state"); + + KvCacheInfo kv_cache_info; + kv_cache_info.kv_cache_buffer = kv_buffer; + kv_cache_info.kv_scale_buffer = kv_scale_buffer; + + auto cache_store = std::make_shared(); + runtimeWriteCacheStore(inputs, kv_cache_info, /*mla_kvcache=*/false, cache_store); + + const auto scale_key = "kv_scale_" + makeCacheKey(0, cache_key_string, 0, "csa_state"); + auto scale_it = cache_store->stored_blocks_.find(scale_key); + ASSERT_NE(scale_it, cache_store->stored_blocks_.end()); + ASSERT_EQ(scale_it->second.size(), static_cast(scale_stride)); + EXPECT_EQ(scale_it->second[0], static_cast(73)); + + ASSERT_EQ(cache_store->store_buffer_requests_.size(), 1u); + auto scale_block = cache_store->store_buffer_requests_.front()->getBlock(scale_key); + ASSERT_NE(scale_block, nullptr); + EXPECT_FALSE(scale_block->gpu_mem); +} + } // namespace rtp_llm diff --git a/rtp_llm/cpp/metrics/RtpLLMMetrics.cc b/rtp_llm/cpp/metrics/RtpLLMMetrics.cc index c097471529..e0727c9939 100644 --- a/rtp_llm/cpp/metrics/RtpLLMMetrics.cc +++ b/rtp_llm/cpp/metrics/RtpLLMMetrics.cc @@ -14,6 +14,8 @@ AUTIL_LOG_SETUP(rtp_llm, RtpEmbeddingGlobalMetrics); AUTIL_LOG_SETUP(rtp_llm, RtpEmbeddingStreamMetrics); AUTIL_LOG_SETUP(rtp_llm, RtpLLMSchedulerMetrics); AUTIL_LOG_SETUP(rtp_llm, RtpLLMCacheMetrics); +AUTIL_LOG_SETUP(rtp_llm, RtpLLMCachePoolMetrics); +AUTIL_LOG_SETUP(rtp_llm, RtpLLMCacheEvictionMetrics); AUTIL_LOG_SETUP(rtp_llm, RtpLLMCacheReuseMetrics); AUTIL_LOG_SETUP(rtp_llm, RtpLLMDeviceCacheReuseMetrics); AUTIL_LOG_SETUP(rtp_llm, RtpLLMExecutorMetrics); @@ -352,6 +354,42 @@ void RtpLLMCacheMetrics::report(const kmonitor::MetricsTags* tags, RtpLLMCacheMe REPORT_MUTABLE_METRIC(mr_cost_time_ms_metric, collector->mr_cost_time_ms); } +bool RtpLLMCachePoolMetrics::init(kmonitor::MetricsGroupManager* manager) { + REGISTER_GAUGE_MUTABLE_METRIC(free_blocks_metric, "rtp_llm_kv_cache_pool_free_blocks"); + REGISTER_GAUGE_MUTABLE_METRIC(available_blocks_metric, "rtp_llm_kv_cache_pool_available_blocks"); + REGISTER_GAUGE_MUTABLE_METRIC(request_ref_blocks_metric, "rtp_llm_kv_cache_pool_request_ref_blocks"); + REGISTER_GAUGE_MUTABLE_METRIC(connector_ref_blocks_metric, "rtp_llm_kv_cache_pool_connector_ref_blocks"); + REGISTER_GAUGE_MUTABLE_METRIC(total_blocks_metric, "rtp_llm_kv_cache_pool_total_blocks"); + REGISTER_GAUGE_MUTABLE_METRIC(used_ratio_metric, "rtp_llm_kv_cache_pool_used_ratio"); + return true; +} + +void RtpLLMCachePoolMetrics::report(const kmonitor::MetricsTags* tags, RtpLLMCachePoolMetricsCollector* collector) { + REPORT_MUTABLE_METRIC(free_blocks_metric, collector->free_blocks); + REPORT_MUTABLE_METRIC(available_blocks_metric, collector->available_blocks); + REPORT_MUTABLE_METRIC(request_ref_blocks_metric, collector->request_ref_blocks); + REPORT_MUTABLE_METRIC(connector_ref_blocks_metric, collector->connector_ref_blocks); + REPORT_MUTABLE_METRIC(total_blocks_metric, collector->total_blocks); + REPORT_MUTABLE_METRIC(used_ratio_metric, collector->used_ratio); +} + +bool RtpLLMCacheEvictionMetrics::init(kmonitor::MetricsGroupManager* manager) { + REGISTER_GAUGE_MUTABLE_METRIC(evicted_block_lifetime_ms_metric, + "rtp_llm_kv_cache_evicted_block_lifetime_ms"); + REGISTER_GAUGE_MUTABLE_METRIC(evicted_block_count_metric, "rtp_llm_kv_cache_evicted_block_count"); + return true; +} + +void RtpLLMCacheEvictionMetrics::report(const kmonitor::MetricsTags* tags, + RtpLLMCacheEvictionMetricsCollector* collector) { + if (collector->lifetime_ms >= 0) { + REPORT_MUTABLE_METRIC(evicted_block_lifetime_ms_metric, collector->lifetime_ms); + } + if (collector->evicted_block_count >= 0) { + REPORT_MUTABLE_METRIC(evicted_block_count_metric, collector->evicted_block_count); + } +} + bool RtpLLMRemoteCacheMatchMetrics::init(kmonitor::MetricsGroupManager* manager) { REGISTER_QPS_MUTABLE_METRIC(remote_match_qps_metric, "rtp_llm_remote_match_qps"); REGISTER_QPS_MUTABLE_METRIC(remote_match_fail_qps_metric, "rtp_llm_remote_match_fail_qps"); diff --git a/rtp_llm/cpp/metrics/RtpLLMMetrics.h b/rtp_llm/cpp/metrics/RtpLLMMetrics.h index 3c25b70bd8..b82c164924 100644 --- a/rtp_llm/cpp/metrics/RtpLLMMetrics.h +++ b/rtp_llm/cpp/metrics/RtpLLMMetrics.h @@ -434,6 +434,53 @@ class RtpLLMCacheMetrics: public kmonitor::MetricsGroup { AUTIL_LOG_DECLARE(); }; +class RtpLLMCachePoolMetricsCollector final { +public: + int64_t free_blocks = 0; + int64_t available_blocks = 0; + int64_t request_ref_blocks = 0; + int64_t connector_ref_blocks = 0; + int64_t total_blocks = 0; + int64_t reserve_blocks = 0; + float used_ratio = 0; +}; + +class RtpLLMCachePoolMetrics: public kmonitor::MetricsGroup { +public: + bool init(kmonitor::MetricsGroupManager* manager) override; + void report(const kmonitor::MetricsTags* tags, RtpLLMCachePoolMetricsCollector* collector); + +public: + kmonitor::MutableMetric* free_blocks_metric = nullptr; + kmonitor::MutableMetric* available_blocks_metric = nullptr; + kmonitor::MutableMetric* request_ref_blocks_metric = nullptr; + kmonitor::MutableMetric* connector_ref_blocks_metric = nullptr; + kmonitor::MutableMetric* total_blocks_metric = nullptr; + kmonitor::MutableMetric* used_ratio_metric = nullptr; + +private: + AUTIL_LOG_DECLARE(); +}; + +class RtpLLMCacheEvictionMetricsCollector final { +public: + int64_t lifetime_ms = -1; + int64_t evicted_block_count = -1; +}; + +class RtpLLMCacheEvictionMetrics: public kmonitor::MetricsGroup { +public: + bool init(kmonitor::MetricsGroupManager* manager) override; + void report(const kmonitor::MetricsTags* tags, RtpLLMCacheEvictionMetricsCollector* collector); + +public: + kmonitor::MutableMetric* evicted_block_lifetime_ms_metric = nullptr; + kmonitor::MutableMetric* evicted_block_count_metric = nullptr; + +private: + AUTIL_LOG_DECLARE(); +}; + class RtpLLMCacheReuseMetricsCollector final { public: int64_t kv_cache_reuse_length = 0; @@ -816,8 +863,10 @@ class RtpLLMMemoryCacheCopyMetricsCollector final { class RtpLLMMemoryCacheStatusMetricsCollector final { public: int64_t total_block_num = 0; - int64_t allocated_block_num = 0; // 在cache中的block数量 - int64_t available_block_num = 0; // 可用的block数量 + int64_t allocated_block_num = 0; + int64_t available_block_num = 0; + int64_t item_num = 0; + float used_ratio = 0; }; class RtpLLMMemoryCacheMetrics: public kmonitor::MetricsGroup { diff --git a/rtp_llm/cpp/model_rpc/DecodeRpcServer.cc b/rtp_llm/cpp/model_rpc/DecodeRpcServer.cc index 6f2596a6f5..32bd9cde29 100644 --- a/rtp_llm/cpp/model_rpc/DecodeRpcServer.cc +++ b/rtp_llm/cpp/model_rpc/DecodeRpcServer.cc @@ -5,7 +5,7 @@ #include #include -#include "rtp_llm/cpp/cache/CacheGroupType.h" +#include "rtp_llm/cpp/cache/spec/CacheGroupType.h" #include "rtp_llm/cpp/utils/KVCacheUtils.h" #include "rtp_llm/cpp/model_rpc/QueryConverter.h" #include "rtp_llm/cpp/model_rpc/DecodeRpcServer.h" @@ -579,7 +579,7 @@ ErrorInfo DecodeRpcServer::loadCache(const LoadKVCacheContext& load_context) { const bool use_mla = cache_config.use_mla; const bool use_hybrid = cache_config.groupNums() > 1; - const auto& spec = cache_config.cache_specs[0]; + const auto& spec = cache_config.specForGroup(0); const size_t k_total_bytes = spec->k_block_size_bytes(); const size_t v_total_bytes = spec->v_block_size_bytes(); @@ -596,6 +596,8 @@ ErrorInfo DecodeRpcServer::loadCache(const LoadKVCacheContext& load_context) { auto cancel_check_func = [&load_context]() -> bool { return load_context.server_context->IsCancelled(); }; auto start_load_time_us = currentTimeUs(); + const auto flat_layer_to_group = cache_config.flatLayerToGroupId(); + const auto cache_group_types = cache_config.groupTypesSnapshot(); std::vector> load_contexts; for (int i = 0; i < load_context.peer_addrs.size(); i++) { auto& peer_addr = load_context.peer_addrs[i]; @@ -607,8 +609,8 @@ ErrorInfo DecodeRpcServer::loadCache(const LoadKVCacheContext& load_context) { auto load_layer_cache = std::make_shared(std::to_string(load_context.request_id), request_key); size_t gid = 0; - if (use_hybrid && layer_id < cache_config.layer_to_group_id.size()) { - const int mapped_gid = cache_config.layer_to_group_id[layer_id]; + if (use_hybrid && layer_id < flat_layer_to_group.size()) { + const int mapped_gid = flat_layer_to_group[layer_id]; if (mapped_gid >= 0) { gid = static_cast(mapped_gid); } @@ -627,10 +629,10 @@ ErrorInfo DecodeRpcServer::loadCache(const LoadKVCacheContext& load_context) { block_pos_list.reserve(block_num); if (use_hybrid && block_num > 0) { CacheGroupType group_type = CacheGroupType::FULL; - if (layer_id < cache_config.layer_to_group_id.size() && !cache_config.group_types.empty()) { - const int gid = cache_config.layer_to_group_id[layer_id]; - if (gid >= 0 && static_cast(gid) < cache_config.group_types.size()) { - group_type = cache_config.group_types[static_cast(gid)]; + if (layer_id < flat_layer_to_group.size() && !cache_group_types.empty()) { + const int gid = flat_layer_to_group[layer_id]; + if (gid >= 0 && static_cast(gid) < cache_group_types.size()) { + group_type = cache_group_types[static_cast(gid)]; } } if (group_type == CacheGroupType::LINEAR) { @@ -707,9 +709,12 @@ ErrorInfo DecodeRpcServer::loadCache(const LoadKVCacheContext& load_context) { "mtp layer_num mismatch: engine=" + std::to_string(layer_num) + " cache_cfg=" + std::to_string(mtp_cache_cfg.layer_num) + " (mtp_model_id=" + std::to_string(mtp_model_id) + ")"); + const auto mtp_global_layer_ids = mtp_cache_cfg.globalLayerIdsSnapshot(); + const auto mtp_flat_layer_to_grp = mtp_cache_cfg.flatLayerToGroupId(); + const auto mtp_group_types = mtp_cache_cfg.groupTypesSnapshot(); RTP_LLM_CHECK_WITH_INFO( - !mtp_cache_cfg.global_layer_ids.empty(), - "mtp_cache_cfg.global_layer_ids is empty (mtp_model_id=" + std::to_string(mtp_model_id) + ")"); + !mtp_global_layer_ids.empty(), + "mtp_cache_cfg.globalLayerIdsSnapshot() is empty (mtp_model_id=" + std::to_string(mtp_model_id) + ")"); for (size_t layer_id = 0; layer_id < layer_num; layer_id++) { auto request_key = std::to_string(load_context.request_id) + "-" + std::to_string(layer_id); @@ -717,8 +722,8 @@ ErrorInfo DecodeRpcServer::loadCache(const LoadKVCacheContext& load_context) { std::make_shared(std::to_string(load_context.request_id), request_key); size_t gid = 0; const bool mtp_use_hybrid = mtp_cache_cfg.groupNums() > 1; - if (mtp_use_hybrid && layer_id < mtp_cache_cfg.layer_to_group_id.size()) { - const int mapped_gid = mtp_cache_cfg.layer_to_group_id[layer_id]; + if (mtp_use_hybrid && layer_id < mtp_flat_layer_to_grp.size()) { + const int mapped_gid = mtp_flat_layer_to_grp[layer_id]; if (mapped_gid >= 0) { gid = static_cast(mapped_gid); } @@ -734,18 +739,18 @@ ErrorInfo DecodeRpcServer::loadCache(const LoadKVCacheContext& load_context) { size_t model_id = mtp_base_model_id; // Use per-module global_layer_ids for address lookup. - const int global_layer_id = mtp_cache_cfg.global_layer_ids[0][layer_id]; + const int global_layer_id = mtp_global_layer_ids[0][layer_id]; // Hybrid cache: Linear group only needs the last block; Full group needs all blocks. std::vector block_pos_list; block_pos_list.reserve(block_num); if (mtp_use_hybrid && block_num > 0) { CacheGroupType group_type = CacheGroupType::FULL; - if (layer_id < mtp_cache_cfg.layer_to_group_id.size() - && !mtp_cache_cfg.group_types.empty()) { - const int gid = mtp_cache_cfg.layer_to_group_id[layer_id]; - if (gid >= 0 && static_cast(gid) < mtp_cache_cfg.group_types.size()) { - group_type = mtp_cache_cfg.group_types[static_cast(gid)]; + if (layer_id < mtp_flat_layer_to_grp.size() + && !mtp_group_types.empty()) { + const int gid = mtp_flat_layer_to_grp[layer_id]; + if (gid >= 0 && static_cast(gid) < mtp_group_types.size()) { + group_type = mtp_group_types[static_cast(gid)]; } } if (group_type == CacheGroupType::LINEAR) { diff --git a/rtp_llm/cpp/model_rpc/proto/model_rpc_service.proto b/rtp_llm/cpp/model_rpc/proto/model_rpc_service.proto index b006f8dc32..5812477a1b 100644 --- a/rtp_llm/cpp/model_rpc/proto/model_rpc_service.proto +++ b/rtp_llm/cpp/model_rpc/proto/model_rpc_service.proto @@ -451,11 +451,17 @@ message MemoryOperationRequestPB { message CopyItem { int32 mem_block = 1; repeated int32 gpu_blocks = 2; + bool is_complete = 3; + BackingType backing_type = 4; } enum CopyDirection { H2D = 0; D2H = 1; } + enum BackingType { + MEMORY = 0; + DISK = 1; + } repeated CopyItem copy_items = 1; CopyDirection copy_direction = 2; } diff --git a/rtp_llm/cpp/model_utils/AttentionConfig.h b/rtp_llm/cpp/model_utils/AttentionConfig.h index 5eb1f2aff6..18a2b798c7 100644 --- a/rtp_llm/cpp/model_utils/AttentionConfig.h +++ b/rtp_llm/cpp/model_utils/AttentionConfig.h @@ -2,6 +2,7 @@ #include "rtp_llm/cpp/model_utils/RopeConfig.h" #include +#include namespace rtp_llm { @@ -58,6 +59,10 @@ struct AttentionConfigs { int indexer_head_dim = 0; int indexer_head_num = 0; int indexer_topk = 0; + int sliding_window = 0; + int o_groups = 0; + int o_lora_rank = 0; + std::vector layer_compress_ratios; // data type for attention computation c10::ScalarType dtype = c10::ScalarType::Half; diff --git a/rtp_llm/cpp/models/PyWrappedModel.h b/rtp_llm/cpp/models/PyWrappedModel.h index 06c62a0c7e..7d42441d3f 100644 --- a/rtp_llm/cpp/models/PyWrappedModel.h +++ b/rtp_llm/cpp/models/PyWrappedModel.h @@ -169,7 +169,7 @@ inline PyWrappedModel::PyWrappedModel(const GptModelInitParams& params, kv_cache.kv_scale_base_by_layer.push_back(t); } - kv_cache.layer_attn_types = layout.layer_attn_types; + kv_cache.layer_group_types = layout.layer_group_types; init_resources.kv_cache = kv_cache; } diff --git a/rtp_llm/cpp/normal_engine/NormalBatchStreamProcessor.cc b/rtp_llm/cpp/normal_engine/NormalBatchStreamProcessor.cc index f628d2910d..2d0f6ab71a 100644 --- a/rtp_llm/cpp/normal_engine/NormalBatchStreamProcessor.cc +++ b/rtp_llm/cpp/normal_engine/NormalBatchStreamProcessor.cc @@ -24,8 +24,8 @@ NormalBatchStreamProcessor::NormalBatchStreamProcessor( model_input_gatherer_config_.kernel_seq_size_per_block = cache_config.kernel_seq_size_per_block; model_input_gatherer_config_.kernel_blocks_per_kv_block = cache_config.kernelBlocksPerKvBlock(); model_input_gatherer_config_.kv_cache_group_nums = cache_config.groupNums(); - model_input_gatherer_config_.layer_to_kv_cache_group_id = cache_config.layer_to_group_id; - model_input_gatherer_config_.kv_cache_group_types = cache_config.group_types; + model_input_gatherer_config_.layer_to_kv_cache_group_id = cache_config.flatLayerToGroupId(); + model_input_gatherer_config_.kv_cache_group_types = cache_config.groupTypesSnapshot(); model_input_gatherer_config_.warm_up = warm_up; model_input_gatherer_config_.enable_detail_log = profiling_debug_logging_config.enable_detail_log; diff --git a/rtp_llm/cpp/normal_engine/NormalEngine.cc b/rtp_llm/cpp/normal_engine/NormalEngine.cc index 9ff9caa273..083e8decdb 100644 --- a/rtp_llm/cpp/normal_engine/NormalEngine.cc +++ b/rtp_llm/cpp/normal_engine/NormalEngine.cc @@ -6,7 +6,7 @@ #include "rtp_llm/cpp/utils/StatusUtil.h" #include "rtp_llm/cpp/engine_base/schedulers/FIFOScheduler.h" #include "rtp_llm/cpp/engine_base/schedulers/BatchDecodeScheduler.h" -#include "rtp_llm/cpp/cache/CacheConfigCreator.h" +#include "rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h" #include "rtp_llm/cpp/engine_base/system_prompt/SystemPromptConstructor.h" #include "rtp_llm/cpp/utils/Logger.h" #include "rtp_llm/cpp/utils/AssertUtils.h" @@ -250,7 +250,7 @@ WarmUpResult NormalEngine::decodeWarmUp(const EngineInitParams& params) { fake_input->generate_config->calculate_loss = int(runtime_config.warm_up_with_loss); rtp_llm::setTraceMemory(true); - auto cache_config = CacheConfigCreator::createBasicConfig(model_config_, parallelism_config); + auto cache_config = CacheConfigCreator::createBasicConfig(model_config_, parallelism_config, KVCacheConfig{}, false, 0); cache_config.seq_size_per_block = model_config_.attn_config.tokens_per_block; cache_config.block_num = 5; ParallelismConfig temp_parallelism_config; @@ -322,7 +322,7 @@ void NormalEngine::initCacheManager(std::optional warm_up_result) const auto& cache_cfg = resource_context_.cache_manager->cacheConfig(); kv_cache_group_num_ = cache_cfg.groupNums(); - kv_cache_layer_to_group_ = cache_cfg.layer_to_group_id; + kv_cache_layer_to_group_ = cache_cfg.flatLayerToGroupId(); } else { auto result = CacheConfigCreator::createConfig( model_config_, parallelism_config, runtime_config, kv_cache_config, warm_up_result); @@ -339,7 +339,7 @@ void NormalEngine::initCacheManager(std::optional warm_up_result) } const auto& cache_cfg = resource_context_.cache_manager->cacheConfig(); kv_cache_group_num_ = cache_cfg.groupNums(); - kv_cache_layer_to_group_ = cache_cfg.layer_to_group_id; + kv_cache_layer_to_group_ = cache_cfg.flatLayerToGroupId(); } } diff --git a/rtp_llm/cpp/normal_engine/NormalModelInputGatherer.h b/rtp_llm/cpp/normal_engine/NormalModelInputGatherer.h index ea638ff49a..1bb33c2032 100644 --- a/rtp_llm/cpp/normal_engine/NormalModelInputGatherer.h +++ b/rtp_llm/cpp/normal_engine/NormalModelInputGatherer.h @@ -4,7 +4,7 @@ #include #include "absl/status/status.h" #include "absl/status/statusor.h" -#include "rtp_llm/cpp/cache/CacheGroupType.h" +#include "rtp_llm/cpp/cache/spec/CacheGroupType.h" #include "rtp_llm/cpp/cache/Types.h" #include "rtp_llm/cpp/config/ConfigModules.h" #include "rtp_llm/models_py/bindings/core/OpData.h" diff --git a/rtp_llm/cpp/normal_engine/speculative/MtpExecutor.cc b/rtp_llm/cpp/normal_engine/speculative/MtpExecutor.cc index 9d9a1fe79d..8ed4cd8b59 100644 --- a/rtp_llm/cpp/normal_engine/speculative/MtpExecutor.cc +++ b/rtp_llm/cpp/normal_engine/speculative/MtpExecutor.cc @@ -7,7 +7,7 @@ #include "rtp_llm/cpp/utils/StatusUtil.h" #include "rtp_llm/cpp/engine_base/schedulers/FIFOScheduler.h" #include "rtp_llm/cpp/engine_base/schedulers/BatchDecodeScheduler.h" -#include "rtp_llm/cpp/cache/CacheConfigCreator.h" +#include "rtp_llm/cpp/cache/config_creator/CacheConfigCreator.h" #include "rtp_llm/cpp/engine_base/system_prompt/SystemPromptConstructor.h" #include "rtp_llm/cpp/utils/Logger.h" #include "rtp_llm/cpp/utils/AssertUtils.h" @@ -22,6 +22,17 @@ namespace rtp_llm { +namespace { +std::vector flattenLayerToGroupIds(const std::vector>& ids) { + std::vector flat; + flat.reserve(ids.size()); + for (const auto& v : ids) { + flat.push_back(v.empty() ? -1 : v[0]); + } + return flat; +} +} // namespace + bool MtpExecutor::isTpRank0() const { return tp_rank_ == 0; } @@ -197,7 +208,7 @@ MtpExecutor::MtpExecutor(const EngineInitParams& params, if (!params.py_model.is_none()) { RTP_LLM_LOG_INFO("init executor with python model"); model_.reset(new PyWrappedModel( - model_init_params, params.py_model, false, true, target_cache_layer_layout.layer_to_groups)); + model_init_params, params.py_model, false, true, flattenLayerToGroupIds(target_cache_layer_layout.layer_to_group_ids))); } // when warmup, cache manager maybe nullptr @@ -239,7 +250,7 @@ MtpExecutor::MtpExecutor(const EngineInitParams& params, if (!params.py_sp_model.is_none()) { RTP_LLM_LOG_INFO("[speculative decoding] using py model"); draft_model_.reset(new PyWrappedModel( - model_params, params.py_sp_model, false, false, draft_cache_layer_layout.layer_to_groups)); + model_params, params.py_sp_model, false, false, flattenLayerToGroupIds(draft_cache_layer_layout.layer_to_group_ids))); // Create separate model for speculative prefill with CUDA graph if enabled (from params) const bool enable_cuda_graph = params.hw_kernel_config.enable_cuda_graph; RTP_LLM_LOG_INFO( @@ -249,7 +260,7 @@ MtpExecutor::MtpExecutor(const EngineInitParams& params, RTP_LLM_LOG_INFO( "[speculative decoding] creating separate prefill draft model with CUDA graph support"); sp_prefill_draft_model_.reset(new PyWrappedModel( - model_params, params.py_sp_model, true, false, draft_cache_layer_layout.layer_to_groups)); + model_params, params.py_sp_model, true, false, flattenLayerToGroupIds(draft_cache_layer_layout.layer_to_group_ids))); } } break; // NOTE: only support one mtp model now @@ -260,12 +271,14 @@ MtpExecutor::MtpExecutor(const EngineInitParams& params, draft_kv_cache_layer_to_group = torch::empty({(int64_t)draft_cache_layer_layout.layers_to_kv_buffer_ptrs.size()}, torch::kInt32); + auto target_flat_ids = flattenLayerToGroupIds(target_cache_layer_layout.layer_to_group_ids); + auto draft_flat_ids = flattenLayerToGroupIds(draft_cache_layer_layout.layer_to_group_ids); memcpy(target_kv_cache_layer_to_group.data_ptr(), - target_cache_layer_layout.layer_to_groups.data(), - target_cache_layer_layout.layer_to_groups.size() * sizeof(int)); + target_flat_ids.data(), + target_flat_ids.size() * sizeof(int)); memcpy(draft_kv_cache_layer_to_group.data_ptr(), - draft_cache_layer_layout.layer_to_groups.data(), - draft_cache_layer_layout.layer_to_groups.size() * sizeof(int)); + draft_flat_ids.data(), + draft_flat_ids.size() * sizeof(int)); } /* diff --git a/rtp_llm/cpp/normal_engine/speculative/test/MtpBatchStreamProcessorTest.cc b/rtp_llm/cpp/normal_engine/speculative/test/MtpBatchStreamProcessorTest.cc index 63c4a7496e..3a13a802c3 100644 --- a/rtp_llm/cpp/normal_engine/speculative/test/MtpBatchStreamProcessorTest.cc +++ b/rtp_llm/cpp/normal_engine/speculative/test/MtpBatchStreamProcessorTest.cc @@ -26,6 +26,19 @@ std::vector toVec(const torch::Tensor& t) { class MtpBatchStreamProcessorTest: public DeviceTestBase { public: + static void initSingleGroupCacheConfig(CacheConfig& config, int layer_num = 1) { + config.layer_num = static_cast(layer_num); + config.layer_all_num = static_cast(layer_num); + auto spec = std::make_shared(); + spec->type = KVCacheSpecType::MultiHeadAttention; + spec->seq_size_per_block = 8; + spec->local_head_num_kv = 1; + spec->size_per_head = 1; + std::vector layer_ids(layer_num); + std::iota(layer_ids.begin(), layer_ids.end(), 0); + config.fromGroupedSpecs({spec}, {layer_ids}, {CacheGroupType::FULL}, {"default"}); + } + GenerateStreamPtr createContextStream(const ModelConfig& model_config, const RuntimeConfig& runtime_config, const ResourceContext& resource_context, @@ -39,7 +52,7 @@ class MtpBatchStreamProcessorTest: public DeviceTestBase { BatchKVCacheResource addr; // New (refactored) BatchKVCacheResource: [batch_id][group_id] -> block_indices addr.resetBatchSize(1); - addr.initGroups(1, 1, {0}); + addr.initGroups(1, 1, {{0}}); addr.setBatchBlocks(0, 0, {block_id}); stream->setKVCache(addr); @@ -84,7 +97,7 @@ TEST_F(MtpBatchStreamProcessorTest, testPrefillDispatch) { PDSepConfig pd_sep_config; ProfilingDebugLoggingConfig profiling_debug_logging_config; CacheConfig cache_config; - cache_config.group_types = {CacheGroupType::FULL}; + initSingleGroupCacheConfig(cache_config); model_config.max_seq_len = 2048; model_config.vocab_size = 4; @@ -167,7 +180,7 @@ TEST_F(MtpBatchStreamProcessorTest, testDispatchDecodeStream) { draft_prefill_output.sampler_output.all_probs = torch::tensor({0.2f, 0.1f, 0.3f, 0.5f, 0.3f, 0.1f, 0.4f, 0.2f}, torch::kFloat32).reshape({2, 4}); - cache_config.group_types = {CacheGroupType::FULL}; + initSingleGroupCacheConfig(cache_config); MtpBatchStreamProcessor processor( model_config, pd_sep_config, profiling_debug_logging_config, cache_config, sp_config, false); @@ -216,7 +229,7 @@ TEST_F(MtpBatchStreamProcessorTest, testGatherDecodeModelInput) { auto stream_groups = StreamGroups({stream1, stream2}); - cache_config.group_types = {CacheGroupType::FULL}; + initSingleGroupCacheConfig(cache_config); auto processor = MtpBatchStreamProcessor( model_config, pd_sep_config, profiling_debug_logging_config, cache_config, sp_config, false); auto model_input = processor.gatherDecodeModelInput(stream_groups); @@ -293,7 +306,7 @@ TEST_F(MtpBatchStreamProcessorTest, testPrepareOneStepSpecDecodeModelInput) { auto stream_groups = StreamGroups({stream1, stream2}); - cache_config.group_types = {CacheGroupType::FULL}; + initSingleGroupCacheConfig(cache_config); auto processor = MtpBatchStreamProcessor( model_config, pd_sep_config, profiling_debug_logging_config, cache_config, sp_config, false); auto model_input_status = processor.gatherDecodeModelInput(stream_groups); @@ -391,7 +404,7 @@ TEST_F(MtpBatchStreamProcessorTest, testprepareDecodeDraftModelInput) { auto stream_groups = StreamGroups({stream1, stream2}); - cache_config.group_types = {CacheGroupType::FULL}; + initSingleGroupCacheConfig(cache_config); auto processor = MtpBatchStreamProcessor( model_config, pd_sep_config, profiling_debug_logging_config, cache_config, sp_config, false); auto model_input_status = processor.gatherDecodeModelInput(stream_groups); @@ -446,7 +459,7 @@ TEST_F(MtpBatchStreamProcessorTest, testUpdatePrefillPostDraftModelInput) { auto stream_groups = StreamGroups({stream1, stream2}); - cache_config.group_types = {CacheGroupType::FULL}; + initSingleGroupCacheConfig(cache_config); auto processor = MtpBatchStreamProcessor( model_config, pd_sep_config, profiling_debug_logging_config, cache_config, sp_config, false); auto model_input_status = processor.gatherModelInput(stream_groups); @@ -504,7 +517,7 @@ TEST_F(MtpBatchStreamProcessorTest, testUpdateDecodePostDraftModelInput) { auto stream_groups = StreamGroups({stream1, stream2}); - cache_config.group_types = {CacheGroupType::FULL}; + initSingleGroupCacheConfig(cache_config); auto processor = MtpBatchStreamProcessor( model_config, pd_sep_config, profiling_debug_logging_config, cache_config, sp_config, false); auto model_input_status = processor.gatherModelInput(stream_groups); diff --git a/rtp_llm/cpp/normal_engine/speculative/test/MtpExecutorTest.cc b/rtp_llm/cpp/normal_engine/speculative/test/MtpExecutorTest.cc index c28eea6bbd..9c993f0700 100644 --- a/rtp_llm/cpp/normal_engine/speculative/test/MtpExecutorTest.cc +++ b/rtp_llm/cpp/normal_engine/speculative/test/MtpExecutorTest.cc @@ -342,7 +342,9 @@ class MtpExecutorTest: public DeviceTestBase { rtp_llm::TYPE_INT8, /*local_head_num_kv=*/128, /*size_per_head=*/256); - cache_config.mtp_sub_configs.push_back(std::make_shared(mtp_config)); + cache_config.layer_all_num = cache_config.layer_num + mtp_config.layer_num; + auto sub_cfg = cache_config.mergeMTPModule(mtp_config, /*module_index=*/0, /*main_layer_num=*/cache_config.layer_num); + cache_config.mtp_sub_configs.push_back(sub_cfg); EngineInitParams params = createEngineInitParams(config, model_config, runtime_config, kv_cache_config); params.sp_config = sp_config; diff --git a/rtp_llm/cpp/normal_engine/test/BUILD b/rtp_llm/cpp/normal_engine/test/BUILD index 839060f0ec..c9b43594d0 100644 --- a/rtp_llm/cpp/normal_engine/test/BUILD +++ b/rtp_llm/cpp/normal_engine/test/BUILD @@ -44,6 +44,7 @@ cc_test( "//rtp_llm/models_py/bindings/cuda/ops:cuda_impl", "//rtp_llm/cpp/normal_engine:normal_engine", "//rtp_llm/cpp/models:models", + "//rtp_llm/cpp/cache/test:cache_config_test_utils", "@com_google_googletest//:gtest", "@com_google_googletest//:gtest_main", "@local_config_cuda//cuda:cuda_headers", diff --git a/rtp_llm/cpp/normal_engine/test/MockEngine.h b/rtp_llm/cpp/normal_engine/test/MockEngine.h index abf7b4959d..031a0ace3c 100644 --- a/rtp_llm/cpp/normal_engine/test/MockEngine.h +++ b/rtp_llm/cpp/normal_engine/test/MockEngine.h @@ -16,6 +16,7 @@ #include "rtp_llm/cpp/testing/TestBase.h" #include "rtp_llm/cpp/models/models_weight/W.h" #include "rtp_llm/cpp/config/ConfigModules.h" +#include "rtp_llm/cpp/cache/spec/KVCacheSpecDescTypes.h" using namespace std; namespace W = rtp_llm::W; @@ -73,6 +74,19 @@ rtp_llm::EngineInitParams createEngineInitParams(const CustomConfig& config, const size_t inter_size = 512; // inter_size is now calculated in ModelDeployWeightInfo, not in ModelConfig model_config.attn_config.tokens_per_block = 2; + kv_cache_config.seq_size_per_block = model_config.attn_config.tokens_per_block; + + DataType kv_dtype = config.kv_cache_data_type == DataType::TYPE_INT8 ? DataType::TYPE_INT8 + : config.kv_cache_data_type == DataType::TYPE_FP8_E4M3 ? DataType::TYPE_FP8_E4M3 + : DataType::TYPE_FP16; + KVCacheSpecDesc mha_desc; + mha_desc.tag = "default"; + mha_desc.cache_type = CacheType::MHA; + mha_desc.num_kv_heads = model_config.attn_config.kv_head_num; + mha_desc.seq_size_per_block = model_config.attn_config.tokens_per_block; + mha_desc.dtype = kv_dtype; + mha_desc.size_per_head = model_config.attn_config.size_per_head; + model_config.kv_cache_spec_descs.resize(model_config.num_layers, {mha_desc}); runtime_config.reserve_runtime_mem_mb = 1024; const size_t hidden_units = 128; diff --git a/rtp_llm/cpp/normal_engine/test/NormalBatchStreamProcessorTest.cc b/rtp_llm/cpp/normal_engine/test/NormalBatchStreamProcessorTest.cc index ed4a6107ee..8ef0c8bde6 100644 --- a/rtp_llm/cpp/normal_engine/test/NormalBatchStreamProcessorTest.cc +++ b/rtp_llm/cpp/normal_engine/test/NormalBatchStreamProcessorTest.cc @@ -2,6 +2,8 @@ #include "torch/all.h" #include "gtest/gtest.h" +#include "rtp_llm/cpp/cache/test/CacheConfigTestUtils.h" + #define private public #include "rtp_llm/cpp/normal_engine/NormalBatchStreamProcessor.h" #include "rtp_llm/cpp/normal_engine/NormalGenerateStream.h" @@ -24,7 +26,21 @@ static torch::Tensor hostIntBuffer(std::vector data) { return torch::tensor(data, torch::kInt32); } -class NormalBatchStreamProcessorTest: public DeviceTestBase {}; +class NormalBatchStreamProcessorTest: public DeviceTestBase { +public: + static void initSingleGroupCacheConfig(CacheConfig& config, int layer_num = 1) { + config.layer_num = static_cast(layer_num); + config.layer_all_num = static_cast(layer_num); + auto spec = std::make_shared(); + spec->type = KVCacheSpecType::MultiHeadAttention; + spec->seq_size_per_block = 8; + spec->local_head_num_kv = 1; + spec->size_per_head = 1; + std::vector layer_ids(layer_num); + std::iota(layer_ids.begin(), layer_ids.end(), 0); + config.fromGroupedSpecs({spec}, {layer_ids}, {CacheGroupType::FULL}, {"default"}); + } +}; TEST_F(NormalBatchStreamProcessorTest, testSimpleAssemble) { ResourceContext resource_context; @@ -36,7 +52,7 @@ TEST_F(NormalBatchStreamProcessorTest, testSimpleAssemble) { PDSepConfig pd_sep_config; ProfilingDebugLoggingConfig profiling_debug_logging_config; CacheConfig cache_config; - cache_config.group_types = {CacheGroupType::FULL}; + initSingleGroupCacheConfig(cache_config, model_config.num_layers); RuntimeConfig runtime_config; NormalBatchStreamProcessor processor( @@ -50,7 +66,7 @@ TEST_F(NormalBatchStreamProcessorTest, testSimpleAssemble) { query1->input_ids = hostIntBuffer({1}); BatchKVCacheResource addr1; addr1.resetBatchSize(1); - addr1.initGroups(1, 3, {0, 0, 0}); + addr1.initGroups(1, 3, {{0}, {0}, {0}}); addr1.setBatchBlocks(0, 0, {1, 2, 3, 4}); stream1->setKVCache(addr1); stream1->setIsContextStream(false); @@ -63,7 +79,7 @@ TEST_F(NormalBatchStreamProcessorTest, testSimpleAssemble) { query2->input_ids = hostIntBuffer({1, 2}); BatchKVCacheResource addr2; addr2.resetBatchSize(1); - addr2.initGroups(1, 3, {0, 0, 0}); + addr2.initGroups(1, 3, {{0}, {0}, {0}}); addr2.setBatchBlocks(0, 0, {5, 6, 7, 8}); stream2->setKVCache(addr2); stream2->setIsContextStream(false); @@ -75,7 +91,7 @@ TEST_F(NormalBatchStreamProcessorTest, testSimpleAssemble) { make_shared(query3, model_config, runtime_config, resource_context, nullptr); BatchKVCacheResource addr3; addr3.resetBatchSize(1); - addr3.initGroups(1, 3, {0, 0, 0}); + addr3.initGroups(1, 3, {{0}, {0}, {0}}); addr3.setBatchBlocks(0, 0, {9, 10}); stream3->setKVCache(addr3); @@ -86,7 +102,7 @@ TEST_F(NormalBatchStreamProcessorTest, testSimpleAssemble) { make_shared(query4, model_config, runtime_config, resource_context, nullptr); BatchKVCacheResource addr4; addr4.resetBatchSize(1); - addr4.initGroups(1, 3, {0, 0, 0}); + addr4.initGroups(1, 3, {{0}, {0}, {0}}); addr4.setBatchBlocks(0, 0, {11, 12, 13, 14}); stream4->setKVCache(addr4); stream4->setReuseLength(1); @@ -152,7 +168,7 @@ TEST_F(NormalBatchStreamProcessorTest, testSoftmaxProbs) { make_shared(query1, model_config, runtime_config, resource_context, nullptr); BatchKVCacheResource addr1; addr1.resetBatchSize(1); - addr1.initGroups(1, 3, {0, 0, 0}); + addr1.initGroups(1, 3, {{0}, {0}, {0}}); addr1.setBatchBlocks(0, 0, {1}); stream1->setKVCache(addr1); @@ -162,7 +178,7 @@ TEST_F(NormalBatchStreamProcessorTest, testSoftmaxProbs) { for (const auto& stream : streams) { stream->generate_status_->status = StreamState::RUNNING; } - cache_config.group_types = {CacheGroupType::FULL}; + initSingleGroupCacheConfig(cache_config, model_config.num_layers); NormalBatchStreamProcessor processor( model_config, pd_sep_config, profiling_debug_logging_config, cache_config, false); @@ -205,7 +221,7 @@ TEST_F(NormalBatchStreamProcessorTest, testLoss) { make_shared(query1, model_config, runtime_config, resource_context, nullptr); BatchKVCacheResource addr1; addr1.resetBatchSize(1); - addr1.initGroups(1, 3, {0, 0, 0}); + addr1.initGroups(1, 3, {{0}, {0}, {0}}); addr1.setBatchBlocks(0, 0, {1}); stream1->setKVCache(addr1); @@ -217,7 +233,7 @@ TEST_F(NormalBatchStreamProcessorTest, testLoss) { make_shared(query3, model_config, runtime_config, resource_context, nullptr); BatchKVCacheResource addr3; addr3.resetBatchSize(1); - addr3.initGroups(1, 3, {0, 0, 0}); + addr3.initGroups(1, 3, {{0}, {0}, {0}}); addr3.setBatchBlocks(0, 0, {9}); stream3->setKVCache(addr3); @@ -229,7 +245,7 @@ TEST_F(NormalBatchStreamProcessorTest, testLoss) { make_shared(query4, model_config, runtime_config, resource_context, nullptr); BatchKVCacheResource addr4; addr4.resetBatchSize(1); - addr4.initGroups(1, 3, {0, 0, 0}); + addr4.initGroups(1, 3, {{0}, {0}, {0}}); addr4.setBatchBlocks(0, 0, {11, 12}); stream4->setKVCache(addr4); @@ -241,7 +257,7 @@ TEST_F(NormalBatchStreamProcessorTest, testLoss) { for (const auto& stream : streams) { stream->generate_status_->status = StreamState::RUNNING; } - cache_config.group_types = {CacheGroupType::FULL}; + initSingleGroupCacheConfig(cache_config, model_config.num_layers); NormalBatchStreamProcessor processor( model_config, pd_sep_config, profiling_debug_logging_config, cache_config, false); @@ -288,7 +304,7 @@ TEST_F(NormalBatchStreamProcessorTest, testMultimodalGatherBatch) { PDSepConfig pd_sep_config; ProfilingDebugLoggingConfig profiling_debug_logging_config; CacheConfig cache_config; - cache_config.group_types = {CacheGroupType::FULL}; + initSingleGroupCacheConfig(cache_config, model_config.num_layers); RuntimeConfig runtime_config; NormalBatchStreamProcessor processor( model_config, pd_sep_config, profiling_debug_logging_config, cache_config, false); diff --git a/rtp_llm/cpp/pybind/ConfigInit.cc b/rtp_llm/cpp/pybind/ConfigInit.cc index 66d31fb189..9540076456 100644 --- a/rtp_llm/cpp/pybind/ConfigInit.cc +++ b/rtp_llm/cpp/pybind/ConfigInit.cc @@ -722,11 +722,13 @@ PYBIND11_MODULE(libth_transformer_config, m) { .value("SLIDING_WINDOW", HybridAttentionType::SLIDING_WINDOW); pybind11::class_(m, "HybridAttentionConfig") - .def(pybind11::init>(), - pybind11::arg("enable_hybrid_attention") = false, - pybind11::arg("hybrid_attention_types") = std::vector{}) + .def(pybind11::init>(), + pybind11::arg("enable_hybrid_attention") = false, + pybind11::arg("enable_independent_kv_cache_pools") = false, + pybind11::arg("hybrid_attention_types") = std::vector{}) .def("to_string", &HybridAttentionConfig::to_string) .def_readwrite("enable_hybrid_attention", &HybridAttentionConfig::enable_hybrid_attention) + .def_readwrite("enable_independent_kv_cache_pools", &HybridAttentionConfig::enable_independent_kv_cache_pools) .def_readwrite("hybrid_attention_types", &HybridAttentionConfig::hybrid_attention_types); // Register SpeculativeType enum diff --git a/rtp_llm/cpp/testing/TestBase.h b/rtp_llm/cpp/testing/TestBase.h index 8663049f0b..24d834a8c6 100644 --- a/rtp_llm/cpp/testing/TestBase.h +++ b/rtp_llm/cpp/testing/TestBase.h @@ -203,7 +203,7 @@ class DeviceTestBase: public EngineBaseTest { auto batch_kv_cache = std::make_shared(); batch_kv_cache->resetBatchSize(batch_size); - batch_kv_cache->initGroups(1, cache_config.layer_all_num, cache_config.layer_to_group_id); + batch_kv_cache->initGroups(1, cache_config.layer_all_num, cache_config.layerGroupIdsSnapshot()); auto complete_token_ids = std::make_shared(static_cast(batch_size), @@ -261,13 +261,13 @@ class DeviceTestBase: public EngineBaseTest { torch::indexing::Slice()}) .reshape({2, static_cast(cache_config.seq_size_per_block), - static_cast(cache_config.cache_specs[0]->local_head_num_kv), + static_cast(cache_config.specForGroup(0)->local_head_num_kv), static_cast( - static_cast(*cache_config.cache_specs[0]) + static_cast(*cache_config.specForGroup(0)) .size_per_head)}) .transpose(2, 1) .contiguous(); - // vblock is not used in setKVBlockValue in this case + // vblock is not used in writeKVBlockForTest in this case vblock = kvCache .index({torch::indexing::Slice(), static_cast(i), @@ -275,17 +275,17 @@ class DeviceTestBase: public EngineBaseTest { torch::indexing::Slice(block_start, block_end), torch::indexing::Slice()}) .reshape({static_cast(cache_config.seq_size_per_block), - static_cast(cache_config.cache_specs[0]->local_head_num_kv), + static_cast(cache_config.specForGroup(0)->local_head_num_kv), static_cast( - static_cast(*cache_config.cache_specs[0]) + static_cast(*cache_config.specForGroup(0)) .size_per_head)}) .transpose(1, 0) .contiguous(); } // std::cout << "index: " << k << " start: " << block_start << " end: " << block_end << std::endl; // std::cout << "block index: " << k_indexs[k] << std::endl; - if (!cache_manager_->setKVBlockValue(k_indexs[k], kblock, vblock)) { - std::cout << "setKVBlockValue failed for block index: " << k_indexs[k] << std::endl; + if (!cache_manager_->writeKVBlockForTest(k_indexs[k], kblock, vblock)) { + std::cout << "writeKVBlockForTest failed for block index: " << k_indexs[k] << std::endl; return torch::Tensor(); } } diff --git a/rtp_llm/models_py/bindings/NoBlockCopy.h b/rtp_llm/models_py/bindings/NoBlockCopy.h index 978dc23739..ddd524b112 100644 --- a/rtp_llm/models_py/bindings/NoBlockCopy.h +++ b/rtp_llm/models_py/bindings/NoBlockCopy.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include @@ -16,12 +17,73 @@ struct MultiCopyParams { size_t split_kv_scale_stride_bytes = 0; }; +struct BatchedMemoryCopyTile { + void* dst = nullptr; + const void* src = nullptr; + size_t bytes = 0; +}; + +struct BatchedMemoryCopyParams { + std::vector tiles; + int device_index = -1; +}; + +enum class StagedMemoryCopyDirection { + H2D = 0, + D2H = 1, +}; + +struct StagedMemoryCopyTile { + void* gpu = nullptr; + size_t host_offset = 0; + size_t bytes = 0; +}; + +struct StagedMemoryCopyHostSegment { + void* host = nullptr; + size_t host_offset = 0; + size_t bytes = 0; +}; + +struct StagedMemoryCopyParams { + void* host_base = nullptr; + size_t host_bytes = 0; + std::vector host_segments; + std::vector tiles; + int device_index = -1; + StagedMemoryCopyDirection direction = StagedMemoryCopyDirection::H2D; +}; + +struct StagedMemoryCopyScratch { + void* host_staging = nullptr; + size_t host_capacity = 0; + void* device_staging = nullptr; + size_t device_capacity = 0; + void* device_ptrs = nullptr; + void* device_offsets = nullptr; + void* device_sizes = nullptr; + size_t meta_capacity = 0; + int device_index = -1; +}; + // Multi-tensor non-blocking copy with device-specific implementation. // CUDA: uses a dedicated stream + optional split-KV SM scatter path. // ROCm: plain tensor copy_. // Other devices: not supported (will abort). void execNoBlockCopy(const MultiCopyParams& params); +// One CUDA runtime call copy executor for regular host/device pointers. +// CUDA 12.8+ uses cudaMemcpyBatchAsync to avoid per-tile cudaMemcpyAsync launches. +bool execBatchedMemoryCopy(const BatchedMemoryCopyParams& params); + +// Stages compact host payload in GPU memory, then uses one SM gather/scatter kernel. +// host_segments may describe non-contiguous host blocks; they are packed/unpacked on CPU. +// scratch is optional; passing one lets callers reuse pinned host staging and device metadata buffers. +// H2D: compact host payload -> GPU staging -> tile.gpu by tile.host_offset. +// D2H: tile.gpu -> GPU staging by tile.host_offset -> compact host payload. +bool execStagedMemoryCopy(const StagedMemoryCopyParams& params, StagedMemoryCopyScratch* scratch = nullptr); +void releaseStagedMemoryCopyScratch(StagedMemoryCopyScratch& scratch); + // Warmup split-KV copy kernels. No-op on non-CUDA / PPU devices. // Must be called after cudaSetDevice + setCurrentCUDAStream. void warmupNoBlockCopy(); diff --git a/rtp_llm/models_py/bindings/NoBlockCopyDefault.cc b/rtp_llm/models_py/bindings/NoBlockCopyDefault.cc index dae679dab0..307fc039eb 100644 --- a/rtp_llm/models_py/bindings/NoBlockCopyDefault.cc +++ b/rtp_llm/models_py/bindings/NoBlockCopyDefault.cc @@ -14,6 +14,16 @@ void execNoBlockCopy(const MultiCopyParams& params) { } } +bool execBatchedMemoryCopy(const BatchedMemoryCopyParams& params) { + return params.tiles.empty(); +} + +bool execStagedMemoryCopy(const StagedMemoryCopyParams& params, StagedMemoryCopyScratch*) { + return params.tiles.empty(); +} + +void releaseStagedMemoryCopyScratch(StagedMemoryCopyScratch&) {} + void warmupNoBlockCopy() {} } // namespace rtp_llm diff --git a/rtp_llm/models_py/bindings/OpDefs.cc b/rtp_llm/models_py/bindings/OpDefs.cc index 67bfe051d6..e104bc3ac4 100644 --- a/rtp_llm/models_py/bindings/OpDefs.cc +++ b/rtp_llm/models_py/bindings/OpDefs.cc @@ -28,8 +28,8 @@ void registerPyOpDefs(pybind11::module& m) { .def_readwrite("use_mla", &KVCache::use_mla, "Whether MLA cache layout is used") .def_readwrite("kv_lora_rank", &KVCache::kv_lora_rank, "MLA KV LoRA rank") .def_readwrite("rope_head_dim", &KVCache::rope_head_dim, "MLA RoPE head dimension") - .def_readwrite("layer_attn_types", - &KVCache::layer_attn_types, + .def_readwrite("layer_group_types", + &KVCache::layer_group_types, "Per-layer attention type (CacheGroupType::FULL or LINEAR). " "Empty = all layers treated as FULL (backward compatibility).") .def("get_layer_cache", diff --git a/rtp_llm/models_py/bindings/OpDefs.h b/rtp_llm/models_py/bindings/OpDefs.h index adf0d41656..37fa931b05 100644 --- a/rtp_llm/models_py/bindings/OpDefs.h +++ b/rtp_llm/models_py/bindings/OpDefs.h @@ -4,7 +4,7 @@ #include #include #include -#include "rtp_llm/cpp/cache/CacheGroupType.h" +#include "rtp_llm/cpp/cache/spec/CacheGroupType.h" #include "rtp_llm/cpp/model_utils/AttentionConfig.h" #include "rtp_llm/models_py/bindings/ParamsBase.h" #include "rtp_llm/cpp/utils/Logger.h" @@ -44,14 +44,14 @@ struct KVCache { int rope_head_dim = 0; // Per-layer attention type (CacheGroupType::FULL or LINEAR). - std::vector layer_attn_types; + std::vector layer_group_types; LayerKVCache getLayerCache(int idx) { LayerKVCache layer_cache; layer_cache.layer_id = idx; // Determine whether this layer is a full-attention layer. - if (idx < 0 || static_cast(idx) >= layer_attn_types.size()) + if (idx < 0 || static_cast(idx) >= layer_group_types.size()) throw std::runtime_error("Invalid layer index: " + std::to_string(idx)); auto base = kv_cache_base_by_layer[idx]; torch::Tensor scale; @@ -59,7 +59,7 @@ struct KVCache { scale = kv_scale_base_by_layer[idx]; } - const bool is_full = layer_attn_types[static_cast(idx)] == rtp_llm::CacheGroupType::FULL; + const bool is_full = layer_group_types[static_cast(idx)] == rtp_llm::CacheGroupType::FULL; if (!is_full) { // Linear/SSM attention layer: return the raw cache tensor unchanged. diff --git a/rtp_llm/models_py/bindings/core/ExecOps.cc b/rtp_llm/models_py/bindings/core/ExecOps.cc index 4ba2a154ce..ad98ca2713 100644 --- a/rtp_llm/models_py/bindings/core/ExecOps.cc +++ b/rtp_llm/models_py/bindings/core/ExecOps.cc @@ -2,7 +2,7 @@ #include "rtp_llm/models_py/bindings/core/CommonDefines.h" #include "rtp_llm/cpp/disaggregate/cache_store/CacheStore.h" #include "rtp_llm/cpp/utils/Logger.h" -#include "rtp_llm/cpp/cache/CacheGroupType.h" +#include "rtp_llm/cpp/cache/spec/CacheGroupType.h" #include "rtp_llm/cpp/utils/KVCacheUtils.h" #include "rtp_llm/cpp/utils/ErrorCode.h" #include "rtp_llm/cpp/utils/StackTrace.h" diff --git a/rtp_llm/models_py/bindings/core/TensorHolder.h b/rtp_llm/models_py/bindings/core/TensorHolder.h new file mode 100644 index 0000000000..c0db0e7009 --- /dev/null +++ b/rtp_llm/models_py/bindings/core/TensorHolder.h @@ -0,0 +1,40 @@ +#pragma once + +#include +#include + +#include + +namespace rtp_llm { + +struct TensorHolder { + static constexpr size_t kReleasedHoldRounds = 2; + + std::vector tensors; + std::queue> clear_tensors; + + void hold_host(const torch::Tensor& tensor) { + if (tensor.defined() && tensor.device().is_cpu()) { + tensors.push_back(tensor); + } + } + + void hold(const torch::Tensor& tensor) { + if (tensor.defined()) { + tensors.push_back(tensor); + } + } + + void release() { + // Move the current hold set into clear_tensors. Keep two released + // rounds alive so tensors created for async H2D/D2H copies or CUDA + // kernels are not freed until the third release point. + clear_tensors.push(std::move(tensors)); + tensors.clear(); + while (clear_tensors.size() > kReleasedHoldRounds) { + clear_tensors.pop(); + } + } +}; + +} // namespace rtp_llm diff --git a/rtp_llm/models_py/bindings/cuda/NoBlockCopy.cc b/rtp_llm/models_py/bindings/cuda/NoBlockCopy.cc index 7b5fdcc70f..8e72ace7ed 100644 --- a/rtp_llm/models_py/bindings/cuda/NoBlockCopy.cc +++ b/rtp_llm/models_py/bindings/cuda/NoBlockCopy.cc @@ -1,7 +1,10 @@ #include "rtp_llm/models_py/bindings/NoBlockCopy.h" +#include "rtp_llm/models_py/bindings/common/kernels/sm_copy_kernel.h" #include "rtp_llm/models_py/bindings/cuda/SplitKvCacheCopy.h" #include "rtp_llm/models_py/bindings/cuda/cuda_host_utils.h" +#include +#include #include #include #include @@ -15,8 +18,189 @@ at::cuda::CUDAStream& getNoBlockCopyStream() { return stream; } +enum class HostCoverage { + Invalid, + Partial, + Full, +}; + +HostCoverage checkHostCoverage(const StagedMemoryCopyParams& params) { + std::vector> ranges; + ranges.reserve(params.tiles.size()); + for (const auto& tile : params.tiles) { + if (tile.bytes == 0) { + continue; + } + if (tile.host_offset > params.host_bytes || tile.bytes > params.host_bytes - tile.host_offset) { + return HostCoverage::Invalid; + } + ranges.emplace_back(tile.host_offset, tile.bytes); + } + std::sort(ranges.begin(), ranges.end()); + + size_t covered = 0; + bool has_gap = false; + for (const auto& [offset, bytes] : ranges) { + if (bytes == 0 || offset < covered) { + return HostCoverage::Invalid; + } + if (offset > covered) { + has_gap = true; + } + covered = offset + bytes; + } + if (covered > params.host_bytes) { + return HostCoverage::Invalid; + } + return (!has_gap && covered == params.host_bytes) ? HostCoverage::Full : HostCoverage::Partial; +} + +bool checkHostSegments(const StagedMemoryCopyParams& params) { + if (params.host_segments.empty()) { + return params.host_base != nullptr && params.host_bytes > 0; + } + + std::vector> ranges; + ranges.reserve(params.host_segments.size()); + for (const auto& segment : params.host_segments) { + if (segment.host == nullptr || segment.bytes == 0) { + return false; + } + if (segment.host_offset > params.host_bytes || segment.bytes > params.host_bytes - segment.host_offset) { + return false; + } + ranges.emplace_back(segment.host_offset, segment.bytes); + } + std::sort(ranges.begin(), ranges.end()); + + size_t covered = 0; + for (const auto& [offset, bytes] : ranges) { + if (offset < covered) { + return false; + } + covered = offset + bytes; + } + return covered <= params.host_bytes; +} + +void packHostSegments(const StagedMemoryCopyParams& params, void* host_staging) { + auto* base = static_cast(host_staging); + for (const auto& segment : params.host_segments) { + std::memcpy(base + segment.host_offset, segment.host, segment.bytes); + } +} + +void unpackHostSegments(const StagedMemoryCopyParams& params, const void* host_staging) { + const auto* base = static_cast(host_staging); + for (const auto& segment : params.host_segments) { + std::memcpy(segment.host, base + segment.host_offset, segment.bytes); + } +} + +void copyHostToPinnedStaging(const StagedMemoryCopyParams& params, void* host_staging) { + if (params.host_segments.empty()) { + std::memcpy(host_staging, params.host_base, params.host_bytes); + return; + } + packHostSegments(params, host_staging); +} + +void copyPinnedStagingToHost(const StagedMemoryCopyParams& params, const void* host_staging) { + if (params.host_segments.empty()) { + std::memcpy(params.host_base, host_staging, params.host_bytes); + return; + } + unpackHostSegments(params, host_staging); +} + +void releaseDevicePointer(void*& ptr) { + if (ptr != nullptr) { + (void)cudaFree(ptr); + ptr = nullptr; + } +} + +void releaseMetadataScratch(StagedMemoryCopyScratch& scratch) { + releaseDevicePointer(scratch.device_ptrs); + releaseDevicePointer(scratch.device_offsets); + releaseDevicePointer(scratch.device_sizes); + scratch.meta_capacity = 0; +} + +bool ensureStagedMemoryCopyScratch(StagedMemoryCopyScratch& scratch, + int device_index, + size_t host_bytes, + size_t tile_num) { + if (scratch.device_index >= 0 && scratch.device_index != device_index) { + releaseStagedMemoryCopyScratch(scratch); + } + check_cuda_value(cudaSetDevice(device_index)); + scratch.device_index = device_index; + + if (scratch.host_capacity < host_bytes) { + if (scratch.host_staging != nullptr) { + (void)cudaFreeHost(scratch.host_staging); + scratch.host_staging = nullptr; + scratch.host_capacity = 0; + } + auto err = cudaHostAlloc(&scratch.host_staging, host_bytes, cudaHostAllocDefault); + if (err != cudaSuccess) { + RTP_LLM_LOG_WARNING("execStagedMemoryCopy failed to allocate pinned host staging: %s", + cudaGetErrorString(err)); + return false; + } + scratch.host_capacity = host_bytes; + } + + if (scratch.device_capacity < host_bytes) { + releaseDevicePointer(scratch.device_staging); + auto err = cudaMalloc(&scratch.device_staging, host_bytes); + if (err != cudaSuccess) { + scratch.device_capacity = 0; + RTP_LLM_LOG_WARNING("execStagedMemoryCopy failed to allocate device staging: %s", + cudaGetErrorString(err)); + return false; + } + scratch.device_capacity = host_bytes; + } + + if (scratch.meta_capacity < tile_num) { + releaseMetadataScratch(scratch); + auto err = cudaMalloc(&scratch.device_ptrs, tile_num * sizeof(void*)); + if (err == cudaSuccess) { + err = cudaMalloc(&scratch.device_offsets, tile_num * sizeof(size_t)); + } + if (err == cudaSuccess) { + err = cudaMalloc(&scratch.device_sizes, tile_num * sizeof(size_t)); + } + if (err != cudaSuccess) { + releaseMetadataScratch(scratch); + RTP_LLM_LOG_WARNING("execStagedMemoryCopy failed to allocate device metadata: %s", + cudaGetErrorString(err)); + return false; + } + scratch.meta_capacity = tile_num; + } + return true; +} + } // namespace +void releaseStagedMemoryCopyScratch(StagedMemoryCopyScratch& scratch) { + if (scratch.device_index >= 0) { + (void)cudaSetDevice(scratch.device_index); + } + if (scratch.host_staging != nullptr) { + (void)cudaFreeHost(scratch.host_staging); + } + releaseDevicePointer(scratch.device_staging); + releaseMetadataScratch(scratch); + scratch.host_staging = nullptr; + scratch.host_capacity = 0; + scratch.device_capacity = 0; + scratch.device_index = -1; +} + void execNoBlockCopy(const MultiCopyParams& params) { RTP_LLM_CHECK_WITH_INFO(params.multi_src.size() == params.multi_dst.size(), "multi_src.size(%zu) != multi_dst.size(%zu)", @@ -61,6 +245,203 @@ void execNoBlockCopy(const MultiCopyParams& params) { check_cuda_error(); } +bool execBatchedMemoryCopy(const BatchedMemoryCopyParams& params) { + if (params.tiles.empty()) { + return true; + } + if (params.device_index < 0) { + RTP_LLM_LOG_WARNING("execBatchedMemoryCopy failed: invalid device_index=%d", params.device_index); + return false; + } + +#if CUDART_VERSION >= 12080 + check_cuda_value(cudaSetDevice(params.device_index)); + auto stream = getNoBlockCopyStream().stream(); + + const size_t tile_num = params.tiles.size(); + std::vector dsts; + std::vector srcs; + std::vector sizes; + dsts.reserve(tile_num); + srcs.reserve(tile_num); + sizes.reserve(tile_num); + for (const auto& tile : params.tiles) { + if (tile.dst == nullptr || tile.src == nullptr || tile.bytes == 0) { + continue; + } + dsts.push_back(tile.dst); + srcs.push_back(tile.src); + sizes.push_back(tile.bytes); + } + if (dsts.empty()) { + return true; + } + + cudaMemcpyAttributes attr{}; + attr.srcAccessOrder = cudaMemcpySrcAccessOrderStream; + size_t attr_idx = 0; +#if CUDART_VERSION >= 13000 + auto err = cudaMemcpyBatchAsync(dsts.data(), srcs.data(), sizes.data(), dsts.size(), &attr, &attr_idx, 1, stream); +#else + std::vector mutable_srcs; + mutable_srcs.reserve(srcs.size()); + for (auto* src : srcs) { + mutable_srcs.push_back(const_cast(src)); + } + size_t fail_idx = 0; + auto err = cudaMemcpyBatchAsync( + dsts.data(), mutable_srcs.data(), sizes.data(), dsts.size(), &attr, &attr_idx, 1, &fail_idx, stream); +#endif + if (err == cudaSuccess) { + err = cudaStreamSynchronize(stream); + } + if (err != cudaSuccess) { + RTP_LLM_LOG_WARNING("execBatchedMemoryCopy failed: tiles=%zu, error=%s", dsts.size(), cudaGetErrorString(err)); + return false; + } + check_cuda_error(); + return true; +#else + RTP_LLM_LOG_DEBUG("execBatchedMemoryCopy unavailable: CUDART_VERSION=%d", CUDART_VERSION); + return false; +#endif +} + +bool execStagedMemoryCopy(const StagedMemoryCopyParams& params, StagedMemoryCopyScratch* scratch) { + if (params.tiles.empty()) { + return true; + } + if (params.device_index < 0 || params.host_bytes == 0 || !checkHostSegments(params)) { + RTP_LLM_LOG_WARNING("execStagedMemoryCopy failed: device=%d host_base=%p host_bytes=%zu host_segments=%zu", + params.device_index, + params.host_base, + params.host_bytes, + params.host_segments.size()); + return false; + } + const auto host_coverage = checkHostCoverage(params); + if (host_coverage == HostCoverage::Invalid) { + RTP_LLM_LOG_WARNING("execStagedMemoryCopy failed: invalid/overlapping host coverage, tiles=%zu bytes=%zu", + params.tiles.size(), + params.host_bytes); + return false; + } + + check_cuda_value(cudaSetDevice(params.device_index)); + auto stream = getNoBlockCopyStream().stream(); + + std::vector h_ptrs; + std::vector h_offsets; + std::vector h_sizes; + h_ptrs.reserve(params.tiles.size()); + h_offsets.reserve(params.tiles.size()); + h_sizes.reserve(params.tiles.size()); + for (const auto& tile : params.tiles) { + if (tile.gpu == nullptr || tile.bytes == 0) { + continue; + } + if (tile.host_offset > params.host_bytes || tile.bytes > params.host_bytes - tile.host_offset) { + RTP_LLM_LOG_WARNING("execStagedMemoryCopy failed: tile out of host span, off=%zu bytes=%zu host=%zu", + tile.host_offset, + tile.bytes, + params.host_bytes); + return false; + } + h_ptrs.push_back(tile.gpu); + h_offsets.push_back(tile.host_offset); + h_sizes.push_back(tile.bytes); + } + if (h_ptrs.empty()) { + return true; + } + + StagedMemoryCopyScratch local_scratch; + auto* work_scratch = scratch != nullptr ? scratch : &local_scratch; + auto cleanup_local_scratch = [&]() { + if (scratch == nullptr) { + releaseStagedMemoryCopyScratch(local_scratch); + } + }; + + const size_t tile_num = h_ptrs.size(); + if (!ensureStagedMemoryCopyScratch(*work_scratch, params.device_index, params.host_bytes, tile_num)) { + cleanup_local_scratch(); + return false; + } + + auto err = cudaMemcpyAsync( + work_scratch->device_ptrs, h_ptrs.data(), tile_num * sizeof(void*), cudaMemcpyHostToDevice, stream); + if (err == cudaSuccess) { + err = cudaMemcpyAsync(work_scratch->device_offsets, + h_offsets.data(), + tile_num * sizeof(size_t), + cudaMemcpyHostToDevice, + stream); + } + if (err == cudaSuccess) { + err = cudaMemcpyAsync( + work_scratch->device_sizes, h_sizes.data(), tile_num * sizeof(size_t), cudaMemcpyHostToDevice, stream); + } + + if (err == cudaSuccess && params.direction == StagedMemoryCopyDirection::H2D) { + copyHostToPinnedStaging(params, work_scratch->host_staging); + err = cudaMemcpyAsync(work_scratch->device_staging, + work_scratch->host_staging, + params.host_bytes, + cudaMemcpyHostToDevice, + stream); + if (err == cudaSuccess) { + sDevMPS::launch_scatter_copy_var_nooffset( + work_scratch->device_staging, + reinterpret_cast(work_scratch->device_offsets), + reinterpret_cast(work_scratch->device_sizes), + reinterpret_cast(work_scratch->device_ptrs), + static_cast(tile_num), + 0, + stream); + err = cudaGetLastError(); + } + } else if (err == cudaSuccess) { + sDevMPS::launch_gather_copy_var_nooffset( + reinterpret_cast(work_scratch->device_ptrs), + reinterpret_cast(work_scratch->device_sizes), + reinterpret_cast(work_scratch->device_offsets), + work_scratch->device_staging, + static_cast(tile_num), + 0, + stream); + err = cudaGetLastError(); + if (err == cudaSuccess) { + err = cudaMemcpyAsync(work_scratch->host_staging, + work_scratch->device_staging, + params.host_bytes, + cudaMemcpyDeviceToHost, + stream); + } + } + + if (err == cudaSuccess) { + err = cudaStreamSynchronize(stream); + } else { + (void)cudaStreamSynchronize(stream); + } + if (err == cudaSuccess && params.direction == StagedMemoryCopyDirection::D2H) { + copyPinnedStagingToHost(params, work_scratch->host_staging); + } + if (err != cudaSuccess) { + RTP_LLM_LOG_WARNING("execStagedMemoryCopy failed: tiles=%zu bytes=%zu direction=%s error=%s", + tile_num, + params.host_bytes, + params.direction == StagedMemoryCopyDirection::H2D ? "H2D" : "D2H", + cudaGetErrorString(err)); + cleanup_local_scratch(); + return false; + } + cleanup_local_scratch(); + check_cuda_error(); + return true; +} + void warmupNoBlockCopy() { if (!warmupSplitKvCopyKernels(at::cuda::getCurrentCUDAStream().stream())) { RTP_LLM_LOG_WARNING("warmupSplitKvCopyKernels failed; split-KV copy may JIT on first use"); diff --git a/rtp_llm/models_py/bindings/cuda/kernels/attention_input_metadata.cu b/rtp_llm/models_py/bindings/cuda/kernels/attention_input_metadata.cu new file mode 100644 index 0000000000..599eda7709 --- /dev/null +++ b/rtp_llm/models_py/bindings/cuda/kernels/attention_input_metadata.cu @@ -0,0 +1,107 @@ +#include "rtp_llm/models_py/bindings/cuda/kernels/attention_input_metadata.h" + +#include +#include +#include + +namespace rtp_llm { + +namespace { + +__global__ void buildAttentionInputMetadataKernel(const int32_t* __restrict__ input_lengths, + const int32_t* __restrict__ prefix_lengths, + int32_t* __restrict__ cu_seqlens, + int32_t* __restrict__ cu_kv_seqlens, + int32_t* __restrict__ padding_offset, + int32_t batch_size, + int32_t total_tokens) { + if (blockIdx.x != 0 || threadIdx.x != 0) { + return; + } + + int32_t max_input_len = 0; + int32_t q_acc = 0; + int32_t kv_acc = 0; + cu_seqlens[0] = 0; + cu_kv_seqlens[0] = 0; + + for (int32_t b = 0; b < batch_size; ++b) { + const int32_t input_len = input_lengths[b]; + const int32_t prefix_len = prefix_lengths ? prefix_lengths[b] : 0; + max_input_len = max_input_len > input_len ? max_input_len : input_len; + q_acc += input_len; + kv_acc += input_len + prefix_len; + cu_seqlens[b + 1] = q_acc; + cu_kv_seqlens[b + 1] = kv_acc; + } + + if (!padding_offset || total_tokens <= 0) { + return; + } + + int32_t out_idx = 0; + int32_t cum_offset = 0; + for (int32_t b = 0; b < batch_size; ++b) { + const int32_t input_len = input_lengths[b]; + for (int32_t j = 0; j < input_len && out_idx < total_tokens; ++j) { + padding_offset[out_idx++] = cum_offset; + } + cum_offset += max_input_len - input_len; + } +} + +} // namespace + +void invokeBuildAttentionInputMetadata(const at::Tensor& input_lengths, + const at::Tensor& prefix_lengths, + at::Tensor& cu_seqlens, + at::Tensor& cu_kv_seqlens, + at::Tensor& padding_offset, + cudaStream_t stream) { + TORCH_CHECK(input_lengths.defined(), "input_lengths must be defined"); + TORCH_CHECK(input_lengths.is_cuda(), "input_lengths must be a CUDA tensor"); + TORCH_CHECK(input_lengths.scalar_type() == at::kInt, "input_lengths must be int32"); + TORCH_CHECK(input_lengths.is_contiguous(), "input_lengths must be contiguous"); + TORCH_CHECK(!prefix_lengths.defined() || prefix_lengths.numel() == 0 || prefix_lengths.is_cuda(), + "prefix_lengths must be CUDA or empty"); + TORCH_CHECK(!prefix_lengths.defined() || prefix_lengths.numel() == 0 || prefix_lengths.scalar_type() == at::kInt, + "prefix_lengths must be int32"); + TORCH_CHECK(cu_seqlens.is_cuda() && cu_seqlens.scalar_type() == at::kInt, "cu_seqlens must be CUDA int32"); + TORCH_CHECK(cu_kv_seqlens.is_cuda() && cu_kv_seqlens.scalar_type() == at::kInt, "cu_kv_seqlens must be CUDA int32"); + TORCH_CHECK(!padding_offset.defined() || padding_offset.is_cuda(), "padding_offset must be CUDA"); + + const auto batch_size = static_cast(input_lengths.size(0)); + const auto total_tokens = padding_offset.defined() ? static_cast(padding_offset.numel()) : 0; + if (batch_size == 0) { + if (cu_seqlens.numel() > 0) { + cu_seqlens.zero_(); + } + if (cu_kv_seqlens.numel() > 0) { + cu_kv_seqlens.zero_(); + } + if (padding_offset.defined() && padding_offset.numel() > 0) { + padding_offset.zero_(); + } + return; + } + + const int32_t* prefix_ptr = nullptr; + if (prefix_lengths.defined() && prefix_lengths.numel() > 0) { + TORCH_CHECK(prefix_lengths.is_contiguous(), "prefix_lengths must be contiguous"); + TORCH_CHECK(prefix_lengths.size(0) >= batch_size, "prefix_lengths size must cover input_lengths"); + prefix_ptr = prefix_lengths.data_ptr(); + } + + buildAttentionInputMetadataKernel<<<1, 1, 0, stream>>>( + input_lengths.data_ptr(), + prefix_ptr, + cu_seqlens.data_ptr(), + cu_kv_seqlens.data_ptr(), + padding_offset.defined() && padding_offset.numel() > 0 ? padding_offset.data_ptr() : nullptr, + batch_size, + total_tokens); + const auto result = cudaGetLastError(); + TORCH_CHECK(result == cudaSuccess, "build attention input metadata kernel failed: ", cudaGetErrorString(result)); +} + +} // namespace rtp_llm diff --git a/rtp_llm/models_py/bindings/cuda/kernels/attention_input_metadata.h b/rtp_llm/models_py/bindings/cuda/kernels/attention_input_metadata.h new file mode 100644 index 0000000000..866223e499 --- /dev/null +++ b/rtp_llm/models_py/bindings/cuda/kernels/attention_input_metadata.h @@ -0,0 +1,15 @@ +#pragma once + +#include +#include + +namespace rtp_llm { + +void invokeBuildAttentionInputMetadata(const at::Tensor& input_lengths, + const at::Tensor& prefix_lengths, + at::Tensor& cu_seqlens, + at::Tensor& cu_kv_seqlens, + at::Tensor& padding_offset, + cudaStream_t stream); + +} // namespace rtp_llm diff --git a/rtp_llm/models_py/bindings/cuda/kernels/cuda_graph_prepare.cu b/rtp_llm/models_py/bindings/cuda/kernels/cuda_graph_prepare.cu new file mode 100644 index 0000000000..db07d768a0 --- /dev/null +++ b/rtp_llm/models_py/bindings/cuda/kernels/cuda_graph_prepare.cu @@ -0,0 +1,373 @@ +#include "rtp_llm/models_py/bindings/cuda/kernels/cuda_graph_prepare.h" + +#include +#include +#include + +namespace rtp_llm { + +namespace { + +__global__ void cudaGraphPrepareFillKernel(CudaGraphPrepareFillParams params) { + const int64_t tid = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + const int64_t stride = static_cast(blockDim.x) * gridDim.x; + + for (int32_t region_idx = 0; region_idx < params.region_count; ++region_idx) { + const auto region = params.regions[region_idx]; + if (region.ptr == nullptr || region.count <= 0) { + continue; + } + for (int64_t i = tid; i < region.count; i += stride) { + region.ptr[i] = region.value; + } + } +} + +__global__ void prepareFlashInferDecodeParamsKernel(const int32_t* sequence_lengths_plus_1, + const int32_t* block_ids, + int32_t* batch_indice, + int32_t* page_indice, + int32_t* decode_page_indptr, + int32_t* paged_kv_last_page_len, + int32_t* qo_indptr, + int32_t* kvlen, + int32_t* positions, + int64_t* slot_mapping, + int32_t batch_size, + int32_t max_blocks_per_batch, + int32_t seq_size_per_block, + int32_t captured_batch_capacity) { + // Replay path is small-batch metadata; one CUDA block avoids any host prefix-sum. + if (threadIdx.x != 0 || blockIdx.x != 0) { + return; + } + + int32_t page_offset = 0; + decode_page_indptr[0] = 0; + qo_indptr[0] = 0; + const int32_t safe_page_sz = seq_size_per_block > 0 ? seq_size_per_block : 1; + + for (int32_t batch = 0; batch < batch_size; ++batch) { + const int32_t seq_len = sequence_lengths_plus_1[batch] > 1 ? sequence_lengths_plus_1[batch] : 1; + const int32_t pages = (seq_len + safe_page_sz - 1) / safe_page_sz; + + batch_indice[batch] = batch; + positions[batch] = seq_len - 1; + kvlen[batch] = seq_len; + paged_kv_last_page_len[batch] = (seq_len - 1) % safe_page_sz + 1; + const int32_t block_index = (seq_len - 1) / safe_page_sz; + const int32_t block_offset = (seq_len - 1) % safe_page_sz; + const int32_t block_number = + block_index < max_blocks_per_batch ? block_ids[batch * max_blocks_per_batch + block_index] : 0; + slot_mapping[batch] = static_cast(block_number) * safe_page_sz + static_cast(block_offset); + + const int32_t pages_to_copy = pages < max_blocks_per_batch ? pages : max_blocks_per_batch; + for (int32_t page = 0; page < pages_to_copy; ++page) { + page_indice[page_offset + page] = block_ids[batch * max_blocks_per_batch + page]; + } + page_offset += pages_to_copy; + decode_page_indptr[batch + 1] = page_offset; + qo_indptr[batch + 1] = batch + 1; + } + + // Decode CUDA graph replay can use a graph captured for a larger batch + // than the current live batch. Clear stale entries so the captured kernels + // do not process phantom rows with old kvlen/page metadata and block_id=0. + for (int32_t batch = batch_size; batch < captured_batch_capacity; ++batch) { + batch_indice[batch] = 0; + positions[batch] = 0; + kvlen[batch] = 0; + paged_kv_last_page_len[batch] = 0; + slot_mapping[batch] = -1; + decode_page_indptr[batch + 1] = page_offset; + qo_indptr[batch + 1] = batch_size; + } +} + +// Generic prefill cuda graph metadata kernel. Used by both: +// - target verify (SparseMla, with sparse-specific outputs) +// - draft prefill (FlashInfer, sparse-specific outputs as nullptr) +// Pass nullptr for ks/ke/expanded_seq_lens/topk_indices_offset to skip those. +__global__ void prepareSparseMlaTargetVerifyParamsKernel(const int32_t* input_lengths, + const int32_t* prefix_lengths, + const int32_t* block_ids, + int32_t* batch_indice, + int32_t* page_indice, + int32_t* decode_page_indptr, + int32_t* paged_kv_last_page_len, + int32_t* qo_indptr, + int32_t* prefill_ragged_kv_len_indptr, + int32_t* kvlen, + int32_t* positions, + int64_t* slot_mapping, + int32_t* expanded_seq_lens, + int32_t* topk_indices_offset, + int32_t* ks, + int32_t* ke, + int32_t batch_size, + int32_t max_blocks_per_batch, + int32_t seq_size_per_block, + int32_t captured_batch_capacity, + int32_t captured_total_tokens) { + if (threadIdx.x != 0 || blockIdx.x != 0) { + return; + } + + const int32_t safe_page_sz = seq_size_per_block > 0 ? seq_size_per_block : 1; + int32_t token_offset = 0; + int32_t page_offset = 0; + int32_t accu_kv_len = 0; + int32_t k_offset = 0; + + decode_page_indptr[0] = 0; + qo_indptr[0] = 0; + prefill_ragged_kv_len_indptr[0] = 0; + + for (int32_t i = 0; i < batch_size; ++i) { + const int32_t input_len = input_lengths[i]; + const int32_t prefix_len = prefix_lengths[i]; + const int32_t kv_len = input_len + prefix_len; + + for (int32_t j = 0; j < input_len; ++j) { + const int32_t position = j + prefix_len; + batch_indice[token_offset] = i; + positions[token_offset] = position; + const int32_t seq_len_value = kv_len - input_len + 1 + j; + if (expanded_seq_lens != nullptr) { + expanded_seq_lens[token_offset] = seq_len_value; + } + if (topk_indices_offset != nullptr) { + topk_indices_offset[token_offset] = 0; + } + if (ks != nullptr) { + ks[token_offset] = k_offset; + } + if (ke != nullptr) { + ke[token_offset] = k_offset + seq_len_value; + } + + // slot_mapping: physical KV cache slot for this token + const int32_t block_index = position / safe_page_sz; + const int32_t block_offset = position % safe_page_sz; + const int32_t block_number = + block_index < max_blocks_per_batch ? block_ids[i * max_blocks_per_batch + block_index] : 0; + slot_mapping[token_offset] = + static_cast(block_number) * safe_page_sz + static_cast(block_offset); + + token_offset++; + } + k_offset += kv_len; + accu_kv_len += kv_len; + + kvlen[i] = kv_len; + paged_kv_last_page_len[i] = (kv_len - 1) % safe_page_sz + 1; + const int32_t pages = (kv_len + safe_page_sz - 1) / safe_page_sz; + const int32_t pages_to_copy = pages < max_blocks_per_batch ? pages : max_blocks_per_batch; + for (int32_t p = 0; p < pages_to_copy; ++p) { + page_indice[page_offset + p] = block_ids[i * max_blocks_per_batch + p]; + } + page_offset += pages_to_copy; + + decode_page_indptr[i + 1] = page_offset; + qo_indptr[i + 1] = token_offset; + prefill_ragged_kv_len_indptr[i + 1] = accu_kv_len; + } + + // Zero-fill stale entries beyond the active batch to prevent CUDA graph + // replay from processing phantom batch elements with stale metadata. + for (int32_t i = batch_size; i < captured_batch_capacity; ++i) { + kvlen[i] = 0; + paged_kv_last_page_len[i] = 0; + decode_page_indptr[i + 1] = page_offset; + qo_indptr[i + 1] = token_offset; + prefill_ragged_kv_len_indptr[i + 1] = accu_kv_len; + } + for (int32_t t = token_offset; t < captured_total_tokens; ++t) { + batch_indice[t] = 0; + positions[t] = 0; + if (slot_mapping != nullptr) + slot_mapping[t] = -1; + if (expanded_seq_lens != nullptr) + expanded_seq_lens[t] = 0; + if (topk_indices_offset != nullptr) + topk_indices_offset[t] = 0; + if (ks != nullptr) + ks[t] = 0; + if (ke != nullptr) + ke[t] = 0; + } +} + +} // namespace + +void invokeCudaGraphPrepareFill(CudaGraphPrepareFillParams params, cudaStream_t stream) { + TORCH_CHECK(params.region_count >= 0 && params.region_count <= kMaxCudaGraphPrepareFillRegions, + "invalid cuda graph prepare fill region count: ", + params.region_count); + + int64_t total_count = 0; + for (int32_t i = 0; i < params.region_count; ++i) { + total_count += params.regions[i].count > 0 ? params.regions[i].count : 0; + } + if (total_count <= 0) { + return; + } + + constexpr int block_size = 256; + const int blocks = static_cast(std::min((total_count + block_size - 1) / block_size, 1024)); + cudaGraphPrepareFillKernel<<>>(params); + const auto result = cudaGetLastError(); + TORCH_CHECK(result == cudaSuccess, "cuda graph prepare fill kernel failed: ", cudaGetErrorString(result)); +} + +void invokePrepareFlashInferDecodeParams(const int32_t* sequence_lengths_plus_1, + const int32_t* block_ids, + int32_t* batch_indice, + int32_t* page_indice, + int32_t* decode_page_indptr, + int32_t* paged_kv_last_page_len, + int32_t* qo_indptr, + int32_t* kvlen, + int32_t* positions, + int64_t* slot_mapping, + int32_t batch_size, + int32_t max_blocks_per_batch, + int32_t seq_size_per_block, + int32_t captured_batch_capacity, + cudaStream_t stream) { + TORCH_CHECK(sequence_lengths_plus_1 != nullptr, "sequence_lengths_plus_1 is null"); + TORCH_CHECK(block_ids != nullptr, "block_ids is null"); + TORCH_CHECK(batch_indice != nullptr && page_indice != nullptr && decode_page_indptr != nullptr + && paged_kv_last_page_len != nullptr && qo_indptr != nullptr && kvlen != nullptr + && positions != nullptr && slot_mapping != nullptr, + "FlashInfer decode metadata output buffer is null"); + if (batch_size <= 0 || max_blocks_per_batch <= 0) { + return; + } + prepareFlashInferDecodeParamsKernel<<<1, 1, 0, stream>>>(sequence_lengths_plus_1, + block_ids, + batch_indice, + page_indice, + decode_page_indptr, + paged_kv_last_page_len, + qo_indptr, + kvlen, + positions, + slot_mapping, + batch_size, + max_blocks_per_batch, + seq_size_per_block, + captured_batch_capacity); + const auto result = cudaGetLastError(); + TORCH_CHECK( + result == cudaSuccess, "FlashInfer decode CUDA graph prepare kernel failed: ", cudaGetErrorString(result)); +} + +// Non-sparse prefill cuda graph kernel — sparse-specific outputs nullptr. +void invokePrepareFlashInferPrefillParams(const int32_t* input_lengths, + const int32_t* prefix_lengths, + const int32_t* block_ids, + int32_t* batch_indice, + int32_t* page_indice, + int32_t* decode_page_indptr, + int32_t* paged_kv_last_page_len, + int32_t* qo_indptr, + int32_t* prefill_ragged_kv_len_indptr, + int32_t* kvlen, + int32_t* positions, + int64_t* slot_mapping, + int32_t batch_size, + int32_t max_blocks_per_batch, + int32_t seq_size_per_block, + int32_t captured_total_tokens, + cudaStream_t stream) { + TORCH_CHECK(input_lengths != nullptr, "input_lengths is null"); + TORCH_CHECK(prefix_lengths != nullptr, "prefix_lengths is null"); + TORCH_CHECK(block_ids != nullptr, "block_ids is null"); + TORCH_CHECK(slot_mapping != nullptr, "slot_mapping is null"); + if (batch_size <= 0 || max_blocks_per_batch <= 0) { + return; + } + prepareSparseMlaTargetVerifyParamsKernel<<<1, 1, 0, stream>>>(input_lengths, + prefix_lengths, + block_ids, + batch_indice, + page_indice, + decode_page_indptr, + paged_kv_last_page_len, + qo_indptr, + prefill_ragged_kv_len_indptr, + kvlen, + positions, + slot_mapping, + /*expanded_seq_lens=*/nullptr, + /*topk_indices_offset=*/nullptr, + /*ks=*/nullptr, + /*ke=*/nullptr, + batch_size, + max_blocks_per_batch, + seq_size_per_block, + batch_size, + captured_total_tokens); + const auto result = cudaGetLastError(); + TORCH_CHECK( + result == cudaSuccess, "FlashInfer prefill CUDA graph prepare kernel failed: ", cudaGetErrorString(result)); +} + +void invokePrepareSparseMlaTargetVerifyParams(const int32_t* input_lengths, + const int32_t* prefix_lengths, + const int32_t* block_ids, + int32_t* batch_indice, + int32_t* page_indice, + int32_t* decode_page_indptr, + int32_t* paged_kv_last_page_len, + int32_t* qo_indptr, + int32_t* prefill_ragged_kv_len_indptr, + int32_t* kvlen, + int32_t* positions, + int64_t* slot_mapping, + int32_t* expanded_seq_lens, + int32_t* topk_indices_offset, + int32_t* ks, + int32_t* ke, + int32_t batch_size, + int32_t max_blocks_per_batch, + int32_t seq_size_per_block, + int32_t captured_batch_capacity, + int32_t captured_total_tokens, + cudaStream_t stream) { + TORCH_CHECK(input_lengths != nullptr, "input_lengths is null"); + TORCH_CHECK(prefix_lengths != nullptr, "prefix_lengths is null"); + TORCH_CHECK(block_ids != nullptr, "block_ids is null"); + TORCH_CHECK(slot_mapping != nullptr, "slot_mapping is null"); + if (batch_size <= 0 || max_blocks_per_batch <= 0) { + return; + } + prepareSparseMlaTargetVerifyParamsKernel<<<1, 1, 0, stream>>>(input_lengths, + prefix_lengths, + block_ids, + batch_indice, + page_indice, + decode_page_indptr, + paged_kv_last_page_len, + qo_indptr, + prefill_ragged_kv_len_indptr, + kvlen, + positions, + slot_mapping, + expanded_seq_lens, + topk_indices_offset, + ks, + ke, + batch_size, + max_blocks_per_batch, + seq_size_per_block, + captured_batch_capacity, + captured_total_tokens); + const auto result = cudaGetLastError(); + TORCH_CHECK(result == cudaSuccess, + "SparseMLA target verify CUDA graph prepare kernel failed: ", + cudaGetErrorString(result)); +} + +} // namespace rtp_llm diff --git a/rtp_llm/models_py/bindings/cuda/kernels/cuda_graph_prepare.h b/rtp_llm/models_py/bindings/cuda/kernels/cuda_graph_prepare.h new file mode 100644 index 0000000000..50de2495d9 --- /dev/null +++ b/rtp_llm/models_py/bindings/cuda/kernels/cuda_graph_prepare.h @@ -0,0 +1,80 @@ +#pragma once + +#include +#include + +namespace rtp_llm { + +constexpr int kMaxCudaGraphPrepareFillRegions = 32; + +struct CudaGraphPrepareFillRegion { + int32_t* ptr = nullptr; + int64_t count = 0; + int32_t value = 0; +}; + +struct CudaGraphPrepareFillParams { + int32_t region_count = 0; + CudaGraphPrepareFillRegion regions[kMaxCudaGraphPrepareFillRegions]; +}; + +void invokeCudaGraphPrepareFill(CudaGraphPrepareFillParams params, cudaStream_t stream); + +void invokePrepareFlashInferDecodeParams(const int32_t* sequence_lengths_plus_1, + const int32_t* block_ids, + int32_t* batch_indice, + int32_t* page_indice, + int32_t* decode_page_indptr, + int32_t* paged_kv_last_page_len, + int32_t* qo_indptr, + int32_t* kvlen, + int32_t* positions, + int64_t* slot_mapping, + int32_t batch_size, + int32_t max_blocks_per_batch, + int32_t seq_size_per_block, + int32_t captured_batch_capacity, + cudaStream_t stream); + +void invokePrepareFlashInferPrefillParams(const int32_t* input_lengths, + const int32_t* prefix_lengths, + const int32_t* block_ids, + int32_t* batch_indice, + int32_t* page_indice, + int32_t* decode_page_indptr, + int32_t* paged_kv_last_page_len, + int32_t* qo_indptr, + int32_t* prefill_ragged_kv_len_indptr, + int32_t* kvlen, + int32_t* positions, + int64_t* slot_mapping, + int32_t batch_size, + int32_t max_blocks_per_batch, + int32_t seq_size_per_block, + int32_t captured_total_tokens, + cudaStream_t stream); + +void invokePrepareSparseMlaTargetVerifyParams(const int32_t* input_lengths, + const int32_t* prefix_lengths, + const int32_t* block_ids, + int32_t* batch_indice, + int32_t* page_indice, + int32_t* decode_page_indptr, + int32_t* paged_kv_last_page_len, + int32_t* qo_indptr, + int32_t* prefill_ragged_kv_len_indptr, + int32_t* kvlen, + int32_t* positions, + int64_t* slot_mapping, + int32_t* expanded_seq_lens, + int32_t* topk_indices_offset, + int32_t* ks, + int32_t* ke, + int32_t batch_size, + int32_t max_blocks_per_batch, + int32_t seq_size_per_block, + int32_t captured_batch_capacity, + int32_t captured_total_tokens, + cudaStream_t stream); + +} // namespace rtp_llm diff --git a/rtp_llm/models_py/bindings/cuda/kernels/mtp_target_verify_prepare.cu b/rtp_llm/models_py/bindings/cuda/kernels/mtp_target_verify_prepare.cu new file mode 100644 index 0000000000..4e36819bc3 --- /dev/null +++ b/rtp_llm/models_py/bindings/cuda/kernels/mtp_target_verify_prepare.cu @@ -0,0 +1,333 @@ +#include "rtp_llm/models_py/bindings/cuda/kernels/mtp_target_verify_prepare.h" + +#include "rtp_llm/cpp/utils/AssertUtils.h" + +#include + +namespace rtp_llm { + +namespace { + +__global__ void mtpTargetVerifyPrepareKernel(const int32_t* __restrict__ sequence_lengths, + int32_t* __restrict__ input_lengths, + int32_t* __restrict__ prefix_lengths, + int32_t* __restrict__ sequence_lengths_plus_1, + int32_t* __restrict__ lm_output_indexes, + int32_t tokens_per_batch, + int32_t batch_size) { + const int32_t idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (idx >= batch_size) { + return; + } + input_lengths[idx] = tokens_per_batch; + prefix_lengths[idx] = sequence_lengths[idx]; + sequence_lengths_plus_1[idx] = sequence_lengths[idx] + 1; + lm_output_indexes[idx] = idx * tokens_per_batch; +} + +__global__ void mtpSpecDecodeMetadataPrepareKernel(int32_t* __restrict__ input_lengths, + int32_t* __restrict__ lm_output_indexes, + int32_t tokens_per_batch, + int32_t batch_size) { + const int32_t total_tokens = batch_size * tokens_per_batch; + const int32_t idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (idx < batch_size) { + input_lengths[idx] = tokens_per_batch; + } + if (idx < total_tokens) { + lm_output_indexes[idx] = idx; + } +} + +__global__ void mtpSpecDecodeTokensMetadataPrepareKernel(const int32_t* __restrict__ token0, + const int32_t* __restrict__ token1, + const int32_t* __restrict__ token2, + const int32_t* __restrict__ token3, + const int32_t* __restrict__ token4, + const int32_t* __restrict__ token5, + const int32_t* __restrict__ token6, + const int32_t* __restrict__ token7, + int32_t* __restrict__ spec_tokens, + int32_t* __restrict__ input_lengths, + int32_t* __restrict__ lm_output_indexes, + int32_t tokens_per_batch, + int32_t batch_size) { + const int32_t total_tokens = batch_size * tokens_per_batch; + const int32_t idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (idx >= total_tokens) { + return; + } + + const int32_t batch_idx = idx / tokens_per_batch; + const int32_t token_idx = idx - batch_idx * tokens_per_batch; + const int32_t* src = nullptr; + switch (token_idx) { + case 0: + src = token0; + break; + case 1: + src = token1; + break; + case 2: + src = token2; + break; + case 3: + src = token3; + break; + case 4: + src = token4; + break; + case 5: + src = token5; + break; + case 6: + src = token6; + break; + case 7: + src = token7; + break; + } + + spec_tokens[idx] = src[batch_idx]; + lm_output_indexes[idx] = idx; + if (token_idx == 0) { + input_lengths[batch_idx] = tokens_per_batch; + } +} + +__global__ void mtpPrefillShiftAppendKernel(const int32_t* __restrict__ combo_tokens_in, + const int32_t* __restrict__ input_lengths, + const int32_t* __restrict__ batch_offsets, + const int32_t* __restrict__ new_all_token_ids, + int32_t* __restrict__ combo_tokens_out, + int32_t token_stride, + int32_t batch_size, + int32_t total_tokens) { + const int32_t global_idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (global_idx >= total_tokens) { + return; + } + // Binary search for the batch this token belongs to. batch_offsets[b] holds + // the exclusive end offset for batch b (i.e. cumulative input_lengths up to b+1). + int32_t lo = 0; + int32_t hi = batch_size - 1; + while (lo < hi) { + const int32_t mid = lo + ((hi - lo) >> 1); + if (batch_offsets[mid] <= global_idx) { + lo = mid + 1; + } else { + hi = mid; + } + } + const int32_t batch_idx = lo; + const int32_t batch_start = (batch_idx == 0) ? 0 : batch_offsets[batch_idx - 1]; + const int32_t position_in_batch = global_idx - batch_start; + const int32_t input_length = input_lengths[batch_idx]; + + if (position_in_batch == input_length - 1) { + // Last position: write the new accepted token (last column of new_all_token_ids). + combo_tokens_out[global_idx] = new_all_token_ids[batch_idx * token_stride + token_stride - 1]; + } else if (position_in_batch < input_length - 1) { + // Shift left by 1: out[i] = in[i+1] within the batch. + combo_tokens_out[global_idx] = combo_tokens_in[global_idx + 1]; + } +} + +void checkCudaI32Vector(const torch::Tensor& tensor, const char* name, int64_t batch_size) { + RTP_LLM_CHECK_WITH_INFO(tensor.defined(), "%s must be defined", name); + RTP_LLM_CHECK_WITH_INFO(tensor.is_cuda(), "%s must be CUDA", name); + RTP_LLM_CHECK_WITH_INFO(tensor.scalar_type() == torch::kInt32, "%s must be int32", name); + RTP_LLM_CHECK_WITH_INFO(tensor.is_contiguous(), "%s must be contiguous", name); + RTP_LLM_CHECK_WITH_INFO( + tensor.numel() >= batch_size, "%s numel %ld is smaller than batch_size %ld", name, tensor.numel(), batch_size); +} + +} // namespace + +void invokeMtpTargetVerifyPrepare(const torch::Tensor& sequence_lengths, + torch::Tensor& input_lengths, + torch::Tensor& prefix_lengths, + torch::Tensor& sequence_lengths_plus_1, + torch::Tensor& lm_output_indexes, + int32_t tokens_per_batch, + cudaStream_t stream) { + const int64_t batch_size = input_lengths.numel(); + if (batch_size <= 0) { + return; + } + checkCudaI32Vector(sequence_lengths, "sequence_lengths", batch_size); + checkCudaI32Vector(input_lengths, "input_lengths", batch_size); + checkCudaI32Vector(prefix_lengths, "prefix_lengths", batch_size); + checkCudaI32Vector(sequence_lengths_plus_1, "sequence_lengths_plus_1", batch_size); + checkCudaI32Vector(lm_output_indexes, "lm_output_indexes", batch_size); + + constexpr int block_size = 256; + const int grid_size = static_cast((batch_size + block_size - 1) / block_size); + mtpTargetVerifyPrepareKernel<<>>(sequence_lengths.data_ptr(), + input_lengths.data_ptr(), + prefix_lengths.data_ptr(), + sequence_lengths_plus_1.data_ptr(), + lm_output_indexes.data_ptr(), + tokens_per_batch, + static_cast(batch_size)); +} + +void invokeMtpSpecDecodeMetadataPrepare(torch::Tensor& input_lengths, + torch::Tensor& lm_output_indexes, + int32_t tokens_per_batch, + cudaStream_t stream) { + const int64_t batch_size = input_lengths.numel(); + if (batch_size <= 0) { + return; + } + checkCudaI32Vector(input_lengths, "input_lengths", batch_size); + const int64_t total_tokens = batch_size * tokens_per_batch; + checkCudaI32Vector(lm_output_indexes, "lm_output_indexes", total_tokens); + + constexpr int block_size = 256; + const int64_t work_items = std::max(batch_size, total_tokens); + const int grid_size = static_cast((work_items + block_size - 1) / block_size); + mtpSpecDecodeMetadataPrepareKernel<<>>(input_lengths.data_ptr(), + lm_output_indexes.data_ptr(), + tokens_per_batch, + static_cast(batch_size)); +} + +void invokeMtpSpecDecodeTokensMetadataPrepare(const std::vector& token_columns, + torch::Tensor& spec_tokens, + torch::Tensor& input_lengths, + torch::Tensor& lm_output_indexes, + int32_t tokens_per_batch, + cudaStream_t stream) { + RTP_LLM_CHECK_WITH_INFO(tokens_per_batch > 0, "tokens_per_batch must be positive"); + RTP_LLM_CHECK_WITH_INFO(tokens_per_batch <= 8, "tokens_per_batch %d exceeds fused kernel max 8", tokens_per_batch); + RTP_LLM_CHECK_WITH_INFO(static_cast(token_columns.size()) == tokens_per_batch, + "token_columns size %ld must equal tokens_per_batch %d", + token_columns.size(), + tokens_per_batch); + + const int64_t batch_size = input_lengths.numel(); + if (batch_size <= 0) { + return; + } + const int64_t total_tokens = batch_size * tokens_per_batch; + checkCudaI32Vector(spec_tokens, "spec_tokens", total_tokens); + checkCudaI32Vector(input_lengths, "input_lengths", batch_size); + checkCudaI32Vector(lm_output_indexes, "lm_output_indexes", total_tokens); + for (size_t i = 0; i < token_columns.size(); ++i) { + checkCudaI32Vector(token_columns[i], "token_columns", batch_size); + } + + const int32_t* ptrs[8] = {}; + for (size_t i = 0; i < token_columns.size(); ++i) { + ptrs[i] = token_columns[i].data_ptr(); + } + + constexpr int block_size = 256; + const int grid_size = static_cast((total_tokens + block_size - 1) / block_size); + mtpSpecDecodeTokensMetadataPrepareKernel<<>>( + ptrs[0], + ptrs[1], + ptrs[2], + ptrs[3], + ptrs[4], + ptrs[5], + ptrs[6], + ptrs[7], + spec_tokens.data_ptr(), + input_lengths.data_ptr(), + lm_output_indexes.data_ptr(), + tokens_per_batch, + static_cast(batch_size)); +} + +// Fused kernel: next_seq_len[i] = prev_seq_len[i] + accept_len[i] +// hidden_idx[i] = (int64_t)(accept_len[i] - 1) +__global__ void mtpDispatchStatePrepareKernel(const int32_t* __restrict__ accept_len, + const int32_t* __restrict__ prev_seq_len, + int32_t* __restrict__ next_seq_len, + int64_t* __restrict__ hidden_idx, + int32_t batch_size) { + const int32_t idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (idx >= batch_size) { + return; + } + const int32_t al = accept_len[idx]; + next_seq_len[idx] = prev_seq_len[idx] + al; + hidden_idx[idx] = static_cast(al - 1); +} + +void invokeMtpDispatchStatePrepare(const torch::Tensor& accept_len, + const torch::Tensor& prev_seq_len, + torch::Tensor& next_seq_len, + torch::Tensor& hidden_idx, + int64_t batch_size, + cudaStream_t stream) { + if (batch_size <= 0) { + return; + } + checkCudaI32Vector(accept_len, "accept_len", batch_size); + checkCudaI32Vector(prev_seq_len, "prev_seq_len", batch_size); + checkCudaI32Vector(next_seq_len, "next_seq_len", batch_size); + RTP_LLM_CHECK_WITH_INFO(hidden_idx.defined() && hidden_idx.is_cuda(), "hidden_idx must be CUDA"); + RTP_LLM_CHECK_WITH_INFO(hidden_idx.scalar_type() == torch::kInt64, "hidden_idx must be int64"); + RTP_LLM_CHECK_WITH_INFO(hidden_idx.is_contiguous(), "hidden_idx must be contiguous"); + RTP_LLM_CHECK_WITH_INFO( + hidden_idx.numel() >= batch_size, "hidden_idx numel %ld < batch_size %ld", hidden_idx.numel(), batch_size); + + constexpr int block_size = 256; + const int grid_size = static_cast((batch_size + block_size - 1) / block_size); + mtpDispatchStatePrepareKernel<<>>(accept_len.data_ptr(), + prev_seq_len.data_ptr(), + next_seq_len.data_ptr(), + hidden_idx.data_ptr(), + static_cast(batch_size)); +} + +// REBASE CONFLICT CONTEXT(518707c73): source branch added this fused +// shift/append launcher to eliminate sync-heavy CPU token manipulation. Keep it +// with the new base dispatch-state prepare launcher above. +void invokeMtpPrefillShiftAppend(const torch::Tensor& combo_tokens_in, + const torch::Tensor& input_lengths, + const torch::Tensor& batch_offsets, + const torch::Tensor& new_all_token_ids, + torch::Tensor& combo_tokens_out, + int32_t token_stride, + cudaStream_t stream) { + const int64_t batch_size = input_lengths.numel(); + if (batch_size <= 0) { + return; + } + const int64_t total_tokens = combo_tokens_in.numel(); + if (total_tokens <= 0) { + return; + } + checkCudaI32Vector(combo_tokens_in, "combo_tokens_in", total_tokens); + checkCudaI32Vector(combo_tokens_out, "combo_tokens_out", total_tokens); + checkCudaI32Vector(input_lengths, "input_lengths", batch_size); + checkCudaI32Vector(batch_offsets, "batch_offsets", batch_size); + RTP_LLM_CHECK_WITH_INFO(new_all_token_ids.defined() && new_all_token_ids.is_cuda(), + "new_all_token_ids must be CUDA"); + RTP_LLM_CHECK_WITH_INFO(new_all_token_ids.scalar_type() == torch::kInt32, + "new_all_token_ids must be int32 (got %s)", + c10::toString(new_all_token_ids.scalar_type())); + RTP_LLM_CHECK_WITH_INFO(new_all_token_ids.is_contiguous(), "new_all_token_ids must be contiguous"); + RTP_LLM_CHECK_WITH_INFO(new_all_token_ids.numel() >= batch_size * token_stride, + "new_all_token_ids numel %ld < batch_size %ld * token_stride %d", + new_all_token_ids.numel(), + batch_size, + token_stride); + + constexpr int block_size = 256; + const int grid_size = static_cast((total_tokens + block_size - 1) / block_size); + mtpPrefillShiftAppendKernel<<>>(combo_tokens_in.data_ptr(), + input_lengths.data_ptr(), + batch_offsets.data_ptr(), + new_all_token_ids.data_ptr(), + combo_tokens_out.data_ptr(), + token_stride, + static_cast(batch_size), + static_cast(total_tokens)); +} + +} // namespace rtp_llm diff --git a/rtp_llm/models_py/bindings/cuda/kernels/mtp_target_verify_prepare.h b/rtp_llm/models_py/bindings/cuda/kernels/mtp_target_verify_prepare.h new file mode 100644 index 0000000000..0b66be8efd --- /dev/null +++ b/rtp_llm/models_py/bindings/cuda/kernels/mtp_target_verify_prepare.h @@ -0,0 +1,58 @@ +#pragma once + +#include +#include +#include + +namespace rtp_llm { + +void invokeMtpTargetVerifyPrepare(const torch::Tensor& sequence_lengths, + torch::Tensor& input_lengths, + torch::Tensor& prefix_lengths, + torch::Tensor& sequence_lengths_plus_1, + torch::Tensor& lm_output_indexes, + int32_t tokens_per_batch, + cudaStream_t stream); + +void invokeMtpSpecDecodeMetadataPrepare(torch::Tensor& input_lengths, + torch::Tensor& lm_output_indexes, + int32_t tokens_per_batch, + cudaStream_t stream); + +void invokeMtpSpecDecodeTokensMetadataPrepare(const std::vector& token_columns, + torch::Tensor& spec_tokens, + torch::Tensor& input_lengths, + torch::Tensor& lm_output_indexes, + int32_t tokens_per_batch, + cudaStream_t stream); + +// Fused kernel for dispatchDecodeAsync per-stream state publishing. +// Computes: next_seq_len[i] = prev_seq_len[i] + accept_len[i] (int32) +// hidden_idx[i] = accept_len[i] - 1 (int64) +// All inputs/outputs must be contiguous CUDA tensors with numel >= batch_size. +void invokeMtpDispatchStatePrepare(const torch::Tensor& accept_len, + const torch::Tensor& prev_seq_len, + torch::Tensor& next_seq_len, + torch::Tensor& hidden_idx, + int64_t batch_size, + cudaStream_t stream); + +// REBASE CONFLICT CONTEXT(518707c73): keep new base dispatch-state publishing +// kernel and add source branch prefill shift/append kernel to avoid sync-heavy +// CPU token manipulation. +// For each batch b with input_lengths_d[b] tokens packed at offset cumsum(input_lengths_d)[b-1] +// in combo_tokens_in: +// * shift combo_tokens_in[offset .. offset+input_length-1] left by 1 (drop first token) +// * write new_all_token_ids[b, token_stride-1] at combo_tokens_out[offset+input_length-1] +// All inputs/outputs are int32 CUDA tensors. combo_tokens_out may alias combo_tokens_in; +// the kernel writes each position from a single thread per (batch, position) pair so +// in-place shift is safe. +void invokeMtpPrefillShiftAppend(const torch::Tensor& combo_tokens_in, + const torch::Tensor& input_lengths, + const torch::Tensor& batch_offsets, + const torch::Tensor& new_all_token_ids, + torch::Tensor& combo_tokens_out, + int32_t token_stride, + cudaStream_t stream); + +} // namespace rtp_llm