NVIDIA
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 49 additions & 0 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h‎
Lines changed: 21 additions & 7 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h‎
Lines changed: 21 additions & 7 deletions
diff --git a/‎cpp/include/tensorrt_llm/executor/dataTransceiverState.h‎
Lines changed: 36 additions & 3 deletions b/‎cpp/include/tensorrt_llm/executor/dataTransceiverState.h‎
Lines changed: 36 additions & 3 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp‎
Lines changed: 17 additions & 6 deletions b/‎cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp‎
Lines changed: 17 additions & 6 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/cacheTransBuffer.cpp‎
Lines changed: 11 additions & 3 deletions b/‎cpp/tensorrt_llm/batch_manager/cacheTransBuffer.cpp‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/cacheTransBuffer.h‎
Lines changed: 9 additions & 2 deletions b/‎cpp/tensorrt_llm/batch_manager/cacheTransBuffer.h‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp‎
Lines changed: 3 additions & 1 deletion b/‎cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp‎
Lines changed: 3 additions & 1 deletion
@@ -595,6 +595,21 @@ class WindowBlockManager
 
     ~WindowBlockManager();
 
+    [[nodiscard]] bool isEnableIndexerKCache() const
+    {
+        return mEnableIndexerKCache;
+    }
+
+    [[nodiscard]] SizeType32 getIndexerKCacheQuantBlockSize() const
+    {
+        return mIndexerKCacheQuantBlockSize;
+    }
+
+    [[nodiscard]] SizeType32 getIndexerKCacheIndexHeadDim() const
+    {
+        return mIndexerKCacheIndexHeadDim;
+    }
+
     void allocatePools(bool useUvm);
 
     void releasePools();
@@ -1014,6 +1029,21 @@ class BlockManager
         std::optional<kvc::BaseAgentConfig> agentConfig = std::nullopt, bool enableIndexerKCache = false,
         SizeType32 indexerKCacheQuantBlockSize = 128, SizeType32 indexerKCacheIndexHeadDim = 0);
 
+    [[nodiscard]] bool isEnableIndexerKCache() const
+    {
+        return mWindowBlockManagers.begin()->second.isEnableIndexerKCache();
+    }
+
+    [[nodiscard]] SizeType32 getIndexerKCacheQuantBlockSize() const
+    {
+        return mWindowBlockManagers.begin()->second.getIndexerKCacheQuantBlockSize();
+    }
+
+    [[nodiscard]] SizeType32 getIndexerKCacheIndexHeadDim() const
+    {
+        return mWindowBlockManagers.begin()->second.getIndexerKCacheIndexHeadDim();
+    }
+
     BlockManager(BlockManager const&) = delete;
     BlockManager& operator=(BlockManager const&) = delete;
 
@@ -1485,6 +1515,10 @@ class BaseKVCacheManager
 
     [[nodiscard]] virtual bool isEnableBlockReuse() const = 0;
 
+    [[nodiscard]] virtual bool isEnableIndexerKCache() const = 0;
+    [[nodiscard]] virtual SizeType32 getIndexerKCacheIndexHeadDim() const = 0;
+    [[nodiscard]] virtual SizeType32 getIndexerKCacheQuantBlockSize() const = 0;
+
     // void removeToken(SizeType32 seqSlotIdx);
     virtual void rewindKVCache(LlmRequest::RequestIdType requestId, SizeType32 rewindLengths) = 0;
 
@@ -1818,6 +1852,21 @@ class KVCacheManager : public BaseKVCacheManager
         return mEnableBlockReuse;
     }
 
+    [[nodiscard]] bool isEnableIndexerKCache() const override
+    {
+        return mBlockManager.isEnableIndexerKCache();
+    }
+
+    [[nodiscard]] SizeType32 getIndexerKCacheIndexHeadDim() const override
+    {
+        return mBlockManager.getIndexerKCacheIndexHeadDim();
+    }
+
+    [[nodiscard]] SizeType32 getIndexerKCacheQuantBlockSize() const override
+    {
+        return mBlockManager.getIndexerKCacheQuantBlockSize();
+    }
+
     void removeToken(LlmRequest::RequestIdType requestId);
     void rewindKVCache(LlmRequest::RequestIdType requestId, SizeType32 rewindLengths) override;
 
 
@@ -73,7 +73,8 @@ class BlockRange
         BaseKVCacheManager& cacheManager, BlockKey const& lastBlockKey, int32_t indexFromEnd)
     {
 
-        auto poolNum = cacheManager.getNumPools();
+        auto poolNum = cacheManager.getBlockManager().getNumPools(
+            /*includeBlockScalePools=*/false, /*includeIndexerKCachePools=*/false);
         TLLM_CHECK_WITH_INFO(poolNum == 1, "Reuse tree is not supported for multiple pools or variable window size");
 
         auto windowSize = cacheManager.getBlockManager().getWindowSizesMetadata().begin()->first;
@@ -136,13 +137,21 @@ class BlockRange
         return blockHashesPerWindow;
     }
 
-    BlockRangeForWindow getBlockRangeForWindow(SizeType32 windowSize) const
+    BlockRangeForWindow getBlockRangeForWindow(SizeType32 windowSize, bool useIndexerKCache = false) const
     {
         TLLM_CHECK_WITH_INFO(
             mPoolsPerWindow.find(windowSize) != mPoolsPerWindow.end(), "Window size %d not found", windowSize);
         auto pool = mPoolsPerWindow.at(windowSize).front();
         auto blockIds = mBlockIdsPerWindow.at(windowSize);
-        return BlockRangeForWindow(mManager, windowSize, std::move(blockIds), std::move(pool));
+        if (useIndexerKCache)
+        {
+            TLLM_CHECK(mIndexerKCachePool);
+            return BlockRangeForWindow(mManager, windowSize, std::move(blockIds), std::move(mIndexerKCachePool));
+        }
+        else
+        {
+            return BlockRangeForWindow(mManager, windowSize, std::move(blockIds), std::move(pool));
+        }
     }
 
     std::vector<SizeType32> getWindowSizes() const
@@ -167,9 +176,8 @@ class BlockRange
         , mRequestId(requestId)
         , mBlockIdsPerWindow(std::move(blockIdsPerWindow))
     {
-
-        // cacheManager.getBlockManager.getPrimaryPool(0);
-        auto poolNum = mManager->getNumPools();
+        auto poolNum = mManager->getBlockManager().getNumPools(
+            /*includeBlockScalePools=*/false, /*includeIndexerKCachePools=*/false);
         for (SizeType32 poolIdx = 0; poolIdx < poolNum; ++poolIdx)
         {
             auto windowSize = cacheManager.getBlockManager().getPoolWindowSize(poolIdx);
@@ -181,21 +189,27 @@ class BlockRange
         : mManager(&cacheManager)
         , mRequestId(requestId)
     {
-        auto poolNum = mManager->getNumPools();
+        auto poolNum = mManager->getBlockManager().getNumPools(
+            /*includeBlockScalePools=*/false, /*includeIndexerKCachePools=*/false);
         for (SizeType32 poolIdx = 0; poolIdx < poolNum; ++poolIdx)
         {
             auto windowSize = cacheManager.getBlockManager().getPoolWindowSize(poolIdx);
             mPoolsPerWindow[windowSize].push_back(cacheManager.getBlockManager().getPrimaryPool(poolIdx));
             mBlockIdsPerWindow[windowSize]
                 = cacheManager.getSequence(mRequestId).getCacheBlockIds(windowSize).at(kFIRST_AND_ONLY_BEAM);
         }
+        if (cacheManager.isEnableIndexerKCache())
+        {
+            mIndexerKCachePool = cacheManager.getIndexerKCachePool();
+        }
     }
 
 private:
     BaseKVCacheManager const* mManager;
     LlmRequest::RequestIdType const mRequestId;
     std::unordered_map<SizeType32, std::vector<SizeType32>> mBlockIdsPerWindow;
     std::unordered_map<SizeType32, std::vector<runtime::ITensor::SharedPtr>> mPoolsPerWindow;
+    runtime::ITensor::SharedPtr mIndexerKCachePool;
 
     static constexpr SizeType32 kFIRST_AND_ONLY_BEAM = 0;
     static constexpr SizeType32 kFIRST_POOL_INDEX = 0;
 
@@ -50,7 +50,8 @@ class CacheState final
 
     CacheState(ModelConfig modelConfig, runtime::WorldConfig const& worldConfig,
         std::vector<SizeType32> const& attentionLayerNumPerPP, nvinfer1::DataType dataType,
-        AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2, bool enableBlockReuse = false)
+        AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2, bool enableBlockReuse = false,
+        bool hasIndexerKCache = false, SizeType32 indexerDimPerHead = 0, SizeType32 indexerKCacheQuantBlockSize = 128)
         : mModelConfig(std::move(modelConfig))
         , mParallelConfig{worldConfig.getTensorParallelism(), worldConfig.getPipelineParallelism(),
               worldConfig.getContextParallelism(), worldConfig.enableAttentionDP(), worldConfig.getTensorParallelRank(),
@@ -59,34 +60,45 @@ class CacheState final
         , mAttentionConfig(attentionType, kvFactor)
     {
         mEnableBlockReuse = enableBlockReuse;
+        mHasIndexerKCache = hasIndexerKCache;
+        mIndexerDimPerHead = indexerDimPerHead;
+        mIndexerKCacheQuantBlockSize = indexerKCacheQuantBlockSize;
     }
 
     CacheState(std::vector<SizeType32> nbKvHeadPerLayer, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
         SizeType32 tensorParallelism, SizeType32 pipelineParallelism, SizeType32 contextParallelism,
         std::vector<SizeType32> const& attentionLayerNumPerPP, nvinfer1::DataType dataType,
         AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2, bool enableAttentionDP = false,
-        int DPrank = 0, int DPsize = 0, bool enableBlockReuse = false)
+        int DPrank = 0, int DPsize = 0, bool enableBlockReuse = false, bool hasIndexerKCache = false,
+        SizeType32 indexerDimPerHead = 0, SizeType32 indexerKCacheQuantBlockSize = 128)
         : mModelConfig{std::move(nbKvHeadPerLayer), sizePerHead, tokensPerBlock}
         , mParallelConfig{tensorParallelism, pipelineParallelism, contextParallelism, enableAttentionDP, DPrank, DPsize,
               attentionLayerNumPerPP}
         , mDataType{dataType}
         , mAttentionConfig(attentionType, kvFactor)
     {
         mEnableBlockReuse = enableBlockReuse;
+        mHasIndexerKCache = hasIndexerKCache;
+        mIndexerDimPerHead = indexerDimPerHead;
+        mIndexerKCacheQuantBlockSize = indexerKCacheQuantBlockSize;
     }
 
     CacheState(SizeType32 nbAttentionLayers, SizeType32 nbKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
         SizeType32 tensorParallelism, SizeType32 pipelineParallelism, SizeType32 contextParallelism,
         std::vector<SizeType32> const& attentionLayerNumPerPP, nvinfer1::DataType dataType,
         AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2, bool enableAttentionDP = false,
-        int DPrank = 0, int DPsize = 0, bool enableBlockReuse = false)
+        int DPrank = 0, int DPsize = 0, bool enableBlockReuse = false, bool hasIndexerKCache = false,
+        SizeType32 indexerDimPerHead = 0, SizeType32 indexerKCacheQuantBlockSize = 128)
         : mModelConfig{std::vector(nbAttentionLayers, nbKvHeads), sizePerHead, tokensPerBlock}
         , mParallelConfig{tensorParallelism, pipelineParallelism, contextParallelism, enableAttentionDP, DPrank, DPsize,
               attentionLayerNumPerPP}
         , mDataType{dataType}
         , mAttentionConfig(attentionType, kvFactor)
     {
         mEnableBlockReuse = enableBlockReuse;
+        mHasIndexerKCache = hasIndexerKCache;
+        mIndexerDimPerHead = indexerDimPerHead;
+        mIndexerKCacheQuantBlockSize = indexerKCacheQuantBlockSize;
     }
 
     [[nodiscard]] bool operator==(kv_cache::CacheState const& other) const noexcept
@@ -174,6 +186,21 @@ class CacheState final
         return mEnableBlockReuse;
     }
 
+    [[nodiscard]] bool getHasIndexerKCache() const
+    {
+        return mHasIndexerKCache;
+    }
+
+    [[nodiscard]] SizeType32 getIndexerDimPerHead() const
+    {
+        return mIndexerDimPerHead;
+    }
+
+    [[nodiscard]] SizeType32 getIndexerKCacheQuantBlockSize() const
+    {
+        return mIndexerKCacheQuantBlockSize;
+    }
+
     [[nodiscard]] std::string toString() const
     {
         std::stringstream sstring;
@@ -194,6 +221,9 @@ class CacheState final
         sstring << "dpRank:" << mParallelConfig.mDPrank << "\n";
         sstring << "dpSize:" << mParallelConfig.mDPsize << "\n";
         sstring << "enableBlockReuse:" << mEnableBlockReuse << "\n";
+        sstring << "hasIndexerKCache:" << mHasIndexerKCache << "\n";
+        sstring << "indexerDimPerHead:" << mIndexerDimPerHead << "\n";
+        sstring << "indexerKCacheQuantBlockSize:" << mIndexerKCacheQuantBlockSize << "\n";
         return sstring.str();
     }
 
@@ -204,6 +234,9 @@ class CacheState final
     nvinfer1::DataType mDataType;
     AttentionConfig mAttentionConfig;
     bool mEnableBlockReuse{false};
+    bool mHasIndexerKCache{false};
+    SizeType32 mIndexerDimPerHead{0};
+    SizeType32 mIndexerKCacheQuantBlockSize{128};
 };
 
 struct MpiState
 
@@ -45,7 +45,8 @@ namespace tensorrt_llm::batch_manager::kv_cache_manager
 BlockRange getBlockRangeForSending(
     BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest, BlockKey const& lastBlockKey, int32_t indexFromEnd)
 {
-    auto poolNum = cacheManager->getBlockManager().getNumPools();
+    auto poolNum = cacheManager->getBlockManager().getNumPools(
+        /*includeBlockScalePools=*/false, /*includeIndexerKCachePools=*/false);
     if (poolNum > 1 || !cacheManager->isEnableBlockReuse() || lastBlockKey.uniqueTokens.size() == 0)
     {
         // disable reuse path, and vwsa don't support reuse.
@@ -87,7 +88,8 @@ BlockRange getBlockRangeForSending(
 BlockRange getBlockRangeForReceiving(
     BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest, bool srcEnableBlockReuse)
 {
-    auto poolNum = cacheManager->getBlockManager().getNumPools();
+    auto poolNum = cacheManager->getBlockManager().getNumPools(
+        /*includeBlockScalePools=*/false, /*includeIndexerKCachePools=*/false);
     if (poolNum == 1 && srcEnableBlockReuse)
     {
         // Build from all block ids, then slice off the reused blocks so we only transfer newly allocated ones.
@@ -170,7 +172,8 @@ void checkAlternateWindow(BaseKVCacheManager* cacheManager, BaseCacheFormatter::
     // if gen PP and context PP are different, cache formatter only support alternative window like gpt-oss.
     // which is one layer is WSA, and another layer is Full attention.
 
-    auto numPools = cacheManager->getBlockManager().getNumPools();
+    auto numPools = cacheManager->getBlockManager().getNumPools(
+        /*includeBlockScalePools=*/false, /*includeIndexerKCachePools=*/false);
     auto layerNum = cacheManager->getBlockManager().getNumLayers();
 
     auto selfPPNum = selfConfig.getParallelConfig().mPipelineParallelism;
@@ -247,7 +250,8 @@ void CacheFormatter::format(tensorrt_llm::batch_manager::TransferSession& sessio
     auto& blockManager = mCacheManager->getBlockManager();
     auto const& lastBlockKey = session.getLastBlockKey();
     auto blockRange = getBlockRangeForSending(mCacheManager, llmRequest, lastBlockKey, indexFromEnd);
-    auto const numPools = blockManager.getNumPools();
+    auto const numPools
+        = blockManager.getNumPools(/*includeBlockScalePools=*/false, /*includeIndexerKCachePools=*/false);
     // TODO(oargov): are we sure the other side has the same number of pools? this might not hold for pp_size>1...
 
     bool layerWise = common::getEnvDisaggLayerwise() && numPools == 1;
@@ -555,7 +559,8 @@ void CacheFormatter::unformat(tensorrt_llm::batch_manager::TransferSession& sess
     TLLM_LOG_DEBUG("pickUpConnections size: %d connections size: %d", pickUpConnections.size(), connections.size());
     std::vector<runtime::ITensor::SharedPtr> recvBufferTmps;
     std::map<SizeType32, std::vector<runtime::ITensor::SharedPtr>> outputBuffersPerWindow;
-    auto const numPools = mCacheManager->getBlockManager().getNumPools();
+    auto const numPools = mCacheManager->getBlockManager().getNumPools(
+        /*includeBlockScalePools=*/false, /*includeIndexerKCachePools=*/false);
     // TODO(oargov): are we sure the other side has the same number of pools? this might not hold for pp_size>1...
     size_t blockNum = 0;
     size_t cacheBlockSizeSum = 0;
@@ -969,7 +974,13 @@ std::unique_ptr<BaseCacheFormatter> createCacheFormatter(
 {
     if (isMLA)
     {
-        return std::make_unique<MLACacheFormatter>(cacheManager, cacheTransBufferManager);
+        std::vector<CacheTransBufferManager*> cacheTransBufferManagers = {cacheTransBufferManager};
+        auto maxNumTokens = cacheTransBufferManager->getMaxNumTokens();
+        if (cacheManager->isEnableIndexerKCache())
+        {
+            cacheTransBufferManagers.push_back(new CacheTransBufferManager(cacheManager, maxNumTokens, true));
+        }
+        return std::make_unique<MLACacheFormatter>(cacheManager, cacheTransBufferManagers);
     }
     return std::make_unique<CacheFormatter>(cacheManager, cacheTransBufferManager);
 }
 
@@ -189,14 +189,22 @@ bool FabricMemory::supportFbaricMemory()
 }
 
 CacheTransBufferManager::CacheTransBufferManager(
-    KVCacheManager::BaseKVCacheManager* cacheManager, std::optional<size_t> maxNumTokens)
+    KVCacheManager::BaseKVCacheManager* cacheManager, std::optional<size_t> maxNumTokens, bool transferIndexerKCache)
     : mCacheManager{cacheManager}
     , mBufferManager{std::make_shared<runtime::CudaStream>()}
+    , mTransferIndexerKCache{transferIndexerKCache}
+    , mMaxNumTokens{maxNumTokens}
 {
-
     // TODO: FP4 dataSize
     TLLM_CHECK(mCacheManager);
-    mDataType = mCacheManager->getPrimaryPool(0)->getDataType();
+    if (transferIndexerKCache)
+    {
+        mDataType = mCacheManager->getIndexerKCachePool()->getDataType();
+    }
+    else
+    {
+        mDataType = mCacheManager->getPrimaryPool(0)->getDataType();
+    }
 
     auto tokensPerBlock = mCacheManager->getBlockManager().getTokensPerBlock();
     size_t bufferSizeFromMaxNumToken = 0;
 
@@ -57,8 +57,8 @@ class FabricMemory
 class CacheTransBufferManager
 {
 public:
-    CacheTransBufferManager(
-        KVCacheManager::BaseKVCacheManager* cacheManager, std::optional<size_t> maxNumTokens = std::nullopt);
+    CacheTransBufferManager(KVCacheManager::BaseKVCacheManager* cacheManager,
+        std::optional<size_t> maxNumTokens = std::nullopt, bool transferIndexerKCache = false);
 
     static size_t preAllocBufferSize(std::map<SizeType32, SizeType32> const& cacheSizeBytesPerTokenPerWindow,
         SizeType32 tokensPerBlock,
@@ -82,6 +82,11 @@ class CacheTransBufferManager
     size_t getRecvBufferCount();
     size_t getSendBufferCount();
 
+    std::optional<size_t> getMaxNumTokens()
+    {
+        return mMaxNumTokens;
+    }
+
 private:
     struct ConcurrenceResource
     {
@@ -114,6 +119,8 @@ class CacheTransBufferManager
     KVCacheManager::BaseKVCacheManager* mCacheManager;
     runtime::BufferManager mBufferManager;
     std::vector<std::unique_ptr<FabricMemory>> mFabricMemory;
+    bool mTransferIndexerKCache;
+    std::optional<size_t> mMaxNumTokens;
 };
 
 } // namespace tensorrt_llm::batch_manager::kv_cache_manager
@@ -137,7 +137,9 @@ CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheMa
         kvFactor = 1;
     }
     mCacheState = std::make_unique<executor::kv_cache::CacheState>(cacheStateModelCfg, worldConfig,
-        attentionLayerNumPerPP, dataType, attentionType, kvFactor, cacheManager->isEnableBlockReuse());
+        attentionLayerNumPerPP, dataType, attentionType, kvFactor, cacheManager->isEnableBlockReuse(),
+        cacheManager->isEnableIndexerKCache(), cacheManager->getIndexerKCacheIndexHeadDim(),
+        cacheManager->getIndexerKCacheQuantBlockSize());
 
     if (mCacheState->getParallelConfig().mEnableAttentionDP)
     {
Original file line number	Diff line number	Diff line change
`@@ -137,7 +137,9 @@ CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheMa`
`137`	`137`	`kvFactor = 1;`
`138`	`138`	`}`
`139`	`139`	`mCacheState = std::make_unique<executor::kv_cache::CacheState>(cacheStateModelCfg, worldConfig,`
`140`		`- attentionLayerNumPerPP, dataType, attentionType, kvFactor, cacheManager->isEnableBlockReuse());`
	`140`	`+ attentionLayerNumPerPP, dataType, attentionType, kvFactor, cacheManager->isEnableBlockReuse(),`
	`141`	`+ cacheManager->isEnableIndexerKCache(), cacheManager->getIndexerKCacheIndexHeadDim(),`
	`142`	`+ cacheManager->getIndexerKCacheQuantBlockSize());`
`141`	`143`
`142`	`144`	`if (mCacheState->getParallelConfig().mEnableAttentionDP)`
`143`	`145`	`{`