NVIDIA
diff --git a/‎.github/CODEOWNERS‎
Lines changed: 1 addition & 0 deletions b/‎.github/CODEOWNERS‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ATTRIBUTIONS-CPP-aarch64.md‎
Lines changed: 18 additions & 0 deletions b/‎ATTRIBUTIONS-CPP-aarch64.md‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎ATTRIBUTIONS-CPP-x86_64.md‎
Lines changed: 18 additions & 0 deletions b/‎ATTRIBUTIONS-CPP-x86_64.md‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h‎
Lines changed: 2 additions & 1 deletion b/‎cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 65 additions & 0 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheTransferManager.h‎
Lines changed: 13 additions & 3 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheTransferManager.h‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h‎
Lines changed: 21 additions & 7 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h‎
Lines changed: 21 additions & 7 deletions
@@ -6,6 +6,7 @@
 /jenkins @NVIDIA/trt-llm-ci-infra-devs @NVIDIA/trt-llm-infra-devs
 ### Setup
 /docker @NVIDIA/trt-llm-setup-infra-devs @NVIDIA/trt-llm-infra-devs
+/.pre-commit-config.yaml @NVIDIA/trt-llm-setup-infra-devs @NVIDIA/trt-llm-infra-devs
 ### Github workflows
 /.github @NVIDIA/trt-llm-gh-workflows-infra-devs @NVIDIA/trt-llm-infra-devs
 /.coderabbit.yaml @NVIDIA/trt-llm-gh-workflows-infra-devs @NVIDIA/trt-llm-infra-devs
 
@@ -50,6 +50,7 @@ tensorrt_llm/pg_utils_bindings.*.so
 tensorrt_llm/flash_mla/
 tensorrt_llm/flash_mla_cpp_tllm.*.so
 tensorrt_llm/flash_mla_cpp_tllm.pyi
+tensorrt_llm/scripts
 *docs/cpp_docs*
 *docs/source/_cpp_gen*
 docs/source/**/*.rst
 
@@ -1389,7 +1389,7 @@ repos:
     -   id: yapf
         files: *common_files
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.1.0
+    rev: v6.0.0
     hooks:
     -   id: check-added-large-files
         exclude: |
 
@@ -14889,6 +14889,24 @@ Chen, Tianqi
 
 ```
 
+## Mooncake
+
+- **Repository URL**: https://github.com/kvcache-ai/Mooncake
+- **License URL**: https://github.com/kvcache-ai/Mooncake/blob/main/LICENSE-APACHE
+- **License name**: Apache 2.0
+
+### Authors
+
+© Copyright 2025, Mooncake Team.
+Copyright (c) Meta Platforms, Inc. and affiliates.
+Copyright 2024 KVCache.AI
+Ruoyu Qin
+Zheming Li
+Weiran He
+Mingxing Zhang
+Yongwei Wu
+Weimin Zheng
+Xinran Xu
 ## flashinfer
 
 ### License Text
 
@@ -14697,6 +14697,24 @@ Chen, Tianqi
 
 ```
 
+## Mooncake
+
+- **Repository URL**: https://github.com/kvcache-ai/Mooncake
+- **License URL**: https://github.com/kvcache-ai/Mooncake/blob/main/LICENSE-APACHE
+- **License name**: Apache 2.0
+
+### Authors
+
+© Copyright 2025, Mooncake Team.
+Copyright (c) Meta Platforms, Inc. and affiliates.
+Copyright 2024 KVCache.AI
+Ruoyu Qin
+Zheming Li
+Weiran He
+Mingxing Zhang
+Yongwei Wu
+Weimin Zheng
+Xinran Xu
 ## flashinfer
 
 ### License Text
 
@@ -269,7 +269,8 @@ class CacheTransceiver : public BaseCacheTransceiver
     std::unique_ptr<executor::kv_cache::CacheState> mCacheState;
     std::unique_ptr<executor::kv_cache::ConnectionManager> mManager;
     std::optional<executor::CacheTransceiverConfig> mCacheTransceiverConfig;
-    std::unique_ptr<kv_cache_manager::CacheTransBufferManager> mCacheTransBufferManager;
+    std::vector<std::unique_ptr<kv_cache_manager::CacheTransBufferManager>> mCacheTransBufferManagers;
+    std::vector<kv_cache_manager::CacheTransBufferManager*> mCacheTransBufferManagerPtrs;
     // library handle to the communicator related features,
     // this is used to defer dependency resolution until needed.
     static std::mutex mDllMutex;
 
@@ -595,6 +595,21 @@ class WindowBlockManager
 
     ~WindowBlockManager();
 
+    [[nodiscard]] bool isEnableIndexerKCache() const
+    {
+        return mEnableIndexerKCache;
+    }
+
+    [[nodiscard]] SizeType32 getIndexerKCacheQuantBlockSize() const
+    {
+        return mIndexerKCacheQuantBlockSize;
+    }
+
+    [[nodiscard]] SizeType32 getIndexerKCacheIndexHeadDim() const
+    {
+        return mIndexerKCacheIndexHeadDim;
+    }
+
     void allocatePools(bool useUvm);
 
     void releasePools();
@@ -809,6 +824,9 @@ class WindowBlockManager
         return mBufferManager;
     }
 
+    //! \brief Sync internal streams used by transfer manager with buffer manager stream
+    void syncTransferManagerWithBufferManager();
+
     //! \brief Perform per-request bookkeeping
     void refreshBlocks();
 
@@ -1021,6 +1039,21 @@ class BlockManager
         std::optional<kvc::BaseAgentConfig> agentConfig = std::nullopt, bool enableIndexerKCache = false,
         SizeType32 indexerKCacheQuantBlockSize = 128, SizeType32 indexerKCacheIndexHeadDim = 0);
 
+    [[nodiscard]] bool isEnableIndexerKCache() const
+    {
+        return mIsEnableIndexerKCache;
+    }
+
+    [[nodiscard]] SizeType32 getIndexerKCacheQuantBlockSize() const
+    {
+        return mIndexerKCacheQuantBlockSize;
+    }
+
+    [[nodiscard]] SizeType32 getIndexerKCacheIndexHeadDim() const
+    {
+        return mIndexerKCacheIndexHeadDim;
+    }
+
     BlockManager(BlockManager const&) = delete;
     BlockManager& operator=(BlockManager const&) = delete;
 
@@ -1283,6 +1316,9 @@ class BlockManager
     //! \brief Store newest block for reuse
     void storeNewBlock(GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest);
 
+    //! \brief Sync internal streams used by transfer manager with buffer manager stream
+    void syncTransferManagerWithBufferManager();
+
     //! \brief Perform per-request bookkeeping
     void refreshBlocks();
 
@@ -1398,6 +1434,10 @@ class BlockManager
     std::vector<SizeType32> mAbsolutePoolToRelativePoolIndex;
     // Record what sequences are currently managed by the block manager
     std::set<LlmRequest::RequestIdType> mManagedSequences;
+
+    bool mIsEnableIndexerKCache{false};
+    SizeType32 mIndexerKCacheQuantBlockSize{0};
+    SizeType32 mIndexerKCacheIndexHeadDim{0};
 };
 
 struct OffsetTableDimensions
@@ -1500,6 +1540,10 @@ class BaseKVCacheManager
 
     [[nodiscard]] virtual bool isEnableBlockReuse() const = 0;
 
+    [[nodiscard]] virtual bool isEnableIndexerKCache() const = 0;
+    [[nodiscard]] virtual SizeType32 getIndexerKCacheIndexHeadDim() const = 0;
+    [[nodiscard]] virtual SizeType32 getIndexerKCacheQuantBlockSize() const = 0;
+
     // void removeToken(SizeType32 seqSlotIdx);
     virtual void rewindKVCache(LlmRequest::RequestIdType requestId, SizeType32 rewindLengths) = 0;
 
@@ -1546,6 +1590,7 @@ class BaseKVCacheManager
     [[nodiscard]] virtual runtime::ITensor::SharedPtr getIndexerKCachePool() const = 0;
     [[nodiscard]] virtual SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const = 0;
 
+    virtual void syncTransferManagerWithBufferManager() = 0;
     virtual void refreshBlocks() = 0;
     virtual void flushIterationEvents() = 0;
     virtual void resetReuseState() = 0;
@@ -1834,6 +1879,21 @@ class KVCacheManager : public BaseKVCacheManager
         return mEnableBlockReuse;
     }
 
+    [[nodiscard]] bool isEnableIndexerKCache() const override
+    {
+        return mBlockManager.isEnableIndexerKCache();
+    }
+
+    [[nodiscard]] SizeType32 getIndexerKCacheIndexHeadDim() const override
+    {
+        return mBlockManager.getIndexerKCacheIndexHeadDim();
+    }
+
+    [[nodiscard]] SizeType32 getIndexerKCacheQuantBlockSize() const override
+    {
+        return mBlockManager.getIndexerKCacheQuantBlockSize();
+    }
+
     void removeToken(LlmRequest::RequestIdType requestId);
     void rewindKVCache(LlmRequest::RequestIdType requestId, SizeType32 rewindLengths) override;
 
@@ -1912,6 +1972,11 @@ class KVCacheManager : public BaseKVCacheManager
         return mBlockManager.getPoolLayerIdx(layer_idx);
     }
 
+    void syncTransferManagerWithBufferManager() override
+    {
+        mBlockManager.syncTransferManagerWithBufferManager();
+    }
+
     //! \brief Perform per-iteration bookkeeping
     void refreshBlocks() override
     {
 
@@ -46,7 +46,15 @@ class KVCacheTransferManager
         int numTokensToCopy = 0, executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM,
         std::string const& directory = "");
 
-    //! \brief Synchronize the offload/onboard streams with the bufferManager stream.
+    //! \brief Synchronize internal streams with bufferManager stream.
+    //! \details The buffer manager uses the same stream as the prefill and decode kernels. This method ensures that the
+    //! internal kernels used for offloading and onboarding will wait for prefill and decode kernels before performing
+    //! any block copies. This method must be called before the first call to KVCacheManager::addSequence in every step.
+    void syncWithBufferManager();
+
+    //! \brief Synchronize bufferManager stream with internal streams. This method ensures that prefill and decode
+    //! kernels for next step will wait for offloading and onboarding work that has already been scheduled. This method
+    //! must be called after last call to KVCacheManager::addSequence in every step.
     void syncTransfers();
 
 private:
@@ -75,8 +83,10 @@ class KVCacheTransferManager
     runtime::BufferManager mOnboardManager;
     runtime::BufferManager mOffloadManager;
 
-    // Track the block ids offloaded in this iteration.
-    std::unordered_map<int32_t, tr::CudaEvent> mPendingOffloads;
+    // Track reads and writes for blocks. Note that it is the memory pool index that
+    // identifies the raw memory blocks involved in I/O, not the block Id.
+    std::unordered_map<kernels::KVCacheIndex::UnderlyingType, tr::CudaEvent> mPendingReads;
+    std::unordered_map<kernels::KVCacheIndex::UnderlyingType, tr::CudaEvent> mPendingWrites;
     // Reference to parent loopback agent
     std::shared_ptr<kvc::BaseLoopbackAgent> mLoopbackAgent;
     int mDeviceId;
 
@@ -73,7 +73,8 @@ class BlockRange
         BaseKVCacheManager& cacheManager, BlockKey const& lastBlockKey, int32_t indexFromEnd)
     {
 
-        auto poolNum = cacheManager.getNumPools();
+        auto poolNum = cacheManager.getBlockManager().getNumPools(
+            /*includeBlockScalePools=*/false, /*includeIndexerKCachePools=*/false);
         TLLM_CHECK_WITH_INFO(poolNum == 1, "Reuse tree is not supported for multiple pools or variable window size");
 
         auto windowSize = cacheManager.getBlockManager().getWindowSizesMetadata().begin()->first;
@@ -136,13 +137,21 @@ class BlockRange
         return blockHashesPerWindow;
     }
 
-    BlockRangeForWindow getBlockRangeForWindow(SizeType32 windowSize) const
+    BlockRangeForWindow getBlockRangeForWindow(SizeType32 windowSize, bool useIndexerKCache = false) const
     {
         TLLM_CHECK_WITH_INFO(
             mPoolsPerWindow.find(windowSize) != mPoolsPerWindow.end(), "Window size %d not found", windowSize);
         auto pool = mPoolsPerWindow.at(windowSize).front();
         auto blockIds = mBlockIdsPerWindow.at(windowSize);
-        return BlockRangeForWindow(mManager, windowSize, std::move(blockIds), std::move(pool));
+        if (useIndexerKCache)
+        {
+            TLLM_CHECK(mIndexerKCachePool);
+            return BlockRangeForWindow(mManager, windowSize, std::move(blockIds), mIndexerKCachePool);
+        }
+        else
+        {
+            return BlockRangeForWindow(mManager, windowSize, std::move(blockIds), std::move(pool));
+        }
     }
 
     std::vector<SizeType32> getWindowSizes() const
@@ -167,9 +176,8 @@ class BlockRange
         , mRequestId(requestId)
         , mBlockIdsPerWindow(std::move(blockIdsPerWindow))
     {
-
-        // cacheManager.getBlockManager.getPrimaryPool(0);
-        auto poolNum = mManager->getNumPools();
+        auto poolNum = mManager->getBlockManager().getNumPools(
+            /*includeBlockScalePools=*/false, /*includeIndexerKCachePools=*/false);
         for (SizeType32 poolIdx = 0; poolIdx < poolNum; ++poolIdx)
         {
             auto windowSize = cacheManager.getBlockManager().getPoolWindowSize(poolIdx);
@@ -181,21 +189,27 @@ class BlockRange
         : mManager(&cacheManager)
         , mRequestId(requestId)
     {
-        auto poolNum = mManager->getNumPools();
+        auto poolNum = mManager->getBlockManager().getNumPools(
+            /*includeBlockScalePools=*/false, /*includeIndexerKCachePools=*/false);
         for (SizeType32 poolIdx = 0; poolIdx < poolNum; ++poolIdx)
         {
             auto windowSize = cacheManager.getBlockManager().getPoolWindowSize(poolIdx);
             mPoolsPerWindow[windowSize].push_back(cacheManager.getBlockManager().getPrimaryPool(poolIdx));
             mBlockIdsPerWindow[windowSize]
                 = cacheManager.getSequence(mRequestId).getCacheBlockIds(windowSize).at(kFIRST_AND_ONLY_BEAM);
         }
+        if (cacheManager.isEnableIndexerKCache())
+        {
+            mIndexerKCachePool = cacheManager.getIndexerKCachePool();
+        }
     }
 
 private:
     BaseKVCacheManager const* mManager;
     LlmRequest::RequestIdType const mRequestId;
     std::unordered_map<SizeType32, std::vector<SizeType32>> mBlockIdsPerWindow;
     std::unordered_map<SizeType32, std::vector<runtime::ITensor::SharedPtr>> mPoolsPerWindow;
+    runtime::ITensor::SharedPtr mIndexerKCachePool;
 
     static constexpr SizeType32 kFIRST_AND_ONLY_BEAM = 0;
     static constexpr SizeType32 kFIRST_POOL_INDEX = 0;