Skip to content

Commit d59e2cb

Browse files
committed
Merge remote-tracking branch 'thor/user/tjohnsen/fix_5627710' into try-thor-fix-transfer-manager
2 parents a179d7f + ba32c3b commit d59e2cb

File tree

2,447 files changed

+18134
-10066
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

2,447 files changed

+18134
-10066
lines changed

.github/CODEOWNERS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
/jenkins @NVIDIA/trt-llm-ci-infra-devs @NVIDIA/trt-llm-infra-devs
77
### Setup
88
/docker @NVIDIA/trt-llm-setup-infra-devs @NVIDIA/trt-llm-infra-devs
9+
/.pre-commit-config.yaml @NVIDIA/trt-llm-setup-infra-devs @NVIDIA/trt-llm-infra-devs
910
### Github workflows
1011
/.github @NVIDIA/trt-llm-gh-workflows-infra-devs @NVIDIA/trt-llm-infra-devs
1112
/.coderabbit.yaml @NVIDIA/trt-llm-gh-workflows-infra-devs @NVIDIA/trt-llm-infra-devs

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ tensorrt_llm/pg_utils_bindings.*.so
5050
tensorrt_llm/flash_mla/
5151
tensorrt_llm/flash_mla_cpp_tllm.*.so
5252
tensorrt_llm/flash_mla_cpp_tllm.pyi
53+
tensorrt_llm/scripts
5354
*docs/cpp_docs*
5455
*docs/source/_cpp_gen*
5556
docs/source/**/*.rst

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1389,7 +1389,7 @@ repos:
13891389
- id: yapf
13901390
files: *common_files
13911391
- repo: https://github.com/pre-commit/pre-commit-hooks
1392-
rev: v4.1.0
1392+
rev: v6.0.0
13931393
hooks:
13941394
- id: check-added-large-files
13951395
exclude: |

ATTRIBUTIONS-CPP-aarch64.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14889,6 +14889,24 @@ Chen, Tianqi
1488914889

1489014890
```
1489114891

14892+
## Mooncake
14893+
14894+
- **Repository URL**: https://github.com/kvcache-ai/Mooncake
14895+
- **License URL**: https://github.com/kvcache-ai/Mooncake/blob/main/LICENSE-APACHE
14896+
- **License name**: Apache 2.0
14897+
14898+
### Authors
14899+
14900+
© Copyright 2025, Mooncake Team.
14901+
Copyright (c) Meta Platforms, Inc. and affiliates.
14902+
Copyright 2024 KVCache.AI
14903+
Ruoyu Qin
14904+
Zheming Li
14905+
Weiran He
14906+
Mingxing Zhang
14907+
Yongwei Wu
14908+
Weimin Zheng
14909+
Xinran Xu
1489214910
## flashinfer
1489314911

1489414912
### License Text

ATTRIBUTIONS-CPP-x86_64.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14697,6 +14697,24 @@ Chen, Tianqi
1469714697

1469814698
```
1469914699

14700+
## Mooncake
14701+
14702+
- **Repository URL**: https://github.com/kvcache-ai/Mooncake
14703+
- **License URL**: https://github.com/kvcache-ai/Mooncake/blob/main/LICENSE-APACHE
14704+
- **License name**: Apache 2.0
14705+
14706+
### Authors
14707+
14708+
© Copyright 2025, Mooncake Team.
14709+
Copyright (c) Meta Platforms, Inc. and affiliates.
14710+
Copyright 2024 KVCache.AI
14711+
Ruoyu Qin
14712+
Zheming Li
14713+
Weiran He
14714+
Mingxing Zhang
14715+
Yongwei Wu
14716+
Weimin Zheng
14717+
Xinran Xu
1470014718
## flashinfer
1470114719

1470214720
### License Text

cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,8 @@ class CacheTransceiver : public BaseCacheTransceiver
269269
std::unique_ptr<executor::kv_cache::CacheState> mCacheState;
270270
std::unique_ptr<executor::kv_cache::ConnectionManager> mManager;
271271
std::optional<executor::CacheTransceiverConfig> mCacheTransceiverConfig;
272-
std::unique_ptr<kv_cache_manager::CacheTransBufferManager> mCacheTransBufferManager;
272+
std::vector<std::unique_ptr<kv_cache_manager::CacheTransBufferManager>> mCacheTransBufferManagers;
273+
std::vector<kv_cache_manager::CacheTransBufferManager*> mCacheTransBufferManagerPtrs;
273274
// library handle to the communicator related features,
274275
// this is used to defer dependency resolution until needed.
275276
static std::mutex mDllMutex;

cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -595,6 +595,21 @@ class WindowBlockManager
595595

596596
~WindowBlockManager();
597597

598+
[[nodiscard]] bool isEnableIndexerKCache() const
599+
{
600+
return mEnableIndexerKCache;
601+
}
602+
603+
[[nodiscard]] SizeType32 getIndexerKCacheQuantBlockSize() const
604+
{
605+
return mIndexerKCacheQuantBlockSize;
606+
}
607+
608+
[[nodiscard]] SizeType32 getIndexerKCacheIndexHeadDim() const
609+
{
610+
return mIndexerKCacheIndexHeadDim;
611+
}
612+
598613
void allocatePools(bool useUvm);
599614

600615
void releasePools();
@@ -809,6 +824,9 @@ class WindowBlockManager
809824
return mBufferManager;
810825
}
811826

827+
//! \brief Sync internal streams used by transfer manager with buffer manager stream
828+
void syncTransferManagerWithBufferManager();
829+
812830
//! \brief Perform per-request bookkeeping
813831
void refreshBlocks();
814832

@@ -1021,6 +1039,21 @@ class BlockManager
10211039
std::optional<kvc::BaseAgentConfig> agentConfig = std::nullopt, bool enableIndexerKCache = false,
10221040
SizeType32 indexerKCacheQuantBlockSize = 128, SizeType32 indexerKCacheIndexHeadDim = 0);
10231041

1042+
[[nodiscard]] bool isEnableIndexerKCache() const
1043+
{
1044+
return mIsEnableIndexerKCache;
1045+
}
1046+
1047+
[[nodiscard]] SizeType32 getIndexerKCacheQuantBlockSize() const
1048+
{
1049+
return mIndexerKCacheQuantBlockSize;
1050+
}
1051+
1052+
[[nodiscard]] SizeType32 getIndexerKCacheIndexHeadDim() const
1053+
{
1054+
return mIndexerKCacheIndexHeadDim;
1055+
}
1056+
10241057
BlockManager(BlockManager const&) = delete;
10251058
BlockManager& operator=(BlockManager const&) = delete;
10261059

@@ -1283,6 +1316,9 @@ class BlockManager
12831316
//! \brief Store newest block for reuse
12841317
void storeNewBlock(GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest);
12851318

1319+
//! \brief Sync internal streams used by transfer manager with buffer manager stream
1320+
void syncTransferManagerWithBufferManager();
1321+
12861322
//! \brief Perform per-request bookkeeping
12871323
void refreshBlocks();
12881324

@@ -1398,6 +1434,10 @@ class BlockManager
13981434
std::vector<SizeType32> mAbsolutePoolToRelativePoolIndex;
13991435
// Record what sequences are currently managed by the block manager
14001436
std::set<LlmRequest::RequestIdType> mManagedSequences;
1437+
1438+
bool mIsEnableIndexerKCache{false};
1439+
SizeType32 mIndexerKCacheQuantBlockSize{0};
1440+
SizeType32 mIndexerKCacheIndexHeadDim{0};
14011441
};
14021442

14031443
struct OffsetTableDimensions
@@ -1500,6 +1540,10 @@ class BaseKVCacheManager
15001540

15011541
[[nodiscard]] virtual bool isEnableBlockReuse() const = 0;
15021542

1543+
[[nodiscard]] virtual bool isEnableIndexerKCache() const = 0;
1544+
[[nodiscard]] virtual SizeType32 getIndexerKCacheIndexHeadDim() const = 0;
1545+
[[nodiscard]] virtual SizeType32 getIndexerKCacheQuantBlockSize() const = 0;
1546+
15031547
// void removeToken(SizeType32 seqSlotIdx);
15041548
virtual void rewindKVCache(LlmRequest::RequestIdType requestId, SizeType32 rewindLengths) = 0;
15051549

@@ -1546,6 +1590,7 @@ class BaseKVCacheManager
15461590
[[nodiscard]] virtual runtime::ITensor::SharedPtr getIndexerKCachePool() const = 0;
15471591
[[nodiscard]] virtual SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const = 0;
15481592

1593+
virtual void syncTransferManagerWithBufferManager() = 0;
15491594
virtual void refreshBlocks() = 0;
15501595
virtual void flushIterationEvents() = 0;
15511596
virtual void resetReuseState() = 0;
@@ -1834,6 +1879,21 @@ class KVCacheManager : public BaseKVCacheManager
18341879
return mEnableBlockReuse;
18351880
}
18361881

1882+
[[nodiscard]] bool isEnableIndexerKCache() const override
1883+
{
1884+
return mBlockManager.isEnableIndexerKCache();
1885+
}
1886+
1887+
[[nodiscard]] SizeType32 getIndexerKCacheIndexHeadDim() const override
1888+
{
1889+
return mBlockManager.getIndexerKCacheIndexHeadDim();
1890+
}
1891+
1892+
[[nodiscard]] SizeType32 getIndexerKCacheQuantBlockSize() const override
1893+
{
1894+
return mBlockManager.getIndexerKCacheQuantBlockSize();
1895+
}
1896+
18371897
void removeToken(LlmRequest::RequestIdType requestId);
18381898
void rewindKVCache(LlmRequest::RequestIdType requestId, SizeType32 rewindLengths) override;
18391899

@@ -1912,6 +1972,11 @@ class KVCacheManager : public BaseKVCacheManager
19121972
return mBlockManager.getPoolLayerIdx(layer_idx);
19131973
}
19141974

1975+
void syncTransferManagerWithBufferManager() override
1976+
{
1977+
mBlockManager.syncTransferManagerWithBufferManager();
1978+
}
1979+
19151980
//! \brief Perform per-iteration bookkeeping
19161981
void refreshBlocks() override
19171982
{

cpp/include/tensorrt_llm/batch_manager/kvCacheTransferManager.h

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,15 @@ class KVCacheTransferManager
4646
int numTokensToCopy = 0, executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM,
4747
std::string const& directory = "");
4848

49-
//! \brief Synchronize the offload/onboard streams with the bufferManager stream.
49+
//! \brief Synchronize internal streams with bufferManager stream.
50+
//! \details The buffer manager uses the same stream as the prefill and decode kernels. This method ensures that the
51+
//! internal kernels used for offloading and onboarding will wait for prefill and decode kernels before performing
52+
//! any block copies. This method must be called before the first call to KVCacheManager::addSequence in every step.
53+
void syncWithBufferManager();
54+
55+
//! \brief Synchronize bufferManager stream with internal streams. This method ensures that prefill and decode
56+
//! kernels for next step will wait for offloading and onboarding work that has already been scheduled. This method
57+
//! must be called after last call to KVCacheManager::addSequence in every step.
5058
void syncTransfers();
5159

5260
private:
@@ -75,8 +83,10 @@ class KVCacheTransferManager
7583
runtime::BufferManager mOnboardManager;
7684
runtime::BufferManager mOffloadManager;
7785

78-
// Track the block ids offloaded in this iteration.
79-
std::unordered_map<int32_t, tr::CudaEvent> mPendingOffloads;
86+
// Track reads and writes for blocks. Note that it is the memory pool index that
87+
// identifies the raw memory blocks involved in I/O, not the block Id.
88+
std::unordered_map<kernels::KVCacheIndex::UnderlyingType, tr::CudaEvent> mPendingReads;
89+
std::unordered_map<kernels::KVCacheIndex::UnderlyingType, tr::CudaEvent> mPendingWrites;
8090
// Reference to parent loopback agent
8191
std::shared_ptr<kvc::BaseLoopbackAgent> mLoopbackAgent;
8292
int mDeviceId;

cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,8 @@ class BlockRange
7373
BaseKVCacheManager& cacheManager, BlockKey const& lastBlockKey, int32_t indexFromEnd)
7474
{
7575

76-
auto poolNum = cacheManager.getNumPools();
76+
auto poolNum = cacheManager.getBlockManager().getNumPools(
77+
/*includeBlockScalePools=*/false, /*includeIndexerKCachePools=*/false);
7778
TLLM_CHECK_WITH_INFO(poolNum == 1, "Reuse tree is not supported for multiple pools or variable window size");
7879

7980
auto windowSize = cacheManager.getBlockManager().getWindowSizesMetadata().begin()->first;
@@ -136,13 +137,21 @@ class BlockRange
136137
return blockHashesPerWindow;
137138
}
138139

139-
BlockRangeForWindow getBlockRangeForWindow(SizeType32 windowSize) const
140+
BlockRangeForWindow getBlockRangeForWindow(SizeType32 windowSize, bool useIndexerKCache = false) const
140141
{
141142
TLLM_CHECK_WITH_INFO(
142143
mPoolsPerWindow.find(windowSize) != mPoolsPerWindow.end(), "Window size %d not found", windowSize);
143144
auto pool = mPoolsPerWindow.at(windowSize).front();
144145
auto blockIds = mBlockIdsPerWindow.at(windowSize);
145-
return BlockRangeForWindow(mManager, windowSize, std::move(blockIds), std::move(pool));
146+
if (useIndexerKCache)
147+
{
148+
TLLM_CHECK(mIndexerKCachePool);
149+
return BlockRangeForWindow(mManager, windowSize, std::move(blockIds), mIndexerKCachePool);
150+
}
151+
else
152+
{
153+
return BlockRangeForWindow(mManager, windowSize, std::move(blockIds), std::move(pool));
154+
}
146155
}
147156

148157
std::vector<SizeType32> getWindowSizes() const
@@ -167,9 +176,8 @@ class BlockRange
167176
, mRequestId(requestId)
168177
, mBlockIdsPerWindow(std::move(blockIdsPerWindow))
169178
{
170-
171-
// cacheManager.getBlockManager.getPrimaryPool(0);
172-
auto poolNum = mManager->getNumPools();
179+
auto poolNum = mManager->getBlockManager().getNumPools(
180+
/*includeBlockScalePools=*/false, /*includeIndexerKCachePools=*/false);
173181
for (SizeType32 poolIdx = 0; poolIdx < poolNum; ++poolIdx)
174182
{
175183
auto windowSize = cacheManager.getBlockManager().getPoolWindowSize(poolIdx);
@@ -181,21 +189,27 @@ class BlockRange
181189
: mManager(&cacheManager)
182190
, mRequestId(requestId)
183191
{
184-
auto poolNum = mManager->getNumPools();
192+
auto poolNum = mManager->getBlockManager().getNumPools(
193+
/*includeBlockScalePools=*/false, /*includeIndexerKCachePools=*/false);
185194
for (SizeType32 poolIdx = 0; poolIdx < poolNum; ++poolIdx)
186195
{
187196
auto windowSize = cacheManager.getBlockManager().getPoolWindowSize(poolIdx);
188197
mPoolsPerWindow[windowSize].push_back(cacheManager.getBlockManager().getPrimaryPool(poolIdx));
189198
mBlockIdsPerWindow[windowSize]
190199
= cacheManager.getSequence(mRequestId).getCacheBlockIds(windowSize).at(kFIRST_AND_ONLY_BEAM);
191200
}
201+
if (cacheManager.isEnableIndexerKCache())
202+
{
203+
mIndexerKCachePool = cacheManager.getIndexerKCachePool();
204+
}
192205
}
193206

194207
private:
195208
BaseKVCacheManager const* mManager;
196209
LlmRequest::RequestIdType const mRequestId;
197210
std::unordered_map<SizeType32, std::vector<SizeType32>> mBlockIdsPerWindow;
198211
std::unordered_map<SizeType32, std::vector<runtime::ITensor::SharedPtr>> mPoolsPerWindow;
212+
runtime::ITensor::SharedPtr mIndexerKCachePool;
199213

200214
static constexpr SizeType32 kFIRST_AND_ONLY_BEAM = 0;
201215
static constexpr SizeType32 kFIRST_POOL_INDEX = 0;

0 commit comments

Comments
 (0)