Fix failed tests

Tabrizian · Tabrizian · commit a810c12ea5c5 · 2025-11-10T22:56:18.000Z
Signed-off-by: Iman Tabrizian &lt;10105175+tabrizian@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp b/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp
@@ -47,6 +47,9 @@ BlockRange getBlockRangeForSending(BaseKVCacheManager* cacheManager, LlmRequest
 {
     auto poolNum = cacheManager->getBlockManager().getNumPools(
         /*includeBlockScalePools=*/false, /*includeIndexerKCachePools=*/false);
+
+    // Note: When recv side has CP, the requested seqLen is lesser than seqLen on the sender side as seqLen is
+    // distributed among CP ranks. So, we transfer all blocks from send side.
     if (poolNum > 1 || !cacheManager->isEnableBlockReuse() || lastBlockKey.uniqueTokens.size() == 0 || recvSideHasCP)
     {
         // disable reuse path, and vwsa don't support reuse.
diff --git a/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp b/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp
@@ -1022,7 +1022,8 @@ class AsymmetricalCacheTest : public ::testing::TestWithParam<AsymmetricTestPara
         }
         int kvFactor = mCacheState->getAttentionConfig().mKvFactor;
         int tokensPerBlock = mCacheState->getModelConfig().mTokensPerBlock;
-        int startTokenId = (blockId * mCpSize + mCpRank) * tokensPerBlock;
+        // We don't account for CP here because contextCP is always 1 currently.
+        int startTokenId = blockId * tokensPerBlock;
         int sizePerHead;
         if (isIndexerKCache)
         {
@@ -1117,7 +1118,8 @@ class AsymmetricalCacheTest : public ::testing::TestWithParam<AsymmetricTestPara
         }
         int kvFactor = mCacheState->getAttentionConfig().mKvFactor;
         int tokensPerBlock = mCacheState->getModelConfig().mTokensPerBlock;
-        int startTokenId = (blockId * mCpSize + mCpRank) * tokensPerBlock;
+        // We don't account for CP here because contextCP is always 1 currently.
+        int startTokenId = blockId * tokensPerBlock;
         int sizePerHead;
         if (isIndexerKCache)
         {
diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -45,7 +45,7 @@ def result(self):
 DuckLLM = namedtuple('DuckLLM', ['args', 'tokenizer', 'generate_async'])
 
 DEFAULT_TEST_TIMEOUT = 1800
-DEFAULT_SERVER_WAITING_TIMEOUT = 1200
+DEFAULT_SERVER_WAITING_TIMEOUT = 3600
 
 
 class MyThreadPoolExecutor(ThreadPoolExecutor):

Original file line number	Diff line number	Diff line change
`@@ -47,6 +47,9 @@ BlockRange getBlockRangeForSending(BaseKVCacheManager* cacheManager, LlmRequest`
`47`	`47`	`{`
`48`	`48`	`auto poolNum = cacheManager->getBlockManager().getNumPools(`
`49`	`49`	`/includeBlockScalePools=/false, /includeIndexerKCachePools=/false);`
	`50`	`+`
	`51`	`+ // Note: When recv side has CP, the requested seqLen is lesser than seqLen on the sender side as seqLen is`
	`52`	`+ // distributed among CP ranks. So, we transfer all blocks from send side.`
`50`	`53`	`if (poolNum > 1 \|\| !cacheManager->isEnableBlockReuse() \|\| lastBlockKey.uniqueTokens.size() == 0 \|\| recvSideHasCP)`
`51`	`54`	`{`
`52`	`55`	`// disable reuse path, and vwsa don't support reuse.`
Original file line number	Diff line number	Diff line change
`@@ -1022,7 +1022,8 @@ class AsymmetricalCacheTest : public ::testing::TestWithParam<AsymmetricTestPara`
`1022`	`1022`	`}`
`1023`	`1023`	`int kvFactor = mCacheState->getAttentionConfig().mKvFactor;`
`1024`	`1024`	`int tokensPerBlock = mCacheState->getModelConfig().mTokensPerBlock;`
`1025`		`- int startTokenId = (blockId * mCpSize + mCpRank) * tokensPerBlock;`
	`1025`	`+ // We don't account for CP here because contextCP is always 1 currently.`
	`1026`	`+ int startTokenId = blockId * tokensPerBlock;`
`1026`	`1027`	`int sizePerHead;`
`1027`	`1028`	`if (isIndexerKCache)`
`1028`	`1029`	`{`
`@@ -1117,7 +1118,8 @@ class AsymmetricalCacheTest : public ::testing::TestWithParam<AsymmetricTestPara`
`1117`	`1118`	`}`
`1118`	`1119`	`int kvFactor = mCacheState->getAttentionConfig().mKvFactor;`
`1119`	`1120`	`int tokensPerBlock = mCacheState->getModelConfig().mTokensPerBlock;`
`1120`		`- int startTokenId = (blockId * mCpSize + mCpRank) * tokensPerBlock;`
	`1121`	`+ // We don't account for CP here because contextCP is always 1 currently.`
	`1122`	`+ int startTokenId = blockId * tokensPerBlock;`
`1121`	`1123`	`int sizePerHead;`
`1122`	`1124`	`if (isIndexerKCache)`
`1123`	`1125`	`{`