NVIDIA
diff --git a/‎cpp/include/tensorrt_llm/executor/executor.h‎
Lines changed: 8 additions & 2 deletions b/‎cpp/include/tensorrt_llm/executor/executor.h‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎cpp/tensorrt_llm/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp‎
Lines changed: 30 additions & 3 deletions b/‎cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp‎
Lines changed: 30 additions & 3 deletions
diff --git a/‎cpp/tensorrt_llm/executor/cacheTransceiverConfig.cpp‎
Lines changed: 17 additions & 2 deletions b/‎cpp/tensorrt_llm/executor/cacheTransceiverConfig.cpp‎
Lines changed: 17 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/executor/serialization.cpp‎
Lines changed: 7 additions & 1 deletion b/‎cpp/tensorrt_llm/executor/serialization.cpp‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎cpp/tensorrt_llm/kernels/CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions b/‎cpp/tensorrt_llm/kernels/CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/IndexerTopK.h‎
Lines changed: 5 additions & 5 deletions b/‎cpp/tensorrt_llm/kernels/IndexerTopK.h‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu‎
Lines changed: 6 additions & 0 deletions b/‎cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cuteDslKernels/CMakeLists.txt‎
Lines changed: 23 additions & 0 deletions b/‎cpp/tensorrt_llm/kernels/cuteDslKernels/CMakeLists.txt‎
Lines changed: 23 additions & 0 deletions
@@ -1465,16 +1465,19 @@ class CacheTransceiverConfig
         NIXL = 3
     };
     explicit CacheTransceiverConfig(std::optional<BackendType> backendType = std::nullopt,
-        std::optional<size_t> maxNumTokens = std::nullopt, std::optional<int> kvTransferTimeoutMs = std::nullopt);
+        std::optional<size_t> maxNumTokens = std::nullopt, std::optional<int> kvTransferTimeoutMs = std::nullopt,
+        std::optional<int> kvTransferSenderFutureTimeoutMs = std::nullopt);
 
     bool operator==(CacheTransceiverConfig const& other) const;
     void setBackendType(std::optional<BackendType> backendType);
     void setMaxTokensInBuffer(std::optional<size_t> maxTokensInBuffer);
     void setKvTransferTimeoutMs(std::optional<int> kvTransferTimeoutMs);
+    void setKvTransferSenderFutureTimeoutMs(std::optional<int> kvTransferSenderFutureTimeoutMs);
 
-    [[nodiscard]] std::optional<int> getKvTransferTimeoutMs() const;
     [[nodiscard]] std::optional<size_t> getMaxTokensInBuffer() const;
     [[nodiscard]] std::optional<BackendType> getBackendType() const;
+    [[nodiscard]] std::optional<int> getKvTransferTimeoutMs() const;
+    [[nodiscard]] std::optional<int> getKvTransferSenderFutureTimeoutMs() const;
 
 private:
     std::optional<BackendType> mBackendType;
@@ -1483,6 +1486,9 @@ class CacheTransceiverConfig
     /// transfer may be degraded.
     std::optional<size_t> mMaxTokensInBuffer;
     std::optional<int> mKvTransferTimeoutMs;
+    // @brief Timeout in milliseconds to wait for the sender future to be ready when scheduled batch size is 0. This
+    // allows the request to be eventually cancelled by the user or because of kv_transfer_timeout_ms
+    std::optional<int> mKvTransferSenderFutureTimeoutMs;
 };
 
 /// @brief Configuration class for the model executor
 
@@ -189,6 +189,7 @@ set(TRTLLM_LINK_LIBS
     fb_gemm_src
     gemm_swiglu_sm90_src
     cutlass_src
+    cute_dsl_src
     layers_src
     runtime_src
     testing_src
 
@@ -419,6 +419,13 @@ void updateKVCacheTransferBW(std::shared_ptr<CacheTransceiverComm> const& mComm,
 void CacheTransceiver::checkContextTransferStatus(std::optional<int> const& atLeastRequestNum)
 {
     bool blockAll = !atLeastRequestNum.has_value();
+    std::optional<int> senderFutureTimeoutMs = std::nullopt;
+    // If blockAll is true, we want to block and not use a timeout
+    if (!blockAll && mCacheTransceiverConfig.has_value())
+    {
+        senderFutureTimeoutMs = mCacheTransceiverConfig->getKvTransferSenderFutureTimeoutMs();
+    }
+
     auto syncComm = mCacheState->getParallelConfig().mEnableAttentionDP ? mGroupTPInDPComm : mGroupTensorParaComm;
     std::vector<LlmRequest::RequestIdType> contextCompleteRequestIds;
     for (auto&& [request, future] : mSenderFutures)
@@ -476,16 +483,36 @@ void CacheTransceiver::checkContextTransferStatus(std::optional<int> const& atLe
         {
             try
             {
-                future.get();
-                request->setState(LlmRequestState::kDISAGG_CONTEXT_COMPLETE);
+                // Wait for up to a specified timeout
+                auto status = future.wait_for(std::chrono::milliseconds(senderFutureTimeoutMs.value_or(0)));
+                if (status == std::future_status::ready || !senderFutureTimeoutMs.has_value())
+                {
+                    future.get();
+                    request->setState(LlmRequestState::kDISAGG_CONTEXT_COMPLETE);
+                    it = mSenderFutures.erase(it);
+                }
+                else if (status == std::future_status::timeout)
+                {
+                    TLLM_LOG_WARNING("Timed out waiting for context transfer for request %ld after %d milliseconds.",
+                        request->mRequestId, senderFutureTimeoutMs.value());
+                    ++it;
+                }
+                else
+                {
+                    TLLM_LOG_ERROR(
+                        "Future returned unexpected status for request %ld. Marking as error", request->mRequestId);
+
+                    request->setState(LlmRequestState::kDISAGG_TRANS_ERROR);
+                    it = mSenderFutures.erase(it);
+                }
             }
             catch (std::exception const& e)
             {
                 TLLM_LOG_ERROR(
                     "Error occurred during context transfer for request %ld: %s", request->mRequestId, e.what());
                 request->setState(LlmRequestState::kDISAGG_TRANS_ERROR);
+                it = mSenderFutures.erase(it);
             }
-            it = mSenderFutures.erase(it);
         }
         else
         {
 
@@ -21,11 +21,13 @@
 namespace tensorrt_llm::executor
 {
 
-CacheTransceiverConfig::CacheTransceiverConfig(
-    std::optional<BackendType> backendType, std::optional<size_t> maxNumTokens, std::optional<int> kvTransferTimeoutMs)
+CacheTransceiverConfig::CacheTransceiverConfig(std::optional<BackendType> backendType,
+    std::optional<size_t> maxNumTokens, std::optional<int> kvTransferTimeoutMs,
+    std::optional<int> kvTransferSenderFutureTimeoutMs)
     : mBackendType(backendType)
     , mMaxTokensInBuffer(maxNumTokens)
     , mKvTransferTimeoutMs(kvTransferTimeoutMs)
+    , mKvTransferSenderFutureTimeoutMs(kvTransferSenderFutureTimeoutMs)
 {
 }
 
@@ -54,6 +56,15 @@ void CacheTransceiverConfig::setKvTransferTimeoutMs(std::optional<int> kvTransfe
     mKvTransferTimeoutMs = kvTransferTimeoutMs;
 }
 
+void CacheTransceiverConfig::setKvTransferSenderFutureTimeoutMs(std::optional<int> kvTransferSenderFutureTimeoutMs)
+{
+    if (kvTransferSenderFutureTimeoutMs.has_value() && kvTransferSenderFutureTimeoutMs.value() <= 0)
+    {
+        TLLM_THROW("kvTransferSenderFutureTimeoutMs must be positive");
+    }
+    mKvTransferSenderFutureTimeoutMs = kvTransferSenderFutureTimeoutMs;
+}
+
 std::optional<CacheTransceiverConfig::BackendType> CacheTransceiverConfig::getBackendType() const
 {
     return mBackendType;
@@ -69,4 +80,8 @@ std::optional<int> CacheTransceiverConfig::getKvTransferTimeoutMs() const
     return mKvTransferTimeoutMs;
 }
 
+std::optional<int> CacheTransceiverConfig::getKvTransferSenderFutureTimeoutMs() const
+{
+    return mKvTransferSenderFutureTimeoutMs;
+}
 } // namespace tensorrt_llm::executor
@@ -1290,20 +1290,26 @@ CacheTransceiverConfig Serialization::deserializeCacheTransceiverConfig(std::ist
 {
     auto backendType = su::deserialize<std::optional<CacheTransceiverConfig::BackendType>>(is);
     auto maxTokensInBuffer = su::deserialize<std::optional<size_t>>(is);
-    return CacheTransceiverConfig{backendType, maxTokensInBuffer};
+    auto kvTransferTimeoutMs = su::deserialize<std::optional<int>>(is);
+    auto kvTransferSenderFutureTimeoutMs = su::deserialize<std::optional<int>>(is);
+    return CacheTransceiverConfig{backendType, maxTokensInBuffer, kvTransferTimeoutMs, kvTransferSenderFutureTimeoutMs};
 }
 
 void Serialization::serialize(CacheTransceiverConfig const& cacheTransceiverConfig, std::ostream& os)
 {
     su::serialize(cacheTransceiverConfig.getBackendType(), os);
     su::serialize(cacheTransceiverConfig.getMaxTokensInBuffer(), os);
+    su::serialize(cacheTransceiverConfig.getKvTransferTimeoutMs(), os);
+    su::serialize(cacheTransceiverConfig.getKvTransferSenderFutureTimeoutMs(), os);
 }
 
 size_t Serialization::serializedSize(CacheTransceiverConfig const& cacheTransceiverConfig)
 {
     size_t totalSize = 0;
     totalSize += su::serializedSize(cacheTransceiverConfig.getBackendType());
     totalSize += su::serializedSize(cacheTransceiverConfig.getMaxTokensInBuffer());
+    totalSize += su::serializedSize(cacheTransceiverConfig.getKvTransferTimeoutMs());
+    totalSize += su::serializedSize(cacheTransceiverConfig.getKvTransferSenderFutureTimeoutMs());
     return totalSize;
 }
 
 
@@ -22,6 +22,8 @@ file(GLOB_RECURSE SRC_CU *.cu)
 # selectiveScan trtllmGenKernels folder
 list(FILTER SRC_CPP EXCLUDE REGEX "cutlass_kernels/.*")
 list(FILTER SRC_CU EXCLUDE REGEX "cutlass_kernels/.*")
+list(FILTER SRC_CPP EXCLUDE REGEX "cuteDslKernels/.*")
+list(FILTER SRC_CU EXCLUDE REGEX "cuteDslKernels/.*")
 list(FILTER SRC_CPP EXCLUDE REGEX "flashMLA/.*")
 list(FILTER SRC_CU EXCLUDE REGEX "flashMLA/.*")
 list(FILTER SRC_CPP EXCLUDE REGEX "contextFusedMultiHeadAttention/.*")
@@ -75,6 +77,7 @@ target_include_directories(
 add_cuda_architectures(kernels_src 89)
 
 add_subdirectory(cutlass_kernels)
+add_subdirectory(cuteDslKernels)
 add_subdirectory(flashMLA)
 add_subdirectory(contextFusedMultiHeadAttention)
 add_subdirectory(decoderMaskedMultiheadAttention)
 
@@ -24,12 +24,12 @@
 
 namespace tensorrt_llm::kernels
 {
-void invokeIndexerTopKDecode(float const* logits, int const* seqLens, int* outIndices, float* auxLogits,
-    int* auxIndices, int const splitWorkThreshold, int const numRows, int const numColumns, int const stride0,
-    int const stride1, int const next_n, int const index_topk = 2048, cudaStream_t const stream = 0);
+void invokeIndexerTopKDecode(float const* logits, int const* seqLens, int* indices, float* outLogitsAux,
+    int* outIndicesAux, int const splitWorkThreshold, int const numRows, int const numColumns, int const stride0,
+    int const stride1, int const next_n, int const topK = 2048, cudaStream_t const stream = 0);
 
-void invokeIndexerTopKPrefill(float const* logits, int const* rowStarts, int const* rowEnds, int* outIndices,
-    int const numRows, int const numColumns, int const stride0, int const stride1, int const index_topk = 2048,
+void invokeIndexerTopKPrefill(float const* logits, int const* rowStarts, int const* rowEnds, int* indices,
+    int const numRows, int const numColumns, int const stride0, int const stride1, int const topK = 2048,
     cudaStream_t const stream = 0);
 
 } // namespace tensorrt_llm::kernels
@@ -51,6 +51,12 @@ namespace tensorrt_llm::kernels::mnnvl_throughput
         __VA_ARGS__;                                                                                                   \
         break;                                                                                                         \
     }                                                                                                                  \
+    case 6:                                                                                                            \
+    {                                                                                                                  \
+        constexpr int TOP_K = 6;                                                                                       \
+        __VA_ARGS__;                                                                                                   \
+        break;                                                                                                         \
+    }                                                                                                                  \
     case 4:                                                                                                            \
     {                                                                                                                  \
         constexpr int TOP_K = 4;                                                                                       \
 
@@ -0,0 +1,23 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved. SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+#
+
+file(GLOB_RECURSE SRC_CPP *.cpp)
+file(GLOB_RECURSE SRC_CU *.cu)
+
+add_library(cute_dsl_src OBJECT ${SRC_CPP} ${SRC_CU})
+set_property(TARGET cute_dsl_src PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET cute_dsl_src PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
Original file line number	Diff line number	Diff line change
`@@ -1290,20 +1290,26 @@ CacheTransceiverConfig Serialization::deserializeCacheTransceiverConfig(std::ist`
`1290`	`1290`	`{`
`1291`	`1291`	`auto backendType = su::deserialize<std::optional<CacheTransceiverConfig::BackendType>>(is);`
`1292`	`1292`	`auto maxTokensInBuffer = su::deserialize<std::optional<size_t>>(is);`
`1293`		`- return CacheTransceiverConfig{backendType, maxTokensInBuffer};`
	`1293`	`+ auto kvTransferTimeoutMs = su::deserialize<std::optional<int>>(is);`
	`1294`	`+ auto kvTransferSenderFutureTimeoutMs = su::deserialize<std::optional<int>>(is);`
	`1295`	`+ return CacheTransceiverConfig{backendType, maxTokensInBuffer, kvTransferTimeoutMs, kvTransferSenderFutureTimeoutMs};`
`1294`	`1296`	`}`
`1295`	`1297`
`1296`	`1298`	`void Serialization::serialize(CacheTransceiverConfig const& cacheTransceiverConfig, std::ostream& os)`
`1297`	`1299`	`{`
`1298`	`1300`	`su::serialize(cacheTransceiverConfig.getBackendType(), os);`
`1299`	`1301`	`su::serialize(cacheTransceiverConfig.getMaxTokensInBuffer(), os);`
	`1302`	`+ su::serialize(cacheTransceiverConfig.getKvTransferTimeoutMs(), os);`
	`1303`	`+ su::serialize(cacheTransceiverConfig.getKvTransferSenderFutureTimeoutMs(), os);`
`1300`	`1304`	`}`
`1301`	`1305`
`1302`	`1306`	`size_t Serialization::serializedSize(CacheTransceiverConfig const& cacheTransceiverConfig)`
`1303`	`1307`	`{`
`1304`	`1308`	`size_t totalSize = 0;`
`1305`	`1309`	`totalSize += su::serializedSize(cacheTransceiverConfig.getBackendType());`
`1306`	`1310`	`totalSize += su::serializedSize(cacheTransceiverConfig.getMaxTokensInBuffer());`
	`1311`	`+ totalSize += su::serializedSize(cacheTransceiverConfig.getKvTransferTimeoutMs());`
	`1312`	`+ totalSize += su::serializedSize(cacheTransceiverConfig.getKvTransferSenderFutureTimeoutMs());`
`1307`	`1313`	`return totalSize;`
`1308`	`1314`	`}`
`1309`	`1315`