updates to dsv3RopeOp

brb-nv · brb-nv · commit 6e5fe5d7c839 · 2025-11-20T16:47:02.000Z
diff --git a/cpp/tensorrt_llm/thop/dsv3RopeOp.cpp b/cpp/tensorrt_llm/thop/dsv3RopeOp.cpp
@@ -66,6 +66,7 @@ struct MlaRopeGenArgs
     float const* kv_scale_quant_orig_ptr;
     float host_bmm1_scale;
     int32_t const* helix_position_offsets_ptr;
+    bool const* helix_is_inactive_rank_ptr;
 };
 
 template <typename T, typename KVCacheBuffer>
@@ -105,6 +106,7 @@ void invokeMLARopeGenerationHelper(T const* latent_cache_ptr, T* q_pe_ptr, T* fu
     mla_params.dequant_scale_kv = args.kv_scale_quant_orig_ptr;
     mla_params.host_bmm1_scale = args.host_bmm1_scale;
     mla_params.helix_position_offsets = args.helix_position_offsets_ptr;
+    mla_params.helix_is_inactive_rank = args.helix_is_inactive_rank_ptr;
 
     tk::invokeMLARopeGeneration<T>(mla_params, kv_cache_buffer, stream);
 }
@@ -134,7 +136,7 @@ void MLARopeGeneration(torch::Tensor fused_q, // [tokens, num_heads, (nope_dim +
         head_size == kv_lora_rank + qk_rope_head_dim, "head_size must = kv_lora_rank + qk_rope_head_dim");
     TLLM_CHECK_WITH_INFO(num_kv_heads == 1, "num_kv_heads must = 1");
     TORCH_CHECK(
-        mla_tensor_params.size() == 1, "Expecting 1 tensor for custom MLA tensor params: helix_position_offsets.");
+        mla_tensor_params.size() == 2, "Expecting 2 tensors for custom MLA tensor params: helix_position_offsets and helix_is_inactive_rank.");
 
     auto stream = at::cuda::getCurrentCUDAStream(fused_q.get_device());
     auto const kv_cache_quant_mode = tc::QuantMode(uint32_t(quant_mode));
@@ -153,6 +155,7 @@ void MLARopeGeneration(torch::Tensor fused_q, // [tokens, num_heads, (nope_dim +
     int32_t const num_gen_tokens = num_tokens;
     int32_t const seq_offset = num_contexts;
     auto& mla_helix_position_offsets = mla_tensor_params[0];
+    auto& mla_helix_is_inactive_rank = mla_tensor_params[1];
     int32_t const layer_num = host_kv_cache_pool_mapping.value().size(0);
 
     tk::MlaMetaParams mla_meta_params = {static_cast<int>(q_lora_rank), static_cast<int>(kv_lora_rank),
@@ -161,6 +164,8 @@ void MLARopeGeneration(torch::Tensor fused_q, // [tokens, num_heads, (nope_dim +
 
     int32_t const* helix_position_offsets_ptr
         = mla_helix_position_offsets.has_value() ? mla_helix_position_offsets->data_ptr<int32_t>() : nullptr;
+    bool const* helix_is_inactive_rank_ptr
+        = mla_helix_is_inactive_rank.has_value() ? mla_helix_is_inactive_rank->data_ptr<bool>() : nullptr;
 
     int* cu_q_seqlens_ptr = reinterpret_cast<int*>(cu_q_seqlens.data_ptr());
     int* cu_kv_seqlens_ptr = reinterpret_cast<int*>(cu_kv_seqlens.data_ptr());
@@ -274,7 +279,7 @@ void MLARopeGeneration(torch::Tensor fused_q, // [tokens, num_heads, (nope_dim +
         static_cast<int32_t>(num_heads), mla_meta_params, sequence_lengths_ptr, max_context_q_len,
         block_ids_per_seq_ptr, cache_type, cu_q_seqlens_ptr, cu_kv_seqlens_ptr, fmha_tile_counter_ptr,
         mla_bmm1_scale_ptr, mla_bmm2_scale_ptr, quant_q_buffer_ptr, quant_scale_o_ptr, kv_scale_orig_quant_ptr,
-        kv_scale_quant_orig_ptr, host_bmm1_scale, helix_position_offsets_ptr};
+        kv_scale_quant_orig_ptr, host_bmm1_scale, helix_position_offsets_ptr, helix_is_inactive_rank_ptr};
 
     auto const input_dtype = fused_q.scalar_type();
     if (input_dtype == torch::kFloat16)
diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -70,8 +70,6 @@
 from .modeling_utils import (DecoderModel, EagerFusionConfig, filter_weights,
                              register_auto_model)
 
-# from ..utils import use_torch_printoptions
-
 
 @triton.jit
 def weight_dequant_kernel(x_ptr, s_ptr, y_ptr, M, N, BLOCK_SIZE: tl.constexpr):
diff --git a/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py b/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py
@@ -14,7 +14,6 @@
 from tensorrt_llm.mapping import CpType
 
 from ..distributed import Distributed
-from ..utils import use_torch_printoptions
 from .llm_request import (ExecutorRequest, LlmRequest,
                           executor_request_to_llm_request)
 
@@ -694,16 +693,12 @@ def _merge_helix_requests(self, new_requests: list[RequestQueueItem],
                 input_ids_this_rank = input_ids_this_rank[:-padding_len]
                 position_ids_this_rank = position_ids_this_rank[:-padding_len]
 
-            with use_torch_printoptions(sci_mode=False,
-                                        threshold=16,
-                                        edgeitems=2,
-                                        linewidth=120):
-                print(
-                    f"[ExecutorRequestQueue::_merge_helix_requests][{curr_cp_rank}]: input_ids_this_rank: {torch.tensor(input_ids_this_rank)}"
-                )
-                print(
-                    f"[ExecutorRequestQueue::_merge_helix_requests][{curr_cp_rank}]: position_ids_this_rank: {torch.tensor(position_ids_this_rank)}"
-                )
+            print(
+                f"[ExecutorRequestQueue::_merge_helix_requests][{curr_cp_rank}]: input_ids_this_rank: {torch.tensor(input_ids_this_rank)}"
+            )
+            print(
+                f"[ExecutorRequestQueue::_merge_helix_requests][{curr_cp_rank}]: position_ids_this_rank: {torch.tensor(position_ids_this_rank)}"
+            )
             # TODO: Figure how to pass down position_ids_this_rank to LLMRequest.
             req = executor_request_to_llm_request(
                 req_id=req_item.id,
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -42,7 +42,6 @@
 from ..speculative.drafter import Drafter
 from ..speculative.mtp import SampleStateTensorsMTP
 from ..speculative.speculation_gate import SpeculationGate
-from ..utils import use_torch_printoptions
 from .executor_request_queue import ExecutorRequestQueue, RequestQueueItem
 from .guided_decoder import GuidedDecoder
 from .handle_additional_outputs import HandleAdditionalOutputs
@@ -1898,15 +1897,11 @@ def _prepare_disagg_gen_transmission_complete(self, scheduled_batch):
                 req.py_draft_tokens = [] if ctx_draft_tokens is None else ctx_draft_tokens
                 beam_width = req.sampling_config.beam_width
 
-                with use_torch_printoptions(sci_mode=False,
-                                            threshold=16,
-                                            edgeitems=2,
-                                            linewidth=120):
-                    for beam in range(0, beam_width):
-                        print(
-                            f"[PyExecutor::_prepare_disagg_gen_transmission_complete]: Adding new token {torch.tensor(first_gen_tokens[beam])} for beam {beam}."
-                        )
-                        req.add_new_token(first_gen_tokens[beam], beam)
+                for beam in range(0, beam_width):
+                    print(
+                        f"[PyExecutor::_prepare_disagg_gen_transmission_complete]: Adding new token {torch.tensor(first_gen_tokens[beam])} for beam {beam}."
+                    )
+                    req.add_new_token(first_gen_tokens[beam], beam)
 
     @nvtx_range("_recv_disagg_gen_cache")
     def _recv_disagg_gen_cache(self, new_gen_reqs):