formatting

brb-nv · brb-nv · commit 2b1c9dd46f3e · 2025-11-23T02:51:17.000Z
diff --git a/cpp/tensorrt_llm/kernels/mlaKernels.cu b/cpp/tensorrt_llm/kernels/mlaKernels.cu
@@ -425,6 +425,7 @@ __global__ void applyMLARopeAndAssignQKVKernelGeneration(T* qkv_output, T* q_pe,
 
             if (valid_token)
             {
+
                 auto const position_id
                     = (helix_position_offsets != nullptr ? helix_position_offsets[global_token_idx]
                                                          : kv_cache_lengths[batch_idx] - seq_len + local_token_idx);
@@ -463,6 +464,7 @@ __global__ void applyMLARopeAndAssignQKVKernelGeneration(T* qkv_output, T* q_pe,
                 if (head_idx == head_num && (helix_is_inactive_rank == nullptr || !helix_is_inactive_rank[batch_idx]))
                 {
                     auto const token_kv_idx = kv_cache_lengths[batch_idx] - seq_len + local_token_idx;
+
                     {
                         auto kDst = reinterpret_cast<T*>(kv_cache.getKBlockPtr(batch_idx, token_kv_idx));
                         auto inBlockIdx = kv_cache.getKVLocalIdx(
diff --git a/cpp/tensorrt_llm/kernels/mlaKernels.h b/cpp/tensorrt_llm/kernels/mlaKernels.h
@@ -107,6 +107,7 @@ struct MlaParams
 
     // for Helix parallelism: the rotary position offsets [b]
     int32_t const* helix_position_offsets{nullptr};
+
     // for Helix parallelism: whether the current rank is inactive, shape [b]
     // (the current query tokens are not appended to this rank's KV cache)
     bool const* helix_is_inactive_rank{nullptr};
diff --git a/cpp/tensorrt_llm/thop/dsv3RopeOp.cpp b/cpp/tensorrt_llm/thop/dsv3RopeOp.cpp
@@ -135,8 +135,8 @@ void MLARopeGeneration(torch::Tensor fused_q, // [tokens, num_heads, (nope_dim +
     TLLM_CHECK_WITH_INFO(
         head_size == kv_lora_rank + qk_rope_head_dim, "head_size must = kv_lora_rank + qk_rope_head_dim");
     TLLM_CHECK_WITH_INFO(num_kv_heads == 1, "num_kv_heads must = 1");
-    TORCH_CHECK(
-        mla_tensor_params.size() == 2, "Expecting 2 tensors for custom MLA tensor params: helix_position_offsets and helix_is_inactive_rank.");
+    TORCH_CHECK(mla_tensor_params.size() == 2,
+        "Expecting 2 tensors for custom MLA tensor params: helix_position_offsets and helix_is_inactive_rank.");
 
     auto stream = at::cuda::getCurrentCUDAStream(fused_q.get_device());
     auto const kv_cache_quant_mode = tc::QuantMode(uint32_t(quant_mode));
diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py
@@ -296,7 +296,8 @@ def plan(
         self.sparse_mla_topk = sparse_mla_topk
         self.helix_position_offsets = helix_position_offsets
         self.helix_is_inactive_rank = helix_is_inactive_rank
-        if self.helix_is_inactive_rank is not None and not isinstance(self.helix_is_inactive_rank, torch.Tensor):
+        if self.helix_is_inactive_rank is not None and not isinstance(
+                self.helix_is_inactive_rank, torch.Tensor):
             self.helix_is_inactive_rank = torch.tensor(
                 self.helix_is_inactive_rank, dtype=torch.bool, pin_memory=True)
 
@@ -478,7 +479,9 @@ def run(
             spec_decoding_tensor_params.append(self.spec_decoding_bl_tree_mask)
             spec_decoding_tensor_params.append(
                 self.spec_bl_tree_first_sparse_mask_offset_kv)
-        mla_tensor_params = [self.helix_position_offsets, self.helix_is_inactive_rank]
+        mla_tensor_params = [
+            self.helix_position_offsets, self.helix_is_inactive_rank
+        ]
 
         thop.attention(
             q,
@@ -855,10 +858,8 @@ def prepare(self) -> None:
         if self.helix_is_inactive_rank is not None and len(
                 self.helix_is_inactive_rank):
             # If helix is inactive, attend to the previously cached tokens only.
-            # This gets further complicated with multiple requests as each request might
-            # have a different active helix rank.
             assert cached_token_lens is not None, "cached_token_lens should be set for helix"
-            kv_lens = cached_token_lens
+            kv_lens = cached_token_lens.clone()
             helix_is_inactive_rank_cpu = torch.tensor(
                 self.helix_is_inactive_rank,
                 dtype=torch.bool,
@@ -1768,15 +1769,18 @@ def mla_rope_generation(
         assert self.is_mla_enable and self.mla_params is not None
         assert metadata.kv_cache_manager is not None
         sink_token_length = 0
-        
+
         # Ensure helix_is_inactive_rank is on the same device as other tensors
         if helix_is_inactive_rank is not None:
             if isinstance(helix_is_inactive_rank, list):
                 helix_is_inactive_rank = torch.tensor(
-                    helix_is_inactive_rank, dtype=torch.bool, device=helix_position_offsets.device)
+                    helix_is_inactive_rank,
+                    dtype=torch.bool,
+                    device=helix_position_offsets.device)
             elif helix_is_inactive_rank.device.type != 'cuda':
-                helix_is_inactive_rank = helix_is_inactive_rank.to(helix_position_offsets.device)
-        
+                helix_is_inactive_rank = helix_is_inactive_rank.to(
+                    helix_position_offsets.device)
+
         mla_tensor_params = [helix_position_offsets, helix_is_inactive_rank]
 
         torch.ops.trtllm.mla_rope_generation(
diff --git a/tensorrt_llm/_torch/distributed/communicator.py b/tensorrt_llm/_torch/distributed/communicator.py
@@ -1,10 +1,10 @@
+import copy
 import math
 import pickle  # nosec B403
 from abc import ABC, abstractmethod
 from functools import wraps
 from typing import Optional
 
-import copy
 import numpy as np
 import torch
 import torch.distributed as dist
@@ -346,7 +346,8 @@ def __init__(self, mapping: Mapping):
         # Repurpose CP ranks to TP for Helix so that the right comms are created.
         mapping_with_helix = None
         if self.mapping.cp_size > 1:
-            print(f"[MPIDist::__init__] Repurposing CP ranks to TP for Helix.")
+            logger.info(
+                f"[MPIDist::__init__] Repurposing CP ranks to TP for Helix.")
             mapping_with_helix = copy.deepcopy(self.mapping)
             mapping_without_helix = Mapping(
                 world_size=self.mapping.world_size,
@@ -364,7 +365,9 @@ def __init__(self, mapping: Mapping):
 
         # Restore the original mapping.
         if mapping_with_helix is not None:
-            print(f"[MPIDist::__init__] Restoring original mapping.")
+            logger.info(
+                f"[MPIDist::__init__] Restoring original mapping undoing Helix manipulation."
+            )
             self.mapping = mapping_with_helix
 
     def broadcast(self, obj, root=0, chunk_size: int = 4 * 1024 * 1024):
diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -545,8 +545,6 @@ def __init__(
                          config=model_config,
                          aux_stream=aux_stream,
                          mapping_with_cp=mapping_with_cp)
-        # @B: Does this layer need to know about mapping_with_cp?
-        # Likely no because no use of mapping.
         self.kv_a_proj_with_mqa = DeepseekV3Linear(
             config.hidden_size,
             self.kv_lora_rank + self.qk_rope_head_dim +
diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py
@@ -750,7 +750,8 @@ def __init__(
         # tensor parallel
         config = config or ModelConfig()
         if mapping_with_cp is not None:
-            print("[MLA::__init__] OVERRIDING MAPPING WITH CP DETECTED.")
+            logger.warning(
+                "[MLA::__init__] Overriding mapping with CP detected.")
             self.mapping = mapping_with_cp
         else:
             self.mapping = config.mapping
@@ -762,7 +763,8 @@ def __init__(
         if self.mapping.has_cp_ulysses():
             raise NotImplementedError("MLA doesn't support CP Ulyssees yet")
         if self.mapping.cp_size > 1:
-            assert self.mapping.cp_config['cp_type'] == CpType.HELIX, f"CP type must be HELIX for MLA, but got {self.mapping.cp_config['cp_type']}."
+            assert self.mapping.cp_config[
+                'cp_type'] == CpType.HELIX, f"CP type must be HELIX for MLA, but got {self.mapping.cp_config['cp_type']}."
 
         mapping = Mapping(
             world_size=tp_size * pp_size * cp_size,
@@ -1727,20 +1729,19 @@ def forward_absorption_generation(
             maybe_execute_in_parallel(
                 lambda: torch.ops.trtllm.bmm_out(
                     q_nope_t, self.k_b_proj_trans.transpose(1, 2), q_nope_out),
-                lambda: self.mqa.mla_rope_generation(fused_q,
-                                                     q_pe,
-                                                     latent_cache,
-                                                     attn_metadata,
-                                                     cu_q_seqlens,
-                                                     cu_kv_seqlens,
-                                                     fmha_scheduler_counter,
-                                                     mla_bmm1_scale,
-                                                     mla_bmm2_scale,
-                                                     quant_q_buffer,
-                                                     helix_position_offsets=
-                                                     helix_position_offsets,
-                                                     helix_is_inactive_rank=
-                                                     helix_is_inactive_rank),
+                lambda: self.mqa.mla_rope_generation(
+                    fused_q,
+                    q_pe,
+                    latent_cache,
+                    attn_metadata,
+                    cu_q_seqlens,
+                    cu_kv_seqlens,
+                    fmha_scheduler_counter,
+                    mla_bmm1_scale,
+                    mla_bmm2_scale,
+                    quant_q_buffer,
+                    helix_position_offsets=helix_position_offsets,
+                    helix_is_inactive_rank=helix_is_inactive_rank),
                 self.ln_events[0],
                 self.ln_events[1],
                 rope_stream,
@@ -1758,20 +1759,19 @@ def forward_absorption_generation(
                     q_nope_out,
                     self.k_b_proj_trans_dequant,
                 ),
-                lambda: self.mqa.mla_rope_generation(fused_q,
-                                                     q_pe,
-                                                     latent_cache,
-                                                     attn_metadata,
-                                                     cu_q_seqlens,
-                                                     cu_kv_seqlens,
-                                                     fmha_scheduler_counter,
-                                                     mla_bmm1_scale,
-                                                     mla_bmm2_scale,
-                                                     quant_q_buffer,
-                                                     helix_position_offsets=
-                                                     helix_position_offsets,
-                                                     helix_is_inactive_rank=
-                                                     helix_is_inactive_rank),
+                lambda: self.mqa.mla_rope_generation(
+                    fused_q,
+                    q_pe,
+                    latent_cache,
+                    attn_metadata,
+                    cu_q_seqlens,
+                    cu_kv_seqlens,
+                    fmha_scheduler_counter,
+                    mla_bmm1_scale,
+                    mla_bmm2_scale,
+                    quant_q_buffer,
+                    helix_position_offsets=helix_position_offsets,
+                    helix_is_inactive_rank=helix_is_inactive_rank),
                 self.ln_events[0],
                 self.ln_events[1],
                 rope_stream,
diff --git a/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py b/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py
@@ -710,8 +710,7 @@ def _merge_requests(
             elif cp_type == CpType.HELIX:
                 return self._merge_helix_requests(
                     new_requests,
-                    tokens_per_block=32)
-                    # tokens_per_block=cp_config['tokens_per_block'])
+                    tokens_per_block=cp_config['tokens_per_block'])
             else:
                 raise NotImplementedError(
                     f'Unsupported cp type {cp_type.name}.')
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -9,7 +9,6 @@
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from typing import Any, Callable, Dict, List, Optional, Tuple
-from .llm_request import LlmRequest
 
 import torch
 import torch._dynamo.config
@@ -566,6 +565,9 @@ def warmup(self, resource_manager: ResourceManager) -> None:
         cp_type = self.mapping.cp_config.get('cp_type', None)
         if cp_type is not None:
             if cp_type in [CpType.ULYSSES, CpType.STAR]:
+                logger.info(
+                    "[ModelEngine::warmup] Skipping warmup for cp_type: ",
+                    cp_type.name)
                 return
 
         self._run_torch_compile_warmup(resource_manager)
@@ -1620,7 +1622,8 @@ def _prepare_tp_inputs(
                 request.cached_tokens = num_cached_tokens_per_seq[-1]
                 prompt_lengths.append(request.py_prompt_len)
                 if self.mapping.has_cp_helix():
-                    helix_is_inactive_rank.append(request.py_helix_is_inactive_rank)
+                    helix_is_inactive_rank.append(
+                        request.py_helix_is_inactive_rank)
                 draft_lens.append(0)
                 sequence_lengths.append(1)
                 num_accepted_draft_tokens.append(0)
diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py
@@ -2,7 +2,6 @@
 import gc
 import json
 import os
-import gc
 import signal  # Added import
 import subprocess  # nosec B404
 import sys
@@ -131,6 +130,9 @@ def get_llm_args(
         except KeyError:
             raise ValueError(f"Invalid cp_type: {cp_config['cp_type']}. " \
                              f"Must be one of: {', '.join([t.name for t in CpType])}")
+        if cp_config["cp_type"] == CpType.HELIX:
+            cp_config['tokens_per_block'] = kv_cache_config.tokens_per_block
+
     llm_args = {
         "model": model,
         "scheduler_config": scheduler_config,
@@ -386,8 +388,8 @@ def convert(self, value: Any, param: Optional["click.Parameter"],
 def serve(
         model: str, tokenizer: Optional[str], host: str, port: int,
         log_level: str, backend: str, max_beam_width: int, max_batch_size: int,
-        max_num_tokens: int, max_seq_len: int, tp_size: int, pp_size: int, cp_size: int,
-        ep_size: Optional[int], cluster_size: Optional[int],
+        max_num_tokens: int, max_seq_len: int, tp_size: int, pp_size: int,
+        cp_size: int, ep_size: Optional[int], cluster_size: Optional[int],
         gpus_per_node: Optional[int], kv_cache_free_gpu_memory_fraction: float,
         num_postprocess_workers: int, trust_remote_code: bool,
         extra_llm_api_options: Optional[str], reasoning_parser: Optional[str],
diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py
@@ -274,7 +274,7 @@ def get_test_config(test_desc, example_dir, test_root):
         "deepseek_v3_lite_fp8_tllm_gen_helix":
         (4,
          f"{test_configs_root}/disagg_config_ctxtp2_gentp1cp2_deepseek_v3_lite_fp8_tllm_gen.yaml"
-        ),
+         ),
     }
 
     if test_desc not in config_map:
diff --git a/tests/unittest/_torch/modules/test_mla_helix.py b/tests/unittest/_torch/modules/test_mla_helix.py
@@ -528,15 +528,18 @@ def _run_mla_distributed(
                     num_cached_tokens_per_seq=cached_tokens_per_seq,
                 ),
                 enable_context_mla_with_cached_kv=True,
-                helix_is_inactive_rank=torch.tensor(helix_is_inactive_rank, dtype=torch.bool, device="cuda"),
+                helix_is_inactive_rank=torch.tensor(
+                    helix_is_inactive_rank, dtype=torch.bool, device="cuda"
+                ),
             )
         else:
             attn_metadata.kv_cache_params = KVCacheParams(
                 use_cache=True,
                 num_cached_tokens_per_seq=cached_tokens_per_seq,
             )
             attn_metadata.helix_is_inactive_rank = torch.tensor(
-                helix_is_inactive_rank, dtype=torch.bool, device="cuda")
+                helix_is_inactive_rank, dtype=torch.bool, device="cuda"
+            )
         attn_metadata.prepare()
         extra_attrs["attention_metadata"] = weakref.ref(attn_metadata)
         if not use_cuda_graph:

Original file line number	Diff line number	Diff line change
`@@ -425,6 +425,7 @@ __global__ void applyMLARopeAndAssignQKVKernelGeneration(T* qkv_output, T* q_pe,`
`425`	`425`
`426`	`426`	`if (valid_token)`
`427`	`427`	`{`
	`428`	`+`
`428`	`429`	`auto const position_id`
`429`	`430`	`= (helix_position_offsets != nullptr ? helix_position_offsets[global_token_idx]`
`430`	`431`	`: kv_cache_lengths[batch_idx] - seq_len + local_token_idx);`
`@@ -463,6 +464,7 @@ __global__ void applyMLARopeAndAssignQKVKernelGeneration(T* qkv_output, T* q_pe,`
`463`	`464`	`if (head_idx == head_num && (helix_is_inactive_rank == nullptr \|\| !helix_is_inactive_rank[batch_idx]))`
`464`	`465`	`{`
`465`	`466`	`auto const token_kv_idx = kv_cache_lengths[batch_idx] - seq_len + local_token_idx;`
	`467`	`+`
`466`	`468`	`{`
`467`	`469`	`auto kDst = reinterpret_cast<T*>(kv_cache.getKBlockPtr(batch_idx, token_kv_idx));`
`468`	`470`	`auto inBlockIdx = kv_cache.getKVLocalIdx(`