NVIDIA
diff --git a/‎cpp/tensorrt_llm/kernels/mlaKernels.cu‎
Lines changed: 5 additions & 6 deletions b/‎cpp/tensorrt_llm/kernels/mlaKernels.cu‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/mlaKernels.h‎
Lines changed: 3 additions & 0 deletions b/‎cpp/tensorrt_llm/kernels/mlaKernels.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/thop/attentionOp.cpp‎
Lines changed: 7 additions & 2 deletions b/‎cpp/tensorrt_llm/thop/attentionOp.cpp‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/thop/dsv3RopeOp.cpp‎
Lines changed: 7 additions & 2 deletions b/‎cpp/tensorrt_llm/thop/dsv3RopeOp.cpp‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎examples/disaggregated/clients/prompts_helix.json‎
Lines changed: 1 addition & 0 deletions b/‎examples/disaggregated/clients/prompts_helix.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/llm-api/quickstart_advanced.py‎
Lines changed: 2 additions & 0 deletions b/‎examples/llm-api/quickstart_advanced.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/attention_backend/trtllm.py‎
Lines changed: 48 additions & 7 deletions b/‎tensorrt_llm/_torch/attention_backend/trtllm.py‎
Lines changed: 48 additions & 7 deletions
diff --git a/‎tensorrt_llm/_torch/distributed/communicator.py‎
Lines changed: 23 additions & 1 deletion b/‎tensorrt_llm/_torch/distributed/communicator.py‎
Lines changed: 23 additions & 1 deletion
@@ -351,7 +351,8 @@ __global__ void applyMLARopeAndAssignQKVKernelGeneration(T* qkv_output, T* q_pe,
     int* seqQOffset, uint32_t* fmha_tile_counter, int32_t const* kv_cache_lengths, int* seqKVOffsets, int q_pe_ld,
     int q_pe_stride, KvCacheDataType cache_type, float* bmm1_scale, float* bmm2_scale, float const* quant_scale_o,
     float const* quant_scale_q, float const* quant_scale_kv, float const* dequant_scale_q,
-    float const* dequant_scale_kv, float host_bmm1_scale, int32_t const* helix_position_offsets)
+    float const* dequant_scale_kv, float host_bmm1_scale, int32_t const* helix_position_offsets,
+    bool const* helix_is_inactive_rank)
 {
 
     // Constants.
@@ -424,7 +425,6 @@ __global__ void applyMLARopeAndAssignQKVKernelGeneration(T* qkv_output, T* q_pe,
 
             if (valid_token)
             {
-
                 auto const position_id
                     = (helix_position_offsets != nullptr ? helix_position_offsets[global_token_idx]
                                                          : kv_cache_lengths[batch_idx] - seq_len + local_token_idx);
@@ -460,10 +460,9 @@ __global__ void applyMLARopeAndAssignQKVKernelGeneration(T* qkv_output, T* q_pe,
 
             if (valid_token)
             {
-                if (head_idx == head_num)
+                if (head_idx == head_num && (helix_is_inactive_rank == nullptr || !helix_is_inactive_rank[batch_idx]))
                 {
                     auto const token_kv_idx = kv_cache_lengths[batch_idx] - seq_len + local_token_idx;
-
                     {
                         auto kDst = reinterpret_cast<T*>(kv_cache.getKBlockPtr(batch_idx, token_kv_idx));
                         auto inBlockIdx = kv_cache.getKVLocalIdx(
@@ -514,7 +513,7 @@ __global__ void applyMLARopeAndAssignQKVKernelGeneration(T* qkv_output, T* q_pe,
             auto local_token_idx = global_token_idx % seq_len;
             bool valid_token = global_token_idx < total_s_len;
 
-            if (valid_token)
+            if (valid_token && (helix_is_inactive_rank == nullptr || !helix_is_inactive_rank[batch_idx]))
             {
                 if (head_dim_vec_idx == 0)
                 {
@@ -1047,7 +1046,7 @@ void invokeMLARopeGeneration(MlaParams<T>& params, KVCacheBuffer kv_cache_buffer
         params.seqQOffset, params.fmha_tile_counter, params.cache_seq_lens, params.cu_kv_seqlens, params.q_pe_ld,
         params.q_pe_stride, params.cache_type, params.bmm1_scale, params.bmm2_scale, params.quant_scale_o,
         params.quant_scale_q, params.quant_scale_kv, params.dequant_scale_q, params.dequant_scale_kv,
-        params.host_bmm1_scale, params.helix_position_offsets);
+        params.host_bmm1_scale, params.helix_position_offsets, params.helix_is_inactive_rank);
 }
 
 template <typename T, typename TCache>
 
@@ -107,6 +107,9 @@ struct MlaParams
 
     // for Helix parallelism: the rotary position offsets [b]
     int32_t const* helix_position_offsets{nullptr};
+    // for Helix parallelism: whether the current rank is inactive, shape [b]
+    // (the current query tokens are not appended to this rank's KV cache)
+    bool const* helix_is_inactive_rank{nullptr};
 };
 
 template <typename T, typename KVCacheBuffer>
 
@@ -181,8 +181,8 @@ class Runner : public RunnerBase
         [[maybe_unused]] MlaParams<T> mla_params;
         if (op.isMLAEnabled())
         {
-            TORCH_CHECK(mla_tensor_params.size() == 1,
-                "Expecting 1 tensor for custom MLA tensor params: helix_position_offsets.");
+            TORCH_CHECK(mla_tensor_params.size() == 2,
+                "Expecting 2 tensors for custom MLA tensor params: helix_position_offsets and helix_is_inactive_rank.");
             if (is_context && op.mUseSparseAttention)
             {
                 if (latent_cache.has_value())
@@ -227,10 +227,15 @@ class Runner : public RunnerBase
 
                 // For generation, helix position is in ropeOp
                 auto& mla_helix_position_offsets = mla_tensor_params[0];
+                auto& mla_helix_is_inactive_rank = mla_tensor_params[1];
                 if (mla_helix_position_offsets.has_value())
                 {
                     mla_params.helix_position_offsets = mla_helix_position_offsets->data_ptr<int32_t>();
                 }
+                if (mla_helix_is_inactive_rank.has_value())
+                {
+                    mla_params.helix_is_inactive_rank = mla_helix_is_inactive_rank->data_ptr<bool>();
+                }
             }
             else
             {
 
@@ -66,6 +66,7 @@ struct MlaRopeGenArgs
     float const* kv_scale_quant_orig_ptr;
     float host_bmm1_scale;
     int32_t const* helix_position_offsets_ptr;
+    bool const* helix_is_inactive_rank_ptr;
 };
 
 template <typename T, typename KVCacheBuffer>
@@ -105,6 +106,7 @@ void invokeMLARopeGenerationHelper(T const* latent_cache_ptr, T* q_pe_ptr, T* fu
     mla_params.dequant_scale_kv = args.kv_scale_quant_orig_ptr;
     mla_params.host_bmm1_scale = args.host_bmm1_scale;
     mla_params.helix_position_offsets = args.helix_position_offsets_ptr;
+    mla_params.helix_is_inactive_rank = args.helix_is_inactive_rank_ptr;
 
     tk::invokeMLARopeGeneration<T>(mla_params, kv_cache_buffer, stream);
 }
@@ -134,7 +136,7 @@ void MLARopeGeneration(torch::Tensor fused_q, // [tokens, num_heads, (nope_dim +
         head_size == kv_lora_rank + qk_rope_head_dim, "head_size must = kv_lora_rank + qk_rope_head_dim");
     TLLM_CHECK_WITH_INFO(num_kv_heads == 1, "num_kv_heads must = 1");
     TORCH_CHECK(
-        mla_tensor_params.size() == 1, "Expecting 1 tensor for custom MLA tensor params: helix_position_offsets.");
+        mla_tensor_params.size() == 2, "Expecting 2 tensors for custom MLA tensor params: helix_position_offsets and helix_is_inactive_rank.");
 
     auto stream = at::cuda::getCurrentCUDAStream(fused_q.get_device());
     auto const kv_cache_quant_mode = tc::QuantMode(uint32_t(quant_mode));
@@ -153,6 +155,7 @@ void MLARopeGeneration(torch::Tensor fused_q, // [tokens, num_heads, (nope_dim +
     int32_t const num_gen_tokens = num_tokens;
     int32_t const seq_offset = num_contexts;
     auto& mla_helix_position_offsets = mla_tensor_params[0];
+    auto& mla_helix_is_inactive_rank = mla_tensor_params[1];
     int32_t const layer_num = host_kv_cache_pool_mapping.value().size(0);
 
     tk::MlaMetaParams mla_meta_params = {static_cast<int>(q_lora_rank), static_cast<int>(kv_lora_rank),
@@ -161,6 +164,8 @@ void MLARopeGeneration(torch::Tensor fused_q, // [tokens, num_heads, (nope_dim +
 
     int32_t const* helix_position_offsets_ptr
         = mla_helix_position_offsets.has_value() ? mla_helix_position_offsets->data_ptr<int32_t>() : nullptr;
+    bool const* helix_is_inactive_rank_ptr
+        = mla_helix_is_inactive_rank.has_value() ? mla_helix_is_inactive_rank->data_ptr<bool>() : nullptr;
 
     int* cu_q_seqlens_ptr = reinterpret_cast<int*>(cu_q_seqlens.data_ptr());
     int* cu_kv_seqlens_ptr = reinterpret_cast<int*>(cu_kv_seqlens.data_ptr());
@@ -274,7 +279,7 @@ void MLARopeGeneration(torch::Tensor fused_q, // [tokens, num_heads, (nope_dim +
         static_cast<int32_t>(num_heads), mla_meta_params, sequence_lengths_ptr, max_context_q_len,
         block_ids_per_seq_ptr, cache_type, cu_q_seqlens_ptr, cu_kv_seqlens_ptr, fmha_tile_counter_ptr,
         mla_bmm1_scale_ptr, mla_bmm2_scale_ptr, quant_q_buffer_ptr, quant_scale_o_ptr, kv_scale_orig_quant_ptr,
-        kv_scale_quant_orig_ptr, host_bmm1_scale, helix_position_offsets_ptr};
+        kv_scale_quant_orig_ptr, host_bmm1_scale, helix_position_offsets_ptr, helix_is_inactive_rank_ptr};
 
     auto const input_dtype = fused_q.scalar_type();
     if (input_dtype == torch::kFloat16)
 
@@ -0,0 +1 @@
+["Global warming is the long term rise in Earth temperature caused by greenhouse gases from human activity, burning fossil fuels, and deforestation. It leads to melting ice, rising seas, and extreme weather that threaten ecosystems, wildlife, and people. Urgent global action is "]
@@ -70,6 +70,7 @@ def add_llm_args(parser):
                         choices=["auto", "TorchSampler", "TRTLLMSampler"])
     parser.add_argument('--tp_size', type=int, default=1)
     parser.add_argument('--pp_size', type=int, default=1)
+    parser.add_argument('--cp_size', type=int, default=1)
     parser.add_argument('--moe_ep_size', type=int, default=-1)
     parser.add_argument('--moe_tp_size', type=int, default=-1)
     parser.add_argument('--moe_cluster_size', type=int, default=-1)
@@ -259,6 +260,7 @@ def setup_llm(args, **kwargs):
         attention_dp_config=attention_dp_config,
         tensor_parallel_size=args.tp_size,
         pipeline_parallel_size=args.pp_size,
+        context_parallel_size=args.cp_size,
         moe_expert_parallel_size=args.moe_ep_size,
         moe_tensor_parallel_size=args.moe_tp_size,
         moe_cluster_parallel_size=args.moe_cluster_size,
 
@@ -187,7 +187,6 @@ def plan(
         q_pe: Optional[torch.Tensor] = None,
         mrope_config: Optional[dict] = None,
         softmax_stats_tensor: Optional[torch.Tensor] = None,
-        helix_position_offsets: Optional[torch.Tensor] = None,
         is_spec_decoding_enabled: bool = False,
         use_spec_decoding: bool = False,
         is_spec_dec_tree: bool = False,
@@ -205,6 +204,8 @@ def plan(
         sparse_attn_offsets: Optional[torch.Tensor] = None,
         sparse_attn_indices_block_size: int = 1,
         sparse_mla_topk: int = 0,
+        helix_position_offsets: Optional[torch.Tensor] = None,
+        helix_is_inactive_rank: Optional[torch.Tensor] = None,
         **kwargs,
     ):
         """
@@ -241,7 +242,6 @@ def plan(
             use_paged_context_fmha (bool): Sets the mPagedContextFMHA attribute in the op runner.
             mrope_config (dict): The dictionary containing the mRope configuration.
             softmax_stats_tensor (torch.Tensor): The tensor to store the softmax statistics (max/sum)
-            helix_position_offsets (torch.Tensor): The tensor to store the helix position offsets, with shape (num_tokens) on GPU.
             attention_sinks (torch.Tensor): The attention sinks (additional value in the denominator of the softmax) with shape of (num_heads_q) on GPU.
             chunked_prefill_buffer_batch_size (int): used for malloc buffer for k and v in fp8 context mla. the max input kv length is not max_num_tokens in this case. It is chunked_prefill_buffer_batch_size * max_num_tokens.
             sparse_kv_indices (torch.Tensor): The sparse indices for the KV cache, with shape of (num_heads_kv, num_sparse_tokens) on GPU.
@@ -250,6 +250,8 @@ def plan(
             sparse_attn_offsets (torch.Tensor): The batch offsets for the sparse attention indices, with shape of (num_generations + 1) on GPU.
             sparse_attn_indices_block_size (int): The granularity of the sparse attention indices, used by block sparse attention.
             sparse_mla_topk (int): The topk for the sparse MLA, used by DSA attention.
+            helix_position_offsets (torch.Tensor): The tensor to store the helix position offsets, with shape (num_tokens) on GPU.
+            helix_is_inactive_rank (torch.Tensor): For Helix: whether the current rank is inactive, with shape (batch_size) on GPU.
         """
         self.layer_idx = layer_idx
         self.tokens_per_block = tokens_per_block
@@ -285,14 +287,19 @@ def plan(
             'mrope_position_deltas') if mrope_config is not None else None
         self.block_ids_per_seq = block_ids_per_seq
         self.softmax_stats_tensor = softmax_stats_tensor
-        self.helix_position_offsets = helix_position_offsets
         self.attention_sinks = attention_sinks
         self.sparse_kv_indices = sparse_kv_indices
         self.sparse_kv_offsets = sparse_kv_offsets
         self.sparse_attn_indices = sparse_attn_indices
         self.sparse_attn_offsets = sparse_attn_offsets
         self.sparse_attn_indices_block_size = sparse_attn_indices_block_size
         self.sparse_mla_topk = sparse_mla_topk
+        self.helix_position_offsets = helix_position_offsets
+        self.helix_is_inactive_rank = helix_is_inactive_rank
+        if self.helix_is_inactive_rank is not None and not isinstance(self.helix_is_inactive_rank, torch.Tensor):
+            self.helix_is_inactive_rank = torch.tensor(
+                self.helix_is_inactive_rank, dtype=torch.bool, pin_memory=True)
+
         if max_sequence_length > self.rope_params.max_positions:
             self.rope_params.max_positions = max_sequence_length
             self.rotary_inv_freq, self.rotary_cos_sin = self.rope_params.create_rope_const_params(
@@ -471,7 +478,7 @@ def run(
             spec_decoding_tensor_params.append(self.spec_decoding_bl_tree_mask)
             spec_decoding_tensor_params.append(
                 self.spec_bl_tree_first_sparse_mask_offset_kv)
-        mla_tensor_params = [self.helix_position_offsets]
+        mla_tensor_params = [self.helix_position_offsets, self.helix_is_inactive_rank]
 
         thop.attention(
             q,
@@ -630,6 +637,13 @@ class TrtllmAttentionMetadata(AttentionMetadata):
     spec_decoding_bl_tree_mask: Optional[torch.Tensor] = None
     spec_bl_tree_first_sparse_mask_offset_kv: Optional[torch.Tensor] = None
 
+    # Whether the current rank is inactive for helix parallelism.
+    # In helix parallelism, only the active rank appends KV cache for the query token
+    # and attends to the previously cached tokens as well as the query token. Inactive
+    # ranks do not append KV cache for the query token and attend to the previously
+    # cached tokens only.
+    helix_is_inactive_rank: Optional[torch.Tensor] = None
+
     @property
     def max_seq_len(self) -> int:
         """
@@ -838,7 +852,23 @@ def prepare(self) -> None:
         if self.enable_flash_mla:
             self.prepare_flash_mla()
         # number of tokens needed in the kv cache for each sequence after the next pass
-        kv_lens = cached_token_lens + self.seq_lens_kv if cached_token_lens is not None else self.seq_lens_kv
+        if self.helix_is_inactive_rank is not None and len(
+                self.helix_is_inactive_rank):
+            # If helix is inactive, attend to the previously cached tokens only.
+            # This gets further complicated with multiple requests as each request might
+            # have a different active helix rank.
+            assert cached_token_lens is not None, "cached_token_lens should be set for helix"
+            kv_lens = cached_token_lens
+            helix_is_inactive_rank_cpu = torch.tensor(
+                self.helix_is_inactive_rank,
+                dtype=torch.bool,
+                device='cpu',
+            )
+            active_rank = ~helix_is_inactive_rank_cpu
+            kv_lens[active_rank] += self.seq_lens_kv[active_rank]
+        else:
+            kv_lens = cached_token_lens + self.seq_lens_kv if cached_token_lens is not None else self.seq_lens_kv
+
         # self.kv_lens is the valid kv cache length, while the self.kv_lens_cuda is
         # the sequence length including the cached tokens and the input tokens.
         self.kv_lens[:self.num_seqs].copy_(
@@ -1435,7 +1465,6 @@ def forward(
             q_pe=q_pe,
             mrope_config=mrope_config,
             softmax_stats_tensor=softmax_stats_tensor,
-            helix_position_offsets=helix_position_offsets,
             is_spec_decoding_enabled=metadata.is_spec_decoding_enabled,
             use_spec_decoding=metadata.use_spec_decoding,
             is_spec_dec_tree=metadata.is_spec_dec_tree,
@@ -1458,6 +1487,8 @@ def forward(
             sparse_attn_indices_block_size=sparse_attn_indices_block_size,
             sparse_mla_topk=metadata.sparse_mla_topk if hasattr(
                 metadata, 'sparse_mla_topk') else 0,
+            helix_position_offsets=helix_position_offsets,
+            helix_is_inactive_rank=metadata.helix_is_inactive_rank,
         )
         out_dtype = None
         if out_scale is not None:
@@ -1717,6 +1748,7 @@ def mla_rope_generation(
         mla_bmm2_scale: torch.Tensor,
         quant_q_buffer: torch.Tensor,
         helix_position_offsets: Optional[torch.Tensor] = None,
+        helix_is_inactive_rank: Optional[torch.Tensor] = None,
         out_scale: Optional[torch.Tensor] = None,
     ) -> None:
         """
@@ -1736,7 +1768,16 @@ def mla_rope_generation(
         assert self.is_mla_enable and self.mla_params is not None
         assert metadata.kv_cache_manager is not None
         sink_token_length = 0
-        mla_tensor_params = [helix_position_offsets]
+        
+        # Ensure helix_is_inactive_rank is on the same device as other tensors
+        if helix_is_inactive_rank is not None:
+            if isinstance(helix_is_inactive_rank, list):
+                helix_is_inactive_rank = torch.tensor(
+                    helix_is_inactive_rank, dtype=torch.bool, device=helix_position_offsets.device)
+            elif helix_is_inactive_rank.device.type != 'cuda':
+                helix_is_inactive_rank = helix_is_inactive_rank.to(helix_position_offsets.device)
+        
+        mla_tensor_params = [helix_position_offsets, helix_is_inactive_rank]
 
         torch.ops.trtllm.mla_rope_generation(
             fused_q,
 
@@ -4,6 +4,7 @@
 from functools import wraps
 from typing import Optional
 
+import copy
 import numpy as np
 import torch
 import torch.distributed as dist
@@ -341,9 +342,30 @@ class MPIDist(Distributed):
 
     def __init__(self, mapping: Mapping):
         super().__init__(mapping)
+        self.create_cp_comm()
+        # Repurpose CP ranks to TP for Helix so that the right comms are created.
+        mapping_with_helix = None
+        if self.mapping.cp_size > 1:
+            print(f"[MPIDist::__init__] Repurposing CP ranks to TP for Helix.")
+            mapping_with_helix = copy.deepcopy(self.mapping)
+            mapping_without_helix = Mapping(
+                world_size=self.mapping.world_size,
+                rank=self.mapping.rank,
+                gpus_per_node=self.mapping.gpus_per_node,
+                cp_size=1,
+                cp_config={},
+                tp_size=self.mapping.tp_size * self.mapping.cp_size,
+                pp_size=self.mapping.pp_size,
+                moe_ep_size=self.mapping.moe_ep_size,
+                enable_attention_dp=self.mapping.enable_attention_dp)
+            self.mapping = mapping_without_helix
         self.create_tp_comm()
         self.create_pp_comm()
-        self.create_cp_comm()
+
+        # Restore the original mapping.
+        if mapping_with_helix is not None:
+            print(f"[MPIDist::__init__] Restoring original mapping.")
+            self.mapping = mapping_with_helix
 
     def broadcast(self, obj, root=0, chunk_size: int = 4 * 1024 * 1024):
         comm = mpi_comm()
Original file line number	Diff line number	Diff line change
`@@ -351,7 +351,8 @@ __global__ void applyMLARopeAndAssignQKVKernelGeneration(T* qkv_output, T* q_pe,`
`351`	`351`	`int* seqQOffset, uint32_t* fmha_tile_counter, int32_t const* kv_cache_lengths, int* seqKVOffsets, int q_pe_ld,`
`352`	`352`	`int q_pe_stride, KvCacheDataType cache_type, float* bmm1_scale, float* bmm2_scale, float const* quant_scale_o,`
`353`	`353`	`float const* quant_scale_q, float const* quant_scale_kv, float const* dequant_scale_q,`
`354`		`- float const* dequant_scale_kv, float host_bmm1_scale, int32_t const* helix_position_offsets)`
	`354`	`+ float const* dequant_scale_kv, float host_bmm1_scale, int32_t const* helix_position_offsets,`
	`355`	`+ bool const* helix_is_inactive_rank)`
`355`	`356`	`{`
`356`	`357`
`357`	`358`	`// Constants.`
`@@ -424,7 +425,6 @@ __global__ void applyMLARopeAndAssignQKVKernelGeneration(T* qkv_output, T* q_pe,`
`424`	`425`
`425`	`426`	`if (valid_token)`
`426`	`427`	`{`
`427`		`-`
`428`	`428`	`auto const position_id`
`429`	`429`	`= (helix_position_offsets != nullptr ? helix_position_offsets[global_token_idx]`
`430`	`430`	`: kv_cache_lengths[batch_idx] - seq_len + local_token_idx);`
`@@ -460,10 +460,9 @@ __global__ void applyMLARopeAndAssignQKVKernelGeneration(T* qkv_output, T* q_pe,`
`460`	`460`
`461`	`461`	`if (valid_token)`
`462`	`462`	`{`
`463`		`- if (head_idx == head_num)`
	`463`	`+ if (head_idx == head_num && (helix_is_inactive_rank == nullptr \|\| !helix_is_inactive_rank[batch_idx]))`
`464`	`464`	`{`
`465`	`465`	`auto const token_kv_idx = kv_cache_lengths[batch_idx] - seq_len + local_token_idx;`
`466`		`-`
`467`	`466`	`{`
`468`	`467`	`auto kDst = reinterpret_cast<T*>(kv_cache.getKBlockPtr(batch_idx, token_kv_idx));`
`469`	`468`	`auto inBlockIdx = kv_cache.getKVLocalIdx(`
`@@ -514,7 +513,7 @@ __global__ void applyMLARopeAndAssignQKVKernelGeneration(T* qkv_output, T* q_pe,`
`514`	`513`	`auto local_token_idx = global_token_idx % seq_len;`
`515`	`514`	`bool valid_token = global_token_idx < total_s_len;`
`516`	`515`
`517`		`- if (valid_token)`
	`516`	`+ if (valid_token && (helix_is_inactive_rank == nullptr \|\| !helix_is_inactive_rank[batch_idx]))`
`518`	`517`	`{`
`519`	`518`	`if (head_dim_vec_idx == 0)`
`520`	`519`	`{`
`@@ -1047,7 +1046,7 @@ void invokeMLARopeGeneration(MlaParams<T>& params, KVCacheBuffer kv_cache_buffer`
`1047`	`1046`	`params.seqQOffset, params.fmha_tile_counter, params.cache_seq_lens, params.cu_kv_seqlens, params.q_pe_ld,`
`1048`	`1047`	`params.q_pe_stride, params.cache_type, params.bmm1_scale, params.bmm2_scale, params.quant_scale_o,`
`1049`	`1048`	`params.quant_scale_q, params.quant_scale_kv, params.dequant_scale_q, params.dequant_scale_kv,`
`1050`		`- params.host_bmm1_scale, params.helix_position_offsets);`
	`1049`	`+ params.host_bmm1_scale, params.helix_position_offsets, params.helix_is_inactive_rank);`
`1051`	`1050`	`}`
`1052`	`1051`
`1053`	`1052`	`template <typename T, typename TCache>`
Original file line number	Diff line number	Diff line change
`@@ -181,8 +181,8 @@ class Runner : public RunnerBase`
`181`	`181`	`[[maybe_unused]] MlaParams<T> mla_params;`
`182`	`182`	`if (op.isMLAEnabled())`
`183`	`183`	`{`
`184`		`- TORCH_CHECK(mla_tensor_params.size() == 1,`
`185`		`- "Expecting 1 tensor for custom MLA tensor params: helix_position_offsets.");`
	`184`	`+ TORCH_CHECK(mla_tensor_params.size() == 2,`
	`185`	`+ "Expecting 2 tensors for custom MLA tensor params: helix_position_offsets and helix_is_inactive_rank.");`
`186`	`186`	`if (is_context && op.mUseSparseAttention)`
`187`	`187`	`{`
`188`	`188`	`if (latent_cache.has_value())`
`@@ -227,10 +227,15 @@ class Runner : public RunnerBase`
`227`	`227`
`228`	`228`	`// For generation, helix position is in ropeOp`
`229`	`229`	`auto& mla_helix_position_offsets = mla_tensor_params[0];`
	`230`	`+ auto& mla_helix_is_inactive_rank = mla_tensor_params[1];`
`230`	`231`	`if (mla_helix_position_offsets.has_value())`
`231`	`232`	`{`
`232`	`233`	`mla_params.helix_position_offsets = mla_helix_position_offsets->data_ptr<int32_t>();`
`233`	`234`	`}`
	`235`	`+ if (mla_helix_is_inactive_rank.has_value())`
	`236`	`+ {`
	`237`	`+ mla_params.helix_is_inactive_rank = mla_helix_is_inactive_rank->data_ptr<bool>();`
	`238`	`+ }`
`234`	`239`	`}`
`235`	`240`	`else`
`236`	`241`	`{`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+["Global warming is the long term rise in Earth temperature caused by greenhouse gases from human activity, burning fossil fuels, and deforestation. It leads to melting ice, rising seas, and extreme weather that threaten ecosystems, wildlife, and people. Urgent global action is "]`