fix XQA issue

yweng0828 · yweng0828 · commit e0aed01f3bc9 · 2025-10-27T02:27:08.000-07:00
Signed-off-by: Yue Weng &lt;25103990+yweng0828@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/thop/attentionOp.cpp b/cpp/tensorrt_llm/thop/attentionOp.cpp
@@ -24,6 +24,7 @@
 #include "tensorrt_llm/runtime/utils/debugUtils.h"
 #include "tensorrt_llm/thop/attentionOp.h"
 #include "tensorrt_llm/thop/thUtils.h"
+#include <assert.h>
 #include <cstdint>
 #include <functional>
 #include <torch/extension.h>
@@ -466,7 +467,8 @@ class Runner : public RunnerBase
                     = spec_decoding_tensor_params[1].value().data_ptr<int32_t>();
                 enqueue_params.spec_decoding_packed_mask = spec_decoding_tensor_params[2].value().data_ptr<int32_t>();
                 enqueue_params.spec_decoding_is_generation_length_variable = true;
-                enqueue_params.spec_decoding_max_generation_length = input_seq_length + 1;
+                assert(spec_decoding_tensor_params[1].value().dim() == 2); // [batch_size, max_draft_len + 1]
+                enqueue_params.spec_decoding_max_generation_length = spec_decoding_tensor_params[1].value().sizes()[1];
             }
 
             // Current mlaGeneration will using fmha to do attention, so we don't go into enqueueGeneration
diff --git a/tensorrt_llm/_torch/attention_backend/interface.py b/tensorrt_llm/_torch/attention_backend/interface.py
@@ -335,6 +335,7 @@ def restore_from_spec_dec(self) -> None:
 
     def update_spec_dec_param(
             self,
+            batch_size,
             is_spec_decoding_enabled,
             spec_metadata,
             spec_tree_manager,
diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py
@@ -1055,14 +1055,14 @@ def prepare_context_mla_with_cached_kv(self,
 
     def update_spec_dec_param(
         self,
+        batch_size,
         is_spec_decoding_enabled,
         spec_metadata,
         spec_tree_manager,
         max_draft_len,
         max_total_draft_tokens,
         spec_decoding_tensor: Optional['SpecDecodingTensor'] = None,
     ):
-
         if spec_decoding_tensor is not None:
             spec_decoding_tensor.position_offsets
             spec_decoding_tensor.packed_mask
@@ -1086,6 +1086,8 @@ def update_spec_dec_param(
 
         # Parameters can be fixed and not changed during runtime if the
         if self.is_spec_decoding_enabled:
+            # These buffers are accessed more like removing input padding,
+            # rather than using max_total_draft_tokens + 1 as the offset between different requests.
             self.spec_decoding_position_offsets = torch.empty(
                 [self.max_num_requests, max_total_draft_tokens + 1],
                 dtype=torch.int,
@@ -1121,47 +1123,54 @@ def update_spec_dec_param(
                     assert False, "Dynamic tree is not supported yet"
                 # If is the static tree
                 else:
-                    self.spec_decoding_position_offsets[
-                        :,
-                    ].copy_(spec_tree_manager.spec_dec_position_offsets[0, :],
-                            non_blocking=True)
-                    self.spec_decoding_packed_mask[:, :, :].copy_(
+                    self.spec_decoding_position_offsets[:batch_size, :].copy_(
+                        spec_tree_manager.spec_dec_position_offsets[0, :],
+                        non_blocking=True)
+                    self.spec_decoding_packed_mask[:batch_size, :, :].copy_(
                         spec_tree_manager.spec_dec_packed_mask[0, :, :],
                         non_blocking=True)
-                    self.spec_decoding_generation_lengths[:].fill_(
+                    self.spec_decoding_generation_lengths[:batch_size].fill_(
                         spec_tree_manager.max_total_draft_tokens + 1)
             else:
                 # Prepare for the linear-tree.
                 # Populate the mask that won't change during inference phase.
                 self.generate_spec_decoding_position_offsets(
-                    max_total_draft_tokens=max_total_draft_tokens)
+                    batch_size=batch_size, max_draft_len=max_draft_len)
                 self.generate_spec_decoding_packed_mask(
-                    max_total_draft_tokens=max_total_draft_tokens)
+                    batch_size=batch_size, max_draft_len=max_draft_len)
                 self.generate_spec_decoding_generation_length(
-                    max_total_draft_tokens=max_total_draft_tokens)
+                    batch_size=batch_size, max_draft_len=max_draft_len)
 
-    def generate_spec_decoding_position_offsets(self, max_total_draft_tokens):
-        position_offset = torch.arange(max_total_draft_tokens + 1,
+    def generate_spec_decoding_position_offsets(self, batch_size,
+                                                max_draft_len):
+        position_offset = torch.arange(max_draft_len + 1,
                                        dtype=torch.int,
                                        device='cpu',
-                                       pin_memory=True)
-
+                                       pin_memory=True).repeat(batch_size)
+        #
         # fill all the batches with same position offset
-        self.spec_decoding_position_offsets.copy_(position_offset,
-                                                  non_blocking=True)
-
-    def generate_spec_decoding_packed_mask(self, max_total_draft_tokens):
-        # TODO: fix this limitation
-        assert max_total_draft_tokens < 32, "max_total_draft_tokens should be less than 32, will be fixed later"
-        dummy_idx = torch.arange(max_total_draft_tokens + 1)
-        spec_decoding_packed_mask = torch.pow(2, dummy_idx + 1) - 1
-        self.spec_decoding_packed_mask[:, :, 0].copy_(spec_decoding_packed_mask,
-                                                      non_blocking=True)
-
-    def generate_spec_decoding_generation_length(self, max_total_draft_tokens):
-        spec_decoding_generation_length = torch.full((self.max_num_requests, ),
-                                                     max_total_draft_tokens + 1)
-        self.spec_decoding_generation_lengths[:self.max_num_requests].copy_(
+        self.spec_decoding_position_offsets.reshape(-1)[:(max_draft_len + 1) *
+                                                        batch_size].copy_(
+                                                            position_offset,
+                                                            non_blocking=True)
+
+    def generate_spec_decoding_packed_mask(self, batch_size, max_draft_len):
+        # FIXME: remove this limitation
+        assert max_draft_len < 32, "max_draft_len should be less than 32, will be fixed later"
+        dummy_idx = torch.arange(max_draft_len + 1)
+        spec_decoding_packed_mask = torch.pow(
+            2, dummy_idx + 1) - 1  # [max_draft_len + 1]
+        spec_decoding_packed_mask = spec_decoding_packed_mask.repeat(
+            batch_size)  # [batch_size * (max_draft_len + 1)]
+        self.spec_decoding_packed_mask.reshape(
+            -1)[:(max_draft_len + 1) * batch_size].copy_(
+                spec_decoding_packed_mask, non_blocking=True)
+
+    def generate_spec_decoding_generation_length(self, batch_size,
+                                                 max_draft_len):
+        spec_decoding_generation_length = torch.full((batch_size, ),
+                                                     max_draft_len + 1)
+        self.spec_decoding_generation_lengths[:batch_size].copy_(
             spec_decoding_generation_length, non_blocking=True)
 
 
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -2331,8 +2331,8 @@ def forward(
                 spec_resource_manager, self.is_draft_model, self.attn_backend,
                 self.model_is_wrapped, spec_metadata.is_spec_dec_tree)
             attn_metadata.update_spec_dec_param(
-                is_spec_dec_mode, spec_metadata, spec_tree_manager,
-                self.original_max_draft_len,
+                scheduled_requests.batch_size, is_spec_dec_mode, spec_metadata,
+                spec_tree_manager, self.original_max_draft_len,
                 self.original_max_total_draft_tokens, spec_decoding_tensor)
         else:
             spec_resource_manager = None
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -581,7 +581,7 @@ def update_kv_cache_draft_token_location(self,
         requests = scheduled_batch.all_requests()
         accepted_draft_token_offsets, packed_accepted_draft_tokens_indices, rewind_draft_token_separate_adjustments = self.locate_accepted_draft_tokens(
             requests)
-        past_key_value_lengths = attn_metadata.kv_lens_cuda
+        past_key_value_lengths = attn_metadata.kv_lens_cuda[:len(requests)]
         if attn_metadata.kv_cache_block_offsets is not None and attn_metadata.host_kv_cache_block_offsets is not None and attn_metadata.host_kv_cache_pool_pointers is not None and attn_metadata.host_kv_cache_pool_mapping is not None:
             use_paged_kv_cache = True
         else:
diff --git a/tensorrt_llm/_torch/speculative/drafting_loops.py b/tensorrt_llm/_torch/speculative/drafting_loops.py
@@ -192,6 +192,9 @@ def prepare_for_generation_with_tree_decoding(
                                    batch_size] -= prev_layer_gen_len_per_req  # reset to original length before the drafter loop.
         attn_metadata.kv_lens_cuda[:batch_size] += next_layer_gen_len_per_req
 
+    # FIXME, update without D2H
+    # attn_metadata.kv_lens[:batch_size] = attn_metadata.kv_lens_cuda[:batch_size].cpu()
+
     ## 3.2) _seq_lens, _seq_lens_cuda
     attn_metadata._seq_lens[:batch_size].fill_(next_layer_gen_len_per_req)
     attn_metadata._seq_lens_cuda[:batch_size].fill_(next_layer_gen_len_per_req)
@@ -207,23 +210,22 @@ def prepare_for_generation_with_tree_decoding(
         attn_metadata.use_spec_decoding = True
 
     ## 3.6) spec_decoding_position_offsets
-    attn_metadata.spec_decoding_position_offsets[:, :
-                                                 next_layer_gen_len_per_req] = spec_tree_manager.spec_dec_position_offsets_for_drafter_model[
-                                                     prepare_for_layer_idx -
-                                                     1].unsqueeze(0)
-    attn_metadata.spec_decoding_position_offsets[:,
-                                                 next_layer_gen_len_per_req:] = 0
+    attn_metadata.spec_decoding_position_offsets.reshape(
+        -1
+    )[:batch_size *
+      next_layer_gen_len_per_req] = spec_tree_manager.spec_dec_position_offsets_for_drafter_model[
+          prepare_for_layer_idx - 1].repeat(batch_size)
 
     ## 3.7) spec_decoding_packed_mask
-    attn_metadata.spec_decoding_packed_mask[:, :
-                                            next_layer_gen_len_per_req, :] = spec_tree_manager.spec_dec_packed_mask_for_drafter_model[
-                                                prepare_for_layer_idx -
-                                                1].unsqueeze(0)
-    attn_metadata.spec_decoding_packed_mask[:,
-                                            next_layer_gen_len_per_req:, :] = 0
+    attn_metadata.spec_decoding_packed_mask.reshape(
+        -1, attn_metadata.spec_decoding_packed_mask.size(-1)
+    )[:batch_size *
+      next_layer_gen_len_per_req, :] = spec_tree_manager.spec_dec_packed_mask_for_drafter_model[
+          prepare_for_layer_idx - 1].repeat(batch_size, 1)
 
     ## 3.8) spec_decoding_generation_lengths
-    attn_metadata.spec_decoding_generation_lengths[:] = next_layer_gen_len_per_req
+    attn_metadata.spec_decoding_generation_lengths[:
+                                                   batch_size] = next_layer_gen_len_per_req
 
     # 4) spec_metadata
     ## 4.1) num_tokens
diff --git a/tensorrt_llm/_torch/speculative/interface.py b/tensorrt_llm/_torch/speculative/interface.py
@@ -130,20 +130,32 @@ def attention_need_spec_dec_mode(
         spec_resource_manager: BaseResourceManager,
         is_draft_model: bool,
         attention_backend: Type[AttentionBackend],
-        use_chain_drafter: bool,
+        use_chain_drafter: bool,  # CDL
         is_spec_dec_tree: bool,
     ):
         """
         If true, the attention backend kernel needs to run in spec-dec mode (multi-token query mode).
+        Args:
+            spec_resource_manager: the resource manager for the spec-dec mode.
+            is_draft_model: whether the model is a draft model.
+            attention_backend: the attention backend.
+            use_chain_drafter: whether to use capturable drafting loops (CDL). For the target model, it is always False.
+            is_spec_dec_tree: whether the spec-dec mode is a tree, i.e., static tree or dynamic tree.
         """
         is_trtllm_attention = issubclass(attention_backend, TrtllmAttention)
-        return (self.is_eagle3_one_model()  # one model
-                or (self.is_eagle3() and spec_resource_manager.is_first_draft
-                    and is_trtllm_attention and use_chain_drafter
-                    and is_draft_model)  # two model + first drafter + CDL
-                or (self.is_eagle3() and is_trtllm_attention
-                    and is_spec_dec_tree)  # two model + tree
-                )
+        # Case 1: one model
+        use_case_1 = self.is_eagle3_one_model()
+        # Case 2: eagle3 two model + draft model + CDL + is_first_draft + TRTLLM attention
+        use_case_2 = self.is_eagle3(
+        ) and spec_resource_manager.is_first_draft and use_chain_drafter and is_draft_model and is_trtllm_attention
+        # Case 3: eagle3 two model + tree decoding + draft model + CDL + TRTLLM attention
+        use_case_3 = self.is_eagle3(
+        ) and is_spec_dec_tree and is_draft_model and use_chain_drafter and is_trtllm_attention
+        # Case 4: eagle3 two model + tree decoding + target model + TRTLLM attention
+        use_case_4 = self.is_eagle3(
+        ) and is_spec_dec_tree and not is_draft_model and is_trtllm_attention
+
+        return use_case_1 or use_case_2 or use_case_3 or use_case_4
 
     @staticmethod
     def from_string(name: Optional[str]) -> "SpeculativeDecodingMode":
diff --git a/tensorrt_llm/_torch/speculative/spec_tree_manager.py b/tensorrt_llm/_torch/speculative/spec_tree_manager.py
@@ -122,7 +122,7 @@ def __init__(self, max_num_requests: int, use_dynamic_tree: bool,
         else:
             self.init_tree_info_for_static_tree()
 
-        self.dump_tree_info()
+        # self.dump_tree_info()
 
     def init_tree_info_for_dynamic_tree(self):
         # For the dynamic tree
diff --git a/tests/unittest/_torch/speculative/test_draft_token_prepare_for_generation.py b/tests/unittest/_torch/speculative/test_draft_token_prepare_for_generation.py
@@ -188,12 +188,22 @@ def run_test(max_batch_size, prepare_for_layer_idx, max_total_draft_tokens,
         assert torch.all(
             torch.tensor(attn_metadata.num_contexts) == torch.tensor(
                 ref_attn_metadata['num_contexts']))
-        assert torch.all(attn_metadata.spec_decoding_position_offsets ==
-                         ref_attn_metadata['spec_decoding_position_offsets'])
-        assert torch.all(attn_metadata.spec_decoding_packed_mask ==
-                         ref_attn_metadata['spec_decoding_packed_mask'])
         assert torch.all(attn_metadata.spec_decoding_generation_lengths ==
                          ref_attn_metadata['spec_decoding_generation_lengths'])
+        total_process_tokens = attn_metadata.spec_decoding_generation_lengths.sum(
+        )
+        print(f"total_process_tokens: {total_process_tokens}")
+        assert torch.all(
+            attn_metadata.spec_decoding_position_offsets.reshape(
+                -1)[:total_process_tokens] ==
+            ref_attn_metadata['spec_decoding_position_offsets']
+            [:total_process_tokens])
+        assert torch.all(
+            attn_metadata.spec_decoding_packed_mask.reshape(
+                -1, attn_metadata.spec_decoding_packed_mask.size(
+                    -1))[:total_process_tokens, :] ==
+            ref_attn_metadata['spec_decoding_packed_mask']
+            [:total_process_tokens, :])
 
         assert torch.all(
             torch.tensor(spec_metadata.num_tokens) == torch.tensor(
@@ -267,13 +277,9 @@ def run_test(max_batch_size, prepare_for_layer_idx, max_total_draft_tokens,
                                                            device='cuda')
     ref_attn_metadata['num_contexts'] = 0
     ref_attn_metadata['spec_decoding_position_offsets'] = torch.tensor(
-        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-        dtype=torch.int32,
-        device='cuda')
+        [0, 0, 0], dtype=torch.int32, device='cuda')
     ref_attn_metadata['spec_decoding_packed_mask'] = torch.tensor(
-        [1, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-        dtype=torch.int32,
-        device='cuda').reshape(1, max_total_draft_tokens + 1, 1)
+        [1, 2, 4], dtype=torch.int32, device='cuda').unsqueeze(1)
     ref_attn_metadata['spec_decoding_generation_lengths'] = torch.tensor(
         [3], dtype=torch.int32, device='cuda')
 
@@ -361,14 +367,9 @@ def run_test(max_batch_size, prepare_for_layer_idx, max_total_draft_tokens,
                                                            device='cuda')
     ref_attn_metadata['num_contexts'] = 0
     ref_attn_metadata['spec_decoding_position_offsets'] = torch.tensor(
-        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-        dtype=torch.int32,
-        device='cuda').repeat(max_batch_size, 1)
+        [0, 0, 0, 0, 0, 0], dtype=torch.int32, device='cuda')
     ref_attn_metadata['spec_decoding_packed_mask'] = torch.tensor(
-        [1, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-        dtype=torch.int32,
-        device='cuda').reshape(1, max_total_draft_tokens + 1,
-                               1).repeat(max_batch_size, 1, 1)
+        [1, 2, 4, 1, 2, 4], dtype=torch.int32, device='cuda').unsqueeze(1)
     ref_attn_metadata['spec_decoding_generation_lengths'] = torch.tensor(
         [3, 3], dtype=torch.int32, device='cuda')
 
@@ -455,14 +456,9 @@ def run_test(max_batch_size, prepare_for_layer_idx, max_total_draft_tokens,
                                                            device='cuda')
     ref_attn_metadata['num_contexts'] = 0
     ref_attn_metadata['spec_decoding_position_offsets'] = torch.tensor(
-        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-        dtype=torch.int32,
-        device='cuda').repeat(max_batch_size, 1)
+        [0, 0, 0, 0, 0, 0], dtype=torch.int32, device='cuda')
     ref_attn_metadata['spec_decoding_packed_mask'] = torch.tensor(
-        [1, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-        dtype=torch.int32,
-        device='cuda').reshape(1, max_total_draft_tokens + 1,
-                               1).repeat(max_batch_size, 1, 1)
+        [1, 2, 4, 1, 2, 4], dtype=torch.int32, device='cuda').unsqueeze(1)
     ref_attn_metadata['spec_decoding_generation_lengths'] = torch.tensor(
         [3, 3], dtype=torch.int32, device='cuda')
 
@@ -545,13 +541,9 @@ def run_test(max_batch_size, prepare_for_layer_idx, max_total_draft_tokens,
                                                            device='cuda')
     ref_attn_metadata['num_contexts'] = 0
     ref_attn_metadata['spec_decoding_position_offsets'] = torch.tensor(
-        [0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
-        dtype=torch.int32,
-        device='cuda')
+        [0, 0, 1, 1, 1], dtype=torch.int32, device='cuda')
     ref_attn_metadata['spec_decoding_packed_mask'] = torch.tensor(
-        [1, 2, 5, 9, 18, 0, 0, 0, 0, 0, 0, 0, 0],
-        dtype=torch.int32,
-        device='cuda').reshape(1, max_total_draft_tokens + 1, 1)
+        [1, 2, 5, 9, 18], dtype=torch.int32, device='cuda').unsqueeze(1)
     ref_attn_metadata['spec_decoding_generation_lengths'] = torch.tensor(
         [5], dtype=torch.int32, device='cuda')
 
@@ -637,13 +629,10 @@ def run_test(max_batch_size, prepare_for_layer_idx, max_total_draft_tokens,
                                                            device='cuda')
     ref_attn_metadata['num_contexts'] = 0
     ref_attn_metadata['spec_decoding_position_offsets'] = torch.tensor(
-        [0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
-        dtype=torch.int32,
-        device='cuda')
+        [0, 0, 1, 1, 1, 0, 0, 1, 1, 1], dtype=torch.int32, device='cuda')
     ref_attn_metadata['spec_decoding_packed_mask'] = torch.tensor(
-        [1, 2, 5, 9, 18, 0, 0, 0, 0, 0, 0, 0, 0],
-        dtype=torch.int32,
-        device='cuda').reshape(1, max_total_draft_tokens + 1, 1)
+        [1, 2, 5, 9, 18, 1, 2, 5, 9, 18], dtype=torch.int32,
+        device='cuda').unsqueeze(1)
     ref_attn_metadata['spec_decoding_generation_lengths'] = torch.tensor(
         [5, 5], dtype=torch.int32, device='cuda')