fix ci

yweng0828 · yweng0828 · commit 655723b28068 · 2025-10-27T02:27:09.000-07:00
Signed-off-by: Yue Weng &lt;25103990+yweng0828@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/attention_backend/interface.py b/tensorrt_llm/_torch/attention_backend/interface.py
@@ -11,6 +11,8 @@
 
 if TYPE_CHECKING:
     from ..speculative.utils import SpecDecodingTensor
+    from ..speculative.interface import SpecMetadata
+    from ..speculative.spec_tree_manager import SpecTreeManager
 
 from tensorrt_llm.functional import (PositionEmbeddingType, RopeEmbeddingUtils,
                                      RotaryScalingType)
@@ -337,10 +339,12 @@ def update_spec_dec_param(
             self,
             batch_size,
             is_spec_decoding_enabled,
-            spec_metadata,
-            spec_tree_manager,
+            is_spec_dec_tree,
+            is_spec_dec_dynamic_tree,
             max_draft_len,
             max_total_draft_tokens,
+            spec_metadata: Optional['SpecMetadata'] = None,
+            spec_tree_manager: Optional['SpecTreeManager'] = None,
             spec_decoding_tensor: Optional['SpecDecodingTensor'] = None):
         """
         Hook to be called when using TRTLLM attention backend in spec-dec mode.
diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py
@@ -8,6 +8,8 @@
 
 if TYPE_CHECKING:
     from ..speculative.utils import SpecDecodingTensor
+    from ..speculative.interface import SpecMetadata
+    from ..speculative.spec_tree_manager import SpecTreeManager
 
 from tensorrt_llm._utils import get_sm_version
 from tensorrt_llm.bindings.internal import thop
@@ -1057,25 +1059,26 @@ def update_spec_dec_param(
         self,
         batch_size,
         is_spec_decoding_enabled,
-        spec_metadata,
-        spec_tree_manager,
+        is_spec_dec_tree,
+        is_spec_dec_dynamic_tree,
         max_draft_len,
         max_total_draft_tokens,
+        spec_metadata: Optional['SpecMetadata'] = None,
+        spec_tree_manager: Optional['SpecTreeManager'] = None,
         spec_decoding_tensor: Optional['SpecDecodingTensor'] = None,
     ):
         if spec_decoding_tensor is not None:
-            spec_decoding_tensor.position_offsets
-            spec_decoding_tensor.packed_mask
-            spec_decoding_tensor.generation_lengths
+            spec_decoding_position_offsets = spec_decoding_tensor.position_offsets
+            spec_decoding_packed_mask = spec_decoding_tensor.packed_mask
+            spec_decoding_generation_lengths = spec_decoding_tensor.generation_lengths
         else:
-            pass
+            spec_decoding_position_offsets = None
+            spec_decoding_packed_mask = None
+            spec_decoding_generation_lengths = None
         # spec_dec mode should only be enabled for pre-Blackwell machines and when there's a spec-dec tree.
         self.is_spec_decoding_enabled = is_spec_decoding_enabled and get_sm_version(
         ) < 100
 
-        self.is_spec_dec_tree = False if spec_tree_manager is None else True
-        self.is_spec_dec_dynamic_tree = False if spec_tree_manager is None else spec_tree_manager.use_dynamic_tree
-
         if get_sm_version() >= 100:
             if self.is_spec_dec_tree or self.is_spec_dec_dynamic_tree:
                 assert not self.is_spec_dec_tree, "Spec-dec tree is not supported on this machine. Please use a pre-Blackwell machine for a spec-dec tree."
@@ -1084,6 +1087,9 @@ def update_spec_dec_param(
         # use_spec_decoding is default to true by default, change in runtime by layers / requests
         self.use_spec_decoding = self.is_spec_decoding_enabled
 
+        self.is_spec_dec_tree = is_spec_dec_tree
+        self.is_spec_dec_dynamic_tree = is_spec_dec_dynamic_tree
+
         # Parameters can be fixed and not changed during runtime if the
         if self.is_spec_decoding_enabled:
             # These buffers are accessed more like removing input padding,
@@ -1109,28 +1115,40 @@ def update_spec_dec_param(
                 device='cuda',
             )
 
-            # Prepare the spec-dec mask, position offset and generation length for static tree of dynamic tree.
-            # We only prepare the spec-dec mask, position offset and generation length for the target model here.
-            # For the drafter model, we will prepare them in the drafting loops.
-            is_target_model = not spec_metadata.is_draft_model
-            is_using_tree = self.is_spec_dec_tree or self.is_spec_dec_dynamic_tree
-            if is_target_model and is_using_tree:
+            is_target_model = not spec_metadata.is_draft_model if hasattr(
+                spec_metadata, 'is_draft_model') else False
+
+            if self.is_spec_dec_tree and self.is_spec_dec_dynamic_tree:
+                # dynamic tree
+                assert spec_decoding_position_offsets is not None, "spec_decoding_position_offsets is required for dynamic tree"
+                assert spec_decoding_packed_mask is not None, "spec_decoding_packed_mask is required for dynamic tree"
+                self.spec_decoding_position_offsets.copy_(
+                    spec_decoding_position_offsets, non_blocking=True)
+                self.spec_decoding_packed_mask.copy_(spec_decoding_packed_mask,
+                                                     non_blocking=True)
+                if spec_decoding_generation_lengths is not None:
+                    self.spec_decoding_generation_lengths.copy_(
+                        spec_decoding_generation_lengths, non_blocking=True)
+                else:
+                    self.generate_spec_decoding_generation_length(
+                        batch_size=batch_size,
+                        max_draft_len=max_total_draft_tokens)
+            elif self.is_spec_dec_tree and not self.is_spec_dec_dynamic_tree and spec_metadata is not None and is_target_model:
+                # static tree and target model
+                # Prepare the spec-dec mask, position offset and generation length for static tree.
+                # We only prepare the spec-dec mask, position offset and generation length for the target model here.
+                # For the drafter model, we will prepare them in the drafting loops.
+
                 assert spec_metadata.spec_dec_mode.is_eagle3(
                 ), "Tree decoding is only supported for Eagle3 now"
-                # If is the dynamic tree
-                if self.is_spec_dec_dynamic_tree:
-                    # TODO: add dynamic tree logic
-                    assert False, "Dynamic tree is not supported yet"
-                # If is the static tree
-                else:
-                    self.spec_decoding_position_offsets[:batch_size, :].copy_(
-                        spec_tree_manager.spec_dec_position_offsets[0, :],
-                        non_blocking=True)
-                    self.spec_decoding_packed_mask[:batch_size, :, :].copy_(
-                        spec_tree_manager.spec_dec_packed_mask[0, :, :],
-                        non_blocking=True)
-                    self.spec_decoding_generation_lengths[:batch_size].fill_(
-                        spec_tree_manager.max_total_draft_tokens + 1)
+                self.spec_decoding_position_offsets[:batch_size, :].copy_(
+                    spec_tree_manager.spec_dec_position_offsets[0, :],
+                    non_blocking=True)
+                self.spec_decoding_packed_mask[:batch_size, :, :].copy_(
+                    spec_tree_manager.spec_dec_packed_mask[0, :, :],
+                    non_blocking=True)
+                self.spec_decoding_generation_lengths[:batch_size].fill_(
+                    spec_tree_manager.max_total_draft_tokens + 1)
             else:
                 # Prepare for the linear-tree.
                 # Populate the mask that won't change during inference phase.
@@ -1147,7 +1165,6 @@ def generate_spec_decoding_position_offsets(self, batch_size,
                                        dtype=torch.int,
                                        device='cpu',
                                        pin_memory=True).repeat(batch_size)
-        #
         # fill all the batches with same position offset
         self.spec_decoding_position_offsets.reshape(-1)[:(max_draft_len + 1) *
                                                         batch_size].copy_(
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -1342,7 +1342,9 @@ def _prepare_tp_inputs(
         if spec_config is not None:
             spec_resource_manager = resource_manager.get_resource_manager(
                 ResourceManagerType.SPEC_RESOURCE_MANAGER)
-            spec_tree_manager = spec_resource_manager.spec_tree_manager
+            if spec_resource_manager is not None and hasattr(
+                    spec_resource_manager, 'spec_tree_manager'):
+                spec_tree_manager = spec_resource_manager.spec_tree_manager
 
         # will contain previous batch indices of generation requests
         previous_batch_indices = []
@@ -2331,9 +2333,15 @@ def forward(
                 spec_resource_manager, self.is_draft_model, self.attn_backend,
                 self.model_is_wrapped, spec_metadata.is_spec_dec_tree)
             attn_metadata.update_spec_dec_param(
-                scheduled_requests.batch_size, is_spec_dec_mode, spec_metadata,
-                spec_tree_manager, self.original_max_draft_len,
-                self.original_max_total_draft_tokens, spec_decoding_tensor)
+                batch_size=scheduled_requests.batch_size,
+                is_spec_decoding_enabled=is_spec_dec_mode,
+                is_spec_dec_tree=spec_metadata.is_spec_dec_tree,
+                is_spec_dec_dynamic_tree=spec_metadata.is_spec_dec_dynamic_tree,
+                max_draft_len=spec_metadata.max_draft_len,
+                max_total_draft_tokens=spec_metadata.max_total_draft_tokens,
+                spec_metadata=spec_metadata,
+                spec_tree_manager=spec_tree_manager,
+                spec_decoding_tensor=spec_decoding_tensor)
         else:
             spec_resource_manager = None
             spec_metadata = None
diff --git a/tensorrt_llm/_torch/speculative/eagle3.py b/tensorrt_llm/_torch/speculative/eagle3.py
@@ -48,10 +48,10 @@ def __init__(self, config: "EagleDecodingConfig", dtype: torch.dtype,
             self.max_total_draft_tokens = self.max_draft_len
 
         # empty hidden states tensor
-        max_num_tokens = min(
-            max_num_tokens, max_num_requests *
-            (self.max_total_draft_tokens + 1)) + (self.max_total_draft_tokens +
-                                                  1)
+        max_num_tokens = min(max_num_tokens, max_num_requests *
+                             self.max_seq_len) + (self.max_total_draft_tokens +
+                                                  1) * max_num_requests
+
         self.hidden_states = torch.empty(
             (max_num_tokens, self.hidden_size * config.num_capture_layers),
             dtype=self.dtype,
@@ -165,6 +165,7 @@ def __post_init__(self):
 
     def prepare(self):
         is_first_draft = self.eagle3_resource_manager.is_first_draft
+        spec_tree_manager = self.eagle3_resource_manager.spec_tree_manager
         # Update start indices
         # Here, we assume the sequence lengths (seq_lens) during the draft model
         # forward will not exceed those of the target model. So pre-allocate
@@ -186,18 +187,19 @@ def prepare(self):
         for req_id, seq_len in zip(self.request_ids, self.seq_lens):
             slot_id = self.eagle3_resource_manager.slot_manager.get_slot(req_id)
             start_idx = self.eagle3_resource_manager.start_indices[slot_id]
-            # 1) target model
+            # 1) target model or (is_first_draft and is_linear_tree)
             # If this is the first draft or the target model forward, we need to
             # read/write all of the hidden states
-            if not self.is_draft_model:
+            if not self.is_draft_model or (is_first_draft
+                                           and spec_tree_manager is None):
                 hidden_states_read_indices.extend(
                     list(range(start_idx, start_idx + seq_len)))
                 hidden_states_write_indices.extend(
                     list(range(start_idx, start_idx + seq_len)))
-            # 2）is_first_draft
+            # 2）is_first_draft and draft_token_tree
             # After target model forward, some draft tokens will be accepted.
             # These draft tokens' hidden states will be used for draft model's first drafter layer.
-            elif is_first_draft:
+            elif is_first_draft and spec_tree_manager is not None:
                 assert req_id in self.request_accepted_path.keys(
                 ), f"Request {req_id} not found in request_accepted_path"
                 accepted_path = self.request_accepted_path[req_id]
diff --git a/tensorrt_llm/_torch/speculative/ngram.py b/tensorrt_llm/_torch/speculative/ngram.py
@@ -171,6 +171,8 @@ def __init__(
         assert ngram_pool_manager is not None, "NGram needs a resource manager to maintain the pool."
         self.spec_config = spec_config
         self.max_draft_len = spec_config.max_draft_len
+        self.max_total_draft_tokens = spec_config.max_total_draft_tokens
+        assert self.max_draft_len == self.max_total_draft_tokens, "NGram only supports linear tree."
         self.spec_resource_manager = ngram_pool_manager
 
     def prepare_draft_tokens(
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -520,6 +520,10 @@ def spec_dec_mode(self):
         return TorchSpeculativeDecodingMode.from_string(
             self.decoding_type.upper())
 
+    @functools.cached_property
+    def is_linear_tree(self) -> bool:
+        return self.max_draft_len == self.max_total_draft_tokens
+
 
 class KvCacheConnectorConfig(StrictBaseModel):
     """
diff --git a/tests/unittest/_torch/modeling/test_modeling_llama.py b/tests/unittest/_torch/modeling/test_modeling_llama.py
@@ -516,7 +516,7 @@ def run_forward(input_ids, position_ids, attn_metadata):
         use_spec_decoding = True
         is_spec_dec_tree = True
         is_spec_dec_dynamic_tree = True
-        max_draft_tokens = gen_input_ids_0.size(-1) - 1
+        max_total_draft_tokens = gen_input_ids_0.size(-1) - 1
 
         attn_metadata_gen_phase_0 = metadata_cls(
             seq_lens=torch.tensor([gen_input_ids_0.size(-1)], dtype=torch.int),
@@ -540,10 +540,12 @@ def run_forward(input_ids, position_ids, attn_metadata):
             packed_mask=spec_decoding_packed_mask)
 
         attn_metadata_gen_phase_0.update_spec_dec_param(
+            batch_size=batch_size,
             is_spec_decoding_enabled=is_spec_decoding_enabled,
             is_spec_dec_dynamic_tree=is_spec_dec_dynamic_tree,
             is_spec_dec_tree=is_spec_dec_tree,
-            max_draft_tokens=max_draft_tokens,
+            max_draft_len=max_total_draft_tokens,
+            max_total_draft_tokens=max_total_draft_tokens,
             spec_decoding_tensor=spec_decoding_tensor,
         )
 
@@ -586,10 +588,12 @@ def run_forward(input_ids, position_ids, attn_metadata):
             [gen_input_ids_1.size(-1)], dtype=torch.int)
         attn_metadata_gen_phase_0.kv_cache_params.num_cached_tokens_per_seq = num_cached_tokens_per_seq_1
         attn_metadata_gen_phase_0.update_spec_dec_param(
+            batch_size=batch_size,
             is_spec_decoding_enabled=is_spec_decoding_enabled,
             is_spec_dec_tree=is_spec_dec_tree,
             is_spec_dec_dynamic_tree=False,
-            max_draft_tokens=gen_input_ids_1.size(-1) - 1)
+            max_draft_len=gen_input_ids_1.size(-1) - 1,
+            max_total_draft_tokens=gen_input_ids_1.size(-1) - 1)
 
         gen_position_ids_1 = [
             torch.full(
@@ -630,10 +634,12 @@ def run_forward(input_ids, position_ids, attn_metadata):
             is_spec_dec_tree=is_spec_dec_tree,
             is_spec_dec_dynamic_tree=False)
         attn_metadata_ref.update_spec_dec_param(
+            batch_size=batch_size,
             is_spec_decoding_enabled=is_spec_decoding_enabled,
             is_spec_dec_tree=is_spec_dec_tree,
             is_spec_dec_dynamic_tree=False,
-            max_draft_tokens=gen_input_ids_ref.size(-1) - 1,
+            max_draft_len=gen_input_ids_ref.size(-1) - 1,
+            max_total_draft_tokens=gen_input_ids_ref.size(-1) - 1,
         )
 
         gen_position_ids_ref = [