[https://nvbugs/5549111][fix] Fix 2-model overlap scheduler accuracy on very long prompts (#8076)

mikeiovine · web-flow · commit 00161b315f29 · 2025-10-28T14:55:34.000-07:00
Signed-off-by: Mike Iovine &lt;6158008+mikeiovine@users.noreply.github.com&gt;
Signed-off-by: Michael Iovine &lt;miovine@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -2345,6 +2345,17 @@ def _handle_speculative_decoding(self, scheduled_batch, previous_tensors):
             else:
                 self.has_previous_draft_tokens = False
                 target_inputs, draft_outputs, draft_batch = None, None, None
+                # We are not running the draft model. Remove the draft tokens and turn off spec
+                # decode so that the requests get handled correctly.
+                # One corner case: when we have at least one context request, we have to keep spec
+                # dec on. This ensures that we capture hidden states for requests that haven't done
+                # prefill yet.
+                self.use_spec_decode = False
+                self.model_engine.enable_spec_decode = len(
+                    scheduled_batch.context_requests) > 0
+                if not self.model_engine.enable_spec_decode:
+                    for request in scheduled_batch.all_requests():
+                        request.py_draft_tokens = []
 
         return target_inputs, draft_outputs, draft_batch
 
diff --git a/tests/unittest/_torch/speculative/test_eagle3.py b/tests/unittest/_torch/speculative/test_eagle3.py
@@ -155,6 +155,59 @@ def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str,
         assert text_spec == text_ref
 
 
+@pytest.mark.parametrize("use_cuda_graph", [True, False])
+@pytest.mark.high_cuda_memory
+def test_llama_eagle3_long_prompt(use_cuda_graph):
+    # Eagle3 one model works with overlap scheduler and block reuse.
+    total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
+    if total_mem_gb < 35:
+        pytest.skip("Not enough memory to load target + draft model")
+
+    models_path = llm_models_root()
+    eagle_model_dir = f"{models_path}/EAGLE3-LLaMA3.1-Instruct-8B"
+    target_model_dir = f"{models_path}/llama-3.1-model/Llama-3.1-8B-Instruct"
+
+    spec_config = EagleDecodingConfig(
+        max_draft_len=3,
+        speculative_model_dir=eagle_model_dir,
+        eagle3_one_model=False,
+    )
+
+    if use_cuda_graph:
+        cuda_graph_config = CudaGraphConfig(batch_sizes=[1])
+    else:
+        cuda_graph_config = None
+
+    llm_spec = LLM(model=target_model_dir,
+                   speculative_config=spec_config,
+                   max_batch_size=1,
+                   cuda_graph_config=cuda_graph_config,
+                   disable_overlap_scheduler=False)
+
+    prompt = [", ".join(str(i) for i in range(1000))]
+
+    sampling_params = SamplingParams(max_tokens=10, temperature=0)
+    results_spec = llm_spec.generate(prompt, sampling_params)
+
+    generated_text_spec = [result.outputs[0].text for result in results_spec]
+    llm_spec.shutdown()
+
+    llm_ref = LLM(model=target_model_dir,
+                  max_batch_size=1,
+                  cuda_graph_config=None,
+                  disable_overlap_scheduler=False)
+
+    results_ref = llm_ref.generate(prompt, sampling_params)
+
+    generated_text_ref = [result.outputs[0].text for result in results_ref]
+    llm_ref.shutdown()
+
+    # The LLM with speculation on should dynamically turn it off in this
+    # test since it goes beyond the max seqlen. Thus, the text should be
+    # _exactly_ the same, no need to use similarity scoring.
+    assert generated_text_spec[0] == generated_text_ref[0]
+
+
 def test_deepseek_eagle3():
     use_cuda_graph = True
     attn_backend = "TRTLLM"