a

yizhang-nv · yizhang-nv · commit 05fff03d90c7 · 2025-10-28T14:09:00.000+08:00
Signed-off-by: yizhang-nv &lt;187001205+yizhang-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/compilation/piecewise_optimizer.py b/tensorrt_llm/_torch/compilation/piecewise_optimizer.py
@@ -172,7 +172,10 @@ def __call__(self, *args):
             return self.default_callable(*args)
 
         if self.is_first_runner or self.is_last_runner:
-            set_piecewise_running(self.is_first_runner)
+            if self.is_first_runner == self.is_last_runner:
+                set_piecewise_running(False)
+            else:
+                set_piecewise_running(self.is_first_runner)
 
         entry = self.entries[runtime_num_of_token]
 
diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py
@@ -77,10 +77,14 @@ def extract_extra_attrs(layer_idx: str, attn_type: str):
 
 
 def maybe_compile(func):
-    if is_piecewise_running():
-        # When piecewise running, we don't need to compile the function to avoid host overhead in attention op.
-        return func
-    return torch.compile(func)
+
+    def wrapper(*args, **kwargs):
+        if is_piecewise_running():
+            # When piecewise running, we don't need to compile the function to avoid host overhead in attention op.
+            return func(*args, **kwargs)
+        return torch.compile(func)(*args, **kwargs)
+
+    return wrapper
 
 
 @maybe_compile