Comments

Andrew Briand · Andrew Briand · commit d9068b54c159 · 2025-12-01T08:57:49.000-08:00
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1501,10 +1501,13 @@ def apply(
             router_logits=router_logits,
         )
 
+        # EPLB path
         if (
             self.allow_flashinfer
             and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
         ):
+            # Pack top k ids and expert weights into a single int32 tensor, as
+            # required by TRT-LLM
             packed_tensor = (topk_ids.to(torch.int32) << 16) | topk_weights.to(
                 torch.bfloat16
             ).view(torch.int16)