Upd

wenscarl · wenscarl · commit 28b4dd490123 · 2025-11-14T21:00:51.000Z
diff --git a/flashinfer/comm/mnnvl.py b/flashinfer/comm/mnnvl.py
@@ -547,6 +547,7 @@ def supports_mnnvl() -> bool:
 
 class McastDeviceMemory:
     """Python port of McastDeviceMemory from TensorRT-LLM"""
+
     def __init__(
         self,
         buf_size: int,
@@ -753,7 +754,7 @@ def get_world_size(self) -> int:
         """Get the total number of devices in the group"""
         return self.group_size
 
-    def _alloc_mn_mcast_mem(self, buf_size: int, comm: Any=MpiComm()):
+    def _alloc_mn_mcast_mem(self, buf_size: int, comm: Any = None):
         """Allocate multi-node multicast memory using MNNVL"""
 
         # Verify CUDA context
@@ -766,7 +767,8 @@ def _alloc_mn_mcast_mem(self, buf_size: int, comm: Any=MpiComm()):
                 )
         except Exception as e:
             print(f"Error checking CUDA context: {e}")
-
+        if comm is None:
+            comm = MpiComm()
         # Set up allocation properties
         handle_type = cuda.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_FABRIC
 
diff --git a/flashinfer/comm/trtllm_mnnvl_ar.py b/flashinfer/comm/trtllm_mnnvl_ar.py
@@ -15,7 +15,7 @@
 
 from ..jit import gen_trtllm_mnnvl_comm_module
 from ..utils import register_custom_op
-from .mnnvl import (McastGPUBuffer, CommBackend)
+from .mnnvl import McastGPUBuffer, CommBackend
 
 
 def mpi_barrier():
@@ -122,9 +122,10 @@ def trtllm_mnnvl_rmsnorm(
 
 
 def get_allreduce_mnnvl_workspace(
-    mapping: Mapping, dtype: torch.dtype,
-    buffer_size_in_bytes: Optional[int] = None,
+    mapping: Mapping,
+    dtype: torch.dtype,
     comm: Optional[CommBackend] = None,
+    buffer_size_in_bytes: Optional[int] = None,
 ) -> Tuple[McastGPUBuffer, torch.Tensor, int]:
     """Get workspace buffers needed for multi-node NVLink all-reduce operation.
 
@@ -140,8 +141,8 @@ def get_allreduce_mnnvl_workspace(
     Args:
         mapping: Tensor parallel mapping configuration containing rank info
         dtype: Data type of the tensors being reduced
-        buffer_size_in_bytes: Optional buffer size. Practically, assign this to 3 * 2 * dtype.itemsize * hidden_dim * max_tokens
         comm: Optional communication backend for multi-node synchronization
+        buffer_size_in_bytes: Optional buffer size. Practically, assign this to 3 * 2 * dtype.itemsize * hidden_dim * max_tokens
 
     Returns:
         Tuple containing:
diff --git a/tests/comm/test_trtllm_mnnvl_allreduce_custom_comm.py b/tests/comm/test_trtllm_mnnvl_allreduce_custom_comm.py
@@ -6,19 +6,16 @@
 import pytest
 import torch
 import torch.distributed as dist
-from mpi4py import MPI  # Added MPI import
 
 import flashinfer.comm.trtllm_mnnvl_ar as trtllm_mnnvl_ar
 from flashinfer.comm.mapping import Mapping
-
-# Use flashinfer.norm.rmsnorm as reference implementation.
-from flashinfer.norm import rmsnorm
 from flashinfer.comm.mnnvl import CommBackend as CommBackend
 
 import pynvml
 
 pynvml.nvmlInit()
 
+
 class CustomCommunicator(CommBackend):
     def __init__(self, group):
         self._group = group
@@ -59,7 +56,7 @@ def bcast(self, data, root: int = 0):
         # broadcast_object_list mutates obj_list in-place
         dist.broadcast_object_list(obj_list, src=root, group=self._group)
         return obj_list[0]
-    
+
     def barrier(self):
         """
         Synchronize all ranks in this communicator.
@@ -69,6 +66,7 @@ def barrier(self):
     def Split(self, color: int, key: int) -> "CustomCommunicator":
         return self
 
+
 def get_open_port() -> int:
     try:
         with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
@@ -78,7 +76,8 @@ def get_open_port() -> int:
         with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s:
             s.bind(("::1", 0))
             return s.getsockname()[1]
-      
+
+
 def multi_process_parallel(
     world_size: int, dtype: torch.dtype, test_target: Any, target_args: tuple = ()
 ) -> None:
@@ -98,6 +97,7 @@ def multi_process_parallel(
             f"Process {i} failed with exit code {procs[i].exitcode}"
         )
 
+
 @torch.inference_mode()
 def row_linear_residual_norm_forward(
     x: torch.Tensor,
@@ -182,6 +182,7 @@ def func(
         atol=0.15,
     )
 
+
 def _run_mnnvl_ar(world_size, rank, dtype, distributed_init_port, seq_len, hidden_size):
     # Set CUDA device based on rank
     device = torch.device(f"cuda:{rank}")
@@ -223,8 +224,11 @@ def _run_mnnvl_ar(world_size, rank, dtype, distributed_init_port, seq_len, hidde
         # Get workspace buffers using MPI rank - allocate once per seq_lens list and reuse within the list
         # This workspace is sized for the maximum expected sequence length and can be reused within each list
         # Each parameterized list gets its own fresh workspace allocation
+        explicit_workspace_bytes = 3 * 2 * dtype.itemsize * hidden_size * seq_len
         mcast_buffer_mnnvl, buffer_flags_mnnvl, max_num_elements_mnnvl = (
-            trtllm_mnnvl_ar.get_allreduce_mnnvl_workspace(mapping, dtype, comm)
+            trtllm_mnnvl_ar.get_allreduce_mnnvl_workspace(
+                mapping, dtype, comm, explicit_workspace_bytes
+            )
         )
 
         multicast_ptr = mcast_buffer_mnnvl.get_multicast_ptr()
@@ -282,16 +286,16 @@ def _run_mnnvl_ar(world_size, rank, dtype, distributed_init_port, seq_len, hidde
         # Synchronize before next test
         comm.barrier()
 
-        print(
-            f"PASSED[rank={rank}]: seq_len={seq_len}, dtype={dtype}"
-        )
+        print(f"PASSED[rank={rank}]: seq_len={seq_len}, dtype={dtype}")
 
     except Exception as e:
         rank_failed = True
-        failure_message = f"FAILED[rank={rank}]: seq_lens={seq_len}, dtype={dtype} failed: {e}"
+        failure_message = (
+            f"FAILED[rank={rank}]: seq_lens={seq_len}, dtype={dtype} failed: {e}"
+        )
         print(failure_message)
         # Gather failure status from all ranks
-        all_failures = MPI.COMM_WORLD.allgather(rank_failed)
+        all_failures = comm.allgather(rank_failed)
 
         # If any rank failed, fail the test
         if any(all_failures):
@@ -302,7 +306,7 @@ def _run_mnnvl_ar(world_size, rank, dtype, distributed_init_port, seq_len, hidde
             # Fail the test on all ranks
             pytest.fail(f"Test failed on ranks {failed_ranks}")
             comm.barrier()
-  
+
     finally:
         # Ensure cleanup happens for this list's workspace
         if "mcast_buffer_mnnvl" in locals():
@@ -311,10 +315,14 @@ def _run_mnnvl_ar(world_size, rank, dtype, distributed_init_port, seq_len, hidde
     # Final synchronization and check for failures across all ranks
     comm.barrier()
 
+
 """Main test function that runs on each MPI rank"""
+
+
 @pytest.mark.parametrize("world_size", [2, 4])
 def test_mnnvl_allreduce_custom_communicator(
-    monkeypatch, world_size,
+    monkeypatch,
+    world_size,
 ):
     monkeypatch.setenv("TRTLLM_FORCE_MNNVL_AR", "1")  # force multi-node allreduce.
     seq_len = 24