barrier working

H-Huang · H-Huang · commit a6e46c7882be · 2025-09-26T15:07:10.000-07:00
diff --git a/torchtitan/distributed/expert_parallel.py b/torchtitan/distributed/expert_parallel.py
@@ -25,101 +25,47 @@
 
 class HookCoordinator:
     def __init__(self):
-        # Only need 2 semaphores - one for forward, one for backward
-        self._forward_semaphore = threading.Semaphore(0)   # Forward waits
-        self._backward_semaphore = threading.Semaphore(1)  # Backward starts first
-        
-        # Semaphore mapping
-        self._semaphores = {
-            'forward': self._forward_semaphore,
-            'backward': self._backward_semaphore
-        }
-        
-        # Cross-signaling mapping (forward signals backward, backward signals forward)
-        self._signal_targets = {
-            'forward': self._backward_semaphore,
-            'backward': self._forward_semaphore
-        }
+        # Barrier for 2 threads (forward and backward) to synchronize
+        # This ensures that we always alternate at executing one compute and one comm op together
+        self._execution_barrier = threading.Barrier(2)
 
         self._coordination_enabled = False
         self._cycle_count = 0
+        self._num_layers = None
 
-    def _acquire_execution(self, direction: str, timeout: float = 20.0):
-        """Generic acquire method for both forward and backward"""
-        if not self._coordination_enabled:
-            return
-        
-        direction_upper = direction.upper()
-        print(f"[{direction_upper}] Attempting acquire {direction} execution")
-        self._semaphores[direction].acquire(timeout=timeout)
-        print(f"[{direction_upper}] Acquired {direction} execution")
-
-    def _release_execution(self, direction: str, async_tensor: Optional[torch.Tensor] = None):
-        """Generic release method for both forward and backward"""
-        if not self._coordination_enabled:
+    def barrier(self):
+        """Barrier for 2 threads to synchronize"""
+        if not self.is_coordination_enabled():
             return
 
-        direction_upper = direction.upper()
-        
-        # Signal the other direction
-        other_direction = 'backward' if direction == 'forward' else 'forward'
-        print(f"[{direction_upper}] Releasing {direction}, signaling {other_direction}")
-        self._signal_targets[direction].release()
-
-        # Forward-specific logic
-        if direction == 'forward':
-            self._cycle_count += 1
-            print(f"cycle count {self._cycle_count}")
-            self.check_should_enable_coordination()
-
-    # Simple wrapper methods
-    def acquire_forward_execution(self):
-        self._acquire_execution('forward')
-        
-    def release_forward_execution(self, async_tensor: Optional[torch.Tensor] = None):
-        self._release_execution('forward', async_tensor)
-        
-    def acquire_backward_execution(self):
-        self._acquire_execution('backward')
-        
-    def release_backward_execution(self, async_tensor: Optional[torch.Tensor] = None):
-        self._release_execution('backward', async_tensor)
+        try:
+            self._execution_barrier.wait()
+            print(f"Both threads ready, proceeding")
+        except threading.BrokenBarrierError:
+            print(f"Barrier broken - one thread has finished!")
 
     def enable_coordination(self, num_layers: Optional[int] = None):
-        self._coordination_enabled = True
-        self._cycle_count = 0
+        if num_layers is not None and num_layers > 0:
+            self._coordination_enabled = True
+            self._cycle_count = 0
 
-        # Reset semaphores
-        self._forward_semaphore = threading.Semaphore(0)
-        self._backward_semaphore = threading.Semaphore(1)
-        
-        # Update semaphore references
-        self._semaphores['forward'] = self._forward_semaphore
-        self._semaphores['backward'] = self._backward_semaphore
-        self._signal_targets['forward'] = self._backward_semaphore
-        self._signal_targets['backward'] = self._forward_semaphore
-
-        self._num_layers = num_layers
-        self.check_should_enable_coordination()
-        print(f"[COORDINATION] Simplified hook coordination ENABLED with {num_layers} MoE layers")
+            # Reset barrier
+            self._execution_barrier = threading.Barrier(2)
+
+            self._num_layers = num_layers
+            print(f"Compute/Comm hook coordination ENABLED with {num_layers} MoE layers")
         
     def disable_coordination(self):
         self._coordination_enabled = False
-        # Release both semaphores to unblock any waiting threads
-        try:
-            self._forward_semaphore.release()
-            self._backward_semaphore.release()
-        except ValueError:
-            pass
-        print("[COORDINATION] Simplified hook coordination DISABLED")
-
-    def check_should_enable_coordination(self):
-        # TODO: better way to determine when to disable coordination
-        moe_multipler = 4
-        if self._num_layers is not None and self._cycle_count >= moe_multipler * self._num_layers:
+        self._cycle_count = 0
+        self._execution_barrier.abort()  # Break barrier to unblock threads
+        print("[COORDINATION] Compute/Comm hook coordination DISABLED")
+
+    def check_should_continue_coordination(self):
+        if self._num_layers is not None and self._cycle_count >= self._num_layers:
             print("[COORDINATION] Reached target number of cycles, disabling coordination")
-            self.disable_coordination()
-            return
+            return False
+        return True
         
     def is_coordination_enabled(self):
         return self._coordination_enabled
@@ -129,41 +75,34 @@ def is_coordination_enabled(self):
 
 class SyncHook(torch.autograd.Function):
     @staticmethod
-    def forward(ctx, x, hook_name):
+    def forward(ctx, x, hook_name=""):
         ctx.hook_name = hook_name
-        _hook_coordinator.acquire_forward_execution()
-
-        
-        try:
-            if _hook_coordinator.is_coordination_enabled():
-                if hook_name == "dispatch_A":
-                    # TODO: is this right?
-                    print("Calling torch.cuda.synchronize() from dispatch_A")
-                    # This does GPU-CPU sync so we need to wait explicitly before starting
-                    torch.cuda.synchronize()
-                print(f"[FORWARD] {hook_name}_fwd")
-            return x
-        finally:
-            _hook_coordinator.release_forward_execution(x)
+        # handle edge case for transformer level boundary
+        if _hook_coordinator._coordination_enabled and hook_name == "D":
+            _hook_coordinator._cycle_count += 1
+            print(f"[FORWARD] cycle count: {_hook_coordinator._cycle_count}", "=" * 40)
+            if not _hook_coordinator.check_should_continue_coordination():
+                _hook_coordinator.disable_coordination()
+                return x
+
+        _hook_coordinator.barrier()
+
+        if _hook_coordinator.is_coordination_enabled():
+            print(f"[FORWARD] finished {hook_name}_fwd")
+        return x
     
     @staticmethod
     def backward(ctx, grad_output):
         hook_name = ctx.hook_name
-        _hook_coordinator.acquire_backward_execution()
-        
 
-        try:
-            if _hook_coordinator.is_coordination_enabled():
-                if hook_name == "dispatch_B":
-                    # TODO: is this right?
-                    print("Calling torch.cuda.synchronize() from dispatch_B")
-                    # This does GPU-CPU sync so we need to wait explicitly before starting
-                    torch.cuda.synchronize()
-                print(f"[BACKWARD] {hook_name}_bwd")
-                # grad_output.record_stream(torch.cuda.current_stream())
+        # Edge case, skip initial barrier, all subsequent backward hooks will acquire
+        if hook_name == "D" and _hook_coordinator._cycle_count == 0:
             return grad_output, None
-        finally:
-            _hook_coordinator.release_backward_execution(grad_output)
+
+        _hook_coordinator.barrier()
+        if _hook_coordinator.is_coordination_enabled():
+            print(f"[BACKWARD] finished {hook_name}_bwd")
+        return grad_output, None
 
 TOKEN_GROUP_ALIGN_SIZE_M = 8
 ValidTokenGroupAlignmentSize = Literal[8, 16, 32]
@@ -231,17 +170,6 @@ def _token_dispatch(self, mod, inputs, device_mesh):
         # annotate module input placements/sharding with input_layouts
         routed_input, num_tokens_per_expert = inputs
         ep_size = device_mesh.shape[0]
-
-        # TODO: what is causing the IMAs???
-        if not torch.isfinite(routed_input).all():
-            raise RuntimeError(f"routed_input contains non-finite values: {routed_input}")
-        
-        if not torch.isfinite(num_tokens_per_expert).all():
-            raise RuntimeError(f"num_tokens_per_expert contains non-finite values: {num_tokens_per_expert}")
-        
-        if routed_input.shape[0] > 1000000:  # Reasonable limit
-            raise RuntimeError(f"routed_input suspiciously large: {routed_input.shape}")
-
         # generate the input splits and output splits for all-to-all
         with torch.no_grad():
             num_tokens_per_expert_group = all_to_all_single(
@@ -325,18 +253,18 @@ def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
 
     def _wrap_with_inner_hooks(self, module):
         def inner_pre_hook(module, input):
-            return (SyncHook.apply(input[0], "dispatch_A"),) + input[1:]
+            return (SyncHook.apply(input[0], "A"),) + input[1:]
         def inner_post_hook(module, input, output):
-            return SyncHook.apply(output, "combine_C")
+            return SyncHook.apply(output, "C")
         module.register_forward_pre_hook(inner_pre_hook)
         module.register_forward_hook(inner_post_hook)
         return module
 
     def _wrap_with_outer_hooks(self, module):
         def outer_pre_hook(module, input):
-            return (SyncHook.apply(input[0], "dispatch_B"),) + input[1:]
+            return (SyncHook.apply(input[0], "B"),) + input[1:]
         def outer_post_hook(module, input, output):
-            return SyncHook.apply(output, "combine_D")
+            return SyncHook.apply(output, "D")
         module.register_forward_pre_hook(outer_pre_hook)
         module.register_forward_hook(outer_post_hook)
         return module
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -12,6 +12,7 @@
 
 import torch
 from torch.distributed.elastic.multiprocessing.errors import record
+from torch.profiler import record_function
 
 import torchtitan.protocols.train_spec as train_spec_module
 from torchtitan.components.checkpoint import CheckpointManager
@@ -663,7 +664,6 @@ def _count_moe_modules(model):
     return moe_count
 
 def overlap_callback(action: _Action, ctx: _PipelineContext):
-    print("overlap_callback begin", "=" * 80, torch.distributed.get_rank())
     """Custom callback for OVERLAP_F_B computation that mimics the original implementation."""
     schedule = ctx.schedule_ref
     assert isinstance(schedule, _PipelineScheduleRuntime)
@@ -700,6 +700,7 @@ def overlap_callback(action: _Action, ctx: _PipelineContext):
     assert backward_mb_index is not None
     bwd_recv_ops = schedule.bwd_recv_ops
 
+    print(f"overlap_callback begin {forward_stage_index}:{forward_mb_index}, {backward_stage_index}:{backward_mb_index}", "=" * 80, torch.distributed.get_rank())
     # PP communication ========================================================
 
     # Fwd receives
@@ -744,26 +745,27 @@ def run_backward():
             torch.cuda.set_stream(main_cuda_stream)
             print(f"BACKWARD {backward_stage_index} {torch.cuda.current_stream()}")
             # Backward ========================================================
-            loss = schedule._maybe_get_loss(backward_stage, backward_mb_index)
-            schedule.backward_counter[backward_stage_index] += 1
-            last_backward = (
-                schedule.backward_counter[backward_stage_index] == schedule._n_microbatches
-            )
-            backward_stage.backward_one_chunk(
-                backward_mb_index,
-                loss=loss,
-                full_backward=True,
-                last_backward=last_backward,
-            )
-            grad_scale_factor = schedule._n_microbatches if schedule.scale_grads else 1
-            if last_backward:
-                backward_stage.scale_grads(grad_scale_factor)
-            
-            if backward_is_prev_stage_on_this_rank:
-                stage_index_to_stage[backward_stage_index - 1].set_local_bwd_input(
-                    backward_stage.get_local_bwd_output(backward_mb_index),
+            with record_function(f"backward_stage_{backward_stage_index}_mb_{backward_mb_index}"):
+                loss = schedule._maybe_get_loss(backward_stage, backward_mb_index)
+                schedule.backward_counter[backward_stage_index] += 1
+                last_backward = (
+                    schedule.backward_counter[backward_stage_index] == schedule._n_microbatches
+                )
+                backward_stage.backward_one_chunk(
                     backward_mb_index,
+                    loss=loss,
+                    full_backward=True,
+                    last_backward=last_backward,
                 )
+                grad_scale_factor = schedule._n_microbatches if schedule.scale_grads else 1
+                if last_backward:
+                    backward_stage.scale_grads(grad_scale_factor)
+                
+                if backward_is_prev_stage_on_this_rank:
+                    stage_index_to_stage[backward_stage_index - 1].set_local_bwd_input(
+                        backward_stage.get_local_bwd_output(backward_mb_index),
+                        backward_mb_index,
+                    )
 
 
         # Forward ========================================================
@@ -783,25 +785,25 @@ def run_forward():
                 )
 
         # Run forward and backward in parallel
-        if _hook_coordinator.is_coordination_enabled():
-            thread = threading.Thread(target=run_backward, daemon=True)
-            thread.start()
-            run_forward()
-            thread.join()
+        # if _hook_coordinator.is_coordination_enabled():
+        thread = threading.Thread(target=run_backward, daemon=True)
+        thread.start()
+        run_forward()
+        thread.join()
             # with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
             #     forward_future = executor.submit(run_forward)
             #     backward_future = executor.submit(run_backward)
                 
             #     # Wait for both to complete simultaneously
             #     done, not_done = concurrent.futures.wait([forward_future, backward_future])
             #     output = forward_future.result()
-        else:
-            run_forward()
-            run_backward()
+        # else:
+        #     run_forward()
+        #     run_backward()
 
         _hook_coordinator.disable_coordination()
     forward_backward_overlapped()
-    print("overlap_callback end", "=" * 80)
+    print(f"overlap_callback end {forward_stage_index}:{forward_mb_index}, {backward_stage_index}:{backward_mb_index}", "=" * 80)
 
 import fbvscode
 fbvscode.attach_debugger()