pytorch
diff --git a/‎torchtitan/config/job_config.py‎
Lines changed: 6 additions & 1 deletion b/‎torchtitan/config/job_config.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎torchtitan/distributed/expert_parallel.py‎
Lines changed: 4 additions & 75 deletions b/‎torchtitan/distributed/expert_parallel.py‎
Lines changed: 4 additions & 75 deletions
@@ -365,6 +365,11 @@ class Parallelism:
     The global training batch size must be evenly divisible by pipeline_parallel_microbatch_size.
     """
 
+    pipeline_parallel_expert_parallel_overlap: bool = True
+    """Whether to turn on the optimization to overlap expert parallel and pipeline parallel
+    communication. This is only effective when the pipeline paralel schedule is DualPipeV and
+    pipeline_parallel_degree > 1 and expert_parallel_degree > 1."""
+
     context_parallel_degree: int = 1
     """Context parallelism degree. 1 means disabled."""
 
@@ -693,7 +698,7 @@ class Comm:
     init_timeout_seconds: int = 300
     """Timeout for communication operations, during initialization and first train step."""
 
-    train_timeout_seconds: int = 30
+    train_timeout_seconds: int = 100
     """
     Timeout for communication operations after the first train step --
     usually a tighter bound than during initialization.
 
@@ -5,8 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 
-import threading
-from typing import Callable, Literal, Optional
+from typing import Callable, Literal
 
 import torch
 import torch.nn as nn
@@ -22,80 +21,9 @@
     Shard,
 )
 from torch.distributed.tensor.parallel import ParallelStyle
-from torchtitan.tools.utils import _round_up
-
-class HookCoordinator:
-    def __init__(self):
-        # Barrier for 2 threads (forward and backward) to synchronize
-        # This ensures that we always alternate at executing one compute and one comm op together
-        self._execution_barrier = threading.Barrier(2)
-
-        self._coordination_enabled = False
-        self._cycle_count = 0
-        self._num_layers = None
-
-    def barrier(self):
-        """Barrier for 2 threads to synchronize"""
-        if not self.is_coordination_enabled():
-            return
-
-        try:
-            self._execution_barrier.wait()
-        except threading.BrokenBarrierError:
-            pass
-
-    def enable_coordination(self, num_layers: Optional[int] = None):
-        if num_layers is not None and num_layers > 0:
-            self._coordination_enabled = True
-            self._cycle_count = 0
-
-            # Reset barrier
-            self._execution_barrier = threading.Barrier(2)
-            self._num_layers = num_layers
-
-    def disable_coordination(self):
-        self._coordination_enabled = False
-        self._cycle_count = 0
-        self._execution_barrier.abort()  # Break barrier to unblock threads
-
-    def check_should_continue_coordination(self):
-        if self._num_layers is not None and self._cycle_count >= self._num_layers:
-            return False
-        return True
-
-    def is_coordination_enabled(self):
-        return self._coordination_enabled
 
-
-# Global coordinator
-_hook_coordinator = HookCoordinator()
-
-
-class SyncHook(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x, hook_name=""):
-        ctx.hook_name = hook_name
-        # handle edge case for transformer level boundary
-        if _hook_coordinator._coordination_enabled and hook_name == "D":
-            _hook_coordinator._cycle_count += 1
-            # print(f"[FORWARD] cycle count: {_hook_coordinator._cycle_count}", "=" * 40)
-            if not _hook_coordinator.check_should_continue_coordination():
-                _hook_coordinator.disable_coordination()
-                return x
-
-        _hook_coordinator.barrier()
-        return x
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        hook_name = ctx.hook_name
-
-        # Edge case, skip initial barrier, all subsequent backward hooks will acquire
-        if hook_name == "D" and _hook_coordinator._cycle_count == 0:
-            return grad_output, None
-
-        _hook_coordinator.barrier()
-        return grad_output, None
+from torchtitan.distributed.pipeline_parallel import SyncHook
+from torchtitan.tools.utils import _round_up
 
 
 TOKEN_GROUP_ALIGN_SIZE_M = 8
@@ -164,6 +92,7 @@ def _token_dispatch(self, mod, inputs, device_mesh):
         # annotate module input placements/sharding with input_layouts
         routed_input, num_tokens_per_expert = inputs
         ep_size = device_mesh.shape[0]
+
         # generate the input splits and output splits for all-to-all
         with torch.no_grad():
             num_tokens_per_expert_group = all_to_all_single(