pytorch
diff --git a/‎run_train.sh‎
Lines changed: 5 additions & 2 deletions b/‎run_train.sh‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎torchtitan/config/job_config.py‎
Lines changed: 2 additions & 2 deletions b/‎torchtitan/config/job_config.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎torchtitan/distributed/expert_parallel.py‎
Lines changed: 240 additions & 3 deletions b/‎torchtitan/distributed/expert_parallel.py‎
Lines changed: 240 additions & 3 deletions
diff --git a/‎torchtitan/models/deepseek_v3/__init__.py‎
Lines changed: 4 additions & 3 deletions b/‎torchtitan/models/deepseek_v3/__init__.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎torchtitan/models/deepseek_v3/train_configs/debug_model.toml‎
Lines changed: 10 additions & 9 deletions b/‎torchtitan/models/deepseek_v3/train_configs/debug_model.toml‎
Lines changed: 10 additions & 9 deletions
@@ -10,8 +10,11 @@ set -ex
 # use envs as local overwrites for convenience
 # e.g.
 # LOG_RANK=0,1 NGPU=4 ./run_train.sh
-NGPU=${NGPU:-"8"}
-export LOG_RANK=${LOG_RANK:-0}
+# NGPU=${NGPU:-"8"}
+NGPU=${NGPU:-"4"}
+# export LOG_RANK=${LOG_RANK:-0,1,2,3,4,5,6,7}
+# export LOG_RANK=${LOG_RANK:-0,1,2,3}
+export LOG_RANK=${LOG_RANK:-3}
 CONFIG_FILE=${CONFIG_FILE:-"./torchtitan/models/llama3/train_configs/debug_model.toml"}
 TRAIN_FILE=${TRAIN_FILE:-"torchtitan.train"}
 
 
@@ -623,10 +623,10 @@ class MX:
 
 @dataclass
 class Comm:
-    init_timeout_seconds: int = 300
+    init_timeout_seconds: int = 30
     """Timeout for communication operations, during initialization and first train step."""
 
-    train_timeout_seconds: int = 100
+    train_timeout_seconds: int = 10
     """
     Timeout for communication operations after the first train step --
     usually a tighter bound than during initialization.
 
@@ -5,7 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 
-from typing import Callable, Literal
+from typing import Callable, Literal, Dict
 
 import torch
 import torch.nn as nn
@@ -22,6 +22,202 @@
 )
 from torch.distributed.tensor.parallel import ParallelStyle
 
+import threading
+import torch
+from typing import Optional
+import time
+
+class SimplifiedHookCoordinator:
+    """
+    TODO: this hangs because FWD is doing dispatch, and BWD is doing combine
+    the two communications are conflicting??
+    """
+
+    """Alternating forward/backward coordination with just 2 semaphores"""
+    
+    def __init__(self):
+        self._lock = threading.Lock()
+        
+        # Only need 2 semaphores - one for forward, one for backward
+        self._forward_semaphore = threading.Semaphore(0)   # Forward waits
+        self._backward_semaphore = threading.Semaphore(1)  # Backward starts first
+        
+        # CUDA event tracking
+        self._forward_cuda_event = torch.cuda.Event()
+        self._backward_cuda_event = torch.cuda.Event()
+        self._forward_event_recorded = False
+        self._backward_event_recorded = False
+    
+        # Store AsyncCollectiveTensors from previous operations
+        self._stored_forward_async_tensor = None
+        self._stored_backward_async_tensor = None
+
+        self._coordination_enabled = False
+        self._cycle_count = 0
+        
+    def is_coordination_enabled(self) -> bool:
+        """Check if coordination is currently enabled"""
+        return self._coordination_enabled
+
+    def enable_coordination(self, num_layers: Optional[int] = None):
+        self._coordination_enabled = True
+        self._cycle_count = 0
+
+        # Reset semaphores
+        self._forward_semaphore = threading.Semaphore(0)
+        self._backward_semaphore = threading.Semaphore(1)
+        
+        # Reset CUDA events
+        self._forward_cuda_event = torch.cuda.Event()
+        self._backward_cuda_event = torch.cuda.Event()
+        self._forward_event_recorded = False
+        self._backward_event_recorded = False
+
+        # num layers
+        self._num_layers = num_layers
+        
+        print("[COORDINATION] Simplified hook coordination with CUDA events ENABLED")
+        
+    def disable_coordination(self):
+        self._coordination_enabled = False
+        # Release both semaphores to unblock any waiting threads
+        try:
+            self._forward_semaphore.release()
+            self._backward_semaphore.release()
+        except ValueError:
+            pass
+        print("[COORDINATION] Simplified hook coordination DISABLED")
+        
+    def acquire_forward_execution(self):
+        if not self._coordination_enabled:
+            return
+        
+        print("[FORWARD] Attempting acquire forward execution")
+        self._forward_semaphore.acquire()
+
+        # 2. Wait for PREVIOUS FORWARD CUDA operations to complete
+        if self._forward_event_recorded:
+            print("[FORWARD] Waiting for previous FORWARD CUDA operations to complete...")
+            self._forward_cuda_event.wait()
+            print("[FORWARD] Previous FORWARD CUDA operations completed")
+
+        # Wait for forward's own previously stored AsyncCollectiveTensor
+        if self._stored_forward_async_tensor is not None:
+            from torch.distributed._functional_collectives import AsyncCollectiveTensor
+            if isinstance(self._stored_forward_async_tensor, AsyncCollectiveTensor):
+                print("[FORWARD] Waiting for forward's own previous AsyncCollectiveTensor...")
+                torch.ops._c10d_functional.wait_tensor(self._stored_forward_async_tensor)
+                print("[FORWARD] Forward's previous AsyncCollectiveTensor completed")
+            self._stored_forward_async_tensor = None  # Clear after waiting
+
+        print("[FORWARD] Acquired forward execution")
+        
+    def release_forward_execution(self, async_tensor: Optional[torch.Tensor] = None):
+        if not self._coordination_enabled:
+            return
+    
+        # 1. Record CUDA event for current forward operations
+        current_stream = torch.cuda.current_stream()
+        self._forward_cuda_event = torch.cuda.Event()  # Create new event
+        self._forward_cuda_event.record(current_stream)
+        self._forward_event_recorded = True
+        print("[FORWARD] Recorded forward CUDA completion event")
+        
+        # Store forward's AsyncCollectiveTensor for forward's own future use
+        self._stored_forward_async_tensor = async_tensor
+        if async_tensor is not None:
+            from torch.distributed._functional_collectives import AsyncCollectiveTensor
+            if isinstance(async_tensor, AsyncCollectiveTensor):
+                print("[FORWARD] Stored forward AsyncCollectiveTensor for forward's future use")
+
+        print("[FORWARD] Releasing forward, signaling backward")
+        self._backward_semaphore.release()  # Signal backward can start
+
+        self._cycle_count += 1
+        print(f"cycle count {self._cycle_count}")
+        # TODO: better way to determine when to disable coordination
+        moe_multipler = 4
+        if self._num_layers is not None and self._cycle_count >= moe_multipler * self._num_layers:
+            print("[COORDINATION] Reached target number of cycles, disabling coordination")
+            self.disable_coordination()
+            return  # Exit early since coordination is now disabled
+        
+    def acquire_backward_execution(self):
+        if not self._coordination_enabled:
+            return
+
+        print("[BACKWARD] Attempting acquire backward execution")    
+        self._backward_semaphore.acquire()
+
+        # # 2. Wait for PREVIOUS BACKWARD CUDA operations to complete
+        if self._backward_event_recorded:
+            print("[BACKWARD] Waiting for previous BACKWARD CUDA operations to complete...")
+            self._backward_cuda_event.wait()
+            print("[BACKWARD] Previous BACKWARD CUDA operations completed")
+
+        # Wait for backward's own previously stored AsyncCollectiveTensor
+        if self._stored_backward_async_tensor is not None:
+            from torch.distributed._functional_collectives import AsyncCollectiveTensor
+            if isinstance(self._stored_backward_async_tensor, AsyncCollectiveTensor):
+                print("[BACKWARD] Waiting for backward's own previous AsyncCollectiveTensor...")
+                torch.ops._c10d_functional.wait_tensor(self._stored_backward_async_tensor)
+                print("[BACKWARD] Backward's previous AsyncCollectiveTensor completed")
+            self._stored_backward_async_tensor = None  # Clear after waiting
+
+        print("[BACKWARD] Acquired backward execution")
+        
+    def release_backward_execution(self, async_tensor: Optional[torch.Tensor] = None):
+        if not self._coordination_enabled:
+            return
+
+        # 1. Record CUDA event for current backward operations
+        current_stream = torch.cuda.current_stream()
+        self._backward_cuda_event = torch.cuda.Event()  # Create new event
+        self._backward_cuda_event.record(current_stream)
+        self._backward_event_recorded = True
+        print("[BACKWARD] Recorded backward CUDA completion event")
+
+        # Store backward's AsyncCollectiveTensor for backward's own future use
+        self._stored_backward_async_tensor = async_tensor
+        if async_tensor is not None:
+            from torch.distributed._functional_collectives import AsyncCollectiveTensor
+            if isinstance(async_tensor, AsyncCollectiveTensor):
+                print("[BACKWARD] Stored backward AsyncCollectiveTensor for backward's future use")
+
+        print("[BACKWARD] Releasing backward, signaling next forward")
+        self._forward_semaphore.release()  # Signal next forward can start
+        # self._cycle_count += 1
+        # print(f"[CYCLE] Completed cycle {self._cycle_count}")
+
+# Global coordinator
+_hook_coordinator = SimplifiedHookCoordinator()
+
+class SyncHook(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, hook_name):
+        ctx.hook_name = hook_name
+        
+        _hook_coordinator.acquire_forward_execution()
+        
+        try:
+            if _hook_coordinator.is_coordination_enabled():
+                print(f"[FORWARD] {hook_name}_fwd")
+            return x
+        finally:
+            _hook_coordinator.release_forward_execution(x)
+    
+    @staticmethod
+    def backward(ctx, grad_output):
+        hook_name = ctx.hook_name
+        
+        _hook_coordinator.acquire_backward_execution()
+        
+        try:
+            if _hook_coordinator.is_coordination_enabled():
+                print(f"[BACKWARD] {hook_name}_bwd")
+            return grad_output, None
+        finally:
+            _hook_coordinator.release_backward_execution(grad_output)
 
 TOKEN_GROUP_ALIGN_SIZE_M = 8
 ValidTokenGroupAlignmentSize = Literal[8, 16, 32]
@@ -109,11 +305,15 @@ def _token_dispatch(self, mod, inputs, device_mesh):
                 .to(torch.device("cpu"), non_blocking=True)
             )
             # NOTE: this would incur a device-to-host sync
+            # CPU-GPU sync here!!!
+            # start_time = time.time()
             output_splits = (
                 num_tokens_per_expert_group.view(ep_size, -1)
                 .sum(dim=1)
                 .to(torch.device("cpu"), non_blocking=False)
             )
+            # sync_time = time.time() - start_time
+            # print(f"CPU-GPU sync took {sync_time:.4f}s")
             self.input_splits = input_splits.tolist()
             self.output_splits = output_splits.tolist()
 
@@ -125,6 +325,11 @@ def _token_dispatch(self, mod, inputs, device_mesh):
             device_mesh.get_group(),
         )
 
+        # TODO: FIX NEEDING THIS???
+        routed_input = torch.ops._c10d_functional.wait_tensor(
+            routed_input
+        )
+
         # NOTE: After this all-to-all, the routed input is put on proper EP rank.
         # However, the num_tokens_per_expert_group is not of the final target format
         # [#tokens for local expert 0, #tokens for local expert 1, ...]
@@ -152,16 +357,48 @@ def _token_combine(self, mod, routed_output, device_mesh):
             self.output_splits,
             device_mesh.get_group(),
         )
+        # TODO: FIX NEEDING THIS???
+        # CRITICAL: Wait for AsyncCollectiveTensor BEFORE coordination
+        from torch.distributed._functional_collectives import AsyncCollectiveTensor
+        if isinstance(routed_output, AsyncCollectiveTensor):
+            routed_output = torch.ops._c10d_functional.wait_tensor(routed_output)
+        
         return routed_output
 
     def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
-        return distribute_module(
-            module,
+        """
+        hooks are called in the order they are registered:
+        A, dispatch, B (pre hooks)
+        C, combine, D (post hooks)
+        """
+        inner_wrapped_module = self._wrap_with_inner_hooks(module)
+        distributed_module = distribute_module(
+            inner_wrapped_module,
             device_mesh,
             partition_fn=ExpertParallel._partition_fn,
             input_fn=self._token_dispatch,
             output_fn=self._token_combine,
         )
+        final_module = self._wrap_with_outer_hooks(distributed_module)
+        return final_module
+
+    def _wrap_with_inner_hooks(self, module):
+        def inner_pre_hook(module, input):
+            return (SyncHook.apply(input[0], "dispatch_A"),) + input[1:]
+        def inner_post_hook(module, input, output):
+            return SyncHook.apply(output, "combine_C")
+        module.register_forward_pre_hook(inner_pre_hook)
+        module.register_forward_hook(inner_post_hook)
+        return module
+
+    def _wrap_with_outer_hooks(self, module):
+        def outer_pre_hook(module, input):
+            return (SyncHook.apply(input[0], "dispatch_B"),) + input[1:]
+        def outer_post_hook(module, input, output):
+            return SyncHook.apply(output, "combine_D")
+        module.register_forward_pre_hook(outer_pre_hook)
+        module.register_forward_hook(outer_post_hook)
+        return module
 
 
 # This class is for dp2ep with TP (without TP we can just use ExpertParallel)
 
@@ -11,7 +11,7 @@
 from torchtitan.components.optimizer import build_optimizers_with_moe_load_balancing
 from torchtitan.components.tokenizer import build_hf_tokenizer
 from torchtitan.datasets.hf_datasets import build_hf_dataloader
-from torchtitan.models.llama3.infra.pipeline import pipeline_llama
+from torchtitan.models.llama3.infra.pipeline import pipeline_llama, pipeline_llama_tracer
 from torchtitan.models.moe import MoEArgs
 
 from torchtitan.protocols.train_spec import register_train_spec, TrainSpec
@@ -32,10 +32,11 @@
 deepseekv3_configs = {
     "debugmodel": DeepSeekV3ModelArgs(
         vocab_size=2000,
-        dim=256,
+        # needs at least dim 8?
+        dim=8,
         inter_dim=1024,
         moe_inter_dim=256,
-        n_layers=6,
+        n_layers=16,
         n_dense_layers=1,
         n_heads=16,
         moe_args=MoEArgs(
 
@@ -4,9 +4,9 @@ description = "DeepSeek-V3 debug training"
 print_args = false
 
 [profiling]
-enable_profiling = false
+enable_profiling = true
 save_traces_folder = "profile_trace"
-profile_freq = 10
+profile_freq = 5
 enable_memory_snapshot = false
 save_memory_snapshot_folder = "memory_snapshot"
 
@@ -36,22 +36,23 @@ decay_type = "linear"
 min_lr_factor = 0.0
 
 [training]
-local_batch_size = 8
-seq_len = 2048
+local_batch_size = 4
+seq_len = 4
 max_norm = 1.0  # grad norm clipping
-steps = 10
+steps = 6
 dataset = "c4_test"  # supported datasets: c4_test (2K), c4 (177M)
+# dataset = "c4"
 
 [parallelism]
 data_parallel_replicate_degree = 1
 data_parallel_shard_degree = -1
 fsdp_reshard_after_forward = "default" # default / never / always
 tensor_parallel_degree = 1
 enable_async_tensor_parallel = false
-pipeline_parallel_degree = 1
-pipeline_parallel_schedule = "1F1B"
+pipeline_parallel_degree = 2
+expert_parallel_degree = 2
 context_parallel_degree = 1
-expert_parallel_degree = 1
+pipeline_parallel_schedule = "DualPipeV"
 expert_tensor_parallel_degree = 1
 
 [checkpoint]
@@ -63,7 +64,7 @@ export_dtype = "float32"
 async_mode = "disabled"  # ["disabled", "async", "async_with_pinned_mem"]
 
 [activation_checkpoint]
-mode = "selective"  # ["none", "selective", "full"]
+mode = "none"  # ["none", "selective", "full"]
 selective_ac_option = 'op'  # 'int' = ac every positive int layer or 'op', ac based on ops policy
 
 [compile]