clean up train.py

H-Huang · H-Huang · commit 4f8e62168ced · 2025-10-08T08:52:49.000-07:00
diff --git a/torchtitan/distributed/expert_parallel.py b/torchtitan/distributed/expert_parallel.py
@@ -159,9 +159,9 @@ def _token_combine(self, mod, routed_output, device_mesh):
 
     def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
         """
-        hooks are called in the order they are registered:
-        A, dispatch, B (pre hooks)
-        C, combine, D (post hooks)
+        Hooks are called in the order they are registered:
+        SyncHookA, _token_dispatch, SyncHookB (pre hooks)
+        SyncHookC, _token_combine, SyncHookD (post hooks)
         """
         inner_wrapped_module = self._wrap_with_inner_hooks(module)
         distributed_module = distribute_module(
diff --git a/torchtitan/distributed/pipeline_parallel.py b/torchtitan/distributed/pipeline_parallel.py
@@ -455,14 +455,17 @@ def _count_moe_modules(model):
     from torchtitan.models.moe import MoE
 
     moe_count = 0
-    for name, module in model.named_modules():
+    for _, module in model.named_modules():
         if isinstance(module, MoE):
             moe_count += 1
     return moe_count
 
 
 def overlap_callback(action: _Action, ctx: _PipelineContext):
-    """Custom callback for OVERLAP_F_B computation that mimics the original implementation."""
+    """
+    Custom callback for OVERLAP_F_B computation that allows expert parallel communication
+    and pipeline parallel computation to overlap.
+    """
     schedule = ctx.schedule_ref
     assert isinstance(schedule, _PipelineScheduleRuntime)
     stage_index_to_stage: dict[int, _PipelineStageBase] = {
@@ -482,6 +485,7 @@ def overlap_callback(action: _Action, ctx: _PipelineContext):
     # Forward setup
     arg_mbs = ctx.arg_mbs
     kwarg_mbs = ctx.kwarg_mbs
+    assert arg_mbs is not None and kwarg_mbs is not None
     fwd_recv_ops = schedule.fwd_recv_ops
     forward_stage = stage_index_to_stage[forward_stage_index]
     forward_is_next_stage_on_this_rank = forward_stage_index + 1 in stage_index_to_stage
@@ -498,13 +502,6 @@ def overlap_callback(action: _Action, ctx: _PipelineContext):
     assert backward_mb_index is not None
     bwd_recv_ops = schedule.bwd_recv_ops
 
-    # print(
-    #     f"overlap_callback begin {forward_stage_index}:{forward_mb_index}, {backward_stage_index}:{backward_mb_index}",
-    #     "=" * 80,
-    #     torch.distributed.get_rank(),
-    # )
-    # PP communication ========================================================
-
     # Fwd receives
     if (
         not forward_stage.is_first
@@ -529,85 +526,61 @@ def overlap_callback(action: _Action, ctx: _PipelineContext):
         ) in bwd_recv_ops, f"Attempted to run compute {action=} before receiving input"
         _wait_batch_p2p(bwd_recv_ops.pop((backward_stage_index, backward_mb_index)))
 
+    # We count num layers in case the stage layers differ
+    # If they differ than we only want coordination to happen for the min amount of layers
+    min_num_layers = min(
+        _count_moe_modules(forward_stage.submod),
+        _count_moe_modules(backward_stage.submod),
+    )
     # PP computation ========================================================
-    def forward_backward_overlapped():
-        from torchtitan.distributed.pipeline_parallel import _hook_coordinator
+    _hook_coordinator.enable_coordination(num_layers=min_num_layers)
+    main_cuda_stream = torch.cuda.current_stream()
+
+    def run_backward():
+        # Set the backward thread to use the same stream as forward
+        torch.cuda.set_stream(main_cuda_stream)
+        with record_function(
+            f"backward_stage_{backward_stage_index}_mb_{backward_mb_index}"
+        ):
+            loss = schedule._maybe_get_loss(backward_stage, backward_mb_index)
+            schedule.backward_counter[backward_stage_index] += 1
+            last_backward = (
+                schedule.backward_counter[backward_stage_index]
+                == schedule._n_microbatches
+            )
+            backward_stage.backward_one_chunk(
+                backward_mb_index,
+                loss=loss,
+                full_backward=True,
+                last_backward=last_backward,
+            )
+            grad_scale_factor = schedule._n_microbatches if schedule.scale_grads else 1
+            if last_backward:
+                backward_stage.scale_grads(grad_scale_factor)
 
-        # TODO: Num layers is needed in case the stage layers differ, we need to ensure there is no coordination
-        min_num_layers = min(
-            _count_moe_modules(forward_stage.submod),
-            _count_moe_modules(backward_stage.submod),
-        )
-        _hook_coordinator.enable_coordination(num_layers=min_num_layers)
-        main_cuda_stream = torch.cuda.current_stream()
-
-        def run_backward():
-            # Set the backward thread to use the same stream as forward
-            torch.cuda.set_stream(main_cuda_stream)
-            # Backward ========================================================
-            with record_function(
-                f"backward_stage_{backward_stage_index}_mb_{backward_mb_index}"
-            ):
-                loss = schedule._maybe_get_loss(backward_stage, backward_mb_index)
-                schedule.backward_counter[backward_stage_index] += 1
-                last_backward = (
-                    schedule.backward_counter[backward_stage_index]
-                    == schedule._n_microbatches
-                )
-                backward_stage.backward_one_chunk(
+            if backward_is_prev_stage_on_this_rank:
+                stage_index_to_stage[backward_stage_index - 1].set_local_bwd_input(
+                    backward_stage.get_local_bwd_output(backward_mb_index),
                     backward_mb_index,
-                    loss=loss,
-                    full_backward=True,
-                    last_backward=last_backward,
                 )
-                grad_scale_factor = (
-                    schedule._n_microbatches if schedule.scale_grads else 1
-                )
-                if last_backward:
-                    backward_stage.scale_grads(grad_scale_factor)
-
-                if backward_is_prev_stage_on_this_rank:
-                    stage_index_to_stage[backward_stage_index - 1].set_local_bwd_input(
-                        backward_stage.get_local_bwd_output(backward_mb_index),
-                        backward_mb_index,
-                    )
-
-        # Forward ========================================================
-        def run_forward():
-            output = forward_stage.forward_one_chunk(
-                forward_mb_index,
-                arg_mbs[forward_mb_index],
-                kwarg_mbs[forward_mb_index],
-            )
-            schedule._maybe_compute_loss(
-                forward_stage, output, ctx.target_mbs, forward_mb_index
+
+    def run_forward():
+        output = forward_stage.forward_one_chunk(
+            forward_mb_index,
+            arg_mbs[forward_mb_index],
+            kwarg_mbs[forward_mb_index],
+        )
+        schedule._maybe_compute_loss(
+            forward_stage, output, ctx.target_mbs, forward_mb_index
+        )
+        if forward_is_next_stage_on_this_rank:
+            stage_index_to_stage[forward_stage_index + 1].set_local_fwd_input(
+                output, forward_mb_index
             )
-            if forward_is_next_stage_on_this_rank:
-                stage_index_to_stage[forward_stage_index + 1].set_local_fwd_input(
-                    output, forward_mb_index
-                )
 
-        # Run forward and backward in parallel
-        # if _hook_coordinator.is_coordination_enabled():
-        thread = threading.Thread(target=run_backward, daemon=True)
-        thread.start()
-        run_forward()
-        thread.join()
-        # with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
-        #     forward_future = executor.submit(run_forward)
-        #     backward_future = executor.submit(run_backward)
-
-        #     # Wait for both to complete simultaneously
-        #     done, not_done = concurrent.futures.wait([forward_future, backward_future])
-        #     output = forward_future.result()
-        # else:
-        # run_forward()
-        # run_backward()
-
-        _hook_coordinator.disable_coordination()
-
-    forward_backward_overlapped()
-    # print(
-    #     f"overlap_callback end {forward_stage_index}:{forward_mb_index}, {backward_stage_index}:{backward_mb_index}",
-    #     "=" * 80,
-    # )
+    # Run forward and backward in parallel
+    thread = threading.Thread(target=run_backward, daemon=True)
+    thread.start()
+    run_forward()
+    thread.join()
+    _hook_coordinator.disable_coordination()
diff --git a/torchtitan/models/deepseek_v3/train_configs/deepseek_v3_16b.toml b/torchtitan/models/deepseek_v3/train_configs/deepseek_v3_16b.toml
@@ -4,7 +4,7 @@ description = "DeepSeek-V3 16B model training"
 print_args = false
 
 [profiling]
-enable_profiling = false
+enable_profiling = true
 save_traces_folder = "profile_trace"
 profile_freq = 10
 enable_memory_snapshot = false
@@ -56,15 +56,14 @@ expert_tensor_parallel_degree = 1
 enable = false
 folder = "checkpoint"
 interval = 10
-last_save_model_only = false # This does stuff with causing compile?
+last_save_model_only = true
 export_dtype = "float32"
 async_mode = "disabled"  # ["disabled", "async", "async_with_pinned_mem]"
 
 [activation_checkpoint]
 mode = "none"  # ["none", "selective", "full"]
 selective_ac_option = 'op'  # 'int' = ac every positive int layer or 'op', ac based on ops policy
 
-# we cannot compile model with dI-dW split
 [compile]
 enable=true
 components = ["loss"] # ["model", "loss"]
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -493,17 +493,15 @@ def train_step(
             loss = self.forward_backward_step(input_dict, labels)
             accumulated_losses.append(loss.detach())
 
-        # TODO: parameters are not DTensors which im not sure why
-        # grad_norm = dist_utils.clip_grad_norm_(
-        #     [p for m in self.model_parts for p in m.parameters()],
-        #     self.job_config.training.max_norm,
-        #     foreach=True,
-        #     pp_mesh=(
-        #         parallel_dims.world_mesh["pp"] if parallel_dims.pp_enabled else None
-        #     ),
-        #     ep_enabled=parallel_dims.ep_enabled,
-        # )
-        grad_norm = torch.tensor([0.0], device=self.device)
+        grad_norm = dist_utils.clip_grad_norm_(
+            [p for m in self.model_parts for p in m.parameters()],
+            self.job_config.training.max_norm,
+            foreach=True,
+            pp_mesh=(
+                parallel_dims.world_mesh["pp"] if parallel_dims.pp_enabled else None
+            ),
+            ep_enabled=parallel_dims.ep_enabled,
+        )
         self.checkpointer.maybe_wait_for_staging()
         self.optimizers.step()
         self.lr_schedulers.step()
@@ -648,10 +646,6 @@ def close(self) -> None:
             self.metrics_processor.close()
 
 
-import fbvscode
-
-fbvscode.attach_debugger()
-
 if __name__ == "__main__":
     init_logger()
     config_manager = ConfigManager()