fix sink

wwwjn · wwwjn · commit 07c0ff42f213 · 2025-09-29T21:34:21.000-07:00
diff --git a/torchtitan/experiments/gpt_oss/infra/parallelize.py b/torchtitan/experiments/gpt_oss/infra/parallelize.py
@@ -8,6 +8,7 @@
     PrepareModuleInput,
     RowwiseParallel,
     SequenceParallel,
+    PrepareModuleOutput,
 )
 
 if torch.__version__ >= "2.9":
@@ -22,7 +23,6 @@
 from torchtitan.models.llama3.infra.parallelize import apply_ac, apply_ddp
 from torchtitan.experiments.llama4.infra.parallelize import (
     apply_fsdp,
-    apply_moe_ep_tp,
 )
 
 from torchtitan.tools.logging import logger
@@ -212,49 +212,52 @@ def apply_non_moe_tp(
             Float8ColwiseParallel,
             Float8RowwiseParallel,
             PrepareFloat8ModuleInput,
+            PrepareFloat8ModuleOutput
         )
 
-        rowwise_parallel, colwise_parallel, prepare_module_input = (
+        rowwise_parallel, colwise_parallel, prepare_module_input, prepare_module_output = (
             Float8RowwiseParallel,
             Float8ColwiseParallel,
             PrepareFloat8ModuleInput,
+            PrepareFloat8ModuleOutput,
         )
     else:
-        rowwise_parallel, colwise_parallel, prepare_module_input = (
+        rowwise_parallel, colwise_parallel, prepare_module_input, prepare_module_output= (
             RowwiseParallel,
             ColwiseParallel,
             PrepareModuleInput,
+            PrepareModuleOutput,
         )
 
     # Apply tensor + sequence parallelism to every transformer block
     for transformer_block in model.layers.values():
         layer_plan = {
             "attention_norm": SequenceParallel(),
             "attention": prepare_module_input(
-                input_layouts=(Shard(1), Replicate()),
-                desired_input_layouts=(Replicate(), Replicate()),
+                input_layouts=(Shard(1), None),
+                desired_input_layouts=(Replicate(), None),
             ),
-            "attention.wq": colwise_parallel(use_local_output=False),
-            "attention.wk": colwise_parallel(use_local_output=False),
-            "attention.wv": colwise_parallel(use_local_output=False),
+            "attention.wq": colwise_parallel(),
+            "attention.wk": colwise_parallel(),
+            "attention.wv": colwise_parallel(),
+            "attention.attn": prepare_module_output(output_layouts=(Shard(1), Shard(1)), desired_output_layouts=(Shard(1), Shard(1)), use_local_output=False),
             "attention.wo": rowwise_parallel(output_layouts=Shard(1)),
             "ffn_norm": SequenceParallel(),
         }
 
+        # shard attention.sinks across heads
+        attn = transformer_block.attention
+        attn.register_parameter(
+            "sinks",
+            nn.Parameter(distribute_tensor(attn.sinks, tp_mesh, [Shard(0)])),
+        )
+
         parallelize_module(
             module=transformer_block,
             device_mesh=tp_mesh,
             parallelize_plan=layer_plan,
         )
 
-        # shard attention.sinks across heads
-        # TODO(jianiw): Fix the sink implementation
-        # attn = transformer_block.attention
-        # attn.register_parameter(
-        #     "sinks",
-        #     nn.Parameter(distribute_tensor(attn.sinks, tp_mesh, [Replicate()])),
-        # )
-
     if enable_async_tp:
         from torch.distributed._symmetric_memory import enable_symm_mem_for_group
 
diff --git a/torchtitan/experiments/gpt_oss/model/model.py b/torchtitan/experiments/gpt_oss/model/model.py
@@ -245,14 +245,18 @@ def forward(
                 k,
                 v,
                 scale=None,
-                return_lse=False,
+                return_lse=True,
             )
 
             # Apply attention sink rescaling: rescale by σ(lse - w[h])
             # This is mathematically equivalent to concatenating learnable sink weights
+            # TODO: If attention part is, but self.sinks are registered as a DTensor, while lse is a plain tensor
+            # q, k, v are already sharded by TP: [batch, local_heads, seq_len, head_dim] (plain tensor)
+            # sinks shape needs to match: [local_heads],
+            # [rank0]:lse.shape torch.Size([8, 32, 2048]), <class 'torch.Tensor'>
             sink_scale = torch.sigmoid(lse - self.sinks.view(1, -1, 1)).unsqueeze(
                 -1
-            )  # [B,H,S,1]
+            )
             output = output * sink_scale.to(output.dtype)
 
         else:
diff --git a/torchtitan/experiments/gpt_oss/train_configs/debug_model.toml b/torchtitan/experiments/gpt_oss/train_configs/debug_model.toml
@@ -46,10 +46,12 @@ dataset = "c4_test"  # supported datasets: c4_test (2K), c4 (177M)
 data_parallel_replicate_degree = 1
 data_parallel_shard_degree = -1
 fsdp_reshard_after_forward = "default" # default / never / always
-tensor_parallel_degree = 2
+tensor_parallel_degree = 1
 enable_async_tensor_parallel = false
-pipeline_parallel_degree = 1
-context_parallel_degree = 1
+expert_parallel_degree = 4
+
+
+
 
 [checkpoint]
 enable = false