fix flexattn

wwwjn · wwwjn · commit 589ce620cf14 · 2025-09-24T13:36:26.000-07:00
diff --git a/torchtitan/experiments/gpt_oss/model/model.py b/torchtitan/experiments/gpt_oss/model/model.py
@@ -180,16 +180,17 @@ def apply_rotary_emb(q: torch.Tensor, k: torch.Tensor, freqs_cis: torch.Tensor):
     return q_out, k_out
 
 # Torch Attention backup implementation (for debugging and sampling) from HuggingFace
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """torch.repeat_interleave(x, dim=2, repeats=n_rep)"""
+    bs, slen, n_kv_heads, head_dim = x.shape
     if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+        return x
+    return (
+        torch.unsqueeze(x, dim=3)
+        .expand(bs, slen, n_kv_heads, n_rep, head_dim)
+        .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
+    )
+
 
 # TODO(jianw): This is eager version from HuggingFace
 def eager_attention_forward(
@@ -200,12 +201,9 @@ def eager_attention_forward(
     attention_mask: torch.Tensor,
     scaling: float,
     dropout: float = 0.0,
-    num_key_value_groups: int = 1,
     **kwargs,
 ):
-    key_states = repeat_kv(key, num_key_value_groups)
-    value_states = repeat_kv(value, num_key_value_groups)
-    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
     if attention_mask is not None:
         # attention_mask can be [Tq, Tk] or [B, H, Tq, Tk]
         # Convert boolean "allowed" -> additive mask
@@ -230,7 +228,7 @@ def eager_attention_forward(
     probs = nn.functional.softmax(combined_logits, dim=-1, dtype=combined_logits.dtype)
     scores = probs[..., :-1]  # we drop the sink here
     attn_weights = nn.functional.dropout(scores, p=dropout, training=False)
-    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = torch.matmul(attn_weights, value)
     return attn_output
 
 class Attention(nn.Module):
@@ -243,6 +241,10 @@ def __init__(self, model_args: GptOssModelArgs, use_sliding_attention: bool = Fa
 
         self.sliding_window = model_args.sliding_window if use_sliding_attention else None
         self.head_dim = model_args.head_dim
+        self.n_heads = model_args.num_attention_heads
+        self.n_kv_heads = model_args.num_key_value_heads
+
+        self.n_rep = self.n_heads // self.n_kv_heads
 
         self.wq = nn.Linear(
             model_args.hidden_size, model_args.num_attention_heads * model_args.head_dim, bias=True
@@ -294,17 +296,19 @@ def forward(
 
         q, k = apply_rotary_emb(q, k, freqs_cis)
 
+        # repeat k/v heads if n_kv_heads < n_heads
+        keys = repeat_kv(k, self.n_rep)
+        values = repeat_kv(v, self.n_rep)
+
         q = q.transpose(1, 2).contiguous()
-        k = k.transpose(1, 2).contiguous()
-        v = v.transpose(1, 2).contiguous()
+        k = keys.transpose(1, 2).contiguous()
+        v = values.transpose(1, 2).contiguous()
 
         if self.use_flex_attn:
             output = self.attn(
                 q, k, v,
                 scale=None,
                 sink_weights=self.sinks.to_local() if isinstance(self.sinks, DTensor) else self.sinks,
-                # sliding_window=self.sliding_window,
-                enable_gqa=True if self.sliding_window else False,
             )
         else:
             # eager attention forward
@@ -313,7 +317,6 @@ def forward(
                 attention_mask=self.sliding_window_causal(seqlen, x.device),
                 scaling=self.head_dim**-0.5,
                 dropout=0.0,
-                num_key_value_groups=8,
             )
         output = output.transpose(1, 2).contiguous()   # (B, H, T, D) -> (B, T, H, D)
 
diff --git a/torchtitan/models/attention.py b/torchtitan/models/attention.py
@@ -25,7 +25,9 @@
 # FlexAttention mask type. For each mask type, we initialize it at most once per
 # batch. To record what it is initialized, FLEX_ATTN_MASK_T is used as the key to
 # track the initialized mask.
-FLEX_ATTN_MASK_T = tuple[str, int | None, int | None]  # (mask_type, fixed_block_size, sliding_window)
+FLEX_ATTN_MASK_T = tuple[
+    str, int | None, int | None
+]  # (mask_type, fixed_block_size, sliding_window)
 
 
 class FlexAttention(torch.nn.Module):
@@ -64,7 +66,10 @@ class FlexAttention(torch.nn.Module):
     attn_mask_type: str
 
     def __init__(
-        self, attn_mask_type: str, fixed_block_size: int | None = None, sliding_window: int | None = None
+        self,
+        attn_mask_type: str,
+        fixed_block_size: int | None = None,
+        sliding_window: int | None = None,
     ) -> None:
         super().__init__()
         if attn_mask_type not in ["causal", "block_causal", "sliding_window"]:
@@ -73,7 +78,6 @@ def __init__(
         self.fixed_block_size = fixed_block_size
         self.sliding_window = sliding_window
 
-        self.mask_cache = {}
         FlexAttention.used_attn_mask_types.add(self.mask_key)
 
     @property
@@ -87,57 +91,44 @@ def forward(
         v: torch.Tensor,
         scale: float | None = None,
         sink_weights: torch.Tensor | None = None,
-        # sliding_window: int = 0,
-        enable_gqa: bool = False,
     ) -> torch.Tensor:
-        
+
         # Use sink logic when sliding_window is used and sink_weights is provided
         if self.attn_mask_type == "sliding_window" and sink_weights is not None:
-            return self._forward_with_sink(q, k, v, scale, sink_weights, enable_gqa)
-        
-        # Regular path without sink - use pre-compiled block masks
+            return self._forward_with_sink(q, k, v, scale, sink_weights)
+
+        # Regular path without sink
         block_mask = FlexAttention.block_masks[self.mask_key]
         return FlexAttention.flex_attn(q, k, v, block_mask=block_mask, scale=scale)
-    
+
     def _forward_with_sink(
         self,
         q: torch.Tensor,
-        k: torch.Tensor, 
+        k: torch.Tensor,
         v: torch.Tensor,
         scale: float | None = None,
         sink_weights: torch.Tensor | None = None,
-        enable_gqa: bool = False,
     ) -> torch.Tensor:
         """Forward pass with attention sink for sliding window attention."""
-        B, H_q, S_q, D = q.shape
-        _, H_kv, S_kv, _ = k.shape
-
-        if self.sliding_window is None or self.sliding_window <= 0:
-            raise RuntimeError("sliding_window must be configured for sliding_window attention type")
-        mask_key = ("sliding_window_sink", self.sliding_window, S_q, S_kv)
-        if mask_key not in self.mask_cache:
-            mask_mod = FlexAttention._get_sliding_window_mask_mod(self.sliding_window)
-            block_mask = create_block_mask(
-                mask_mod, B, H_q, S_q, S_kv,
-                _compile=True, device=q.device
-            )
-            self.mask_cache[mask_key] = block_mask
-        block_mask = self.mask_cache[mask_key]
+        # Use the pre-compiled static block mask
+        block_mask = FlexAttention.block_masks[self.mask_key]
 
         # Run flex_attn and return LSE for sink computation
         out, lse = FlexAttention.flex_attn(
-            q, k, v,
+            q,
+            k,
+            v,
             block_mask=block_mask,
-            enable_gqa=enable_gqa,
             return_lse=True,
-            scale=scale
+            scale=scale,
         )
 
         # Apply attention sink rescaling: rescale by σ(lse - w[h])
         # This is mathematically equivalent to concatenating learnable sink weights
         if sink_weights is not None:
-            w = sink_weights  # [H]
-            sink_scale = torch.sigmoid(lse - w.view(1, -1, 1)).unsqueeze(-1)  # [B,H,S,1]
+            sink_scale = torch.sigmoid(lse - sink_weights.view(1, -1, 1)).unsqueeze(
+                -1
+            )  # [B,H,S,1]
             out = out * sink_scale
 
         return out.to(q.dtype)
@@ -149,10 +140,12 @@ def _get_sliding_window_mask_mod(window: int):
         - only allows kv_idx ≤ q_idx (causal)
         - and only if (q_idx - kv_idx) ≤ window
         """
+
         def sliding_mod(b, h, q_idx, kv_idx):
             # causal within window
             keep = (kv_idx <= q_idx) & (q_idx - kv_idx <= window)
             return keep
+
         return sliding_mod
 
     @staticmethod
@@ -248,7 +241,9 @@ def init_attention_mask(batch: torch.Tensor, eos_id: int | None) -> None:
                     # We don't care about batch dimension --
                     # all samples have the same sliding window mask.
                     batch_dimension = 1
-                    mask_mod = FlexAttention._get_sliding_window_mask_mod(sliding_window)
+                    mask_mod = FlexAttention._get_sliding_window_mask_mod(
+                        sliding_window
+                    )
                 case _:
                     raise RuntimeError(f"Shouldn't reach here. {attn_mask_type}")
 
@@ -303,7 +298,10 @@ def forward(
 
 
 def build_attention(
-    use_flex_attn: bool, attn_mask_type: str, fixed_block_size: int | None = None, sliding_window: int | None = None
+    use_flex_attn: bool,
+    attn_mask_type: str,
+    fixed_block_size: int | None = None,
+    sliding_window: int | None = None,
 ):
     if use_flex_attn:
         return FlexAttention(attn_mask_type, fixed_block_size, sliding_window)