pytorch
diff --git a/‎torchtitan/experiments/llama4/model/model.py‎
Lines changed: 47 additions & 8 deletions b/‎torchtitan/experiments/llama4/model/model.py‎
Lines changed: 47 additions & 8 deletions
@@ -4,14 +4,21 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Callable
 
 import torch
 import torch.nn.functional as F
 from torch import nn
 
-from torchtitan.models.attention import build_attention
+from torchtitan.models.attention import (
+    AttentionOp,
+    get_causal_mask_mod,
+    get_document_mask_mod,
+    get_fixed_block_mask_mod,
+)
 from torchtitan.models.moe import MoE
-from torchtitan.protocols import ModelProtocol
+from torchtitan.protocols.model import AttentionMasksType
+from torchtitan.protocols.train_spec import ModelProtocol
 
 from .args import TransformerModelArgs
 
@@ -155,9 +162,7 @@ def __init__(
         # values of these two variables.
         self.use_rope = use_rope
 
-        self.sdpa = build_attention(
-            model_args.use_flex_attn, model_args.attn_mask_type, fixed_block_size
-        )
+        self.attention_op = AttentionOp(model_args.use_flex_attn)
 
     def init_weights(self, init_std: float):
         for linear in (self.wq, self.wk, self.wv):
@@ -168,6 +173,7 @@ def forward(
         self,
         x: torch.Tensor,
         freqs_cis: torch.Tensor,
+        attention_masks: AttentionMasksType,
     ):
         """
         Forward pass of the attention module.
@@ -202,7 +208,12 @@ def forward(
         xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
         xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
 
-        output = self.sdpa(xq, xk, xv)
+        assert isinstance(attention_masks, dict) or attention_masks is None
+        if self.attention_op.use_flex_attn:
+            attention_mask = attention_masks["rope" if self.use_rope else "nope"]
+            output = self.attention_op(xq, xk, xv, attention_mask=attention_mask)
+        else:
+            output = self.attention_op(xq, xk, xv, attention_mask=None)
 
         output = output.transpose(
             1, 2
@@ -335,6 +346,7 @@ def forward(
         self,
         x: torch.Tensor,
         freqs_cis: torch.Tensor,
+        attention_masks: AttentionMasksType,
     ):
         """
         Perform a forward pass through the TransformerBlock.
@@ -347,7 +359,7 @@ def forward(
             torch.Tensor: Output tensor after applying attention and feedforward layers.
 
         """
-        h = x + self.attention(self.attention_norm(x), freqs_cis)
+        h = x + self.attention(self.attention_norm(x), freqs_cis, attention_masks)
         if self.moe_enabled:
             out = h + self.moe(self.ffn_norm(h))
         else:
@@ -447,9 +459,36 @@ def _precompute_freqs_cis(self) -> torch.Tensor:
             self.model_args.rope_theta,
         )
 
+    def get_attention_masks(
+        self, create_mask_fn: Callable, batch: torch.Tensor, eos_id: int
+    ) -> AttentionMasksType:
+        if not self.model_args.use_flex_attn:
+            return None
+
+        nope_mask_mod = get_causal_mask_mod()
+        match self.model_args.attn_mask_type:
+            case "causal":
+                B = 1
+            case "block_causal":
+                B = batch.shape[0]
+                rope_mask_mod = get_document_mask_mod(nope_mask_mod, batch, eos_id)
+            case _:
+                raise ValueError(f"Unknown attention mask type: {self.attn_mask_type}")
+
+        rope_mask_mod = get_fixed_block_mask_mod(
+            nope_mask_mod, self.model_args.fixed_attn_block_size
+        )
+
+        seqlen = batch.shape[1]
+        return {
+            "rope": create_mask_fn(rope_mask_mod, B, None, seqlen, seqlen),
+            "nope": create_mask_fn(nope_mask_mod, B, None, seqlen, seqlen),
+        }
+
     def forward(
         self,
         tokens: torch.Tensor,
+        attention_masks: AttentionMasksType,
         input_batch: torch.Tensor | None = None,
     ):
         """
@@ -473,7 +512,7 @@ def forward(
         h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens
 
         for layer in self.layers.values():
-            h = layer(h, self.freqs_cis)
+            h = layer(h, self.freqs_cis, attention_masks)
 
         h = self.norm(h) if self.norm else h
         output = self.output(h) if self.output else h