Remove caching for attention masks (#2117)

wwwjn · web-flow · commit 4389efd06fc0 · 2025-12-11T14:42:20.000-08:00
We remove the lru_cache for attention masks, because in get_attention_mask() function, `and_masks(*mask_mods)` will return different object id. `create_attention_mask` will use all parameters as cache key, and new object id will always cause cache miss. Before the change: (llama3 debugmodel_flex_attn) <img width="1182" height="275" alt="Screenshot 2025-12-09 at 1 27 45 PM" src="https://github.com/user-attachments/assets/e9af2597-9d94-4478-8136-8b9b8c35d9e6" /> After the change: <img width="1182" height="275" alt="Screenshot 2025-12-09 at 1 29 56 PM" src="https://github.com/user-attachments/assets/756a7d09-b47f-434f-8ff6-40098b265a03" />
diff --git a/torchtitan/models/attention.py b/torchtitan/models/attention.py
@@ -6,7 +6,6 @@
 #
 # Copyright (c) Meta Platforms, Inc. All Rights Reserved.
 
-import functools
 from collections.abc import Callable
 from typing import ClassVar, NamedTuple
 
@@ -171,22 +170,19 @@ def forward(
             return F.scaled_dot_product_attention(q, k, v, scale=scale, is_causal=True)
 
 
-# We cannot do inner function/closure because we won't be able to cache it --
-# if we an inner function, a new closure will be created every time
-# `get_causal_mask_mod` is called.
-def _causal_mask(
-    b: torch.Tensor, h: torch.Tensor, q_idx: torch.Tensor, kv_idx: torch.Tensor
-) -> torch.Tensor:
-    """Causal mask that prevents attention to future tokens."""
-    return q_idx >= kv_idx
-
-
 def get_causal_mask_mod() -> _mask_mod_signature:
     """Returns a causal mask modifier for flex attention.
 
     Returns:
         A mask modifier function that implements causal masking.
     """
+
+    def _causal_mask(
+        b: torch.Tensor, h: torch.Tensor, q_idx: torch.Tensor, kv_idx: torch.Tensor
+    ) -> torch.Tensor:
+        """Causal mask that prevents attention to future tokens."""
+        return q_idx >= kv_idx
+
     return _causal_mask
 
 
@@ -275,13 +271,8 @@ def sliding_window_mod(
 _compiled_create_block_mask = torch.compile(create_block_mask)
 
 
-@functools.lru_cache(4)
 def create_attention_mask(*args, **kwargs):
-    """Create an attention mask using compiled create_block_mask.
-
-    This function is cached to avoid recreating BlockMasks for the same
-    arguments.
-    """
+    """Create an attention mask using compiled create_block_mask."""
     return _compiled_create_block_mask(*args, **kwargs)