[CP][RFC] Enable FlexCP for llama3 with parallelize_module

fegin · fegin · commit 70e592073a37 · 2025-09-12T11:24:55.000-07:00
Similar to #1696, but this PR uses parallel_module similar to TP/SP. This PR also requires pytorch/pytorch#162542
diff --git a/torchtitan/models/attention.py b/torchtitan/models/attention.py
@@ -15,6 +15,7 @@
 from torch.nn.attention import sdpa_kernel, SDPBackend
 from torch.nn.attention.flex_attention import (
     _mask_mod_signature,
+    AuxOutput,
     BlockMask,
     create_block_mask,
     flex_attention,
@@ -28,6 +29,26 @@
 FLEX_ATTN_MASK_T = tuple[str, int | None]
 
 
+class FlexAttentionWrapper(torch.nn.Module):
+    _flex_attn: ClassVar[Callable] = torch.compile(
+        flex_attention, mode="max-autotune-no-cudagraphs"
+    )
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, *args: object, **kwargs: object) -> [
+        torch.Tensor | tuple[torch.Tensor, torch.Tensor],
+        tuple[torch.Tensor, AuxOutput],
+    ]:
+        # 1. _flex_attn has to be a class variable, otherwise there will
+        #    be multiple complied flex_attention, which can be slow.
+        # 2. `self._flex_attn` is not correct, `self` will be passed in
+        #    as the first argument, which will cause an error.
+        #    `FlexAttentionWrapper._flex_attn` is correct.
+        return FlexAttentionWrapper._flex_attn(*args, **kwargs)
+
+
 class FlexAttention(torch.nn.Module):
     """FlexAttention module that uses torch.nn.attention.flex_attention.
 
@@ -46,11 +67,6 @@ class FlexAttention(torch.nn.Module):
             to the keys within the same block.
     """
 
-    # We registered flex_attention related attributes as class variables as we
-    # need to amortize the cost of compilation.
-    flex_attn: ClassVar[Callable] = torch.compile(
-        flex_attention, mode="max-autotune-no-cudagraphs"
-    )
     compiled_create_block_mask: ClassVar[Callable] = torch.compile(create_block_mask)
     used_attn_mask_types: ClassVar[set[FLEX_ATTN_MASK_T]] = set()
     # Attention mask type to the created BlockMask.
@@ -71,6 +87,7 @@ def __init__(
             raise ValueError(f"Unrecognized attn_mask_type {attn_mask_type}.")
         self.attn_mask_type = attn_mask_type
         self.fixed_block_size = fixed_block_size
+        self.attention_fn_wrapper = FlexAttentionWrapper()
 
         FlexAttention.used_attn_mask_types.add(self.mask_key)
 
@@ -86,7 +103,7 @@ def forward(
         scale: float | None = None,
     ) -> torch.Tensor:
         block_mask = FlexAttention.block_masks[self.mask_key]
-        return FlexAttention.flex_attn(q, k, v, block_mask=block_mask, scale=scale)
+        return self.attention_fn_wrapper(q, k, v, block_mask=block_mask, scale=scale)
 
     @staticmethod
     def _get_causal_mask_mod() -> _mask_mod_signature:
@@ -251,6 +268,11 @@ def init_attention_mask(
     # while we continue debugging accuracy issues. However, we want to evaluate
     # the user experience with CP enabled.
     if cp_mesh is not None:
+        from torch.distributed.tensor.experimental._attention import _DispatchMode
+
+        torch.distributed.tensor.experimental._attention._dispatch_mode = (
+            _DispatchMode.MODULE_WRAPPER
+        )
         FlexAttention.compiled_create_block_mask = functools.partial(
             create_cp_block_mask, device_mesh=cp_mesh
         )
diff --git a/torchtitan/models/llama3/infra/parallelize.py b/torchtitan/models/llama3/infra/parallelize.py
@@ -14,6 +14,8 @@
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.fsdp import CPUOffloadPolicy, fully_shard, MixedPrecisionPolicy
 from torch.distributed.tensor import Replicate, Shard
+
+from torch.distributed.tensor.experimental._attention import _ContextParallel
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
     parallelize_module,
@@ -67,8 +69,6 @@ def parallelize_llama(
         """
 
     use_flex_attn = getattr(model.model_args, "use_flex_attn", False)
-    if job_config.parallelism.context_parallel_degree > 1 and use_flex_attn:
-        raise NotImplementedError("CP support for FlexAttention is still in progress.")
 
     if parallel_dims.tp_enabled:
         enable_float8_linear = "float8" in job_config.model.converters
@@ -90,6 +90,17 @@ def parallelize_llama(
         )
         maybe_enable_async_tp(job_config, world_mesh["tp"])
 
+    if parallel_dims.cp_enabled:
+        for block in model.layers.values():
+            parallelize_module(
+                module=block.attention.sdpa.attention_fn_wrapper,
+                device_mesh=world_mesh["cp"],
+                parallelize_plan=_ContextParallel(
+                    seq_dim=2,
+                    attention_type=_ContextParallel.AttentionType.FLEX,
+                ),
+            )
+
     model_compile_enabled = (
         job_config.compile.enable and "model" in job_config.compile.components
     )
diff --git a/torchtitan/models/llama3/model/args.py b/torchtitan/models/llama3/model/args.py
@@ -45,13 +45,6 @@ def update_from_config(self, job_config: JobConfig, **kwargs) -> None:
             )
         self.max_seq_len = seq_len
 
-        if job_config.parallelism.context_parallel_degree > 1 and self.use_flex_attn:
-            raise NotImplementedError(
-                "CP support for FlexAttention is still in progress."
-            )
-
-        self.max_seq_len = seq_len
-
     def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:
         nparams = sum(p.numel() for p in model.parameters())
         nparams_embedding = sum(