Merge branch 'main' into RC-TEST-2.9

svekars · web-flow · commit 02ff3f54936c · 2025-09-15T12:59:19.000-07:00
diff --git a/intermediate_source/transformer_building_blocks.py b/intermediate_source/transformer_building_blocks.py
@@ -564,7 +564,6 @@ def benchmark(func, *args, **kwargs):
 #
 # * Cross Attention
 # * Fully masked rows no longer cause NaNs
-# * Modifying attention score: ALiBi with FlexAttention and NJT
 # * Packed Projection
 
 ###############################################################################
@@ -668,66 +667,6 @@ def benchmark(func, *args, **kwargs):
 # appropriately makes it possible to properly express empty sequences.
 
 
-################################################################################
-# FlexAttention + NJT
-# ---------------------------------------------------------------------
-# NJT also composes with the ``FlexAttention`` module. This is a generalization
-# of the ``MultiheadAttention`` layer that allows for arbitrary modifications
-# to the attention score. The example below takes the ``alibi_mod``
-# that implements `ALiBi <https://arxiv.org/abs/2108.12409>`_ from
-# `attention gym <https://github.com/meta-pytorch/attention-gym>`_ and uses it
-# with nested input tensors.
-
-from torch.nn.attention.flex_attention import flex_attention
-
-
-def generate_alibi_bias(H: int):
-    """Returns an alibi bias score_mod given the number of heads H
-    Args:
-        H: number of heads
-    Returns:
-        alibi_bias: alibi bias score_mod
-    """
-
-    def alibi_mod(score, b, h, q_idx, kv_idx):
-        scale = torch.exp2(-((h + 1) * 8.0 / H))
-        bias = (q_idx - kv_idx) * scale
-        return score + bias
-
-    return alibi_mod
-
-
-query, key, value, _ = gen_batch(N, E_q, E_k, E_v, device)
-n_heads, D = 8, E_q // 8
-alibi_score_mod = generate_alibi_bias(n_heads)
-query = query.unflatten(-1, [n_heads, D]).transpose(1, 2).detach().requires_grad_()
-key = key.unflatten(-1, [n_heads, D]).transpose(1, 2).detach().requires_grad_()
-value = value.unflatten(-1, [n_heads, D]).transpose(1, 2).detach().requires_grad_()
-out_flex2 = flex_attention(query, key, value, score_mod=alibi_score_mod)
-
-###############################################################################
-# In addition, one can also use the ``block_mask`` utility of ``FlexAttention``
-# with NJTs via the ``create_nested_block_mask`` function. This is useful for
-# taking advantage of the sparsity of the mask to speed up the attention computation.
-# In particular, the function creates a sparse block mask for a "stacked sequence" of all
-# the variable length sequences in the NJT combined into one, while properly masking out
-# inter-sequence attention. In the following example, we show how to create a
-# causal block mask using this utility.
-
-from torch.nn.attention.flex_attention import create_nested_block_mask
-
-
-def causal_mask(b, h, q_idx, kv_idx):
-    return q_idx >= kv_idx
-
-
-query, key, value, _ = gen_batch(N, E_q, E_k, E_v, device)
-block_mask = create_nested_block_mask(causal_mask, 1, 1, query, _compile=True)
-query = query.unflatten(-1, [n_heads, D]).transpose(1, 2).detach().requires_grad_()
-key = key.unflatten(-1, [n_heads, D]).transpose(1, 2).detach().requires_grad_()
-value = value.unflatten(-1, [n_heads, D]).transpose(1, 2).detach().requires_grad_()
-out_flex = flex_attention(query, key, value, block_mask=block_mask)
-
 ###############################################################################
 # Packed Projection
 # -----------------
diff --git a/redirects.py b/redirects.py
@@ -38,4 +38,5 @@
     "recipes/recipes_index.html": "../recipes_index.html",
     "recipes/torchserve_vertexai_tutorial.html": "../index.html",
     "unstable_source/vulkan_workflow.rst": "../index.html",
+    "unstable/skip_param_init.html": "https://docs.pytorch.org/tutorials/recipes/recipes/module_load_state_dict_tips.html",
 }
diff --git a/unstable_index.rst b/unstable_index.rst
@@ -45,15 +45,6 @@ decide if we want to upgrade the level of commitment or to fail fast.
    :link: unstable/semi_structured_sparse.html
    :tags: Model-Optimiziation
 
-.. Modules
-
-.. customcarditem::
-   :header: Skipping Module Parameter Initialization in PyTorch 1.10
-   :card_description: Describes skipping parameter initialization during module construction in PyTorch 1.10, avoiding wasted computation.
-   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
-   :link: unstable/skip_param_init.html
-   :tags: Modules
-
 .. vmap
 
 .. customcarditem::
diff --git a/unstable_source/skip_param_init.rst b/unstable_source/skip_param_init.rst

Original file line number	Diff line number	Diff line change
`@@ -38,4 +38,5 @@`
`38`	`38`	`"recipes/recipes_index.html": "../recipes_index.html",`
`39`	`39`	`"recipes/torchserve_vertexai_tutorial.html": "../index.html",`
`40`	`40`	`"unstable_source/vulkan_workflow.rst": "../index.html",`
	`41`	`+ "unstable/skip_param_init.html": "https://docs.pytorch.org/tutorials/recipes/recipes/module_load_state_dict_tips.html",`
`41`	`42`	`}`