comfyanonymous · kijai · Sep 30, 2025 · Oct 3, 2025 · Oct 3, 2025 · Oct 3, 2025
@@ -87,7 +87,7 @@ def qkv_fn_k(x):
         )
 
         x = self.o(x)
-        return x
+        return x, q, k
 
 
 class WanT2VCrossAttention(WanSelfAttention):
@@ -225,14 +225,16 @@ def forward(
         """
         # assert e.dtype == torch.float32
 
+        patches = transformer_options.get("patches", {})
+
         if e.ndim < 4:
             e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device) + e).chunk(6, dim=1)
         else:
             e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device).unsqueeze(0) + e).unbind(2)
         # assert e[0].dtype == torch.float32
 
         # self-attention
-        y = self.self_attn(
+        y, q, k = self.self_attn(
             torch.addcmul(repeat_e(e[0], x), self.norm1(x), 1 + repeat_e(e[1], x)),
             freqs, transformer_options=transformer_options)
 
@@ -241,6 +243,11 @@ def forward(
 
         # cross-attention & ffn
         x = x + self.cross_attn(self.norm3(x), context, context_img_len=context_img_len, transformer_options=transformer_options)
+
+        if "cross_attn" in patches:
+            for p in patches["cross_attn"]:
+                x = x + p({"x": x, "q": q, "k": k, "transformer_options": transformer_options})
+
         y = self.ffn(torch.addcmul(repeat_e(e[3], x), self.norm2(x), 1 + repeat_e(e[4], x)))
         x = torch.addcmul(x, y, repeat_e(e[5], x))
         return x
@@ -487,7 +494,7 @@ def __init__(self,
         self.blocks = nn.ModuleList([
             wan_attn_block_class(cross_attn_type, dim, ffn_dim, num_heads,
                                  window_size, qk_norm, cross_attn_norm, eps, operation_settings=operation_settings)
-            for _ in range(num_layers)
+            for i in range(num_layers)
         ])
 
         # head
@@ -540,6 +547,7 @@ def forward_orig(
         # embeddings
         x = self.patch_embedding(x.float()).to(x.dtype)
         grid_sizes = x.shape[2:]
+        transformer_options["grid_sizes"] = grid_sizes
         x = x.flatten(2).transpose(1, 2)
 
         # time embeddings
@@ -568,6 +576,7 @@ def forward_orig(
         patches_replace = transformer_options.get("patches_replace", {})
         blocks_replace = patches_replace.get("dit", {})
         for i, block in enumerate(self.blocks):
+            transformer_options["block_idx"] = i
             if ("double_block", i) in blocks_replace:
                 def block_wrap(args):
                     out = {}
@@ -734,6 +743,7 @@ def forward_orig(
         # embeddings
         x = self.patch_embedding(x.float()).to(x.dtype)
         grid_sizes = x.shape[2:]
+        transformer_options["grid_sizes"] = grid_sizes
         x = x.flatten(2).transpose(1, 2)
 
         # time embeddings
@@ -763,6 +773,7 @@ def forward_orig(
         patches_replace = transformer_options.get("patches_replace", {})
         blocks_replace = patches_replace.get("dit", {})
         for i, block in enumerate(self.blocks):
+            transformer_options["block_idx"] = i
             if ("double_block", i) in blocks_replace:
                 def block_wrap(args):
                     out = {}