code · pull · Dec 3, 2025 · Dec 3, 2025 · Dec 3, 2025 · Dec 3, 2025
diff --git a/comfy/ldm/hunyuan_video/upsampler.py b/comfy/ldm/hunyuan_video/upsampler.py
@@ -1,7 +1,8 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from comfy.ldm.hunyuan_video.vae_refiner import RMS_norm, ResnetBlock, VideoConv3d
+from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, VideoConv3d
+from comfy.ldm.hunyuan_video.vae_refiner import RMS_norm
 import model_management, model_patcher
 
 class SRResidualCausalBlock3D(nn.Module):

diff --git a/comfy/ldm/hunyuan_video/vae_refiner.py b/comfy/ldm/hunyuan_video/vae_refiner.py
@@ -1,42 +1,12 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, AttnBlock, VideoConv3d, Normalize
+from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, AttnBlock, CarriedConv3d, Normalize, conv_carry_causal_3d, torch_cat_if_needed
 import comfy.ops
 import comfy.ldm.models.autoencoder
 import comfy.model_management
 ops = comfy.ops.disable_weight_init
 
-class NoPadConv3d(nn.Module):
-    def __init__(self, n_channels, out_channels, kernel_size, stride=1, dilation=1, padding=0, **kwargs):
-        super().__init__()
-        self.conv = ops.Conv3d(n_channels, out_channels, kernel_size, stride=stride, dilation=dilation, **kwargs)
-
-    def forward(self, x):
-        return self.conv(x)
-
-
-def conv_carry_causal_3d(xl, op, conv_carry_in=None, conv_carry_out=None):
-
-    x = xl[0]
-    xl.clear()
-
-    if conv_carry_out is not None:
-        to_push = x[:, :, -2:, :, :].clone()
-        conv_carry_out.append(to_push)
-
-    if isinstance(op, NoPadConv3d):
-        if conv_carry_in is None:
-            x = torch.nn.functional.pad(x, (1, 1, 1, 1, 2, 0), mode = 'replicate')
-        else:
-            carry_len = conv_carry_in[0].shape[2]
-            x = torch.cat([conv_carry_in.pop(0), x], dim=2)
-            x = torch.nn.functional.pad(x, (1, 1, 1, 1, 2 - carry_len, 0), mode = 'replicate')
-
-    out = op(x)
-
-    return out
-
 
 class RMS_norm(nn.Module):
     def __init__(self, dim):
@@ -49,7 +19,7 @@ def forward(self, x):
         return F.normalize(x, dim=1) * self.scale * comfy.model_management.cast_to(self.gamma, dtype=x.dtype, device=x.device)
 
 class DnSmpl(nn.Module):
-    def __init__(self, ic, oc, tds=True, refiner_vae=True, op=VideoConv3d):
+    def __init__(self, ic, oc, tds, refiner_vae, op):
         super().__init__()
         fct = 2 * 2 * 2 if tds else 1 * 2 * 2
         assert oc % fct == 0
@@ -109,7 +79,7 @@ def forward(self, x, conv_carry_in=None, conv_carry_out=None):
 
 
 class UpSmpl(nn.Module):
-    def __init__(self, ic, oc, tus=True, refiner_vae=True, op=VideoConv3d):
+    def __init__(self, ic, oc, tus, refiner_vae, op):
         super().__init__()
         fct = 2 * 2 * 2 if tus else 1 * 2 * 2
         self.conv = op(ic, oc * fct, kernel_size=3, stride=1, padding=1)
@@ -163,23 +133,6 @@ def forward(self, x, conv_carry_in=None, conv_carry_out=None):
 
         return h + x
 
-class HunyuanRefinerResnetBlock(ResnetBlock):
-    def __init__(self, in_channels, out_channels, conv_op=NoPadConv3d, norm_op=RMS_norm):
-        super().__init__(in_channels=in_channels, out_channels=out_channels, temb_channels=0, conv_op=conv_op, norm_op=norm_op)
-
-    def forward(self, x, conv_carry_in=None, conv_carry_out=None):
-        h = x
-        h = [ self.swish(self.norm1(x)) ]
-        h = conv_carry_causal_3d(h, self.conv1, conv_carry_in=conv_carry_in, conv_carry_out=conv_carry_out)
-
-        h = [ self.dropout(self.swish(self.norm2(h))) ]
-        h = conv_carry_causal_3d(h, self.conv2, conv_carry_in=conv_carry_in, conv_carry_out=conv_carry_out)
-
-        if self.in_channels != self.out_channels:
-            x = self.nin_shortcut(x)
-
-        return x+h
-
 class Encoder(nn.Module):
     def __init__(self, in_channels, z_channels, block_out_channels, num_res_blocks,
                  ffactor_spatial, ffactor_temporal, downsample_match_channel=True, refiner_vae=True, **_):
@@ -191,7 +144,7 @@ def __init__(self, in_channels, z_channels, block_out_channels, num_res_blocks,
 
         self.refiner_vae = refiner_vae
         if self.refiner_vae:
-            conv_op = NoPadConv3d
+            conv_op = CarriedConv3d
             norm_op = RMS_norm
         else:
             conv_op = ops.Conv3d
@@ -206,9 +159,10 @@ def __init__(self, in_channels, z_channels, block_out_channels, num_res_blocks,
 
         for i, tgt in enumerate(block_out_channels):
             stage = nn.Module()
-            stage.block = nn.ModuleList([HunyuanRefinerResnetBlock(in_channels=ch if j == 0 else tgt,
-                                                                   out_channels=tgt,
-                                                                   conv_op=conv_op, norm_op=norm_op)
+            stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt,
+                                                     out_channels=tgt,
+                                                     temb_channels=0,
+                                                     conv_op=conv_op, norm_op=norm_op)
                                         for j in range(num_res_blocks)])
             ch = tgt
             if i < depth:
@@ -218,9 +172,9 @@ def __init__(self, in_channels, z_channels, block_out_channels, num_res_blocks,
             self.down.append(stage)
 
         self.mid = nn.Module()
-        self.mid.block_1 = HunyuanRefinerResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
+        self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
         self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=norm_op)
-        self.mid.block_2 = HunyuanRefinerResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
+        self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
 
         self.norm_out = norm_op(ch)
         self.conv_out = conv_op(ch, z_channels << 1, 3, 1, 1)
@@ -246,22 +200,20 @@ def forward(self, x):
             conv_carry_out = []
             if i == len(x) - 1:
                 conv_carry_out = None
+
             x1 = [ x1 ]
             x1 = conv_carry_causal_3d(x1, self.conv_in, conv_carry_in, conv_carry_out)
 
             for stage in self.down:
                 for blk in stage.block:
-                    x1 = blk(x1, conv_carry_in, conv_carry_out)
+                    x1 = blk(x1, None, conv_carry_in, conv_carry_out)
                 if hasattr(stage, 'downsample'):
                     x1 = stage.downsample(x1, conv_carry_in, conv_carry_out)
 
             out.append(x1)
             conv_carry_in = conv_carry_out
 
-        if len(out) > 1:
-            out = torch.cat(out, dim=2)
-        else:
-            out = out[0]
+        out = torch_cat_if_needed(out, dim=2)
 
         x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(out)))
         del out
@@ -288,7 +240,7 @@ def __init__(self, z_channels, out_channels, block_out_channels, num_res_blocks,
 
         self.refiner_vae = refiner_vae
         if self.refiner_vae:
-            conv_op = NoPadConv3d
+            conv_op = CarriedConv3d
             norm_op = RMS_norm
         else:
             conv_op = ops.Conv3d
@@ -298,19 +250,20 @@ def __init__(self, z_channels, out_channels, block_out_channels, num_res_blocks,
         self.conv_in = conv_op(z_channels, ch, kernel_size=3, stride=1, padding=1)
 
         self.mid = nn.Module()
-        self.mid.block_1 = HunyuanRefinerResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
+        self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
         self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=norm_op)
-        self.mid.block_2 = HunyuanRefinerResnetBlock(in_channels=ch, out_channels=ch,  conv_op=conv_op, norm_op=norm_op)
+        self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch,  conv_op=conv_op, norm_op=norm_op)
 
         self.up = nn.ModuleList()
         depth = (ffactor_spatial >> 1).bit_length()
         depth_temporal = (ffactor_temporal >> 1).bit_length()
 
         for i, tgt in enumerate(block_out_channels):
             stage = nn.Module()
-            stage.block = nn.ModuleList([HunyuanRefinerResnetBlock(in_channels=ch if j == 0 else tgt,
-                                                                   out_channels=tgt,
-                                                                   conv_op=conv_op, norm_op=norm_op)
+            stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt,
+                                                     out_channels=tgt,
+                                                     temb_channels=0,
+                                                     conv_op=conv_op, norm_op=norm_op)
                                         for j in range(num_res_blocks + 1)])
             ch = tgt
             if i < depth:
@@ -340,7 +293,7 @@ def forward(self, z):
                 conv_carry_out = None
             for stage in self.up:
                 for blk in stage.block:
-                    x1 = blk(x1, conv_carry_in, conv_carry_out)
+                    x1 = blk(x1, None, conv_carry_in, conv_carry_out)
                 if hasattr(stage, 'upsample'):
                     x1 = stage.upsample(x1, conv_carry_in, conv_carry_out)
 
@@ -350,10 +303,7 @@ def forward(self, z):
             conv_carry_in = conv_carry_out
         del x
 
-        if len(out) > 1:
-            out = torch.cat(out, dim=2)
-        else:
-            out = out[0]
+        out = torch_cat_if_needed(out, dim=2)
 
         if not self.refiner_vae:
             if z.shape[-3] == 1:

diff --git a/comfy/ldm/lumina/controlnet.py b/comfy/ldm/lumina/controlnet.py
@@ -0,0 +1,113 @@
+import torch
+from torch import nn
+
+from .model import JointTransformerBlock
+
+class ZImageControlTransformerBlock(JointTransformerBlock):
+    def __init__(
+        self,
+        layer_id: int,
+        dim: int,
+        n_heads: int,
+        n_kv_heads: int,
+        multiple_of: int,
+        ffn_dim_multiplier: float,
+        norm_eps: float,
+        qk_norm: bool,
+        modulation=True,
+        block_id=0,
+        operation_settings=None,
+    ):
+        super().__init__(layer_id, dim, n_heads, n_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, qk_norm, modulation, z_image_modulation=True, operation_settings=operation_settings)
+        self.block_id = block_id
+        if block_id == 0:
+            self.before_proj = operation_settings.get("operations").Linear(self.dim, self.dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.after_proj = operation_settings.get("operations").Linear(self.dim, self.dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+
+    def forward(self, c, x, **kwargs):
+        if self.block_id == 0:
+            c = self.before_proj(c) + x
+        c = super().forward(c, **kwargs)
+        c_skip = self.after_proj(c)
+        return c_skip, c
+
+class ZImage_Control(torch.nn.Module):
+    def __init__(
+        self,
+        dim: int = 3840,
+        n_heads: int = 30,
+        n_kv_heads: int = 30,
+        multiple_of: int = 256,
+        ffn_dim_multiplier: float = (8.0 / 3.0),
+        norm_eps: float = 1e-5,
+        qk_norm: bool = True,
+        dtype=None,
+        device=None,
+        operations=None,
+        **kwargs
+    ):
+        super().__init__()
+        operation_settings = {"operations": operations, "device": device, "dtype": dtype}
+
+        self.additional_in_dim = 0
+        self.control_in_dim = 16
+        n_refiner_layers = 2
+        self.n_control_layers = 6
+        self.control_layers = nn.ModuleList(
+            [
+                ZImageControlTransformerBlock(
+                    i,
+                    dim,
+                    n_heads,
+                    n_kv_heads,
+                    multiple_of,
+                    ffn_dim_multiplier,
+                    norm_eps,
+                    qk_norm,
+                    block_id=i,
+                    operation_settings=operation_settings,
+                )
+                for i in range(self.n_control_layers)
+            ]
+        )
+
+        all_x_embedder = {}
+        patch_size = 2
+        f_patch_size = 1
+        x_embedder = operations.Linear(f_patch_size * patch_size * patch_size * self.control_in_dim, dim, bias=True, device=device, dtype=dtype)
+        all_x_embedder[f"{patch_size}-{f_patch_size}"] = x_embedder
+
+        self.control_all_x_embedder = nn.ModuleDict(all_x_embedder)
+        self.control_noise_refiner = nn.ModuleList(
+            [
+                JointTransformerBlock(
+                    layer_id,
+                    dim,
+                    n_heads,
+                    n_kv_heads,
+                    multiple_of,
+                    ffn_dim_multiplier,
+                    norm_eps,
+                    qk_norm,
+                    modulation=True,
+                    z_image_modulation=True,
+                    operation_settings=operation_settings,
+                )
+                for layer_id in range(n_refiner_layers)
+            ]
+        )
+
+    def forward(self, cap_feats, control_context, x_freqs_cis, adaln_input):
+        patch_size = 2
+        f_patch_size = 1
+        pH = pW = patch_size
+        B, C, H, W = control_context.shape
+        control_context = self.control_all_x_embedder[f"{patch_size}-{f_patch_size}"](control_context.view(B, C, H // pH, pH, W // pW, pW).permute(0, 2, 4, 3, 5, 1).flatten(3).flatten(1, 2))
+
+        x_attn_mask = None
+        for layer in self.control_noise_refiner:
+            control_context = layer(control_context, x_attn_mask, x_freqs_cis[:control_context.shape[0], :control_context.shape[1]], adaln_input)
+        return control_context
+
+    def forward_control_block(self, layer_id, control_context, x, x_attn_mask, x_freqs_cis, adaln_input):
+        return self.control_layers[layer_id](control_context, x, x_mask=x_attn_mask, freqs_cis=x_freqs_cis[:control_context.shape[0], :control_context.shape[1]], adaln_input=adaln_input)
diff --git a/comfy/ldm/lumina/model.py b/comfy/ldm/lumina/model.py
@@ -568,7 +568,7 @@ def forward(self, x, timesteps, context, num_tokens, attention_mask=None, **kwar
         ).execute(x, timesteps, context, num_tokens, attention_mask, **kwargs)
 
     # def forward(self, x, t, cap_feats, cap_mask):
-    def _forward(self, x, timesteps, context, num_tokens, attention_mask=None, **kwargs):
+    def _forward(self, x, timesteps, context, num_tokens, attention_mask=None, transformer_options={}, **kwargs):
         t = 1.0 - timesteps
         cap_feats = context
         cap_mask = attention_mask
@@ -585,16 +585,24 @@ def _forward(self, x, timesteps, context, num_tokens, attention_mask=None, **kwa
 
         cap_feats = self.cap_embedder(cap_feats)  # (N, L, D)  # todo check if able to batchify w.o. redundant compute
 
+        patches = transformer_options.get("patches", {})
         transformer_options = kwargs.get("transformer_options", {})
         x_is_tensor = isinstance(x, torch.Tensor)
-        x, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, t, num_tokens, transformer_options=transformer_options)
-        freqs_cis = freqs_cis.to(x.device)
-
-        for layer in self.layers:
-            x = layer(x, mask, freqs_cis, adaln_input, transformer_options=transformer_options)
-
-        x = self.final_layer(x, adaln_input)
-        x = self.unpatchify(x, img_size, cap_size, return_tensor=x_is_tensor)[:,:,:h,:w]
-
-        return -x
+        img, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, t, num_tokens, transformer_options=transformer_options)
+        freqs_cis = freqs_cis.to(img.device)
+
+        for i, layer in enumerate(self.layers):
+            img = layer(img, mask, freqs_cis, adaln_input, transformer_options=transformer_options)
+            if "double_block" in patches:
+                for p in patches["double_block"]:
+                    out = p({"img": img[:, cap_size[0]:], "txt": img[:, :cap_size[0]], "pe": freqs_cis[:, cap_size[0]:], "vec": adaln_input, "x": x, "block_index": i, "transformer_options": transformer_options})
+                    if "img" in out:
+                        img[:, cap_size[0]:] = out["img"]
+                    if "txt" in out:
+                        img[:, :cap_size[0]] = out["txt"]
+
+        img = self.final_layer(img, adaln_input)
+        img = self.unpatchify(img, img_size, cap_size, return_tensor=x_is_tensor)[:, :, :h, :w]
+
+        return -img