code · pull · Aug 27, 2025 · Aug 27, 2025 · Aug 27, 2025 · Aug 27, 2025
diff --git a/comfy/ldm/flux/model.py b/comfy/ldm/flux/model.py
@@ -158,7 +158,7 @@ def block_wrap(args):
                 if i < len(control_i):
                     add = control_i[i]
                     if add is not None:
-                        img += add
+                        img[:, :add.shape[1]] += add
 
         if img.dtype == torch.float16:
             img = torch.nan_to_num(img, nan=0.0, posinf=65504, neginf=-65504)
@@ -189,7 +189,7 @@ def block_wrap(args):
                 if i < len(control_o):
                     add = control_o[i]
                     if add is not None:
-                        img[:, txt.shape[1] :, ...] += add
+                        img[:, txt.shape[1] : txt.shape[1] + add.shape[1], ...] += add
 
         img = img[:, txt.shape[1] :, ...]
 

diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py
@@ -459,7 +459,7 @@ def block_wrap(args):
                 if i < len(control_i):
                     add = control_i[i]
                     if add is not None:
-                        hidden_states += add
+                        hidden_states[:, :add.shape[1]] += add
 
         hidden_states = self.norm_out(hidden_states, temb)
         hidden_states = self.proj_out(hidden_states)

diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py
@@ -1255,6 +1255,7 @@ def forward_orig(
             audio_emb = None
 
         # embeddings
+        bs, _, time, height, width = x.shape
         x = self.patch_embedding(x.float()).to(x.dtype)
         if control_video is not None:
             x = x + self.cond_encoder(control_video)
@@ -1272,7 +1273,7 @@ def forward_orig(
         if reference_latent is not None:
             ref = self.patch_embedding(reference_latent.float()).to(x.dtype)
             ref = ref.flatten(2).transpose(1, 2)
-            freqs_ref = self.rope_encode(reference_latent.shape[-3], reference_latent.shape[-2], reference_latent.shape[-1], t_start=30, device=x.device, dtype=x.dtype)
+            freqs_ref = self.rope_encode(reference_latent.shape[-3], reference_latent.shape[-2], reference_latent.shape[-1], t_start=max(30, time + 9), device=x.device, dtype=x.dtype)
             ref = ref + cond_mask_weight[1]
             x = torch.cat([x, ref], dim=1)
             freqs = torch.cat([freqs, freqs_ref], dim=1)
@@ -1296,7 +1297,6 @@ def forward_orig(
         # context
         context = self.text_embedding(context)
 
-
         patches_replace = transformer_options.get("patches_replace", {})
         blocks_replace = patches_replace.get("dit", {})
         for i, block in enumerate(self.blocks):

diff --git a/comfy/weight_adapter/lokr.py b/comfy/weight_adapter/lokr.py
@@ -97,6 +97,9 @@ def create_train(cls, weight, rank=1, alpha=1.0):
             (mat1, mat2, alpha, None, None, None, None, None, None)
         )
 
+    def to_train(self):
+        return LokrDiff(self.weights)
+
     @classmethod
     def load(
         cls,

diff --git a/comfy_extras/nodes_model_patch.py b/comfy_extras/nodes_model_patch.py
@@ -89,6 +89,7 @@ def __init__(self, model_patch, vae, image, strength, mask=None):
         self.strength = strength
         self.mask = mask
         self.encoded_image = model_patch.model.process_input_latent_image(self.encode_latent_cond(image))
+        self.encoded_image_size = (image.shape[1], image.shape[2])
 
     def encode_latent_cond(self, image):
         latent_image = self.vae.encode(image)
@@ -106,14 +107,15 @@ def __call__(self, kwargs):
         x = kwargs.get("x")
         img = kwargs.get("img")
         block_index = kwargs.get("block_index")
-        if self.encoded_image is None or self.encoded_image.shape[1:] != img.shape[1:]:
-            spacial_compression = self.vae.spacial_compression_encode()
+        spacial_compression = self.vae.spacial_compression_encode()
+        if self.encoded_image is None or self.encoded_image_size != (x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression):
             image_scaled = comfy.utils.common_upscale(self.image.movedim(-1, 1), x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression, "area", "center")
             loaded_models = comfy.model_management.loaded_models(only_currently_used=True)
             self.encoded_image = self.model_patch.model.process_input_latent_image(self.encode_latent_cond(image_scaled.movedim(1, -1)))
+            self.encoded_image_size = (image_scaled.shape[-2], image_scaled.shape[-1])
             comfy.model_management.load_models_gpu(loaded_models)
 
-        img = img + (self.model_patch.model.control_block(img, self.encoded_image.to(img.dtype), block_index) * self.strength)
+        img[:, :self.encoded_image.shape[1]] += (self.model_patch.model.control_block(img[:, :self.encoded_image.shape[1]], self.encoded_image.to(img.dtype), block_index) * self.strength)
         kwargs['img'] = img
         return kwargs
 

diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py
@@ -920,7 +920,7 @@ def execute(cls, positive, negative, vae, width, height, length, batch_size, ref
                 audio_embed_bucket = audio_embed_bucket.permute(0, 2, 3, 1)
 
             positive = node_helpers.conditioning_set_values(positive, {"audio_embed": audio_embed_bucket})
-            negative = node_helpers.conditioning_set_values(negative, {"audio_embed": audio_embed_bucket})
+            negative = node_helpers.conditioning_set_values(negative, {"audio_embed": audio_embed_bucket * 0.0})
 
         if ref_image is not None:
             ref_image = comfy.utils.common_upscale(ref_image[:1].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)