Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions comfy/ldm/flux/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def block_wrap(args):
if i < len(control_i):
add = control_i[i]
if add is not None:
img += add
img[:, :add.shape[1]] += add

if img.dtype == torch.float16:
img = torch.nan_to_num(img, nan=0.0, posinf=65504, neginf=-65504)
Expand Down Expand Up @@ -189,7 +189,7 @@ def block_wrap(args):
if i < len(control_o):
add = control_o[i]
if add is not None:
img[:, txt.shape[1] :, ...] += add
img[:, txt.shape[1] : txt.shape[1] + add.shape[1], ...] += add

img = img[:, txt.shape[1] :, ...]

Expand Down
2 changes: 1 addition & 1 deletion comfy/ldm/qwen_image/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,7 @@ def block_wrap(args):
if i < len(control_i):
add = control_i[i]
if add is not None:
hidden_states += add
hidden_states[:, :add.shape[1]] += add

hidden_states = self.norm_out(hidden_states, temb)
hidden_states = self.proj_out(hidden_states)
Expand Down
4 changes: 2 additions & 2 deletions comfy/ldm/wan/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -1255,6 +1255,7 @@ def forward_orig(
audio_emb = None

# embeddings
bs, _, time, height, width = x.shape
x = self.patch_embedding(x.float()).to(x.dtype)
if control_video is not None:
x = x + self.cond_encoder(control_video)
Expand All @@ -1272,7 +1273,7 @@ def forward_orig(
if reference_latent is not None:
ref = self.patch_embedding(reference_latent.float()).to(x.dtype)
ref = ref.flatten(2).transpose(1, 2)
freqs_ref = self.rope_encode(reference_latent.shape[-3], reference_latent.shape[-2], reference_latent.shape[-1], t_start=30, device=x.device, dtype=x.dtype)
freqs_ref = self.rope_encode(reference_latent.shape[-3], reference_latent.shape[-2], reference_latent.shape[-1], t_start=max(30, time + 9), device=x.device, dtype=x.dtype)
ref = ref + cond_mask_weight[1]
x = torch.cat([x, ref], dim=1)
freqs = torch.cat([freqs, freqs_ref], dim=1)
Expand All @@ -1296,7 +1297,6 @@ def forward_orig(
# context
context = self.text_embedding(context)


patches_replace = transformer_options.get("patches_replace", {})
blocks_replace = patches_replace.get("dit", {})
for i, block in enumerate(self.blocks):
Expand Down
3 changes: 3 additions & 0 deletions comfy/weight_adapter/lokr.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ def create_train(cls, weight, rank=1, alpha=1.0):
(mat1, mat2, alpha, None, None, None, None, None, None)
)

def to_train(self):
return LokrDiff(self.weights)

@classmethod
def load(
cls,
Expand Down
8 changes: 5 additions & 3 deletions comfy_extras/nodes_model_patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ def __init__(self, model_patch, vae, image, strength, mask=None):
self.strength = strength
self.mask = mask
self.encoded_image = model_patch.model.process_input_latent_image(self.encode_latent_cond(image))
self.encoded_image_size = (image.shape[1], image.shape[2])

def encode_latent_cond(self, image):
latent_image = self.vae.encode(image)
Expand All @@ -106,14 +107,15 @@ def __call__(self, kwargs):
x = kwargs.get("x")
img = kwargs.get("img")
block_index = kwargs.get("block_index")
if self.encoded_image is None or self.encoded_image.shape[1:] != img.shape[1:]:
spacial_compression = self.vae.spacial_compression_encode()
spacial_compression = self.vae.spacial_compression_encode()
if self.encoded_image is None or self.encoded_image_size != (x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression):
image_scaled = comfy.utils.common_upscale(self.image.movedim(-1, 1), x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression, "area", "center")
loaded_models = comfy.model_management.loaded_models(only_currently_used=True)
self.encoded_image = self.model_patch.model.process_input_latent_image(self.encode_latent_cond(image_scaled.movedim(1, -1)))
self.encoded_image_size = (image_scaled.shape[-2], image_scaled.shape[-1])
comfy.model_management.load_models_gpu(loaded_models)

img = img + (self.model_patch.model.control_block(img, self.encoded_image.to(img.dtype), block_index) * self.strength)
img[:, :self.encoded_image.shape[1]] += (self.model_patch.model.control_block(img[:, :self.encoded_image.shape[1]], self.encoded_image.to(img.dtype), block_index) * self.strength)
kwargs['img'] = img
return kwargs

Expand Down
2 changes: 1 addition & 1 deletion comfy_extras/nodes_wan.py
Original file line number Diff line number Diff line change
Expand Up @@ -920,7 +920,7 @@ def execute(cls, positive, negative, vae, width, height, length, batch_size, ref
audio_embed_bucket = audio_embed_bucket.permute(0, 2, 3, 1)

positive = node_helpers.conditioning_set_values(positive, {"audio_embed": audio_embed_bucket})
negative = node_helpers.conditioning_set_values(negative, {"audio_embed": audio_embed_bucket})
negative = node_helpers.conditioning_set_values(negative, {"audio_embed": audio_embed_bucket * 0.0})

if ref_image is not None:
ref_image = comfy.utils.common_upscale(ref_image[:1].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
Expand Down
Loading