support for flux controls

stduhpf · stduhpf · commit b2c408753294 · 2025-05-27T00:42:31.000+02:00
diff --git a/flux.hpp b/flux.hpp
@@ -845,6 +845,14 @@ namespace Flux {
                 control = patchify(ctx, control, patch_size);
 
                 img = ggml_concat(ctx, img, ggml_concat(ctx, ggml_concat(ctx, masked, mask, 0), control, 0), 0);
+            } else if (version == VERSION_FLUX_CONTROLS) {
+                GGML_ASSERT(c_concat != NULL);
+
+                ggml_tensor* control = ggml_pad(ctx, c_concat, pad_w, pad_h, 0, 0);
+
+                control = patchify(ctx, control, patch_size);
+
+                img = ggml_concat(ctx, img, control, 0);
             }
 
             auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, skip_layers);  // [N, h*w, C * patch_size * patch_size]
@@ -877,6 +885,8 @@ namespace Flux {
             flux_params.depth_single_blocks = 0;
             if (version == VERSION_FLUX_FILL) {
                 flux_params.in_channels = 384;
+            } else if (version == VERSION_FLUX_CONTROLS) {
+                flux_params.in_channels = 128;
             } else if (version == VERSION_FLEX_2) {
                 flux_params.in_channels = 196;
             }
diff --git a/model.cpp b/model.cpp
@@ -1551,10 +1551,12 @@ SDVersion ModelLoader::get_sd_version() {
     }
 
     if (is_flux) {
-        is_inpaint = input_block_weight.ne[0] == 384;
-        if (is_inpaint) {
+        if (input_block_weight.ne[0] == 384) {
             return VERSION_FLUX_FILL;
         }
+        if (input_block_weight.ne[0] == 128) {
+            return VERSION_FLUX_CONTROLS;
+        }
         if(input_block_weight.ne[0] == 196){
             return VERSION_FLEX_2;
         }
diff --git a/model.h b/model.h
@@ -31,12 +31,13 @@ enum SDVersion {
     VERSION_SD3,
     VERSION_FLUX,
     VERSION_FLUX_FILL,
+    VERSION_FLUX_CONTROLS,
     VERSION_FLEX_2,
     VERSION_COUNT,
 };
 
 static inline bool sd_version_is_flux(SDVersion version) {
-    if (version == VERSION_FLUX || version == VERSION_FLUX_FILL || version == VERSION_FLEX_2 ) {
+    if (version == VERSION_FLUX || version == VERSION_FLUX_FILL || version == VERSION_FLUX_CONTROLS || version == VERSION_FLEX_2 ) {
         return true;
     }
     return false;
@@ -70,15 +71,16 @@ static inline bool sd_version_is_sdxl(SDVersion version) {
     return false;
 }
 
-static inline bool sd_version_is_inpaint(SDVersion version) {
-    if (version == VERSION_SD1_INPAINT || version == VERSION_SD2_INPAINT || version == VERSION_SDXL_INPAINT || version == VERSION_FLUX_FILL || version == VERSION_FLEX_2) {
+
+static inline bool sd_version_is_dit(SDVersion version) {
+    if (sd_version_is_flux(version) || sd_version_is_sd3(version)) {
         return true;
     }
     return false;
 }
 
-static inline bool sd_version_is_dit(SDVersion version) {
-    if (sd_version_is_flux(version) || sd_version_is_sd3(version)) {
+static inline bool sd_version_is_inpaint(SDVersion version) {
+    if (version == VERSION_SD1_INPAINT || version == VERSION_SD2_INPAINT || version == VERSION_SDXL_INPAINT || version == VERSION_FLUX_FILL || version == VERSION_FLEX_2) {
         return true;
     }
     return false;
@@ -88,8 +90,12 @@ static inline bool sd_version_is_edit(SDVersion version) {
     return version == VERSION_SD1_PIX2PIX || version == VERSION_SDXL_PIX2PIX;
 }
 
+static inline bool sd_version_is_control(SDVersion version) {
+    return version == VERSION_FLUX_CONTROLS || version == VERSION_FLEX_2;
+}
+
 static bool sd_version_use_concat(SDVersion version) {
-    return sd_version_is_edit(version) || sd_version_is_inpaint(version);
+    return sd_version_is_edit(version) || sd_version_is_inpaint(version)|| sd_version_is_control(version);
 }
 
 enum PMVersion {
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -301,7 +301,7 @@ class StableDiffusionGGML {
             // TODO: shift_factor
         }
 
-        if(version == VERSION_FLEX_2){
+        if (sd_version_is_control(version)) {
             // Might need vae encode for control cond
             vae_decode_only = false;
         }
@@ -815,15 +815,15 @@ class StableDiffusionGGML {
                         const std::vector<float>& sigmas,
                         int start_merge_step,
                         SDCondition id_cond,
-                        ggml_tensor* denoise_mask    = NULL) {
+                        ggml_tensor* denoise_mask = NULL) {
         std::vector<int> skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count);
 
         // TODO (Pix2Pix): separate image guidance params (right now it's reusing distilled guidance)
 
-        float cfg_scale = guidance.txt_cfg;
+        float cfg_scale     = guidance.txt_cfg;
         float img_cfg_scale = guidance.img_cfg;
-        float slg_scale = guidance.slg.scale;
-        
+        float slg_scale     = guidance.slg.scale;
+
         float min_cfg = guidance.min_cfg;
 
         if (img_cfg_scale != cfg_scale && !sd_version_use_concat(version)) {
@@ -1475,6 +1475,17 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
     int W = width / 8;
     int H = height / 8;
     LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
+
+    struct ggml_tensor* control_latent = NULL;
+    if (sd_version_is_control(sd_ctx->sd->version) && image_hint != NULL) {
+        if (!sd_ctx->sd->use_tiny_autoencoder) {
+            struct ggml_tensor* control_moments = sd_ctx->sd->encode_first_stage(work_ctx, image_hint);
+            control_latent                      = sd_ctx->sd->get_first_stage_encoding(work_ctx, control_moments);
+        } else {
+            control_latent = sd_ctx->sd->encode_first_stage(work_ctx, image_hint);
+        }
+    }
+
     if (sd_version_is_inpaint(sd_ctx->sd->version)) {
         int64_t mask_channels = 1;
         if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
@@ -1507,50 +1518,44 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                 }
             }
         }
-        if (sd_ctx->sd->version == VERSION_FLEX_2 && image_hint != NULL && sd_ctx->sd->control_net == NULL) {
+
+        if (sd_ctx->sd->version == VERSION_FLEX_2 && control_latent != NULL && sd_ctx->sd->control_net == NULL) {
             bool no_inpaint = concat_latent == NULL;
             if (no_inpaint) {
                 concat_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1);
             }
             // fill in the control image here
-            struct ggml_tensor* control_latents = NULL;
-            if (!sd_ctx->sd->use_tiny_autoencoder) {
-                struct ggml_tensor* control_moments = sd_ctx->sd->encode_first_stage(work_ctx, image_hint);
-                control_latents                     = sd_ctx->sd->get_first_stage_encoding(work_ctx, control_moments);
-            } else {
-                control_latents = sd_ctx->sd->encode_first_stage(work_ctx, image_hint);
-            }
-            for (int64_t x = 0; x < concat_latent->ne[0]; x++) {
-                for (int64_t y = 0; y < concat_latent->ne[1]; y++) {
+            for (int64_t x = 0; x < control_latent->ne[0]; x++) {
+                for (int64_t y = 0; y < control_latent->ne[1]; y++) {
                     if (no_inpaint) {
-                        for (int64_t c = 0; c < concat_latent->ne[2] - control_latents->ne[2]; c++) {
+                        for (int64_t c = 0; c < concat_latent->ne[2] - control_latent->ne[2]; c++) {
                             // 0x16,1x1,0x16
                             ggml_tensor_set_f32(concat_latent, c == init_latent->ne[2], x, y, c);
                         }
                     }
-                    for (int64_t c = 0; c < control_latents->ne[2]; c++) {
-                        float v = ggml_tensor_get_f32(control_latents, x, y, c);
-                        ggml_tensor_set_f32(concat_latent, v, x, y, concat_latent->ne[2] - control_latents->ne[2] + c);
+                    for (int64_t c = 0; c < control_latent->ne[2]; c++) {
+                        float v = ggml_tensor_get_f32(control_latent, x, y, c);
+                        ggml_tensor_set_f32(concat_latent, v, x, y, concat_latent->ne[2] - control_latent->ne[2] + c);
                     }
                 }
             }
-            // Disable controlnet
-            image_hint = NULL;
         } else if (concat_latent == NULL) {
             concat_latent = empty_latent;
         }
         cond.c_concat   = concat_latent;
         uncond.c_concat = empty_latent;
         denoise_mask    = NULL;
-    } else if (sd_version_is_edit(sd_ctx->sd->version)) {
+    } else if (sd_version_is_edit(sd_ctx->sd->version) || sd_version_is_control(sd_ctx->sd->version)) {
         auto empty_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2], init_latent->ne[3]);
         ggml_set_f32(empty_latent, 0);
         uncond.c_concat = empty_latent;
+        if (sd_version_is_control(sd_ctx->sd->version) && control_latent != NULL && sd_ctx->sd->control_net == NULL) {
+            concat_latent = control_latent;
+        }
         if (concat_latent == NULL) {
             concat_latent = empty_latent;
         }
-        cond.c_concat   = concat_latent;
-
+        cond.c_concat = concat_latent;
     }
     for (int b = 0; b < batch_count; b++) {
         int64_t sampling_start = ggml_time_ms();
@@ -1823,7 +1828,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
         ggml_tensor* masked_latent = NULL;
         if (!sd_ctx->sd->use_tiny_autoencoder) {
             ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
-            masked_latent      = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
+            masked_latent        = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
         } else {
             masked_latent = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
         }
@@ -1894,8 +1899,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
         } else {
             concat_latent = init_latent;
         }
-    } 
-    
+    }
+
     {
         // LOG_WARN("Inpainting with a base model is not great");
         denoise_mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 1, 1);

Original file line number	Diff line number	Diff line change
`@@ -1551,10 +1551,12 @@ SDVersion ModelLoader::get_sd_version() {`
`1551`	`1551`	`}`
`1552`	`1552`
`1553`	`1553`	`if (is_flux) {`
`1554`		`- is_inpaint = input_block_weight.ne[0] == 384;`
`1555`		`- if (is_inpaint) {`
	`1554`	`+ if (input_block_weight.ne[0] == 384) {`
`1556`	`1555`	`return VERSION_FLUX_FILL;`
`1557`	`1556`	`}`
	`1557`	`+ if (input_block_weight.ne[0] == 128) {`
	`1558`	`+ return VERSION_FLUX_CONTROLS;`
	`1559`	`+ }`
`1558`	`1560`	`if(input_block_weight.ne[0] == 196){`
`1559`	`1561`	`return VERSION_FLEX_2;`
`1560`	`1562`	`}`