leejet
diff --git a/‎examples/common/common.cpp‎
Lines changed: 27 additions & 0 deletions b/‎examples/common/common.cpp‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎examples/common/common.h‎
Lines changed: 1 addition & 0 deletions b/‎examples/common/common.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/stable-diffusion.h‎
Lines changed: 9 additions & 0 deletions b/‎include/stable-diffusion.h‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎src/conditioner.hpp‎
Lines changed: 63 additions & 9 deletions b/‎src/conditioner.hpp‎
Lines changed: 63 additions & 9 deletions
diff --git a/‎src/llm.hpp‎
Lines changed: 56 additions & 3 deletions b/‎src/llm.hpp‎
Lines changed: 56 additions & 3 deletions
diff --git a/‎src/lora.hpp‎
Lines changed: 0 additions & 1 deletion b/‎src/lora.hpp‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/ltx_audio_vae.h‎
Lines changed: 2 additions & 2 deletions b/‎src/ltx_audio_vae.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/model.cpp‎
Lines changed: 3 additions & 0 deletions b/‎src/model.cpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/model.h‎
Lines changed: 10 additions & 1 deletion b/‎src/model.h‎
Lines changed: 10 additions & 1 deletion
@@ -35,6 +35,22 @@ const char* const modes_str[] = {
     "metadata",
 };
 
+static sd_vae_format_t str_to_vae_format(const std::string& value) {
+    if (value == "auto") {
+        return SD_VAE_FORMAT_AUTO;
+    }
+    if (value == "flux") {
+        return SD_VAE_FORMAT_FLUX;
+    }
+    if (value == "sd3") {
+        return SD_VAE_FORMAT_SD3;
+    }
+    if (value == "flux2") {
+        return SD_VAE_FORMAT_FLUX2;
+    }
+    return SD_VAE_FORMAT_COUNT;
+}
+
 #if defined(_WIN32)
 static std::string utf16_to_utf8(const std::wstring& wstr) {
     if (wstr.empty())
@@ -348,6 +364,10 @@ ArgOptions SDContextParams::get_options() {
          "--vae",
          "path to standalone vae model",
          &vae_path},
+        {"",
+         "--vae-format",
+         "VAE latent format override: auto, flux, sd3, or flux2 (default: auto)",
+         &vae_format},
         {"",
          "--audio-vae",
          "path to standalone LTX audio vae model",
@@ -639,6 +659,11 @@ bool SDContextParams::validate(SDMode mode) {
         }
     }
 
+    if (str_to_vae_format(vae_format) == SD_VAE_FORMAT_COUNT) {
+        LOG_ERROR("error: vae_format must be 'auto', 'flux', 'sd3', or 'flux2'");
+        return false;
+    }
+
     return true;
 }
 
@@ -679,6 +704,7 @@ std::string SDContextParams::to_string() const {
         << "  high_noise_diffusion_model_path: \"" << high_noise_diffusion_model_path << "\",\n"
         << "  embeddings_connectors_path: \"" << embeddings_connectors_path << "\",\n"
         << "  vae_path: \"" << vae_path << "\",\n"
+        << "  vae_format: \"" << vae_format << "\",\n"
         << "  audio_vae_path: \"" << audio_vae_path << "\",\n"
         << "  taesd_path: \"" << taesd_path << "\",\n"
         << "  esrgan_path: \"" << esrgan_path << "\",\n"
@@ -772,6 +798,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
         chroma_use_t5_mask,
         chroma_t5_mask_pad,
         qwen_image_zero_cond_t,
+        str_to_vae_format(vae_format),
         max_vram,
         backend.c_str(),
         params_backend.c_str(),
 
@@ -94,6 +94,7 @@ struct SDContextParams {
     std::string high_noise_diffusion_model_path;
     std::string embeddings_connectors_path;
     std::string vae_path;
+    std::string vae_format = "auto";
     std::string audio_vae_path;
     std::string taesd_path;
     std::string esrgan_path;
 
@@ -168,6 +168,14 @@ typedef struct {
     const char* path;
 } sd_embedding_t;
 
+enum sd_vae_format_t {
+    SD_VAE_FORMAT_AUTO = -1,
+    SD_VAE_FORMAT_FLUX,
+    SD_VAE_FORMAT_SD3,
+    SD_VAE_FORMAT_FLUX2,
+    SD_VAE_FORMAT_COUNT,
+};
+
 typedef struct {
     const char* model_path;
     const char* clip_l_path;
@@ -212,6 +220,7 @@ typedef struct {
     bool chroma_use_t5_mask;
     int chroma_t5_mask_pad;
     bool qwen_image_zero_cond_t;
+    enum sd_vae_format_t vae_format;
     float max_vram;  // GiB budget for graph-cut segmented param offload (0 = disabled, -1 = auto free VRAM minus 1 GiB)
     const char* backend;
     const char* params_backend;
 
@@ -1171,7 +1171,6 @@ struct FluxCLIPEmbedder : public Conditioner {
         return true;
     }
 
-
     void free_params_buffer() override {
         if (clip_l) {
             clip_l->free_params_buffer();
@@ -1601,8 +1600,8 @@ struct AnimaConditioner : public Conditioner {
 
     bool alloc_params_buffer() override {
         if (!llm->alloc_params_buffer()) {
-                return false;
-            }
+            return false;
+        }
         return true;
     }
 
@@ -1719,13 +1718,17 @@ struct LLMEmbedder : public Conditioner {
             arch = LLM::LLMArch::MINISTRAL_3_3B;
         } else if (sd_version_is_lens(version)) {
             arch = LLM::LLMArch::GPT_OSS_20B;
+        } else if (sd_version_is_pid(version)) {
+            arch = LLM::LLMArch::GEMMA2_2B;
         } else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) {
             arch = LLM::LLMArch::QWEN3;
         }
         if (arch == LLM::LLMArch::MISTRAL_SMALL_3_2 || arch == LLM::LLMArch::MINISTRAL_3_3B) {
             tokenizer = std::make_shared<MistralTokenizer>();
         } else if (arch == LLM::LLMArch::GPT_OSS_20B) {
             tokenizer = std::make_shared<GPTOSSTokenizer>();
+        } else if (arch == LLM::LLMArch::GEMMA2_2B) {
+            tokenizer = std::make_shared<Gemma2Tokenizer>();
         } else {
             tokenizer = std::make_shared<Qwen2Tokenizer>();
         }
@@ -1743,7 +1746,7 @@ struct LLMEmbedder : public Conditioner {
 
     bool alloc_params_buffer() override {
         if (!llm->alloc_params_buffer()) {
-                return false;
+            return false;
         }
         return true;
     }
@@ -1847,12 +1850,16 @@ struct LLMEmbedder : public Conditioner {
         sd::Tensor<int32_t> input_ids({static_cast<int64_t>(tokens.size())}, tokens);
         sd::Tensor<float> attention_mask;
         if (!mask.empty()) {
-            attention_mask = sd::Tensor<float>({static_cast<int64_t>(mask.size()), static_cast<int64_t>(mask.size())});
+            attention_mask                     = sd::Tensor<float>({static_cast<int64_t>(mask.size()), static_cast<int64_t>(mask.size())});
+            const float masked_attention_value = -std::numeric_limits<float>::max() / 4.0f;
             for (size_t i1 = 0; i1 < mask.size(); ++i1) {
                 for (size_t i0 = 0; i0 < mask.size(); ++i0) {
                     float value = 0.0f;
-                    if (mask[i0] == 0.0f || i0 > i1) {
-                        value = -INFINITY;
+                    if (mask[i0] == 0.0f) {
+                        value += masked_attention_value;
+                    }
+                    if (i0 > i1) {
+                        value += masked_attention_value;
                     }
                     attention_mask[static_cast<int64_t>(i0 + mask.size() * i1)] = value;
                 }
@@ -2126,6 +2133,53 @@ struct LLMEmbedder : public Conditioner {
             prompt_attn_range.second = static_cast<int>(prompt.size());
 
             prompt += "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
+        } else if (sd_version_is_pid(version)) {
+            constexpr int pixeldit_max_length = 300;
+            const std::string chi_prompt =
+                "Given a user prompt, generate an \"Enhanced prompt\" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:\n"
+                "- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.\n"
+                "- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.\n"
+                "Here are examples of how to transform or refine prompts:\n"
+                "- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.\n"
+                "- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.\n"
+                "Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:\n"
+                "User Prompt: ";
+            auto chi_tokens       = std::get<0>(tokenize(chi_prompt, {0, 0}));
+            size_t num_chi_tokens = chi_tokens.size();
+            max_length            = (int)num_chi_tokens + pixeldit_max_length - 2;
+            min_length            = max_length;
+
+            prompt_attn_range.first = static_cast<int>(prompt.size());
+            prompt += " " + conditioner_params.text;
+            prompt_attn_range.second = static_cast<int>(prompt.size());
+
+            auto hidden_states = encode_prompt(n_threads,
+                                               prompt,
+                                               prompt_attn_range,
+                                               min_length,
+                                               0,
+                                               image_embeds,
+                                               out_layers,
+                                               0,
+                                               false,
+                                               max_length);
+            GGML_ASSERT(!hidden_states.empty());
+
+            if (hidden_states.shape()[1] > pixeldit_max_length) {
+                auto bos      = sd::ops::slice(hidden_states, 1, 0, 1);
+                auto tail     = sd::ops::slice(hidden_states,
+                                               1,
+                                               hidden_states.shape()[1] - (pixeldit_max_length - 1),
+                                               hidden_states.shape()[1]);
+                hidden_states = sd::ops::concat(bos, tail, 1);
+            }
+
+            int64_t t1 = ggml_time_ms();
+            LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
+
+            SDCondition result;
+            result.c_crossattn = std::move(hidden_states);
+            return result;
         } else {
             GGML_ABORT("unknown version %d", version);
         }
@@ -2268,10 +2322,10 @@ struct LTXAVEmbedder : public Conditioner {
 
     bool alloc_params_buffer() override {
         if (!llm->alloc_params_buffer()) {
-                return false;
+            return false;
         }
         if (!projector->alloc_params_buffer()) {
-                return false;
+            return false;
         }
         return true;
     }
 
@@ -37,6 +37,7 @@ namespace LLM {
         MISTRAL_SMALL_3_2,
         MINISTRAL_3_3B,
         GEMMA3_12B,
+        GEMMA2_2B,
         GPT_OSS_20B,
         ARCH_COUNT,
     };
@@ -48,6 +49,7 @@ namespace LLM {
         "mistral_small3.2",
         "ministral3.3b",
         "gemma3_12b",
+        "gemma2_2b",
         "gpt_oss_20b",
     };
 
@@ -900,6 +902,33 @@ namespace LLM {
                                                  1.f,
                                                  32.f,
                                                  1.f);
+            } else if (arch == LLMArch::GEMMA2_2B) {
+                q = ggml_rope_ext(ctx->ggml_ctx,
+                                  q,
+                                  input_pos,
+                                  nullptr,
+                                  head_dim,
+                                  GGML_ROPE_TYPE_NEOX,
+                                  8192,
+                                  10000.f,
+                                  1.f,
+                                  0.f,
+                                  1.f,
+                                  32.f,
+                                  1.f);
+                k = ggml_rope_ext(ctx->ggml_ctx,
+                                  k,
+                                  input_pos,
+                                  nullptr,
+                                  head_dim,
+                                  GGML_ROPE_TYPE_NEOX,
+                                  8192,
+                                  10000.f,
+                                  1.f,
+                                  0.f,
+                                  1.f,
+                                  32.f,
+                                  1.f);
             } else if (arch == LLMArch::QWEN3_VL) {
                 int sections[4] = {24, 20, 20, 0};
                 q               = ggml_rope_multi(ctx->ggml_ctx, q, input_pos, nullptr, head_dim, sections, GGML_ROPE_TYPE_IMROPE, 262144, 5000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
@@ -957,10 +986,18 @@ namespace LLM {
             : arch(params.arch),
               sliding_attention(0) {
             if (params.arch == LLMArch::GEMMA3_12B) {
-                post_attention_norm_name = "post_attention_norm";
-                post_ffw_norm_name       = "post_ffw_norm";
+                post_attention_norm_name = "post_attention_norm";       // attn_post_norm
+                pre_ffw_norm_name        = "post_attention_layernorm";  // ffn_norm
+                post_ffw_norm_name       = "post_ffw_norm";             // ffn_post_norm
+            } else if (params.arch == LLMArch::GEMMA2_2B) {
+                post_attention_norm_name = "post_attention_layernorm";  // ffn_norm
+                pre_ffw_norm_name        = "pre_feedforward_layernorm";
+                post_ffw_norm_name       = "post_feedforward_layernorm";
+            } else if (params.arch == LLMArch::GPT_OSS_20B) {
+                pre_ffw_norm_name = "post_attention_norm";  // attn_post_norm
+            } else {
+                pre_ffw_norm_name = "post_attention_layernorm";  // ffn_norm
             }
-            pre_ffw_norm_name = params.arch == LLMArch::GPT_OSS_20B ? "post_attention_norm" : "post_attention_layernorm";
 
             blocks["self_attn"] = std::make_shared<Attention>(params);
             if (params.arch == LLMArch::GPT_OSS_20B) {
@@ -1447,6 +1484,21 @@ namespace LLM {
                 params.rope_thetas             = {1000000.f, 10000.f};
                 params.rope_scales             = {8.f, 1.f};
                 params.sliding_attention       = {1024, 1024, 1024, 1024, 1024, 0};
+            } else if (arch == LLMArch::GEMMA2_2B) {
+                params.head_dim                = 256;
+                params.num_heads               = 8;
+                params.num_kv_heads            = 4;
+                params.qkv_bias                = false;
+                params.qk_norm                 = false;
+                params.rms_norm_eps            = 1e-6f;
+                params.rms_norm_add            = true;
+                params.normalize_input         = true;
+                params.max_position_embeddings = 8192;
+                params.mlp_activation          = MLPActivation::GELU_TANH;
+                params.hidden_size             = 2304;
+                params.intermediate_size       = 9216;
+                params.num_layers              = 26;
+                params.vocab_size              = 256000;
             } else if (arch == LLMArch::GPT_OSS_20B) {
                 params.head_dim                = 64;
                 params.num_heads               = 64;
@@ -1585,6 +1637,7 @@ namespace LLM {
                 params.arch == LLMArch::MINISTRAL_3_3B ||
                 params.arch == LLMArch::QWEN3 ||
                 params.arch == LLMArch::GEMMA3_12B ||
+                params.arch == LLMArch::GEMMA2_2B ||
                 params.arch == LLMArch::GPT_OSS_20B) {
                 input_pos_vec.resize(n_tokens);
                 for (int i = 0; i < n_tokens; ++i) {
 
@@ -91,7 +91,6 @@ struct LoraModel : public GGMLRunner {
             return false;
         }
 
-
         dry_run = false;
         model_loader.load_tensors(on_new_tensor_cb, n_threads);
 
 
@@ -1069,8 +1069,8 @@ namespace LTXV {
                                                                      prefix);
 
             if (!ltx_audio_vae->alloc_params_buffer()) {
-               LOG_ERROR("ltx audio vae buffer allocation failed");
-               return;
+                LOG_ERROR("ltx audio vae buffer allocation failed");
+                return;
             }
 
             std::map<std::string, ggml_tensor*> tensors;
 
@@ -432,6 +432,9 @@ SDVersion ModelLoader::get_sd_version() {
             tensor_storage.name.find("model.diffusion_model.single_transformer_blocks.") != std::string::npos) {
             is_flux = true;
         }
+        if (tensor_storage.name.find("model.diffusion_model.net.lq_proj.latent_proj.0.weight") != std::string::npos) {
+            return VERSION_PID;
+        }
         if (tensor_storage.name.find("model.diffusion_model.nerf_final_layer_conv.") != std::string::npos) {
             return VERSION_CHROMA_RADIANCE;
         }
 
@@ -49,6 +49,7 @@ enum SDVersion {
     VERSION_ERNIE_IMAGE,
     VERSION_LENS,
     VERSION_LONGCAT,
+    VERSION_PID,
     VERSION_COUNT,
 };
 
@@ -164,6 +165,13 @@ static inline bool sd_version_is_lens(SDVersion version) {
     return false;
 }
 
+static inline bool sd_version_is_pid(SDVersion version) {
+    if (version == VERSION_PID) {
+        return true;
+    }
+    return false;
+}
+
 static inline bool sd_version_uses_flux2_vae(SDVersion version) {
     if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version) || sd_version_is_lens(version)) {
         return true;
@@ -194,7 +202,8 @@ static inline bool sd_version_is_dit(SDVersion version) {
         sd_version_is_z_image(version) ||
         sd_version_is_ernie_image(version) ||
         sd_version_is_lens(version) ||
-        sd_version_is_longcat(version)) {
+        sd_version_is_longcat(version) ||
+        sd_version_is_pid(version)) {
         return true;
     }
     return false;
Original file line number	Diff line number	Diff line change
`@@ -91,7 +91,6 @@ struct LoraModel : public GGMLRunner {`
`91`	`91`	`return false;`
`92`	`92`	`}`
`93`	`93`
`94`		`-`
`95`	`94`	`dry_run = false;`
`96`	`95`	`model_loader.load_tensors(on_new_tensor_cb, n_threads);`
`97`	`96`
Original file line number	Diff line number	Diff line change
`@@ -1069,8 +1069,8 @@ namespace LTXV {`
`1069`	`1069`	`prefix);`
`1070`	`1070`
`1071`	`1071`	`if (!ltx_audio_vae->alloc_params_buffer()) {`
`1072`		`- LOG_ERROR("ltx audio vae buffer allocation failed");`
`1073`		`- return;`
	`1072`	`+ LOG_ERROR("ltx audio vae buffer allocation failed");`
	`1073`	`+ return;`
`1074`	`1074`	`}`
`1075`	`1075`
`1076`	`1076`	`std::map<std::string, ggml_tensor*> tensors;`
Original file line number	Diff line number	Diff line change
`@@ -432,6 +432,9 @@ SDVersion ModelLoader::get_sd_version() {`
`432`	`432`	`tensor_storage.name.find("model.diffusion_model.single_transformer_blocks.") != std::string::npos) {`
`433`	`433`	`is_flux = true;`
`434`	`434`	`}`
	`435`	`+ if (tensor_storage.name.find("model.diffusion_model.net.lq_proj.latent_proj.0.weight") != std::string::npos) {`
	`436`	`+ return VERSION_PID;`
	`437`	`+ }`
`435`	`438`	`if (tensor_storage.name.find("model.diffusion_model.nerf_final_layer_conv.") != std::string::npos) {`
`436`	`439`	`return VERSION_CHROMA_RADIANCE;`
`437`	`440`	`}`