diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 85901be67..e2854158d 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -623,8 +623,6 @@ int main(int argc, const char* argv[]) { } } - bool vae_decode_only = true; - auto load_image_and_update_size = [&](const std::string& path, SDImageOwner& image, bool resize_image = true, @@ -646,21 +644,18 @@ int main(int argc, const char* argv[]) { }; if (gen_params.init_image_path.size() > 0) { - vae_decode_only = false; if (!load_image_and_update_size(gen_params.init_image_path, gen_params.init_image)) { return 1; } } if (gen_params.end_image_path.size() > 0) { - vae_decode_only = false; if (!load_image_and_update_size(gen_params.end_image_path, gen_params.end_image)) { return 1; } } if (gen_params.ref_image_paths.size() > 0) { - vae_decode_only = false; gen_params.ref_images.clear(); for (auto& path : gen_params.ref_image_paths) { SDImageOwner ref_image({0, 0, 3, nullptr}); @@ -735,18 +730,7 @@ int main(int argc, const char* argv[]) { } } - if (cli_params.mode == VID_GEN) { - vae_decode_only = false; - } - - if (gen_params.hires_enabled && - (gen_params.resolved_hires_upscaler == SD_HIRES_UPSCALER_MODEL || - gen_params.resolved_hires_upscaler == SD_HIRES_UPSCALER_LANCZOS || - gen_params.resolved_hires_upscaler == SD_HIRES_UPSCALER_NEAREST)) { - vae_decode_only = false; - } - - sd_ctx_params_t sd_ctx_params = ctx_params.to_sd_ctx_params_t(vae_decode_only, cli_params.taesd_preview); + sd_ctx_params_t sd_ctx_params = ctx_params.to_sd_ctx_params_t(cli_params.taesd_preview); SDImageVec results; int num_results = 0; diff --git a/examples/common/common.cpp b/examples/common/common.cpp index a6e396d87..cb19331ea 100644 --- a/examples/common/common.cpp +++ b/examples/common/common.cpp @@ -757,7 +757,7 @@ std::string SDContextParams::to_string() const { return oss.str(); } -sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool taesd_preview) { +sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool taesd_preview) { embedding_vec.clear(); embedding_vec.reserve(embedding_map.size()); for (const auto& kv : embedding_map) { @@ -787,7 +787,6 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool t static_cast(embedding_vec.size()), photo_maker_path.c_str(), tensor_type_rules.c_str(), - vae_decode_only, n_threads, wtype, rng_type, diff --git a/examples/common/common.h b/examples/common/common.h index a6cf17b3b..8f97ac95b 100644 --- a/examples/common/common.h +++ b/examples/common/common.h @@ -179,7 +179,7 @@ struct SDContextParams { bool validate(SDMode mode); bool resolve_and_validate(SDMode mode); std::string to_string() const; - sd_ctx_params_t to_sd_ctx_params_t(bool vae_decode_only, bool taesd_preview); + sd_ctx_params_t to_sd_ctx_params_t(bool taesd_preview); }; struct SDGenerationParams { diff --git a/examples/server/main.cpp b/examples/server/main.cpp index 1d8aa9bd1..dce35c118 100644 --- a/examples/server/main.cpp +++ b/examples/server/main.cpp @@ -85,7 +85,7 @@ int main(int argc, const char** argv) { LOG_DEBUG("%s", ctx_params.to_string().c_str()); LOG_DEBUG("%s", default_gen_params.to_string().c_str()); - sd_ctx_params_t sd_ctx_params = ctx_params.to_sd_ctx_params_t(false, false); + sd_ctx_params_t sd_ctx_params = ctx_params.to_sd_ctx_params_t(false); SDCtxPtr sd_ctx(new_sd_ctx(&sd_ctx_params)); if (sd_ctx == nullptr) { diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index ecd01fd3d..02e5b6175 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -196,7 +196,6 @@ typedef struct { uint32_t embedding_count; const char* photo_maker_path; const char* tensor_type_rules; - bool vae_decode_only; int n_threads; enum sd_type_t wtype; enum rng_type_t rng_type; diff --git a/src/model/vae/ltx_vae.hpp b/src/model/vae/ltx_vae.hpp index 7eeff31bc..a19ce820d 100644 --- a/src/model/vae/ltx_vae.hpp +++ b/src/model/vae/ltx_vae.hpp @@ -1426,7 +1426,7 @@ struct LTXVideoVAE : public VAE { const sd::Tensor& z, bool decode_graph) override { if (!decode_graph && decode_only) { - LOG_ERROR("LTX video VAE encode requires encoder weights; create the context with vae_decode_only=false"); + LOG_ERROR("LTX video VAE encode requires encoder weights"); return {}; } sd::Tensor input = z; diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 1b3b5dba0..cf44014bf 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -163,7 +163,6 @@ class StableDiffusionGGML { SDBackendManager backend_manager; SDVersion version; - bool vae_decode_only = false; bool external_vae_is_invalid = false; bool circular_x = false; @@ -318,7 +317,6 @@ class StableDiffusionGGML { bool init(const sd_ctx_params_t* sd_ctx_params) { n_threads = sd_ctx_params->n_threads; - vae_decode_only = sd_ctx_params->vae_decode_only; offload_params_to_cpu = sd_ctx_params->offload_params_to_cpu; enable_mmap = sd_ctx_params->enable_mmap; max_vram = sd_ctx_params->max_vram; @@ -560,10 +558,6 @@ class StableDiffusionGGML { size_t control_net_params_mem_size = 0; size_t extension_params_mem_size = 0; - if (sd_version_is_control(version)) { - // Might need vae encode for control cond - vae_decode_only = false; - } bool tae_preview_only = sd_ctx_params->tae_preview_only; if (version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) { tae_preview_only = false; @@ -591,7 +585,6 @@ class StableDiffusionGGML { "model.diffusion_model", model_manager); } else if (sd_version_is_pid(version)) { - vae_decode_only = false; cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), tensor_storage_map, version, @@ -706,15 +699,11 @@ class StableDiffusionGGML { } } } else if (sd_version_is_qwen_image(version)) { - bool enable_vision = false; - if (!vae_decode_only) { - enable_vision = true; - } cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), tensor_storage_map, version, "", - enable_vision, + true, model_manager); diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, @@ -723,15 +712,11 @@ class StableDiffusionGGML { sd_ctx_params->qwen_image_zero_cond_t, model_manager); } else if (sd_version_is_longcat(version)) { - bool enable_vision = false; - if (!vae_decode_only) { - enable_vision = true; - } cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), tensor_storage_map, version, "", - enable_vision, + true, model_manager); diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, @@ -827,10 +812,6 @@ class StableDiffusionGGML { return false; } - if (sd_version_is_unet_edit(version)) { - vae_decode_only = false; - } - if (high_noise_diffusion_model) { high_noise_diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes); high_noise_diffusion_model->set_stream_layers_enabled(stream_layers); @@ -846,7 +827,7 @@ class StableDiffusionGGML { return false; } - auto create_tae = [&]() -> std::shared_ptr { + auto create_tae = [&](bool decode_only) -> std::shared_ptr { if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version) || @@ -854,7 +835,7 @@ class StableDiffusionGGML { return std::make_shared(backend_for(SDBackendModule::VAE), tensor_storage_map, "decoder", - vae_decode_only, + decode_only, version, model_manager); @@ -862,7 +843,7 @@ class StableDiffusionGGML { auto model = std::make_shared(backend_for(SDBackendModule::VAE), tensor_storage_map, "decoder.layers", - vae_decode_only, + decode_only, version, model_manager); return model; @@ -884,7 +865,7 @@ class StableDiffusionGGML { return std::make_shared(backend_for(SDBackendModule::VAE), tensor_storage_map, "first_stage_model", - vae_decode_only, + false, version, model_manager); } else if (sd_version_is_wan(version) || @@ -893,14 +874,14 @@ class StableDiffusionGGML { return std::make_shared(backend_for(SDBackendModule::VAE), tensor_storage_map, "first_stage_model", - vae_decode_only, + false, version, model_manager); } else { auto model = std::make_shared(backend_for(SDBackendModule::VAE), tensor_storage_map, "first_stage_model", - vae_decode_only, + false, false, vae_version, model_manager); @@ -930,7 +911,7 @@ class StableDiffusionGGML { } } else if (use_tae && !tae_preview_only) { LOG_INFO("using TAE for encoding / decoding"); - first_stage_model = create_tae(); + first_stage_model = create_tae(false); first_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes); if (!register_runner_params("VAE", first_stage_model, @@ -950,7 +931,7 @@ class StableDiffusionGGML { } if (use_tae && tae_preview_only) { LOG_INFO("using TAE for preview"); - preview_vae = create_tae(); + preview_vae = create_tae(true); preview_vae->set_max_graph_vram_bytes(max_graph_vram_bytes); if (!register_runner_params("preview VAE", preview_vae, @@ -1080,13 +1061,6 @@ class StableDiffusionGGML { ignore_tensors.insert("model.diffusion_model.__32x32__"); ignore_tensors.insert("model.diffusion_model.__index_timestep_zero__"); - if (vae_decode_only) { - ignore_tensors.insert("first_stage_model.encoder"); - ignore_tensors.insert("first_stage_model.conv1"); - ignore_tensors.insert("first_stage_model.quant"); - ignore_tensors.insert("tae.encoder"); - ignore_tensors.insert("text_encoders.llm.visual."); - } if (audio_vae_model) { ignore_tensors.insert("audio_vae.encoder"); } @@ -2642,7 +2616,6 @@ void sd_hires_params_init(sd_hires_params_t* hires_params) { void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { *sd_ctx_params = {}; - sd_ctx_params->vae_decode_only = true; sd_ctx_params->n_threads = sd_get_num_physical_cores(); sd_ctx_params->wtype = SD_TYPE_COUNT; sd_ctx_params->rng_type = CUDA_RNG; @@ -2691,7 +2664,6 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { "control_net_path: %s\n" "photo_maker_path: %s\n" "tensor_type_rules: %s\n" - "vae_decode_only: %s\n" "n_threads: %d\n" "wtype: %s\n" "rng_type: %s\n" @@ -2730,7 +2702,6 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { SAFE_STR(sd_ctx_params->control_net_path), SAFE_STR(sd_ctx_params->photo_maker_path), SAFE_STR(sd_ctx_params->tensor_type_rules), - BOOL_STR(sd_ctx_params->vae_decode_only), sd_ctx_params->n_threads, sd_type_name(sd_ctx_params->wtype), sd_rng_type_name(sd_ctx_params->rng_type), @@ -3913,7 +3884,7 @@ static std::optional prepare_image_generation_latents(sd } } - if (!control_image_tensor.empty() && !sd_ctx->sd->vae_decode_only) { + if (!control_image_tensor.empty()) { control_latent = sd_ctx->sd->encode_first_stage(control_image_tensor); if (control_latent.empty()) { LOG_ERROR("failed to encode control image"); @@ -4255,11 +4226,6 @@ static sd::Tensor upscale_hires_latent(sd_ctx_t* sd_ctx, } else if (request.hires.upscaler == SD_HIRES_UPSCALER_MODEL || request.hires.upscaler == SD_HIRES_UPSCALER_LANCZOS || request.hires.upscaler == SD_HIRES_UPSCALER_NEAREST) { - if (sd_ctx->sd->vae_decode_only) { - LOG_ERROR("hires %s upscaler requires VAE encoder weights; create the context with vae_decode_only=false", - sd_hires_upscaler_name(request.hires.upscaler)); - return {}; - } if (request.hires.upscaler == SD_HIRES_UPSCALER_MODEL && upscaler == nullptr) { LOG_ERROR("hires model upscaler context is null"); return {}; @@ -4607,11 +4573,6 @@ static std::optional prepare_video_generation_latents(sd } if (!start_image.empty() || !end_image.empty()) { - if (sd_ctx->sd->vae_decode_only) { - LOG_ERROR("LTXAV image conditioning requires VAE encoder weights; create the context with vae_decode_only=false"); - return std::nullopt; - } - if (!start_image.empty() && !end_image.empty()) { LOG_INFO("FLF2V"); } else if (!start_image.empty()) { @@ -5076,11 +5037,6 @@ static bool apply_ltxv_refine_image_conditioning(sd_ctx_t* sd_ctx, sd_vid_gen_params->end_image.data == nullptr) { return true; } - if (sd_ctx->sd->vae_decode_only) { - LOG_ERROR("LTXV refine image conditioning requires VAE encoder weights; create the context with vae_decode_only=false"); - return false; - } - constexpr float conditioning_strength = 1.f; int latent_channels = sd_ctx->sd->get_latent_channel(); sd::Tensor video_latent = *latent;