@@ -200,6 +200,18 @@ class StableDiffusionGGML {
200200
201201 std::map<std::string, ggml_tensor*> tensors;
202202
203+ // --- Sequential (lazy) component loading -------------------------------
204+ // Load the conditioner, run it, free it, THEN allocate + load the diffusion
205+ // model. Cuts peak device memory from sum(cond, DiT, VAE) to ~max(cond,
206+ // DiT+VAE), so the fast text-encoder-on-GPU recipe fits cards that can't
207+ // hold all three at once. Opt-in (--sequential-load), single diffusion model
208+ // only, and backend-agnostic (Vulkan/CPU/CUDA).
209+ bool seq_load_requested = false ; // requested via --sequential-load
210+ bool seq_load = false ; // effective (requested && single DiT)
211+ bool dit_params_loaded = true ; // false while DiT load is deferred
212+ std::set<std::string> deferred_dit_keys; // diffusion-model tensor keys to load later
213+ ModelLoader model_loader; // retained so the deferred DiT load can read the file
214+
203215 // lora_name => multiplier
204216 std::unordered_map<std::string, float > curr_lora_state;
205217
@@ -293,7 +305,9 @@ class StableDiffusionGGML {
293305 }
294306 max_vram = sd::ggml_graph_cut::resolve_max_vram_gib (max_vram, backend_for (SDBackendModule::DIFFUSION ));
295307
296- ModelLoader model_loader;
308+ // model_loader is retained as a member so the deferred diffusion-model
309+ // load can read the file after the conditioner has run + been freed.
310+ seq_load_requested = sd_ctx_params->sequential_load ;
297311
298312 if (strlen (SAFE_STR (sd_ctx_params->model_path )) > 0 ) {
299313 LOG_INFO (" loading model from '%s'" , sd_ctx_params->model_path );
@@ -774,6 +788,21 @@ class StableDiffusionGGML {
774788 get_param_tensors (high_noise_diffusion_model, module_can_mmap (SDBackendModule::DIFFUSION ));
775789 }
776790
791+ // Sequential load only applies to the single-diffusion-model case (no
792+ // high-noise model). Capture the DiT tensor keys so we can skip its
793+ // up-front alloc/load and bring it in after the conditioner is freed.
794+ seq_load = seq_load_requested && diffusion_model && !high_noise_diffusion_model;
795+ if (seq_load) {
796+ std::map<std::string, ggml_tensor*> dit_temp;
797+ diffusion_model->get_param_tensors (dit_temp);
798+ for (const auto & [k, t] : dit_temp) {
799+ deferred_dit_keys.insert (k);
800+ }
801+ dit_params_loaded = false ;
802+ LOG_INFO (" sequential load: deferring %zu diffusion-model tensors until after conditioning" ,
803+ deferred_dit_keys.size ());
804+ }
805+
777806 if (!ensure_backend_pair (SDBackendModule::VAE )) {
778807 return false ;
779808 }
@@ -1048,7 +1077,7 @@ class StableDiffusionGGML {
10481077 ggml_free (ctx);
10491078 return false ;
10501079 }
1051- if (diffusion_model && !diffusion_model->alloc_params_buffer ()) {
1080+ if (!seq_load && diffusion_model && !diffusion_model->alloc_params_buffer ()) {
10521081 LOG_ERROR (" Diffusion model params buffer allocation failed" );
10531082 ggml_free (ctx);
10541083 return false ;
@@ -1081,7 +1110,19 @@ class StableDiffusionGGML {
10811110 }
10821111 }
10831112
1084- bool success = model_loader.load_tensors (tensors, ignore_tensors, n_threads, sd_ctx_params->enable_mmap );
1113+ bool success;
1114+ if (seq_load) {
1115+ // First pass: load everything except the deferred diffusion-model tensors.
1116+ std::map<std::string, ggml_tensor*> first_tensors = tensors;
1117+ for (const auto & k : deferred_dit_keys) {
1118+ first_tensors.erase (k);
1119+ }
1120+ std::set<std::string> first_ignore = ignore_tensors;
1121+ first_ignore.insert (" model.diffusion_model." ); // deferred — loaded after conditioning
1122+ success = model_loader.load_tensors (first_tensors, first_ignore, n_threads, sd_ctx_params->enable_mmap );
1123+ } else {
1124+ success = model_loader.load_tensors (tensors, ignore_tensors, n_threads, sd_ctx_params->enable_mmap );
1125+ }
10851126 if (!success) {
10861127 LOG_ERROR (" load tensors from model loader failed" );
10871128 ggml_free (ctx);
@@ -1890,6 +1931,40 @@ class StableDiffusionGGML {
18901931 *controls = std::move (*control_result);
18911932 }
18921933
1934+ // Allocate + load the diffusion-model params that sequential loading deferred.
1935+ // No-op unless seq_load deferred them. Reads the retained model_loader, so the
1936+ // model file(s) are re-opened for the DiT tensors only (conditioner/VAE ignored).
1937+ bool ensure_diffusion_model_loaded () {
1938+ if (dit_params_loaded) {
1939+ return true ;
1940+ }
1941+ int64_t t0 = ggml_time_ms ();
1942+ if (!diffusion_model->alloc_params_buffer ()) {
1943+ LOG_ERROR (" sequential load: diffusion model params buffer allocation failed" );
1944+ return false ;
1945+ }
1946+ std::map<std::string, ggml_tensor*> dit_tensors;
1947+ for (const auto & k : deferred_dit_keys) {
1948+ auto it = tensors.find (k);
1949+ if (it != tensors.end ()) {
1950+ dit_tensors[k] = it->second ;
1951+ }
1952+ }
1953+ // Ignore the (already-loaded) non-DiT components so they don't log as unknown.
1954+ std::set<std::string> ignore = {
1955+ " text_encoders." , " cond_stage_model." , " first_stage_model." ,
1956+ " vae." , " audio_vae" , " alphas_cumprod" ,
1957+ };
1958+ if (!model_loader.load_tensors (dit_tensors, ignore, n_threads, false )) {
1959+ LOG_ERROR (" sequential load: deferred diffusion model tensor load failed" );
1960+ return false ;
1961+ }
1962+ dit_params_loaded = true ;
1963+ LOG_INFO (" sequential load: diffusion model allocated + loaded in %.2fs" ,
1964+ (ggml_time_ms () - t0) * 1 .0f / 1000 );
1965+ return true ;
1966+ }
1967+
18931968 sd::Tensor<float > sample (const std::shared_ptr<DiffusionModelRunner>& work_diffusion_model,
18941969 bool inverse_noise_scaling,
18951970 const sd::Tensor<float >& init_latent,
@@ -1915,6 +1990,12 @@ class StableDiffusionGGML {
19151990 float frame_rate,
19161991 const sd_cache_params_t * cache_params,
19171992 const sd::Tensor<float >& video_positions = {}) {
1993+ // Sequential load: bring in the diffusion model now (after the conditioner
1994+ // has run and freed its buffer), just before the first denoise step.
1995+ if (work_diffusion_model == diffusion_model && !ensure_diffusion_model_loaded ()) {
1996+ LOG_ERROR (" sequential load: diffusion model not available for sampling" );
1997+ return {};
1998+ }
19181999 std::vector<int > skip_layers (guidance.slg .layers , guidance.slg .layers + guidance.slg .layer_count );
19192000 float cfg_scale = guidance.txt_cfg ;
19202001 float img_cfg_scale = guidance.img_cfg ;
@@ -2703,6 +2784,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
27032784 sd_ctx_params->vae_format = SD_VAE_FORMAT_AUTO ;
27042785 sd_ctx_params->backend = nullptr ;
27052786 sd_ctx_params->params_backend = nullptr ;
2787+ sd_ctx_params->sequential_load = false ;
27062788}
27072789
27082790char * sd_ctx_params_to_str (const sd_ctx_params_t * sd_ctx_params) {
0 commit comments