Skip to content

Commit 2805e92

Browse files
Mark Caldwellclaude
andcommitted
feat: optional sequential component loading (--sequential-load)
Load the conditioner, run it, free it, then allocate and load the diffusion model -- instead of holding all components resident at once. Lowers peak device memory from ~sum(conditioner, diffusion, VAE) to ~max(conditioner, diffusion + VAE), so the fast "text encoder on GPU" path fits memory-constrained cards that otherwise cannot hold all three simultaneously. Opt-in via --sequential-load (default off; no behavior change otherwise). Single diffusion model only (skipped when a high-noise/refiner model is also present). Backend-agnostic -- implemented in stable-diffusion.cpp using the existing alloc_params_buffer() / ModelLoader::load_tensors(), with no backend patches. Validated on Strix Halo 8060S (Vulkan, LTX-2) and RX 6700 XT (RDNA2, 12GB): bit-identical output to the default path at a fixed seed, and the flag-off path is byte-identical to before. Peak device memory on RDNA2 (Flux Schnell Q4, 512^2) drops 9.78 -> 7.22 GB with no perf regression. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
1 parent 19bdfe2 commit 2805e92

4 files changed

Lines changed: 92 additions & 3 deletions

File tree

examples/common/common.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -473,6 +473,10 @@ ArgOptions SDContextParams::get_options() {
473473
"--vae-on-cpu",
474474
"keep vae in cpu (for low vram)",
475475
true, &vae_on_cpu},
476+
{"",
477+
"--sequential-load",
478+
"load the conditioner, run it, free it, then load the diffusion model; lowers peak device memory so the text-encoder-on-GPU path fits smaller cards (single diffusion model only)",
479+
true, &sequential_load},
476480
{"",
477481
"--fa",
478482
"use flash attention",
@@ -817,6 +821,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
817821
stream_layers,
818822
backend.c_str(),
819823
params_backend.c_str(),
824+
sequential_load,
820825
};
821826
return sd_ctx_params;
822827
}

examples/common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ struct SDContextParams {
152152
bool control_net_cpu = false;
153153
bool clip_on_cpu = false;
154154
bool vae_on_cpu = false;
155+
bool sequential_load = false;
155156
bool flash_attn = false;
156157
bool diffusion_flash_attn = false;
157158
bool diffusion_conv_direct = false;

include/stable-diffusion.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,7 @@ typedef struct {
226226
bool stream_layers; // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram)
227227
const char* backend;
228228
const char* params_backend;
229+
bool sequential_load; // load conditioner -> run -> free -> then load the diffusion model (lowers peak device memory)
229230
} sd_ctx_params_t;
230231

231232
typedef struct {

src/stable-diffusion.cpp

Lines changed: 85 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,18 @@ class StableDiffusionGGML {
200200

201201
std::map<std::string, ggml_tensor*> tensors;
202202

203+
// --- Sequential (lazy) component loading -------------------------------
204+
// Load the conditioner, run it, free it, THEN allocate + load the diffusion
205+
// model. Cuts peak device memory from sum(cond, DiT, VAE) to ~max(cond,
206+
// DiT+VAE), so the fast text-encoder-on-GPU recipe fits cards that can't
207+
// hold all three at once. Opt-in (--sequential-load), single diffusion model
208+
// only, and backend-agnostic (Vulkan/CPU/CUDA).
209+
bool seq_load_requested = false; // requested via --sequential-load
210+
bool seq_load = false; // effective (requested && single DiT)
211+
bool dit_params_loaded = true; // false while DiT load is deferred
212+
std::set<std::string> deferred_dit_keys; // diffusion-model tensor keys to load later
213+
ModelLoader model_loader; // retained so the deferred DiT load can read the file
214+
203215
// lora_name => multiplier
204216
std::unordered_map<std::string, float> curr_lora_state;
205217

@@ -293,7 +305,9 @@ class StableDiffusionGGML {
293305
}
294306
max_vram = sd::ggml_graph_cut::resolve_max_vram_gib(max_vram, backend_for(SDBackendModule::DIFFUSION));
295307

296-
ModelLoader model_loader;
308+
// model_loader is retained as a member so the deferred diffusion-model
309+
// load can read the file after the conditioner has run + been freed.
310+
seq_load_requested = sd_ctx_params->sequential_load;
297311

298312
if (strlen(SAFE_STR(sd_ctx_params->model_path)) > 0) {
299313
LOG_INFO("loading model from '%s'", sd_ctx_params->model_path);
@@ -774,6 +788,21 @@ class StableDiffusionGGML {
774788
get_param_tensors(high_noise_diffusion_model, module_can_mmap(SDBackendModule::DIFFUSION));
775789
}
776790

791+
// Sequential load only applies to the single-diffusion-model case (no
792+
// high-noise model). Capture the DiT tensor keys so we can skip its
793+
// up-front alloc/load and bring it in after the conditioner is freed.
794+
seq_load = seq_load_requested && diffusion_model && !high_noise_diffusion_model;
795+
if (seq_load) {
796+
std::map<std::string, ggml_tensor*> dit_temp;
797+
diffusion_model->get_param_tensors(dit_temp);
798+
for (const auto& [k, t] : dit_temp) {
799+
deferred_dit_keys.insert(k);
800+
}
801+
dit_params_loaded = false;
802+
LOG_INFO("sequential load: deferring %zu diffusion-model tensors until after conditioning",
803+
deferred_dit_keys.size());
804+
}
805+
777806
if (!ensure_backend_pair(SDBackendModule::VAE)) {
778807
return false;
779808
}
@@ -1048,7 +1077,7 @@ class StableDiffusionGGML {
10481077
ggml_free(ctx);
10491078
return false;
10501079
}
1051-
if (diffusion_model && !diffusion_model->alloc_params_buffer()) {
1080+
if (!seq_load && diffusion_model && !diffusion_model->alloc_params_buffer()) {
10521081
LOG_ERROR("Diffusion model params buffer allocation failed");
10531082
ggml_free(ctx);
10541083
return false;
@@ -1081,7 +1110,19 @@ class StableDiffusionGGML {
10811110
}
10821111
}
10831112

1084-
bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, sd_ctx_params->enable_mmap);
1113+
bool success;
1114+
if (seq_load) {
1115+
// First pass: load everything except the deferred diffusion-model tensors.
1116+
std::map<std::string, ggml_tensor*> first_tensors = tensors;
1117+
for (const auto& k : deferred_dit_keys) {
1118+
first_tensors.erase(k);
1119+
}
1120+
std::set<std::string> first_ignore = ignore_tensors;
1121+
first_ignore.insert("model.diffusion_model."); // deferred — loaded after conditioning
1122+
success = model_loader.load_tensors(first_tensors, first_ignore, n_threads, sd_ctx_params->enable_mmap);
1123+
} else {
1124+
success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, sd_ctx_params->enable_mmap);
1125+
}
10851126
if (!success) {
10861127
LOG_ERROR("load tensors from model loader failed");
10871128
ggml_free(ctx);
@@ -1890,6 +1931,40 @@ class StableDiffusionGGML {
18901931
*controls = std::move(*control_result);
18911932
}
18921933

1934+
// Allocate + load the diffusion-model params that sequential loading deferred.
1935+
// No-op unless seq_load deferred them. Reads the retained model_loader, so the
1936+
// model file(s) are re-opened for the DiT tensors only (conditioner/VAE ignored).
1937+
bool ensure_diffusion_model_loaded() {
1938+
if (dit_params_loaded) {
1939+
return true;
1940+
}
1941+
int64_t t0 = ggml_time_ms();
1942+
if (!diffusion_model->alloc_params_buffer()) {
1943+
LOG_ERROR("sequential load: diffusion model params buffer allocation failed");
1944+
return false;
1945+
}
1946+
std::map<std::string, ggml_tensor*> dit_tensors;
1947+
for (const auto& k : deferred_dit_keys) {
1948+
auto it = tensors.find(k);
1949+
if (it != tensors.end()) {
1950+
dit_tensors[k] = it->second;
1951+
}
1952+
}
1953+
// Ignore the (already-loaded) non-DiT components so they don't log as unknown.
1954+
std::set<std::string> ignore = {
1955+
"text_encoders.", "cond_stage_model.", "first_stage_model.",
1956+
"vae.", "audio_vae", "alphas_cumprod",
1957+
};
1958+
if (!model_loader.load_tensors(dit_tensors, ignore, n_threads, false)) {
1959+
LOG_ERROR("sequential load: deferred diffusion model tensor load failed");
1960+
return false;
1961+
}
1962+
dit_params_loaded = true;
1963+
LOG_INFO("sequential load: diffusion model allocated + loaded in %.2fs",
1964+
(ggml_time_ms() - t0) * 1.0f / 1000);
1965+
return true;
1966+
}
1967+
18931968
sd::Tensor<float> sample(const std::shared_ptr<DiffusionModelRunner>& work_diffusion_model,
18941969
bool inverse_noise_scaling,
18951970
const sd::Tensor<float>& init_latent,
@@ -1915,6 +1990,12 @@ class StableDiffusionGGML {
19151990
float frame_rate,
19161991
const sd_cache_params_t* cache_params,
19171992
const sd::Tensor<float>& video_positions = {}) {
1993+
// Sequential load: bring in the diffusion model now (after the conditioner
1994+
// has run and freed its buffer), just before the first denoise step.
1995+
if (work_diffusion_model == diffusion_model && !ensure_diffusion_model_loaded()) {
1996+
LOG_ERROR("sequential load: diffusion model not available for sampling");
1997+
return {};
1998+
}
19181999
std::vector<int> skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count);
19192000
float cfg_scale = guidance.txt_cfg;
19202001
float img_cfg_scale = guidance.img_cfg;
@@ -2703,6 +2784,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
27032784
sd_ctx_params->vae_format = SD_VAE_FORMAT_AUTO;
27042785
sd_ctx_params->backend = nullptr;
27052786
sd_ctx_params->params_backend = nullptr;
2787+
sd_ctx_params->sequential_load = false;
27062788
}
27072789

27082790
char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {

0 commit comments

Comments
 (0)