Skip to content

Commit d13cf71

Browse files
pwilkinclaude
andcommitted
feat: auto-fit component placement and per-component backend devices
Add an auto-fit planner that picks DiT / VAE / Conditioner device placements from free GPU memory, treating each component as atomic (no intra-tensor row split — equivalent to llama.cpp's LLAMA_SPLIT_MODE_LAYER at component granularity, so views never land on a split buffer and no ggml patch is needed). Also adopt the PR #1184 CLI conventions: - new: --main-backend-device, --diffusion-backend-device, --clip-backend-device, --vae-backend-device, --control-net-backend-device, --tae-backend-device, --upscaler-backend-device, --photomaker-backend-device, --vision-backend-device, --list-devices - removed: --clip-on-cpu, --vae-on-cpu, --control-net-cpu (and the matching keep_*_on_cpu fields on sd_ctx_params_t) Auto-fit knobs: --auto-fit / --no-auto-fit, --no-multi-gpu, --fit-target, --fit-compute-reserve-{dit,vae,cond}, --fit-dry-run. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent f3fd359 commit d13cf71

7 files changed

Lines changed: 694 additions & 60 deletions

File tree

examples/common/common.cpp

Lines changed: 55 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,23 @@ ArgOptions SDContextParams::get_options() {
435435
"--chroma-t5-mask-pad",
436436
"t5 mask pad size of chroma",
437437
&chroma_t5_mask_pad},
438+
{"",
439+
"--fit-target",
440+
"auto-fit: MiB of free memory to leave on each GPU (default: 512)",
441+
&auto_fit_target_mb},
442+
{"",
443+
"--fit-compute-reserve-dit",
444+
"auto-fit: MiB reserved on the DiT's GPU for its compute buffer "
445+
"(0 keeps the built-in default)",
446+
&auto_fit_compute_reserve_dit_mb},
447+
{"",
448+
"--fit-compute-reserve-vae",
449+
"auto-fit: MiB reserved on the VAE's GPU for its compute buffer",
450+
&auto_fit_compute_reserve_vae_mb},
451+
{"",
452+
"--fit-compute-reserve-cond",
453+
"auto-fit: MiB reserved on the conditioner's GPU for its compute buffer",
454+
&auto_fit_compute_reserve_cond_mb},
438455
};
439456

440457
options.float_options = {
@@ -461,18 +478,6 @@ ArgOptions SDContextParams::get_options() {
461478
"--mmap",
462479
"whether to memory-map model",
463480
true, &enable_mmap},
464-
{"",
465-
"--control-net-cpu",
466-
"keep controlnet in cpu (for low vram)",
467-
true, &control_net_cpu},
468-
{"",
469-
"--clip-on-cpu",
470-
"keep clip in cpu (for low vram)",
471-
true, &clip_on_cpu},
472-
{"",
473-
"--vae-on-cpu",
474-
"keep vae in cpu (for low vram)",
475-
true, &vae_on_cpu},
476481
{"",
477482
"--fa",
478483
"use flash attention",
@@ -513,6 +518,24 @@ ArgOptions SDContextParams::get_options() {
513518
"--chroma-enable-t5-mask",
514519
"enable t5 mask for chroma",
515520
true, &chroma_use_t5_mask},
521+
{"",
522+
"--auto-fit",
523+
"automatically pick DiT/VAE/Conditioner device placements based on "
524+
"free GPU memory (default ON)",
525+
true, &auto_fit},
526+
{"",
527+
"--no-auto-fit",
528+
"disable auto-fit and use the explicit --backend / --params-backend flags",
529+
false, &auto_fit},
530+
{"",
531+
"--no-multi-gpu",
532+
"auto-fit: keep all components on a single GPU when they fit "
533+
"(by default, multi-GPU placements are preferred to balance load)",
534+
false, &auto_multi_gpu},
535+
{"",
536+
"--fit-dry-run",
537+
"auto-fit: print the computed plan and exit without loading models",
538+
true, &auto_fit_dry_run},
516539
};
517540

518541
auto on_type_arg = [&](int argc, const char** argv, int index) {
@@ -611,6 +634,15 @@ ArgOptions SDContextParams::get_options() {
611634
"but it usually offers faster inference speed and, in some cases, lower memory usage. "
612635
"The at_runtime mode, on the other hand, is exactly the opposite.",
613636
on_lora_apply_mode_arg},
637+
{"",
638+
"--list-devices",
639+
"list available ggml backend devices (one per line, "
640+
"name<TAB>description) and exit",
641+
[](int /*argc*/, const char** /*argv*/, int /*index*/) {
642+
sd_list_devices();
643+
std::exit(0);
644+
return 0;
645+
}},
614646
};
615647

616648
return options;
@@ -736,9 +768,10 @@ std::string SDContextParams::to_string() const {
736768
<< " backend: \"" << backend << "\",\n"
737769
<< " params_backend: \"" << params_backend << "\",\n"
738770
<< " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n"
739-
<< " control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n"
740-
<< " clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n"
741-
<< " vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n"
771+
<< " auto_fit: " << (auto_fit ? "true" : "false") << ",\n"
772+
<< " auto_fit_target_mb: " << auto_fit_target_mb << ",\n"
773+
<< " auto_fit_dry_run: " << (auto_fit_dry_run ? "true" : "false") << ",\n"
774+
<< " auto_multi_gpu: " << (auto_multi_gpu ? "true" : "false") << ",\n"
742775
<< " flash_attn: " << (flash_attn ? "true" : "false") << ",\n"
743776
<< " diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n"
744777
<< " diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n"
@@ -797,9 +830,6 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
797830
lora_apply_mode,
798831
offload_params_to_cpu,
799832
enable_mmap,
800-
clip_on_cpu,
801-
control_net_cpu,
802-
vae_on_cpu,
803833
flash_attn,
804834
diffusion_flash_attn,
805835
taesd_preview,
@@ -817,6 +847,13 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
817847
stream_layers,
818848
backend.c_str(),
819849
params_backend.c_str(),
850+
auto_fit,
851+
auto_fit_target_mb,
852+
auto_fit_dry_run,
853+
auto_fit_compute_reserve_dit_mb,
854+
auto_fit_compute_reserve_vae_mb,
855+
auto_fit_compute_reserve_cond_mb,
856+
auto_multi_gpu,
820857
};
821858
return sd_ctx_params;
822859
}

examples/common/common.h

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -148,14 +148,11 @@ struct SDContextParams {
148148
bool stream_layers = false;
149149
std::string backend;
150150
std::string params_backend;
151-
bool enable_mmap = false;
152-
bool control_net_cpu = false;
153-
bool clip_on_cpu = false;
154-
bool vae_on_cpu = false;
155-
bool flash_attn = false;
156-
bool diffusion_flash_attn = false;
157-
bool diffusion_conv_direct = false;
158-
bool vae_conv_direct = false;
151+
bool enable_mmap = false;
152+
bool flash_attn = false;
153+
bool diffusion_flash_attn = false;
154+
bool diffusion_conv_direct = false;
155+
bool vae_conv_direct = false;
159156

160157
bool circular = false;
161158
bool circular_x = false;
@@ -167,6 +164,16 @@ struct SDContextParams {
167164

168165
bool qwen_image_zero_cond_t = false;
169166

167+
// Auto-fit defaults — placement is computed automatically based on free
168+
// VRAM. Pass --no-auto-fit to disable and use explicit *-backend-device.
169+
bool auto_fit = true;
170+
int auto_fit_target_mb = 512;
171+
bool auto_fit_dry_run = false;
172+
int auto_fit_compute_reserve_dit_mb = 0;
173+
int auto_fit_compute_reserve_vae_mb = 0;
174+
int auto_fit_compute_reserve_cond_mb = 0;
175+
bool auto_multi_gpu = true;
176+
170177
prediction_t prediction = PREDICTION_COUNT;
171178
lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO;
172179

include/stable-diffusion.h

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -206,9 +206,6 @@ typedef struct {
206206
enum lora_apply_mode_t lora_apply_mode;
207207
bool offload_params_to_cpu;
208208
bool enable_mmap;
209-
bool keep_clip_on_cpu;
210-
bool keep_control_net_on_cpu;
211-
bool keep_vae_on_cpu;
212209
bool flash_attn;
213210
bool diffusion_flash_attn;
214211
bool tae_preview_only;
@@ -226,6 +223,28 @@ typedef struct {
226223
bool stream_layers; // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram)
227224
const char* backend;
228225
const char* params_backend;
226+
227+
// Auto-fit: pick DiT/VAE/Conditioner devices based on free GPU memory.
228+
// When `auto_fit` is true (default), `backend` / `params_backend` are
229+
// ignored and the placement is computed automatically (the plan is fed
230+
// into the same backend assignment that `backend` / `params_backend` use).
231+
// `auto_fit_target_mb` is the memory to leave free per GPU (default 512).
232+
// `auto_fit_dry_run` prints the plan and aborts init before loading.
233+
// `auto_fit_compute_reserve_{dit,vae,cond}_mb` let the user tune the
234+
// per-component compute-buffer reserve; 0 means use the built-in default.
235+
bool auto_fit;
236+
int auto_fit_target_mb;
237+
bool auto_fit_dry_run;
238+
int auto_fit_compute_reserve_dit_mb;
239+
int auto_fit_compute_reserve_vae_mb;
240+
int auto_fit_compute_reserve_cond_mb;
241+
242+
// When more than one GPU device is present, prefer placing different
243+
// components on different GPUs to balance load and fit larger total
244+
// working sets. Set false to keep all components on a single GPU when
245+
// they fit. Defaults to true. Each component still lives entirely on
246+
// one device — no intra-tensor row split.
247+
bool auto_multi_gpu;
229248
} sd_ctx_params_t;
230249

231250
typedef struct {
@@ -491,6 +510,11 @@ SD_API bool preprocess_canny(sd_image_t image,
491510
SD_API const char* sd_commit(void);
492511
SD_API const char* sd_version(void);
493512

513+
// List available ggml backend devices to stdout, in `name<TAB>description<NL>`
514+
// per-line format. The output is intended to be parsed by tools and used as
515+
// device names in the --backend / --params-backend assignment specs.
516+
SD_API void sd_list_devices(void);
517+
494518
#ifdef __cplusplus
495519
}
496520
#endif

0 commit comments

Comments
 (0)