@@ -2432,12 +2432,17 @@ struct GGMLRunner {
24322432 GGML_ASSERT (gf != nullptr );
24332433
24342434 // Keep the plan and resident params under the same live-VRAM cap.
2435+ // Add back our own resident buffer so we don't see chunk-K's
2436+ // allocation as "taken" VRAM and shrink the budget on every step.
24352437 size_t effective_budget = max_graph_vram_bytes;
24362438 if (stream_layers_enabled && max_graph_vram_bytes > 0 && runtime_backend != nullptr ) {
24372439 ggml_backend_dev_t dev = ggml_backend_get_device (runtime_backend);
24382440 if (dev != nullptr && ggml_backend_dev_type (dev) != GGML_BACKEND_DEVICE_TYPE_CPU ) {
24392441 size_t free_vram = 0 , total_vram = 0 ;
24402442 ggml_backend_dev_memory (dev, &free_vram, &total_vram);
2443+ if (resident_runtime_params_buffer != nullptr ) {
2444+ free_vram += ggml_backend_buffer_get_size (resident_runtime_params_buffer);
2445+ }
24412446 constexpr size_t safety_margin = 512ull * 1024 * 1024 ;
24422447 size_t free_clamp = (free_vram > safety_margin) ? (free_vram - safety_margin) : 0 ;
24432448 if (free_clamp < effective_budget) {
@@ -2815,39 +2820,36 @@ struct GGMLRunner {
28152820 bool no_return = false ) {
28162821 GGML_ASSERT (gf != nullptr );
28172822
2818- // Runtime LoRA mutates CPU weights between calls, so resident GPU
2819- // copies would go stale.
2820- if (weight_adapter != nullptr ) {
2821- restore_resident_params ();
2822- } else {
2823- sd::ggml_graph_cut::Plan& base_plan = graph_cut_plan_cache_.graph_cut_plan ;
2824- if (base_plan.available ) {
2825- sd::ggml_graph_cut::annotate_residency (base_plan, residency_budget_bytes);
2826-
2827- std::vector<ggml_tensor*> resident_params;
2828- uint64_t token = 0 ;
2829- for (const auto & segment : base_plan.segments ) {
2830- if (segment.residency != sd::ggml_graph_cut::SegmentResidency::RESIDENT ) {
2823+ // Runtime LoRA composes `weight + diff` in the compute graph via
2824+ // ggml_add; the resident weight tensor's data is never mutated, so
2825+ // chunk-K residency stays valid across sampling steps.
2826+ sd::ggml_graph_cut::Plan& base_plan = graph_cut_plan_cache_.graph_cut_plan ;
2827+ if (base_plan.available ) {
2828+ sd::ggml_graph_cut::annotate_residency (base_plan, residency_budget_bytes);
2829+
2830+ std::vector<ggml_tensor*> resident_params;
2831+ uint64_t token = 0 ;
2832+ for (const auto & segment : base_plan.segments ) {
2833+ if (segment.residency != sd::ggml_graph_cut::SegmentResidency::RESIDENT ) {
2834+ continue ;
2835+ }
2836+ auto seg_params = sd::ggml_graph_cut::param_tensors (gf, segment);
2837+ for (ggml_tensor* t : seg_params) {
2838+ if (t == nullptr )
28312839 continue ;
2832- }
2833- auto seg_params = sd::ggml_graph_cut::param_tensors (gf, segment);
2834- for (ggml_tensor* t : seg_params) {
2835- if (t == nullptr )
2836- continue ;
2837- resident_params.push_back (t);
2838- token ^= reinterpret_cast <uintptr_t >(t) * 0x9E3779B97F4A7C15ull ;
2839- }
2840+ resident_params.push_back (t);
2841+ token ^= reinterpret_cast <uintptr_t >(t) * 0x9E3779B97F4A7C15ull ;
28402842 }
2841- if (token != resident_state_token) {
2842- restore_resident_params ();
2843- if (!resident_params. empty ()) {
2844- if (offload_resident_params ( resident_params)) {
2845- resident_state_token = token;
2846- } else {
2847- LOG_ERROR ( " %s chunk-K: resident offload failed; continuing with per-segment streaming " ,
2848- get_desc (). c_str ());
2849- restore_resident_params ( );
2850- }
2843+ }
2844+ if (token != resident_state_token) {
2845+ restore_resident_params ();
2846+ if (! resident_params. empty ( )) {
2847+ if ( offload_resident_params (resident_params)) {
2848+ resident_state_token = token;
2849+ } else {
2850+ LOG_ERROR ( " %s chunk-K: resident offload failed; continuing with per-segment streaming " ,
2851+ get_desc (). c_str () );
2852+ restore_resident_params ();
28512853 }
28522854 }
28532855 }
0 commit comments