Skip to content

Commit 5401fb1

Browse files
committed
Keep chunk-K residency engaged with runtime LoRA
1 parent 2d40a8b commit 5401fb1

1 file changed

Lines changed: 33 additions & 31 deletions

File tree

src/ggml_extend.hpp

Lines changed: 33 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -2432,12 +2432,17 @@ struct GGMLRunner {
24322432
GGML_ASSERT(gf != nullptr);
24332433

24342434
// Keep the plan and resident params under the same live-VRAM cap.
2435+
// Add back our own resident buffer so we don't see chunk-K's
2436+
// allocation as "taken" VRAM and shrink the budget on every step.
24352437
size_t effective_budget = max_graph_vram_bytes;
24362438
if (stream_layers_enabled && max_graph_vram_bytes > 0 && runtime_backend != nullptr) {
24372439
ggml_backend_dev_t dev = ggml_backend_get_device(runtime_backend);
24382440
if (dev != nullptr && ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
24392441
size_t free_vram = 0, total_vram = 0;
24402442
ggml_backend_dev_memory(dev, &free_vram, &total_vram);
2443+
if (resident_runtime_params_buffer != nullptr) {
2444+
free_vram += ggml_backend_buffer_get_size(resident_runtime_params_buffer);
2445+
}
24412446
constexpr size_t safety_margin = 512ull * 1024 * 1024;
24422447
size_t free_clamp = (free_vram > safety_margin) ? (free_vram - safety_margin) : 0;
24432448
if (free_clamp < effective_budget) {
@@ -2815,39 +2820,36 @@ struct GGMLRunner {
28152820
bool no_return = false) {
28162821
GGML_ASSERT(gf != nullptr);
28172822

2818-
// Runtime LoRA mutates CPU weights between calls, so resident GPU
2819-
// copies would go stale.
2820-
if (weight_adapter != nullptr) {
2821-
restore_resident_params();
2822-
} else {
2823-
sd::ggml_graph_cut::Plan& base_plan = graph_cut_plan_cache_.graph_cut_plan;
2824-
if (base_plan.available) {
2825-
sd::ggml_graph_cut::annotate_residency(base_plan, residency_budget_bytes);
2826-
2827-
std::vector<ggml_tensor*> resident_params;
2828-
uint64_t token = 0;
2829-
for (const auto& segment : base_plan.segments) {
2830-
if (segment.residency != sd::ggml_graph_cut::SegmentResidency::RESIDENT) {
2823+
// Runtime LoRA composes `weight + diff` in the compute graph via
2824+
// ggml_add; the resident weight tensor's data is never mutated, so
2825+
// chunk-K residency stays valid across sampling steps.
2826+
sd::ggml_graph_cut::Plan& base_plan = graph_cut_plan_cache_.graph_cut_plan;
2827+
if (base_plan.available) {
2828+
sd::ggml_graph_cut::annotate_residency(base_plan, residency_budget_bytes);
2829+
2830+
std::vector<ggml_tensor*> resident_params;
2831+
uint64_t token = 0;
2832+
for (const auto& segment : base_plan.segments) {
2833+
if (segment.residency != sd::ggml_graph_cut::SegmentResidency::RESIDENT) {
2834+
continue;
2835+
}
2836+
auto seg_params = sd::ggml_graph_cut::param_tensors(gf, segment);
2837+
for (ggml_tensor* t : seg_params) {
2838+
if (t == nullptr)
28312839
continue;
2832-
}
2833-
auto seg_params = sd::ggml_graph_cut::param_tensors(gf, segment);
2834-
for (ggml_tensor* t : seg_params) {
2835-
if (t == nullptr)
2836-
continue;
2837-
resident_params.push_back(t);
2838-
token ^= reinterpret_cast<uintptr_t>(t) * 0x9E3779B97F4A7C15ull;
2839-
}
2840+
resident_params.push_back(t);
2841+
token ^= reinterpret_cast<uintptr_t>(t) * 0x9E3779B97F4A7C15ull;
28402842
}
2841-
if (token != resident_state_token) {
2842-
restore_resident_params();
2843-
if (!resident_params.empty()) {
2844-
if (offload_resident_params(resident_params)) {
2845-
resident_state_token = token;
2846-
} else {
2847-
LOG_ERROR("%s chunk-K: resident offload failed; continuing with per-segment streaming",
2848-
get_desc().c_str());
2849-
restore_resident_params();
2850-
}
2843+
}
2844+
if (token != resident_state_token) {
2845+
restore_resident_params();
2846+
if (!resident_params.empty()) {
2847+
if (offload_resident_params(resident_params)) {
2848+
resident_state_token = token;
2849+
} else {
2850+
LOG_ERROR("%s chunk-K: resident offload failed; continuing with per-segment streaming",
2851+
get_desc().c_str());
2852+
restore_resident_params();
28512853
}
28522854
}
28532855
}

0 commit comments

Comments
 (0)