From 49f9271148a8f45257d4c7ab5f1e0f0a96c7a1bd Mon Sep 17 00:00:00 2001 From: Rob Kim Date: Tue, 1 Jul 2025 12:15:45 -0400 Subject: [PATCH] Implement context-length dependent KV-cache and Compute Buffer aware layer distribution for heterogeneous multi-GPU inference. Solves the problem of attemtping to run setups with different VRAM (e.g. 24GB cards with 6GB cards); previously layers were assigned without accounting for compute buffer, causing failure when one or more smaller GPUs could not hold the compute buffer. - Add requested_n_ctx parameter to llama_model_params - Implement 3-pass allocation algorithm accounting for compute buffers - Add device exclusion for insufficient memory (GPUs too small to allocate 1 layer + KV_cache + compute buffer excluded) - Add layer redistribution to make equitable use of included GPUs (may not be truly optimal) --- common/common.cpp | 1 + include/llama.h | 3 + src/llama-model.cpp | 308 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 312 insertions(+) diff --git a/common/common.cpp b/common/common.cpp index e4e71ad13fb59..7a3b7e67c7597 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1107,6 +1107,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { mparams.use_mmap = params.use_mmap; mparams.use_mlock = params.use_mlock; mparams.check_tensors = params.check_tensors; + mparams.requested_n_ctx = params.n_ctx; if (params.kv_overrides.empty()) { mparams.kv_overrides = NULL; diff --git a/include/llama.h b/include/llama.h index 3eda9bc68608c..084c550f86d14 100644 --- a/include/llama.h +++ b/include/llama.h @@ -322,6 +322,9 @@ extern "C" { // override key-value pairs of the model meta data const struct llama_model_kv_override * kv_overrides; + // expected context size for memory allocation planning (0 = auto) + uint32_t requested_n_ctx; + // Keep the booleans together to avoid misalignment during copy-by-value. bool vocab_only; // only load the vocabulary, no weights bool use_mmap; // use mmap if possible diff --git a/src/llama-model.cpp b/src/llama-model.cpp index b15bf73c2a29a..4f6812576f92a 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -17,10 +17,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include #include @@ -1580,6 +1582,311 @@ bool llama_model::load_tensors(llama_model_loader & ml) { splits[i] /= split_sum; } + // KV-cache aware layer distribution for heterogeneous GPUs + if (all_zero && n_devices() > 1 && split_mode == LLAMA_SPLIT_MODE_LAYER) { + // Determine context size for memory planning + uint32_t n_ctx_for_kv = 0; + if (params.requested_n_ctx > 0) { + // Use the explicitly requested context size from model params + n_ctx_for_kv = params.requested_n_ctx; + LLAMA_LOG_INFO("%s: Using requested_n_ctx=%u for KV cache calculation\n", + __func__, n_ctx_for_kv); + } else { + // Use a conservative default for memory planning + n_ctx_for_kv = std::min(32768u, hparams.n_ctx_train); + LLAMA_LOG_INFO("%s: Using default n_ctx=%u for KV cache calculation (training context: %u)\n", + __func__, n_ctx_for_kv, hparams.n_ctx_train); + LLAMA_LOG_INFO("%s: (set requested_n_ctx in model params to match your actual context size)\n", __func__); + } + + // Only apply KV-aware distribution if we have a valid context size + if (n_ctx_for_kv > 0 && n_gpu_layers > 0) { + LLAMA_LOG_INFO("%s: Implementing KV-cache aware layer distribution\n", __func__); + + // Calculate memory requirements per layer + const int64_t n_head_kv = hparams.n_head_kv(); + const int64_t n_embd_head = hparams.n_embd_head_k; + const int64_t n_embd_kv = n_embd_head * n_head_kv; + + // KV cache element size (typically f16 = 2 bytes, but can be quantized) + const size_t kv_size_element = 2; // sizeof(ggml_fp16_t) + + // Total KV cache size for all layers (K and V) + // KV cache = 2 (K+V) * n_ctx * n_layers * n_embd_kv * element_size + const size_t kv_cache_size_total = 2ULL * n_ctx_for_kv * n_layer * n_embd_kv * kv_size_element; + + // Estimate model weight size per layer + const size_t model_size_total = ml.n_bytes; + const size_t weight_size_per_layer = model_size_total / n_layer; + + // Calculate actual compute buffer size based on attention matrix requirements + // Attention matrix: n_kv × n_ubatch × n_head × sizeof(float) + // This is the dominant memory consumer during inference + const int64_t n_head = hparams.n_head(); + const size_t n_ubatch = 512; // Default physical batch size (from context params) + const size_t compute_buffer_size = n_ctx_for_kv * n_ubatch * n_head * sizeof(float); + const size_t min_overhead = 512ULL * 1024 * 1024; // 512MB base overhead + + LLAMA_LOG_INFO("%s: Compute buffer size: %.2f MB (context=%u, ubatch=%zu, heads=%lld)\n", + __func__, + compute_buffer_size / 1024.0 / 1024.0, + n_ctx_for_kv, n_ubatch, (long long)n_head); + + // For memory calculation, we need to account for KV cache being shared across layers on each device + // We'll calculate this dynamically during layer assignment + + LLAMA_LOG_INFO("%s: Per-layer memory: weights=%.2f MB\n", + __func__, + weight_size_per_layer / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: Total KV cache size: %.2f MB\n", + __func__, + kv_cache_size_total / 1024.0 / 1024.0); + + // Get memory info and calculate layer assignments + std::vector layers_per_gpu(n_devices(), 0); + std::vector gpu_free_memory(n_devices()); + + // Get free memory for each device and check if they can handle compute buffers + std::vector device_excluded(n_devices(), false); + for (size_t i = 0; i < n_devices(); ++i) { + ggml_backend_dev_t dev = devices[i]; + size_t total, free; + ggml_backend_dev_memory(dev, &free, &total); + gpu_free_memory[i] = free; + + // Check if device can handle minimum requirements (1 layer + compute buffer + KV cache) + size_t min_kv_cache = kv_cache_size_total / n_devices(); // Conservative estimate + size_t min_required = weight_size_per_layer + min_kv_cache + compute_buffer_size + min_overhead; + + if (free < min_required) { + device_excluded[i] = true; + LLAMA_LOG_WARN("%s: Device %zu [%s]: %.2f MB free - excluding (needs %.2f MB minimum)\n", + __func__, i, ggml_backend_dev_name(dev), + free / 1024.0 / 1024.0, min_required / 1024.0 / 1024.0); + } + } + + // Estimate total memory requirements and warn if insufficient + size_t total_gpu_memory = 0; + for (size_t i = 0; i < n_devices(); ++i) { + total_gpu_memory += gpu_free_memory[i]; + } + + // Rough estimate: KV cache + model weights + compute buffers (conservative estimate) + size_t estimated_compute_buffers = kv_cache_size_total; // Compute buffers often similar to KV cache size + size_t estimated_total_needed = kv_cache_size_total + model_size_total + estimated_compute_buffers; + + if (estimated_total_needed > total_gpu_memory) { + LLAMA_LOG_WARN("%s: Memory estimate: %.2f GB needed vs %.2f GB available\n", + __func__, + estimated_total_needed / 1024.0 / 1024.0 / 1024.0, + total_gpu_memory / 1024.0 / 1024.0 / 1024.0); + LLAMA_LOG_WARN("%s: Context size may be too large for available memory\n", __func__); + } + + // Sort devices by available memory (largest first), excluding unusable devices + std::vector gpu_indices; + for (size_t i = 0; i < n_devices(); ++i) { + if (!device_excluded[i]) { + gpu_indices.push_back(i); + } + } + std::sort(gpu_indices.begin(), gpu_indices.end(), + [&gpu_free_memory](size_t a, size_t b) { + return gpu_free_memory[a] > gpu_free_memory[b]; + }); + + if (gpu_indices.empty()) { + LLAMA_LOG_ERROR("%s: No GPUs have sufficient memory for compute buffers\n", __func__); + // Fall back to original allocation + return true; + } + + // Assign layers greedily to GPUs with most memory first + int act_gpu_layers = n_gpu_layers; // Local copy that can be modified + int remaining_layers = act_gpu_layers; + + // First pass: assign layers based on weights only (KV cache and compute buffers handled separately) + size_t weight_per_layer = weight_size_per_layer; + + for (size_t idx : gpu_indices) { + // Reserve memory for compute buffer and base overhead + size_t reserved = compute_buffer_size + min_overhead; + if (gpu_free_memory[idx] <= reserved) { + LLAMA_LOG_WARN("%s: Device %zu [%s]: %zu MB free, can't fit compute buffer (%.2f MB)\n", + __func__, idx, ggml_backend_dev_name(devices[idx]), + gpu_free_memory[idx] / 1024 / 1024, + reserved / 1024.0 / 1024.0); + continue; + } + + size_t available_for_model = gpu_free_memory[idx] - reserved; + int layers_that_fit = available_for_model / weight_per_layer; + + if (layers_that_fit > 0 && remaining_layers > 0) { + int layers_to_assign = std::min(layers_that_fit, remaining_layers); + layers_per_gpu[idx] = layers_to_assign; + remaining_layers -= layers_to_assign; + + LLAMA_LOG_INFO("%s: Device %zu [%s]: %zu MB free, assigned %d layers (%.2f MB weights, %.2f MB compute buffer)\n", + __func__, idx, ggml_backend_dev_name(devices[idx]), + gpu_free_memory[idx] / 1024 / 1024, + layers_per_gpu[idx], + (layers_to_assign * weight_per_layer) / 1024.0 / 1024.0, + compute_buffer_size / 1024.0 / 1024.0); + } else { + LLAMA_LOG_WARN("%s: Device %zu [%s]: %zu MB free, assigned 0 layers (need %.2f MB per layer + %.2f MB compute buffer)\n", + __func__, idx, ggml_backend_dev_name(devices[idx]), + gpu_free_memory[idx] / 1024 / 1024, + weight_per_layer / 1024.0 / 1024.0, + compute_buffer_size / 1024.0 / 1024.0); + } + } + + // Second pass: iteratively check if KV cache can fit proportionally + bool kv_fit_check_needed = (remaining_layers == 0); + int iterations = 0; + const int max_iterations = 10; + + while (kv_fit_check_needed && iterations < max_iterations) { + kv_fit_check_needed = false; + iterations++; + + // Calculate current total assigned layers + int total_assigned = 0; + for (size_t idx = 0; idx < n_devices(); ++idx) { + total_assigned += layers_per_gpu[idx]; + } + + if (total_assigned == 0) break; + + // Check KV cache distribution for each device + for (size_t idx = 0; idx < n_devices(); ++idx) { + if (layers_per_gpu[idx] > 0) { + double layer_ratio = (double)layers_per_gpu[idx] / total_assigned; + size_t kv_cache_for_device = (size_t)(kv_cache_size_total * layer_ratio); + size_t weights = layers_per_gpu[idx] * weight_per_layer; + size_t total_memory_needed = weights + kv_cache_for_device + compute_buffer_size + min_overhead; + + if (total_memory_needed > gpu_free_memory[idx]) { + // Device can't fit current allocation, reduce layers + size_t available_memory = gpu_free_memory[idx]; + if (available_memory > min_overhead + kv_cache_for_device + compute_buffer_size) { + size_t available_for_weights = available_memory - min_overhead - kv_cache_for_device - compute_buffer_size; + int new_layer_count = available_for_weights / weight_per_layer; + new_layer_count = std::max(0, new_layer_count); + + if (new_layer_count < layers_per_gpu[idx]) { + LLAMA_LOG_WARN("%s: Device %zu: Reducing layers from %d to %d due to KV cache requirements (%.2f MB KV cache)\n", + __func__, idx, layers_per_gpu[idx], new_layer_count, + kv_cache_for_device / 1024.0 / 1024.0); + remaining_layers += layers_per_gpu[idx] - new_layer_count; + layers_per_gpu[idx] = new_layer_count; + kv_fit_check_needed = true; + } + } else { + // Device can't even fit the minimum requirements + LLAMA_LOG_WARN("%s: Device %zu: Removing all %d layers - insufficient memory for KV cache\n", + __func__, idx, layers_per_gpu[idx]); + remaining_layers += layers_per_gpu[idx]; + layers_per_gpu[idx] = 0; + kv_fit_check_needed = true; + } + } + } + } + } + + // Third pass: redistribute any remaining layers to devices with available capacity + if (remaining_layers > 0) { + LLAMA_LOG_INFO("%s: Attempting to redistribute %d remaining layers\n", __func__, remaining_layers); + + // Calculate current memory usage for each device that has layers assigned + for (size_t idx : gpu_indices) { + if (layers_per_gpu[idx] > 0 && remaining_layers > 0) { + // Calculate current memory usage + int current_assigned = 0; + for (size_t i = 0; i < n_devices(); ++i) { + current_assigned += layers_per_gpu[i]; + } + + double layer_ratio = (double)layers_per_gpu[idx] / current_assigned; + size_t current_kv_cache = (size_t)(kv_cache_size_total * layer_ratio); + size_t current_weights = layers_per_gpu[idx] * weight_per_layer; + size_t current_usage = current_weights + current_kv_cache + compute_buffer_size + min_overhead; + + if (gpu_free_memory[idx] > current_usage) { + // Calculate how many additional layers could fit + // We need to account for proportional increase in KV cache + int additional_layers = 0; + for (int test_layers = 1; test_layers <= remaining_layers; test_layers++) { + int new_total_layers = layers_per_gpu[idx] + test_layers; + int new_total_assigned = current_assigned + test_layers; + double new_layer_ratio = (double)new_total_layers / new_total_assigned; + size_t new_kv_cache = (size_t)(kv_cache_size_total * new_layer_ratio); + size_t new_weights = new_total_layers * weight_per_layer; + size_t new_total_usage = new_weights + new_kv_cache + compute_buffer_size + min_overhead; + + if (new_total_usage <= gpu_free_memory[idx]) { + additional_layers = test_layers; + } else { + break; + } + } + + if (additional_layers > 0) { + int layers_to_add = std::min(additional_layers, remaining_layers); + layers_per_gpu[idx] += layers_to_add; + remaining_layers -= layers_to_add; + + LLAMA_LOG_INFO("%s: Device %zu [%s]: redistributed %d additional layers (total now %d)\n", + __func__, idx, ggml_backend_dev_name(devices[idx]), + layers_to_add, layers_per_gpu[idx]); + } + } + } + } + } + + // Warn if we couldn't place all layers + if (remaining_layers > 0) { + LLAMA_LOG_ERROR("%s: WARNING: Could not assign %d layers to GPUs. Consider:\n", + __func__, remaining_layers); + LLAMA_LOG_ERROR("%s: - Reducing context size (current: %u)\n", + __func__, n_ctx_for_kv); + LLAMA_LOG_ERROR("%s: - Using fewer layers (-ngl)\n", __func__); + LLAMA_LOG_ERROR("%s: - Adding more GPU memory\n", __func__); + + // Put remaining layers on CPU (will be updated below) + } + + // Convert layer counts to split ratios + splits.clear(); + splits.resize(n_devices()); + float cumsum = 0.0f; + + // Calculate total layers actually assigned + int total_assigned_layers = 0; + for (size_t i = 0; i < n_devices(); ++i) { + total_assigned_layers += layers_per_gpu[i]; + } + + // Update act_gpu_layers to match what we actually assigned + act_gpu_layers = total_assigned_layers; + + for (size_t i = 0; i < n_devices(); ++i) { + cumsum += (float)layers_per_gpu[i] / act_gpu_layers; + splits[i] = cumsum; + } + + LLAMA_LOG_INFO("%s: Final split ratios: ", __func__); + for (size_t i = 0; i < n_devices(); ++i) { + LLAMA_LOG_CONT("%.3f ", splits[i]); + } + LLAMA_LOG_CONT("\n"); + } + } + ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); if (cpu_dev == nullptr) { throw std::runtime_error(format("%s: no CPU backend found", __func__)); @@ -14837,6 +15144,7 @@ llama_model_params llama_model_default_params() { /*.progress_callback =*/ nullptr, /*.progress_callback_user_data =*/ nullptr, /*.kv_overrides =*/ nullptr, + /*.requested_n_ctx =*/ 0, /*.vocab_only =*/ false, /*.use_mmap =*/ true, /*.use_mlock =*/ false,