Skip to content

Commit 284af90

Browse files
committed
perf: allocate CPU-offloaded params from runtime device pinned host buffer
1 parent 5401fb1 commit 284af90

1 file changed

Lines changed: 12 additions & 1 deletion

File tree

src/ggml_extend.hpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2999,7 +2999,18 @@ struct GGMLRunner {
29992999
LOG_DEBUG("%s skipping params allocation (no tensors)", get_desc().c_str());
30003000
return true;
30013001
}
3002-
params_buffer = ggml_backend_alloc_ctx_tensors(params_ctx, params_backend);
3002+
// Pinned host buffer when CPU-offloaded for DMA-direct H2D.
3003+
ggml_backend_buffer_type_t params_buft = nullptr;
3004+
if (params_backend != runtime_backend) {
3005+
ggml_backend_dev_t runtime_dev = ggml_backend_get_device(runtime_backend);
3006+
if (runtime_dev != nullptr) {
3007+
params_buft = ggml_backend_dev_host_buffer_type(runtime_dev);
3008+
}
3009+
}
3010+
if (params_buft == nullptr) {
3011+
params_buft = ggml_backend_get_default_buffer_type(params_backend);
3012+
}
3013+
params_buffer = ggml_backend_alloc_ctx_tensors_from_buft(params_ctx, params_buft);
30033014
if (params_buffer == nullptr) {
30043015
LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i",
30053016
get_desc().c_str(),

0 commit comments

Comments
 (0)