Skip to content

Commit 858f6b7

Browse files
Add an option to build without CUDA VMM (ggml-org#7067)
Add an option to build ggml cuda without CUDA VMM resolves ggml-org#6889 https://forums.developer.nvidia.com/t/potential-nvshmem-allocated-memory-performance-issue/275416/4
1 parent b3a995b commit 858f6b7

File tree

2 files changed

+13
-4
lines changed

2 files changed

+13
-4
lines changed

CMakeLists.txt

+10-1
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,8 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
103103
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
104104
"llama: max. batch size for using peer access")
105105
option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF)
106+
option(LLAMA_CUDA_NO_VMM "llama: do not try to use CUDA VMM" OFF)
107+
106108
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
107109
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
108110
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
@@ -409,6 +411,9 @@ if (LLAMA_CUDA)
409411
if (LLAMA_CUDA_FORCE_MMQ)
410412
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
411413
endif()
414+
if (LLAMA_CUDA_NO_VMM)
415+
add_compile_definitions(GGML_CUDA_NO_VMM)
416+
endif()
412417
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
413418
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
414419
if (DEFINED LLAMA_CUDA_DMMV_Y)
@@ -434,7 +439,11 @@ if (LLAMA_CUDA)
434439
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
435440
endif()
436441

437-
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)
442+
if (LLAMA_CUDA_NO_VMM)
443+
# No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
444+
else()
445+
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
446+
endif()
438447

439448
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
440449
# 52 == lowest CUDA 12 standard

ggml-cuda.cu

+3-3
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
113113
for (int id = 0; id < info.device_count; ++id) {
114114
int device_vmm = 0;
115115

116-
#if !defined(GGML_USE_HIPBLAS)
116+
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
117117
CUdevice device;
118118
CU_CHECK(cuDeviceGet(&device, id));
119119
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
@@ -259,7 +259,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
259259
};
260260

261261
// pool with virtual memory
262-
#if !defined(GGML_USE_HIPBLAS)
262+
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
263263
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
264264
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
265265

@@ -356,7 +356,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
356356
#endif // !defined(GGML_USE_HIPBLAS)
357357

358358
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
359-
#if !defined(GGML_USE_HIPBLAS)
359+
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
360360
if (ggml_cuda_info().devices[device].vmm) {
361361
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
362362
}

0 commit comments

Comments
 (0)