Skip to content

Commit

Permalink
Add option not to abort on cuda OOM
Browse files Browse the repository at this point in the history
Warning: Not ready for merge.
Add option not to abort on cuda OOM but return a ggml_status.
The goal is NOT to be able to continue decoding when OOM but just to do
a clean controlled exit at higher level.
Needs cmake GGML_NO_ABORT_ON_OOM=ON (default OFF)
Retouch ggml_tallocr_alloc to return a ggml_status.
Retouch init_tensor to return a ggml_status.
Add a bool option for ggml_cuda_error() to abort or not, default true.
Add a new macro CUDA_CHECK_NO_ABORT()
Ass a new unit test to check the GGML_NO_ABORT_ON_OOM flow.
  • Loading branch information
WilliamTambellini committed Feb 11, 2025
1 parent 98a61a0 commit db9a2af
Show file tree
Hide file tree
Showing 17 changed files with 349 additions and 37 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
build/
release/
build-*/
out/
tmp/
Expand Down
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ option(GGML_GPROF "ggml: enable gprof"

# build
option(GGML_FATAL_WARNINGS "ggml: enable -Werror flag" OFF)
option(GGML_NO_ABORT_ON_OOM "ggml: enable no abort on OOM (experimental)" OFF)

# sanitizers
option(GGML_SANITIZE_THREAD "ggml: enable thread sanitizer" OFF)
Expand Down
2 changes: 1 addition & 1 deletion include/ggml-alloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ struct ggml_tallocr {
};

GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
GGML_API enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);

// Graph allocator
/*
Expand Down
4 changes: 4 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ if (GGML_FATAL_WARNINGS)
endif()
endif()

if (GGML_NO_ABORT_ON_OOM)
add_compile_definitions(GGML_NO_ABORT_ON_OOM)
endif()

if (GGML_ALL_WARNINGS)
if (NOT MSVC)
list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
Expand Down
20 changes: 17 additions & 3 deletions src/ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -89,14 +89,18 @@ struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
return talloc;
}

void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
size = GGML_PAD(size, talloc->alignment);

if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate tensor '%s' (needed %zu, available %zu)\n",
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
#ifdef GGML_NO_ABORT_ON_OOM
return GGML_STATUS_ALLOC_FAILED;
#else
GGML_ABORT("not enough space in the buffer");
#endif
}

void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
Expand All @@ -105,6 +109,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
assert(((uintptr_t)addr % talloc->alignment) == 0);

ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
return GGML_STATUS_SUCCESS;
}

// dynamic tensor allocator
Expand Down Expand Up @@ -150,6 +155,7 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
}
#endif

// Check with reviewer: could that function returns a ggm_status (offset being an arg)
static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
size = aligned_offset(NULL, size, alloc->alignment);

Expand Down Expand Up @@ -179,6 +185,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
// this should never happen
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
__func__, size, max_avail);
// Note: no way to honor GGML_NO_ABORT_ON_OOM since that fn returns the offset, not a ggml_status
GGML_ABORT("not enough space in the buffer");
}
}
Expand Down Expand Up @@ -378,6 +385,7 @@ struct ggml_gallocr {
};

ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
//GGML_LOG_TRACE("%s: nbufs=%d\n", __func__, n_bufs);
ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
GGML_ASSERT(galloc != NULL);

Expand Down Expand Up @@ -670,7 +678,10 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
}
}

// Returns true on success, false otherwise
// Check with reviewers: any cons to return a ggml_status?
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
//GGML_LOG_TRACE("ggml_gallocr_reserve_n\n");
size_t min_hash_size = graph->n_nodes + graph->n_leafs;
// add 25% margin to avoid hash collisions
min_hash_size += min_hash_size / 4;
Expand Down Expand Up @@ -865,6 +876,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
return false;
}

// Check with reviewers: any cons to return a ggml_status here?
bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
if (ggml_gallocr_needs_realloc(galloc, graph)) {
if (galloc->n_buffers == 1) {
Expand Down Expand Up @@ -954,7 +966,9 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
if (t->data == NULL) {
if (t->view_src == NULL) {
ggml_tallocr_alloc(&tallocr, t);
enum ggml_status s = ggml_tallocr_alloc(&tallocr, t);
if (s != GGML_STATUS_SUCCESS)
GGML_LOG_WARN("%s: failed to alloc tensor %s \n", __func__, t->name);
} else if (t->buffer == NULL) {
ggml_backend_view_init(t);
}
Expand Down
2 changes: 1 addition & 1 deletion src/ggml-backend-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ extern "C" {
// base address of the buffer
void * (*get_base) (ggml_backend_buffer_t buffer);
// (optional) initialize a tensor in the buffer (eg. add tensor extras)
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
enum ggml_status (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
// tensor data access
void (*memset_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
Expand Down
3 changes: 2 additions & 1 deletion src/ggml-cpu/amx/amx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,11 @@ static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
return (void *) (buffer->context);
}

static void ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
static ggml_status ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor);

GGML_UNUSED(buffer);
return GGML_STATUS_SUCCESS;
}

static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
Expand Down
3 changes: 2 additions & 1 deletion src/ggml-cpu/ggml-cpu-aarch64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4135,10 +4135,11 @@ static const ggml::cpu::tensor_traits * ggml_aarch64_get_optimal_repack_type(con
return nullptr;
}

static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
static ggml_status ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
tensor->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_aarch64_get_optimal_repack_type(tensor));

GGML_UNUSED(buffer);
return GGML_STATUS_SUCCESS;
}

static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
Expand Down
16 changes: 9 additions & 7 deletions src/ggml-cuda/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -79,18 +79,19 @@

#define GGML_CUDA_MAX_STREAMS 8

[[noreturn]]
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
// Print the error. Will also abort if abort true
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg, bool abort);

#define CUDA_CHECK_GEN(err, success, error_fn) \
#define CUDA_CHECK_GEN(err, success, error_fn, abort) \
do { \
auto err_ = (err); \
if (err_ != (success)) { \
ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_)); \
ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_), abort); \
} \
} while (0)

#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString, true)
#define CUDA_CHECK_NO_ABORT(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString, false)

#if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA)
static const char * cublas_get_error_str(const cublasStatus_t err) {
Expand All @@ -113,15 +114,16 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
}
#endif // CUDART_VERSION >= 12000

#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str, true)

#if !defined(GGML_USE_HIP)
static const char * cu_get_error_str(CUresult err) {
const char * err_str;
cuGetErrorString(err, &err_str);
return err_str;
}
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
// Will print error and abort
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str, true)
#endif

#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
Expand Down
Loading

0 comments on commit db9a2af

Please sign in to comment.