Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add an option not to abort on cuda OOM #1110

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
build/
release/
debug/
build-*/
out/
tmp/
Expand Down
4 changes: 2 additions & 2 deletions include/ggml-alloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
// call with a worst-case graph to avoid buffer reallocations
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
// returns false if the buffer allocation failed
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
GGML_API bool ggml_gallocr_reserve_n(
GGML_API enum ggml_status ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
GGML_API enum ggml_status ggml_gallocr_reserve_n(
ggml_gallocr_t galloc,
struct ggml_cgraph * graph,
const int * node_buffer_ids,
Expand Down
17 changes: 11 additions & 6 deletions src/ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
size = GGML_PAD(size, talloc->alignment);

if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate tensor '%s' (needed %zu, available %zu)\n",
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
GGML_ABORT("not enough space in the buffer");
}
Expand Down Expand Up @@ -378,6 +378,7 @@ struct ggml_gallocr {
};

ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
//GGML_LOG_TRACE("%s: nbufs=%d\n", __func__, n_bufs);
ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
GGML_ASSERT(galloc != NULL);

Expand Down Expand Up @@ -670,7 +671,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
}
}

bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
enum ggml_status ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
//GGML_LOG_DEBUG("%s: \n", __func__);
size_t min_hash_size = graph->n_nodes + graph->n_leafs;
// add 25% margin to avoid hash collisions
min_hash_size += min_hash_size / 4;
Expand Down Expand Up @@ -771,16 +773,16 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
if (galloc->buffers[i] == NULL) {
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
return false;
return GGML_STATUS_ALLOC_FAILED;
}
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
}
}

return true;
return GGML_STATUS_SUCCESS;
}

bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
enum ggml_status ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
}

Expand Down Expand Up @@ -865,13 +867,16 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
return false;
}

// Check with reviewers: any cons to return a ggml_status here?
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@slaren and here?

bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
if (ggml_gallocr_needs_realloc(galloc, graph)) {
if (galloc->n_buffers == 1) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
#endif
if (!ggml_gallocr_reserve(galloc, graph)) {
enum ggml_status s = ggml_gallocr_reserve(galloc, graph);
if (s != GGML_STATUS_SUCCESS) {
GGML_LOG_INFO("%s: ggml_gallocr_reserve failed to reserve. status=%d \n", __func__, s);
return false;
}
} else {
Expand Down
22 changes: 19 additions & 3 deletions src/ggml-backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,14 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t
// return a dummy buffer for zero-sized allocations
return ggml_backend_buffer_init(buft, {}, NULL, 0);
}

return buft->iface.alloc_buffer(buft, size);
ggml_backend_buffer_t b = NULL;
try {
b = buft->iface.alloc_buffer(buft, size);
} catch (const std::exception &e) {
GGML_LOG_ERROR("%s: iface.alloc_buffer failed: %s \n", __func__, e.what());
return NULL;
}
return b;
}

size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
Expand Down Expand Up @@ -172,6 +178,7 @@ enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer
}

ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
assert(buffer);
return buffer->buft;
}

Expand Down Expand Up @@ -329,7 +336,16 @@ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_
}

enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
return backend->iface.graph_compute(backend, cgraph);
ggml_status s;
try {
s = backend->iface.graph_compute(backend, cgraph);
} catch(std::bad_alloc &e) {
return GGML_STATUS_ALLOC_FAILED;
} catch (std::exception &e) {
GGML_LOG_INFO("%s: graph_compute threw: %s", __func__, e.what());
return GGML_STATUS_FAILED;
}
return s;
}

bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
Expand Down
1 change: 1 addition & 0 deletions src/ggml-cpu/amx/amx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ static void ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, st
tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor);

GGML_UNUSED(buffer);
return GGML_STATUS_SUCCESS;
}

static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
Expand Down
5 changes: 3 additions & 2 deletions src/ggml-cuda/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,8 @@ static int ggml_cuda_highest_compiled_arch(const int arch) {

#define GGML_CUDA_MAX_STREAMS 8

[[noreturn]]
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
// Print the error. Will also either abort or throw an exception.
[[noreturn]] void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);

#define CUDA_CHECK_GEN(err, success, error_fn) \
do { \
Expand Down Expand Up @@ -162,6 +162,7 @@ static const char * cu_get_error_str(CUresult err) {
cuGetErrorString(err, &err_str);
return err_str;
}
// Will print error and abort/throw
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
#endif

Expand Down
22 changes: 18 additions & 4 deletions src/ggml-cuda/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,13 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
GGML_LOG_ERROR(" %s\n", stmt);
// abort with GGML_ABORT to get a stack trace
GGML_ABORT(GGML_CUDA_NAME " error");
static const char* GGML_CUDA_NO_ABORT = getenv("GGML_CUDA_NO_ABORT");
if (!GGML_CUDA_NO_ABORT) {
GGML_ABORT(GGML_CUDA_NAME " error");
}
#ifndef __CUDA_ARCH__
throw std::runtime_error(msg);
#endif
}

// this is faster on Windows
Expand All @@ -92,6 +98,7 @@ int ggml_cuda_get_device() {
return id;
}

// Note: Does not abort/throw because does not use CUDA_CHECK
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
ggml_cuda_set_device(device);
#if defined(GGML_USE_HIP) && defined(GGML_HIP_UMA)
Expand Down Expand Up @@ -536,7 +543,8 @@ static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {

static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;

GGML_ASSERT(tensor);
GGML_LOG_DEBUG("%s: t=%p %s\n", __func__, tensor, tensor->name);
if (tensor->view_src != NULL) {
assert(tensor->view_src->buffer->buft == buffer->buft);
return;
Expand Down Expand Up @@ -945,8 +953,14 @@ static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(gg
// however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
// as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();

return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
ggml_backend_buffer_t b = NULL;
try {
b = ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
} catch (std::exception &e) {
GGML_LOG_ERROR("%s: ggml_backend_buffer_init threw: %s \n", __func__, e.what());
return NULL;
}
return b;
}

static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
Expand Down
5 changes: 5 additions & 0 deletions src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -1681,6 +1681,7 @@ void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes) {
}

struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
assert(src);
return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne);
}

Expand Down Expand Up @@ -2328,6 +2329,8 @@ struct ggml_tensor * ggml_concat(
struct ggml_tensor * b,
int dim) {
GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS);
assert(a);
assert(b);

int64_t ne[GGML_MAX_DIMS];
for (int d = 0; d < GGML_MAX_DIMS; ++d) {
Expand Down Expand Up @@ -2695,6 +2698,8 @@ struct ggml_tensor * ggml_mul_mat(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b) {
assert(a);
assert(b);
GGML_ASSERT(ggml_can_mul_mat(a, b));
GGML_ASSERT(!ggml_is_transposed(a));

Expand Down
11 changes: 11 additions & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -412,3 +412,14 @@ add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
target_link_libraries(${TEST_TARGET} PRIVATE ggml)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")

#
# test-oom (atm only cuda)

if(GGML_CUDA)
set(TEST_TARGET test-oom)
add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE ggml)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
endif()
2 changes: 1 addition & 1 deletion tests/test-arange.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ int main(int /*argc*/, const char** /*argv*/) {
ggml_backend_cpu_set_n_threads(backend, n_threads);
}

ggml_backend_graph_compute(backend, graph);
GGML_ASSERT(ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS);

float * output = new float[ggml_nelements(t)];
ggml_backend_tensor_get(t, output, 0, ggml_nbytes(t));
Expand Down
4 changes: 3 additions & 1 deletion tests/test-backend-ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -633,7 +633,9 @@ struct test_case {
ggml_build_forward_expand(gf, out);

// warmup run
ggml_backend_graph_compute(backend, gf);
ggml_status status = ggml_backend_graph_compute(backend, gf);
if (status != GGML_STATUS_SUCCESS)
printf("Warning: ggml_backend_graph_compute warmup failed: ggml status=%d \n", status);

// determine number of runs
int n_runs;
Expand Down
9 changes: 7 additions & 2 deletions tests/test-mul-mat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,9 @@ struct ggml_tensor* compute(const test_model & model, ggml_gallocr_t allocr) {
ggml_backend_cpu_set_n_threads(model.backend, n_threads);
}


ggml_backend_graph_compute(model.backend, gf);
ggml_status status = ggml_backend_graph_compute(model.backend, gf);
if (status != GGML_STATUS_SUCCESS)
return nullptr;

//ggml_graph_print(gf);

Expand Down Expand Up @@ -313,6 +314,10 @@ int main(void)
}

struct ggml_tensor * result = compute(model, allocr);
if (!result) {
printf("ggml_mul_mat: failed to compute graph");
return EXIT_FAILURE;
}

std::vector<float> out_data(ggml_nelements(result));

Expand Down
Loading