Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 40 additions & 27 deletions ggml/src/ggml-metal/ggml-metal-common.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
#include "ggml-metal-common.h"

#include "ggml-impl.h"
#include "ggml-backend-impl.h"

#include <vector>

// represents a memory range (i.e. an interval from a starting address p0 to an ending address p1 in a given buffer pb)
// the type indicates whether it is a source range (i.e. ops read data from it) or a destination range (i.e. ops write data to it)
struct ggml_mem_range {
uint64_t pb; // buffer id

Expand Down Expand Up @@ -36,8 +39,8 @@ void ggml_mem_ranges_reset(ggml_mem_ranges * mrs) {
mrs->ranges.clear();
}

static bool ggml_mem_ranges_add(ggml_mem_ranges * mrs, ggml_mem_range mrp) {
mrs->ranges.push_back(mrp);
static bool ggml_mem_ranges_add(ggml_mem_ranges * mrs, ggml_mem_range mr) {
mrs->ranges.push_back(mr);

return true;
}
Expand All @@ -48,28 +51,32 @@ static ggml_mem_range ggml_mem_range_from_tensor(const ggml_tensor * tensor, ggm

GGML_ASSERT(!tensor->view_src);

ggml_mem_range mrp;
ggml_mem_range mr;

if (tensor->buffer) {
// when the tensor is allocated, use the actual memory address range of the buffer
mrp = {
// when the tensor is allocated, use the actual memory address range in the buffer
//
// take the actual allocated size with ggml_backend_buft_get_alloc_size()
// this can be larger than the tensor size if the buffer type allocates extra memory
// ref: https://github.com/ggml-org/llama.cpp/pull/15966
mr = {
/*.pb =*/ (uint64_t) tensor->buffer,
/*.p0 =*/ (uint64_t) tensor->data,
/*.p1 =*/ (uint64_t) tensor->data + ggml_nbytes(tensor),
/*.p1 =*/ (uint64_t) tensor->data + ggml_backend_buft_get_alloc_size(tensor->buffer->buft, tensor),
/*.pt =*/ pt,
};
} else {
// otherwise, the tensor ptr is used as an unique id of the memory ranges
// otherwise, the pointer address is used as an unique id of the memory ranges
// that the tensor will be using when it is allocated
mrp = {
mr = {
/*.pb =*/ (uint64_t) tensor,
/*.p0 =*/ 0, //
/*.p1 =*/ 1024, // [0, 1024) is a dummy range, not used
/*.pt =*/ pt,
};
};

return mrp;
return mr;
}

static ggml_mem_range ggml_mem_range_from_tensor_src(const ggml_tensor * tensor) {
Expand All @@ -83,25 +90,25 @@ static ggml_mem_range ggml_mem_range_from_tensor_dst(const ggml_tensor * tensor)
static bool ggml_mem_ranges_add_src(ggml_mem_ranges * mrs, const ggml_tensor * tensor) {
GGML_ASSERT(tensor);

ggml_mem_range mrp = ggml_mem_range_from_tensor_src(tensor);
ggml_mem_range mr = ggml_mem_range_from_tensor_src(tensor);

if (mrs->debug > 2) {
GGML_LOG_DEBUG("%s: add src range buf=%lld, [%lld, %lld)\n", __func__, mrp.pb, mrp.p0, mrp.p1);
GGML_LOG_DEBUG("%s: add src range buf=%lld, [%lld, %lld)\n", __func__, mr.pb, mr.p0, mr.p1);
}

return ggml_mem_ranges_add(mrs, mrp);
return ggml_mem_ranges_add(mrs, mr);
}

static bool ggml_mem_ranges_add_dst(ggml_mem_ranges * mrs, const ggml_tensor * tensor) {
GGML_ASSERT(tensor);

ggml_mem_range mrp = ggml_mem_range_from_tensor_dst(tensor);
ggml_mem_range mr = ggml_mem_range_from_tensor_dst(tensor);

if (mrs->debug > 2) {
GGML_LOG_DEBUG("%s: add dst range buf=%lld, [%lld, %lld)\n", __func__, mrp.pb, mrp.p0, mrp.p1);
GGML_LOG_DEBUG("%s: add dst range buf=%lld, [%lld, %lld)\n", __func__, mr.pb, mr.p0, mr.p1);
}

return ggml_mem_ranges_add(mrs, mrp);
return ggml_mem_ranges_add(mrs, mr);
}

bool ggml_mem_ranges_add(ggml_mem_ranges * mrs, const ggml_tensor * tensor) {
Expand All @@ -114,24 +121,26 @@ bool ggml_mem_ranges_add(ggml_mem_ranges * mrs, const ggml_tensor * tensor) {
return ggml_mem_ranges_add_dst(mrs, tensor);
}

static bool ggml_mem_ranges_check(const ggml_mem_ranges * mrs, ggml_mem_range mrp) {
static bool ggml_mem_ranges_check(const ggml_mem_ranges * mrs, ggml_mem_range mr) {
for (size_t i = 0; i < mrs->ranges.size(); i++) {
const auto & cmp = mrs->ranges[i];

if (mrp.pb != cmp.pb) {
// two memory ranges cannot intersect if they are in different buffers
if (mr.pb != cmp.pb) {
continue;
}

if (mrp.pt == MEM_RANGE_TYPE_SRC && cmp.pt == MEM_RANGE_TYPE_SRC) {
// intersecting source ranges are allowed
if (mr.pt == MEM_RANGE_TYPE_SRC && cmp.pt == MEM_RANGE_TYPE_SRC) {
continue;
}

if (mrp.p0 < cmp.p1 && mrp.p1 >= cmp.p0) {
if (mr.p0 < cmp.p1 && mr.p1 >= cmp.p0) {
if (mrs->debug > 2) {
GGML_LOG_DEBUG("%s: the %s range buf=%lld, [%lld, %lld) overlaps with a previous %s range buf=%lld, [%lld, %lld)\n",
__func__,
mrp.pt == MEM_RANGE_TYPE_SRC ? "src" : "dst",
mrp.pb, mrp.p0, mrp.p1,
mr.pt == MEM_RANGE_TYPE_SRC ? "src" : "dst",
mr.pb, mr.p0, mr.p1,
cmp.pt == MEM_RANGE_TYPE_SRC ? "src" : "dst",
cmp.pb, cmp.p0, cmp.p1);
}
Expand All @@ -146,19 +155,19 @@ static bool ggml_mem_ranges_check(const ggml_mem_ranges * mrs, ggml_mem_range mr
static bool ggml_mem_ranges_check_src(const ggml_mem_ranges * mrs, const ggml_tensor * tensor) {
GGML_ASSERT(tensor);

ggml_mem_range mrp = ggml_mem_range_from_tensor_src(tensor);
ggml_mem_range mr = ggml_mem_range_from_tensor_src(tensor);

const bool res = ggml_mem_ranges_check(mrs, mrp);
const bool res = ggml_mem_ranges_check(mrs, mr);

return res;
}

static bool ggml_mem_ranges_check_dst(const ggml_mem_ranges * mrs, const ggml_tensor * tensor) {
GGML_ASSERT(tensor);

ggml_mem_range mrp = ggml_mem_range_from_tensor_dst(tensor);
ggml_mem_range mr = ggml_mem_range_from_tensor_dst(tensor);

const bool res = ggml_mem_ranges_check(mrs, mrp);
const bool res = ggml_mem_ranges_check(mrs, mr);

return res;
}
Expand Down Expand Up @@ -222,6 +231,7 @@ static std::vector<int> ggml_metal_graph_optimize_reorder(const std::vector<node
}
}

// keep track of the sources of the fused nodes as well
for (const auto * fused : node.fused) {
for (int i = 0; i < GGML_MAX_SRC; i++) {
if (fused->src[i]) {
Expand Down Expand Up @@ -290,7 +300,10 @@ static std::vector<int> ggml_metal_graph_optimize_reorder(const std::vector<node

std::vector<bool> used(n, false);

// the memory ranges for the set of currently concurrent nodes
ggml_mem_ranges * mrs0 = ggml_mem_ranges_init(0);

// the memory ranges for the set of nodes that haven't been processed yet, when looking forward for a node to reorder
ggml_mem_ranges * mrs1 = ggml_mem_ranges_init(0);

for (int i0 = 0; i0 < n; i0++) {
Expand Down Expand Up @@ -329,7 +342,7 @@ static std::vector<int> ggml_metal_graph_optimize_reorder(const std::vector<node

const bool is_empty = node1.is_empty();

// to add a concurrent node, it has to be:
// to reorder a node and add it to the concurrent set, it has to be:
// + empty or concurrent with all nodes in the existing concurrent set (mrs0)
// + concurrent with all nodes prior to it that haven't been processed yet (mrs1)
if ((is_empty || h_check(mrs0, node1)) && h_check(mrs1, node1)) {
Expand Down Expand Up @@ -419,8 +432,8 @@ void ggml_metal_graph_optimize(ggml_cgraph * gf) {
nodes.push_back(std::move(node));
}

// reorder to improve concurrency
#if 1
// reorder to improve concurrency
const auto order = ggml_metal_graph_optimize_reorder(nodes);
#else
std::vector<int> order(nodes.size());
Expand Down
Loading
Loading