ggml-org · ggerganov · Sep 14, 2025 · Sep 13, 2025 · Sep 13, 2025 · Sep 13, 2025
diff --git a/ggml/src/ggml-metal/ggml-metal-common.cpp b/ggml/src/ggml-metal/ggml-metal-common.cpp
@@ -1,9 +1,12 @@
 #include "ggml-metal-common.h"
 
 #include "ggml-impl.h"
+#include "ggml-backend-impl.h"
 
 #include <vector>
 
+// represents a memory range (i.e. an interval from a starting address p0 to an ending address p1 in a given buffer pb)
+// the type indicates whether it is a source range (i.e. ops read data from it) or a destination range (i.e. ops write data to it)
 struct ggml_mem_range {
     uint64_t pb; // buffer id
 
@@ -36,8 +39,8 @@ void ggml_mem_ranges_reset(ggml_mem_ranges * mrs) {
     mrs->ranges.clear();
 }
 
-static bool ggml_mem_ranges_add(ggml_mem_ranges * mrs, ggml_mem_range mrp) {
-    mrs->ranges.push_back(mrp);
+static bool ggml_mem_ranges_add(ggml_mem_ranges * mrs, ggml_mem_range mr) {
+    mrs->ranges.push_back(mr);
 
     return true;
 }
@@ -48,28 +51,32 @@ static ggml_mem_range ggml_mem_range_from_tensor(const ggml_tensor * tensor, ggm
 
     GGML_ASSERT(!tensor->view_src);
 
-    ggml_mem_range mrp;
+    ggml_mem_range mr;
 
     if (tensor->buffer) {
-        // when the tensor is allocated, use the actual memory address range of the buffer
-        mrp = {
+        // when the tensor is allocated, use the actual memory address range in the buffer
+        //
+        // take the actual allocated size with ggml_backend_buft_get_alloc_size()
+        // this can be larger than the tensor size if the buffer type allocates extra memory
+        // ref: https://github.com/ggml-org/llama.cpp/pull/15966
+        mr = {
             /*.pb =*/ (uint64_t) tensor->buffer,
             /*.p0 =*/ (uint64_t) tensor->data,
-            /*.p1 =*/ (uint64_t) tensor->data + ggml_nbytes(tensor),
+            /*.p1 =*/ (uint64_t) tensor->data + ggml_backend_buft_get_alloc_size(tensor->buffer->buft, tensor),
             /*.pt =*/ pt,
         };
     } else {
-        // otherwise, the tensor ptr is used as an unique id of the memory ranges
+        // otherwise, the pointer address is used as an unique id of the memory ranges
         //   that the tensor will be using when it is allocated
-        mrp = {
+        mr = {
             /*.pb =*/ (uint64_t) tensor,
             /*.p0 =*/ 0,    //
             /*.p1 =*/ 1024, // [0, 1024) is a dummy range, not used
             /*.pt =*/ pt,
         };
     };
 
-    return mrp;
+    return mr;
 }
 
 static ggml_mem_range ggml_mem_range_from_tensor_src(const ggml_tensor * tensor) {
@@ -83,25 +90,25 @@ static ggml_mem_range ggml_mem_range_from_tensor_dst(const ggml_tensor * tensor)
 static bool ggml_mem_ranges_add_src(ggml_mem_ranges * mrs, const ggml_tensor * tensor) {
     GGML_ASSERT(tensor);
 
-    ggml_mem_range mrp = ggml_mem_range_from_tensor_src(tensor);
+    ggml_mem_range mr = ggml_mem_range_from_tensor_src(tensor);
 
     if (mrs->debug > 2) {
-        GGML_LOG_DEBUG("%s: add src range buf=%lld, [%lld, %lld)\n", __func__, mrp.pb, mrp.p0, mrp.p1);
+        GGML_LOG_DEBUG("%s: add src range buf=%lld, [%lld, %lld)\n", __func__, mr.pb, mr.p0, mr.p1);
     }
 
-    return ggml_mem_ranges_add(mrs, mrp);
+    return ggml_mem_ranges_add(mrs, mr);
 }
 
 static bool ggml_mem_ranges_add_dst(ggml_mem_ranges * mrs, const ggml_tensor * tensor) {
     GGML_ASSERT(tensor);
 
-    ggml_mem_range mrp = ggml_mem_range_from_tensor_dst(tensor);
+    ggml_mem_range mr = ggml_mem_range_from_tensor_dst(tensor);
 
     if (mrs->debug > 2) {
-        GGML_LOG_DEBUG("%s: add dst range buf=%lld, [%lld, %lld)\n", __func__, mrp.pb, mrp.p0, mrp.p1);
+        GGML_LOG_DEBUG("%s: add dst range buf=%lld, [%lld, %lld)\n", __func__, mr.pb, mr.p0, mr.p1);
     }
 
-    return ggml_mem_ranges_add(mrs, mrp);
+    return ggml_mem_ranges_add(mrs, mr);
 }
 
 bool ggml_mem_ranges_add(ggml_mem_ranges * mrs, const ggml_tensor * tensor) {
@@ -114,24 +121,26 @@ bool ggml_mem_ranges_add(ggml_mem_ranges * mrs, const ggml_tensor * tensor) {
     return ggml_mem_ranges_add_dst(mrs, tensor);
 }
 
-static bool ggml_mem_ranges_check(const ggml_mem_ranges * mrs, ggml_mem_range mrp) {
+static bool ggml_mem_ranges_check(const ggml_mem_ranges * mrs, ggml_mem_range mr) {
     for (size_t i = 0; i < mrs->ranges.size(); i++) {
         const auto & cmp = mrs->ranges[i];
 
-        if (mrp.pb != cmp.pb) {
+        // two memory ranges cannot intersect if they are in different buffers
+        if (mr.pb != cmp.pb) {
             continue;
         }
 
-        if (mrp.pt == MEM_RANGE_TYPE_SRC && cmp.pt == MEM_RANGE_TYPE_SRC) {
+        // intersecting source ranges are allowed
+        if (mr.pt == MEM_RANGE_TYPE_SRC && cmp.pt == MEM_RANGE_TYPE_SRC) {
             continue;
         }
 
-        if (mrp.p0 < cmp.p1 && mrp.p1 >= cmp.p0) {
+        if (mr.p0 < cmp.p1 && mr.p1 >= cmp.p0) {
             if (mrs->debug > 2) {
                 GGML_LOG_DEBUG("%s: the %s range buf=%lld, [%lld, %lld) overlaps with a previous %s range buf=%lld, [%lld, %lld)\n",
                         __func__,
-                        mrp.pt == MEM_RANGE_TYPE_SRC ? "src" : "dst",
-                        mrp.pb, mrp.p0, mrp.p1,
+                        mr.pt == MEM_RANGE_TYPE_SRC ? "src" : "dst",
+                        mr.pb, mr.p0, mr.p1,
                         cmp.pt == MEM_RANGE_TYPE_SRC ? "src" : "dst",
                         cmp.pb, cmp.p0, cmp.p1);
             }
@@ -146,19 +155,19 @@ static bool ggml_mem_ranges_check(const ggml_mem_ranges * mrs, ggml_mem_range mr
 static bool ggml_mem_ranges_check_src(const ggml_mem_ranges * mrs, const ggml_tensor * tensor) {
     GGML_ASSERT(tensor);
 
-    ggml_mem_range mrp = ggml_mem_range_from_tensor_src(tensor);
+    ggml_mem_range mr = ggml_mem_range_from_tensor_src(tensor);
 
-    const bool res = ggml_mem_ranges_check(mrs, mrp);
+    const bool res = ggml_mem_ranges_check(mrs, mr);
 
     return res;
 }
 
 static bool ggml_mem_ranges_check_dst(const ggml_mem_ranges * mrs, const ggml_tensor * tensor) {
     GGML_ASSERT(tensor);
 
-    ggml_mem_range mrp = ggml_mem_range_from_tensor_dst(tensor);
+    ggml_mem_range mr = ggml_mem_range_from_tensor_dst(tensor);
 
-    const bool res = ggml_mem_ranges_check(mrs, mrp);
+    const bool res = ggml_mem_ranges_check(mrs, mr);
 
     return res;
 }
@@ -222,6 +231,7 @@ static std::vector<int> ggml_metal_graph_optimize_reorder(const std::vector<node
             }
         }
 
+        // keep track of the sources of the fused nodes as well
         for (const auto * fused : node.fused) {
             for (int i = 0; i < GGML_MAX_SRC; i++) {
                 if (fused->src[i]) {
@@ -290,7 +300,10 @@ static std::vector<int> ggml_metal_graph_optimize_reorder(const std::vector<node
 
     std::vector<bool> used(n, false);
 
+    // the memory ranges for the set of currently concurrent nodes
     ggml_mem_ranges * mrs0 = ggml_mem_ranges_init(0);
+
+    // the memory ranges for the set of nodes that haven't been processed yet, when looking forward for a node to reorder
     ggml_mem_ranges * mrs1 = ggml_mem_ranges_init(0);
 
     for (int i0 = 0; i0 < n; i0++) {
@@ -329,7 +342,7 @@ static std::vector<int> ggml_metal_graph_optimize_reorder(const std::vector<node
 
                 const bool is_empty = node1.is_empty();
 
-                // to add a concurrent node, it has to be:
+                // to reorder a node and add it to the concurrent set, it has to be:
                 //   + empty or concurrent with all nodes in the existing concurrent set (mrs0)
                 //   + concurrent with all nodes prior to it that haven't been processed yet (mrs1)
                 if ((is_empty || h_check(mrs0, node1)) && h_check(mrs1, node1)) {
@@ -419,8 +432,8 @@ void ggml_metal_graph_optimize(ggml_cgraph * gf) {
         nodes.push_back(std::move(node));
     }
 
-    // reorder to improve concurrency
 #if 1
+    // reorder to improve concurrency
     const auto order = ggml_metal_graph_optimize_reorder(nodes);
 #else
     std::vector<int> order(nodes.size());