From 76681e3c735f8ac45eccf51368bb562a9ab27784 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 1 Jul 2025 15:59:43 +0300
Subject: [PATCH 1/6] llama : reuse compute graphs

ggml-ci
---
 common/arg.cpp                 |   8 ++
 common/common.cpp              |   1 +
 common/common.h                |   1 +
 include/llama.h                |   3 +
 src/llama-batch.h              |  25 +++++
 src/llama-context.cpp          | 170 +++++++++++++++---------------
 src/llama-context.h            |  28 ++---
 src/llama-cparams.cpp          |  29 +++++
 src/llama-cparams.h            |   3 +
 src/llama-graph.cpp            |  71 ++++++++++++-
 src/llama-graph.h              | 187 ++++++++++++++++++++++++++-------
 src/llama-hparams.cpp          |   9 ++
 src/llama-hparams.h            |   2 +
 src/llama-kv-cache-unified.cpp |  60 ++++++-----
 src/llama-kv-cache-unified.h   |  24 +++--
 src/llama-model.cpp            |  15 +--
 src/llama-model.h              |   5 +-
 17 files changed, 456 insertions(+), 185 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 40af7e574830f..d4a3f5b7f21ec 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1464,6 +1464,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.swa_full = true;
         }
     ).set_env("LLAMA_ARG_SWA_FULL"));
+    add_opt(common_arg(
+        {"--graph-reuse", "-gr"},
+        string_format("reuse previous compute graphs when possible (default: %s)"
+            "[(more info)](https://github.com/ggml-org/llama.cpp/pull/14482)", params.graph_reuse ? "true" : "false"),
+        [](common_params & params) {
+            params.graph_reuse = true;
+        }
+    ).set_env("LLAMA_ARG_GRAPH_REUSE"));
     add_opt(common_arg(
         {"--no-context-shift"},
         string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
diff --git a/common/common.cpp b/common/common.cpp
index e4e71ad13fb59..c7c163404a2af 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1157,6 +1157,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
     cparams.no_perf           = params.no_perf;
     cparams.op_offload        = !params.no_op_offload;
     cparams.swa_full          = params.swa_full;
+    cparams.graph_reuse       = params.graph_reuse;
 
     cparams.type_k = params.cache_type_k;
     cparams.type_v = params.cache_type_v;
diff --git a/common/common.h b/common/common.h
index 8922090e7b10d..465d05672d2a6 100644
--- a/common/common.h
+++ b/common/common.h
@@ -330,6 +330,7 @@ struct common_params {
     bool no_perf           = false; // disable performance metrics
     bool ctx_shift         = true;  // context shift on inifinite text generation
     bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
+    bool graph_reuse       = false; // reuse previous compute graphs when possible
 
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
     bool use_mmap          = true;  // use mmap for faster loads
diff --git a/include/llama.h b/include/llama.h
index 3eda9bc68608c..1e8228997f1d8 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -374,6 +374,8 @@ extern "C" {
         bool swa_full;    // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
                           // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
                           //       ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
+
+        bool graph_reuse; // reuse previous compute graphs when possible
     };
 
     // model quantization parameters
@@ -1429,6 +1431,7 @@ extern "C" {
 
         int32_t n_p_eval;
         int32_t n_eval;
+        int32_t n_reused;
     };
 
     struct llama_perf_sampler_data {
diff --git a/src/llama-batch.h b/src/llama-batch.h
index 3420803ff9469..b27134c9331a9 100644
--- a/src/llama-batch.h
+++ b/src/llama-batch.h
@@ -34,6 +34,31 @@ struct llama_ubatch {
     llama_seq_id *  seq_id_unq; // [n_seqs_unq]       | s   | seq_id
     int32_t      *  seq_idx;    // [LLAMA_MAX_SEQ]    | -   | seq_idx
     int8_t       *  output;     // [n_tokens]         | i   | -
+
+    bool is_same(const llama_ubatch & other) const {
+        bool res =
+            equal_seqs   == other.equal_seqs &&
+            n_tokens     == other.n_tokens &&
+            n_seq_tokens == other.n_seq_tokens &&
+            n_seqs       == other.n_seqs &&
+            n_seqs_unq   == other.n_seqs_unq &&
+            (
+                (!token && !other.token) ||
+                (!embd  && !other.embd)
+            );
+
+        if (!res) {
+            return false;
+        }
+
+        // TODO: this won't work because seq_id_unq ptr can point to an old balloc that has
+        //       been freed by this point. find a way to fix this
+        //for (uint32_t s = 0; s < n_seqs_unq; ++s) {
+        //    res &= seq_id_unq[s] == other.seq_id_unq[s];
+        //}
+
+        return res;
+    }
 };
 
 // a helper for sanitizing, fulfilling and splitting a batch
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 06e93b19cbf40..8a1e2a86f9709 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -101,7 +101,8 @@ llama_context::llama_context(
 
     cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
 
-    cparams.op_offload = params.op_offload;
+    cparams.op_offload  = params.op_offload;
+    cparams.graph_reuse = params.graph_reuse;
 
     const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
 
@@ -227,8 +228,8 @@ llama_context::llama_context(
 
         LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
 
-        // buffer used to store the computation graph and the tensor meta data
-        buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
+        gf_res_prev.reset(new llm_graph_result(max_nodes));
+        gf_res_reserve.reset(new llm_graph_result(max_nodes));
 
         // TODO: move these checks to ggml_backend_sched
         // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
@@ -388,10 +389,6 @@ ggml_backend_sched_t llama_context::get_sched() const {
     return sched.get();
 }
 
-ggml_context * llama_context::get_ctx_compute() const {
-    return ctx_compute.get();
-}
-
 uint32_t llama_context::n_ctx() const {
     return cparams.n_ctx;
 }
@@ -678,38 +675,52 @@ bool llama_context::apply_adapter_cvec(
     return cvec.apply(model, data, len, n_embd, il_start, il_end);
 }
 
-llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
+llm_graph_result_i * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
     if (mctx && !mctx->apply()) {
         LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__);
         ret = GGML_STATUS_FAILED;
         return nullptr;
     }
 
-    auto * gf = graph_init();
-    if (!gf) {
-        LLAMA_LOG_ERROR("%s: failed to initialize graph\n", __func__);
-        ret = GGML_STATUS_FAILED;
-        return nullptr;
-    }
+    auto * res = gf_res_prev.get();
+    auto * gf  = res->get_gf();
 
-    auto res = graph_build(ctx_compute.get(), gf, ubatch, gtype, mctx);
-    if (!res) {
-        LLAMA_LOG_ERROR("%s: failed to build graph\n", __func__);
-        ret = GGML_STATUS_FAILED;
-        return nullptr;
-    }
+    // the new graph parameters
+    // in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
+    const auto gparams = graph_params(res, ubatch, mctx, gtype);
 
-    // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
+    const bool can_reuse = cparams.graph_reuse && res->update(gparams);
+    if (can_reuse) {
+        LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
+        n_reused++;
+    } else {
+        res->reset();
 
-    if (!ggml_backend_sched_alloc_graph(sched.get(), gf)) {
-        LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__);
-        ret = GGML_STATUS_ALLOC_FAILED;
-        return nullptr;
+        ggml_backend_sched_reset(sched.get());
+        ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
+
+        //const auto t_start_us = ggml_time_us();
+
+        gf = model.build_graph(gparams);
+
+        //LLAMA_LOG_INFO("graph build time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
+
+        if (!gf) {
+            LLAMA_LOG_ERROR("%s: failed to initialize graph\n", __func__);
+            ret = GGML_STATUS_FAILED;
+            return nullptr;
+        }
+
+        if (!ggml_backend_sched_alloc_graph(sched.get(), gf)) {
+            LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__);
+            ret = GGML_STATUS_ALLOC_FAILED;
+            return nullptr;
+        }
     }
 
     res->set_inputs(&ubatch);
 
-    const auto status = graph_compute(gf, ubatch.n_tokens > 1);
+    const auto status = graph_compute(res->get_gf(), ubatch.n_tokens > 1);
     if (status != GGML_STATUS_SUCCESS) {
         LLAMA_LOG_ERROR("%s: failed to compute graph, compute status: %d\n", __func__, status);
         ret = status;
@@ -767,9 +778,6 @@ int llama_context::encode(const llama_batch & batch_inp) {
 
     n_outputs = n_tokens;
 
-    ggml_backend_sched_reset(sched.get());
-    ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
-
     const auto causal_attn_org = cparams.causal_attn;
 
     // always use non-causal attention for encoder graphs
@@ -778,7 +786,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
     cparams.causal_attn = false;
 
     ggml_status status;
-    const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status);
+    const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status);
 
     cparams.causal_attn = causal_attn_org;
 
@@ -846,7 +854,9 @@ int llama_context::encode(const llama_batch & batch_inp) {
 
     // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
     // overlap with device computation.
-    ggml_backend_sched_reset(sched.get());
+    if (!cparams.graph_reuse) {
+        ggml_backend_sched_reset(sched.get());
+    }
 
     // TODO: hacky solution
     if (model.arch == LLM_ARCH_T5 && t_embd) {
@@ -1005,11 +1015,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
             n_outputs = n_outputs_new;
         }
 
-        ggml_backend_sched_reset(sched.get());
-        ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
-
         ggml_status status;
-        const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
+        const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
 
         if (!res) {
             // the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache
@@ -1192,7 +1199,9 @@ int llama_context::decode(const llama_batch & batch_inp) {
 
     // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
     // overlap with device computation.
-    ggml_backend_sched_reset(sched.get());
+    if (!cparams.graph_reuse) {
+        ggml_backend_sched_reset(sched.get());
+    }
 
     return 0;
 }
@@ -1275,20 +1284,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
 // graph
 //
 
-int32_t llama_context::graph_max_nodes() const {
-    return std::max<int32_t>(65536, 5*model.n_tensors());
-}
-
-ggml_cgraph * llama_context::graph_init() {
-    ggml_init_params params = {
-        /*.mem_size   =*/ buf_compute_meta.size(),
-        /*.mem_buffer =*/ buf_compute_meta.data(),
-        /*.no_alloc   =*/ true,
-    };
-
-    ctx_compute.reset(ggml_init(params));
-
-    return ggml_new_graph_custom(ctx_compute.get(), graph_max_nodes(), false);
+uint32_t llama_context::graph_max_nodes() const {
+    return std::max<uint32_t>(65536u, 5u*model.n_tensors());
 }
 
 ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx) {
@@ -1301,6 +1298,9 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
         LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);
     }
 
+    gf_res_prev->reset();
+    ggml_backend_sched_reset(sched.get());
+
     // store the n_outputs as it is, and restore it afterwards
     // TODO: not sure if needed, might simplify in the future by removing this
     const auto save_n_outputs = this->n_outputs;
@@ -1310,17 +1310,15 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
     llama_batch_allocr balloc(model.hparams.n_pos_per_embd());
     llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs);
 
-    auto * gf = graph_init();
-    auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx);
+    auto * res = gf_res_reserve.get();
 
-    this->n_outputs = save_n_outputs;
+    const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT);
 
-    if (!res) {
-        LLAMA_LOG_ERROR("%s: failed to build worst-case graph\n", __func__);
-        return nullptr;
-    }
+    res->reset();
 
-    ggml_backend_sched_reset(sched.get());
+    auto * gf = model.build_graph(gparams);
+
+    this->n_outputs = save_n_outputs;
 
     // initialize scheduler with the specified graph
     if (!ggml_backend_sched_reserve(sched.get(), gf)) {
@@ -1331,28 +1329,27 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
     return gf;
 }
 
-llm_graph_result_ptr llama_context::graph_build(
-                      ggml_context * ctx,
-                       ggml_cgraph * gf,
-                const llama_ubatch & ubatch,
-                    llm_graph_type   gtype,
-      const llama_memory_context_i * mctx) {
-    return model.build_graph(
-            {
-                /*.ctx         =*/ ctx,
-                /*.arch        =*/ model.arch,
-                /*.hparams     =*/ model.hparams,
-                /*.cparams     =*/ cparams,
-                /*.ubatch      =*/ ubatch,
-                /*.sched       =*/ sched.get(),
-                /*.backend_cpu =*/ backend_cpu,
-                /*.cvec        =*/ &cvec,
-                /*.loras       =*/ &loras,
-                /*.mctx        =*/ mctx,
-                /*.cross       =*/ &cross,
-                /*.n_outputs   =*/ n_outputs,
-                /*.cb          =*/ graph_get_cb(),
-            }, gf, gtype);
+llm_graph_params llama_context::graph_params(
+                      llm_graph_result_i * res,
+                      const llama_ubatch & ubatch,
+            const llama_memory_context_i * mctx,
+            llm_graph_type   gtype) const {
+    return {
+        /*.arch        =*/ model.arch,
+        /*.hparams     =*/ model.hparams,
+        /*.cparams     =*/ cparams,
+        /*.ubatch      =*/ ubatch,
+        /*.gtype       =*/ gtype,
+        /*.sched       =*/ sched.get(),
+        /*.backend_cpu =*/ backend_cpu,
+        /*.cvec        =*/ &cvec,
+        /*.loras       =*/ &loras,
+        /*.mctx        =*/ mctx,
+        /*.cross       =*/ &cross,
+        /*.n_outputs   =*/ n_outputs,
+        /*.cb          =*/ graph_get_cb(),
+        /*.res         =*/ res,
+    };
 }
 
 ggml_status llama_context::graph_compute(
@@ -1930,6 +1927,7 @@ llama_perf_context_data llama_context::perf_get_data() const {
     data.t_eval_ms   = 1e-3 * t_eval_us;
     data.n_p_eval    = std::max(1, n_p_eval);
     data.n_eval      = std::max(1, n_eval);
+    data.n_reused    = std::max(0, n_reused);
 
     return data;
 }
@@ -1938,6 +1936,7 @@ void llama_context::perf_reset() {
     t_start_us  = ggml_time_us();
     t_eval_us   = n_eval = 0;
     t_p_eval_us = n_p_eval = 0;
+    n_reused    = 0;
 }
 
 //
@@ -2064,8 +2063,13 @@ void llama_context::opt_epoch_iter(
                 break;
             }
 
-            auto * gf = graph_init();
-            auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx.get());
+            auto * res = gf_res_prev.get();
+
+            const auto gparams = graph_params(res, ubatch, mctx.get(), LLM_GRAPH_TYPE_DEFAULT);
+
+            res->reset();
+
+            auto * gf = model.build_graph(gparams);
 
             struct ggml_context * ctx_compute_opt;
             {
@@ -2187,6 +2191,7 @@ llama_context_params llama_context_default_params() {
         /*.no_perf                     =*/ true,
         /*.op_offload                  =*/ true,
         /*.swa_full                    =*/ true,
+        /*.graph_reuse                 =*/ false,
     };
 
     return result;
@@ -2807,6 +2812,7 @@ void llama_perf_context_print(const llama_context * ctx) {
     LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
             __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
     LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
+    LLAMA_LOG_INFO("%s:    graphs reused = %10d\n", __func__, data.n_reused);
 }
 
 void llama_perf_context_reset(llama_context * ctx) {
diff --git a/src/llama-context.h b/src/llama-context.h
index 9ce05715a8c03..21d23d83851d1 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -35,8 +35,6 @@ struct llama_context {
 
     ggml_backend_sched_t get_sched() const;
 
-    ggml_context * get_ctx_compute() const;
-
     uint32_t n_ctx()         const;
     uint32_t n_ctx_per_seq() const;
     uint32_t n_batch()       const;
@@ -96,7 +94,7 @@ struct llama_context {
     // if memory_context is provided, it will be applied first to the context's memory
     // ret contains the status of the graph computation
     // returns nullptr only if ret != GGML_STATUS_SUCCESS
-    llm_graph_result_ptr process_ubatch(
+    llm_graph_result_i * process_ubatch(
                 const llama_ubatch & ubatch,
                     llm_graph_type   gtype,
             llama_memory_context_i * mctx,
@@ -188,10 +186,7 @@ struct llama_context {
     //
 
 public:
-    int32_t graph_max_nodes() const;
-
-    // zero-out inputs and create the ctx_compute for the compute graph
-    ggml_cgraph * graph_init();
+    uint32_t graph_max_nodes() const;
 
     // returns the result of ggml_backend_sched_graph_compute_async execution
     ggml_status graph_compute(ggml_cgraph * gf, bool batched);
@@ -200,12 +195,11 @@ struct llama_context {
     ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx);
 
 private:
-    llm_graph_result_ptr graph_build(
-                      ggml_context * ctx,
-                       ggml_cgraph * gf,
-                const llama_ubatch & ubatch,
-                    llm_graph_type   gtype,
-      const llama_memory_context_i * mctx);
+    llm_graph_params graph_params(
+                      llm_graph_result_i * res,
+                      const llama_ubatch & ubatch,
+            const llama_memory_context_i * mctx,
+                          llm_graph_type   gtype) const;
 
     llm_graph_cb graph_get_cb() const;
 
@@ -258,8 +252,6 @@ struct llama_context {
     ggml_backend_t backend_cpu = nullptr;
     std::vector<ggml_backend_ptr> backends;
 
-    ggml_context_ptr ctx_compute;
-
     // training
     ggml_opt_context_t opt_ctx = nullptr;
 
@@ -275,8 +267,8 @@ struct llama_context {
     std::vector<ggml_backend_t>             backend_ptrs;
     std::vector<ggml_backend_buffer_type_t> backend_buft;
 
-    // memory buffers used to evaluate the model
-    std::vector<uint8_t> buf_compute_meta;
+    llm_graph_result_ptr gf_res_prev;
+    llm_graph_result_ptr gf_res_reserve;
 
     // host buffer for the model output (logits and embeddings)
     ggml_backend_buffer_ptr buf_output;
@@ -294,4 +286,6 @@ struct llama_context {
 
     mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
     mutable int32_t n_eval   = 0; // number of eval calls
+
+    mutable int32_t n_reused = 0; // number of times the previous graph was reused
 };
diff --git a/src/llama-cparams.cpp b/src/llama-cparams.cpp
index a3e7a37ee36d7..f81cd10823d3b 100644
--- a/src/llama-cparams.cpp
+++ b/src/llama-cparams.cpp
@@ -3,3 +3,32 @@
 size_t llama_max_parallel_sequences(void) {
     return LLAMA_MAX_SEQ;
 }
+
+bool llama_cparams::is_same(const llama_cparams & other) const {
+    return
+        n_ctx               == other.n_ctx               &&
+        n_batch             == other.n_batch             &&
+        n_ubatch            == other.n_ubatch            &&
+        n_seq_max           == other.n_seq_max           &&
+        n_threads           == other.n_threads           &&
+        n_threads_batch     == other.n_threads_batch     &&
+        rope_freq_base      == other.rope_freq_base      &&
+        rope_freq_scale     == other.rope_freq_scale     &&
+        n_ctx_orig_yarn     == other.n_ctx_orig_yarn     &&
+        yarn_ext_factor     == other.yarn_ext_factor     &&
+        yarn_attn_factor    == other.yarn_attn_factor    &&
+        yarn_beta_fast      == other.yarn_beta_fast      &&
+        yarn_beta_slow      == other.yarn_beta_slow      &&
+        defrag_thold        == other.defrag_thold        &&
+        embeddings          == other.embeddings          &&
+        causal_attn         == other.causal_attn         &&
+        offload_kqv         == other.offload_kqv         &&
+        flash_attn          == other.flash_attn          &&
+        no_perf             == other.no_perf             &&
+        warmup              == other.warmup              &&
+        op_offload          == other.op_offload          &&
+        graph_reuse         == other.graph_reuse         &&
+        pooling_type        == other.pooling_type        &&
+        cb_eval             == other.cb_eval             &&
+        cb_eval_user_data   == other.cb_eval_user_data;
+}
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
index 118615d5bd2d5..064767d51d92c 100644
--- a/src/llama-cparams.h
+++ b/src/llama-cparams.h
@@ -33,9 +33,12 @@ struct llama_cparams {
     bool no_perf;
     bool warmup;
     bool op_offload;
+    bool graph_reuse;
 
     enum llama_pooling_type pooling_type;
 
     ggml_backend_sched_eval_callback cb_eval;
     void * cb_eval_user_data;
+
+    bool is_same(const llama_cparams & other) const;
 };
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 7f0e8c67f1325..264491b4de83a 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -28,6 +28,15 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
     }
 }
 
+bool llm_graph_input_embd::update(const llm_graph_params & params) {
+    bool res = true;
+
+    res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
+    res &= (!embd   && !params.ubatch.embd)  || (embd   &&   embd->ne[0] == params.ubatch.n_tokens);
+
+    return res;
+}
+
 void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
     if (ubatch->pos && pos) {
         const int64_t n_tokens = ubatch->n_tokens;
@@ -50,6 +59,14 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
     }
 }
 
+bool llm_graph_input_pos::update(const llm_graph_params & params) {
+    bool res = true;
+
+    res &= pos->ne[0] == params.ubatch.n_tokens;
+
+    return res;
+}
+
 void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
     if (ubatch->pos && attn_scale) {
         const int64_t n_tokens = ubatch->n_tokens;
@@ -118,6 +135,14 @@ void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {
     }
 }
 
+bool llm_graph_input_out_ids::update(const llm_graph_params & params) {
+    bool res = true;
+
+    res &= n_outputs == params.n_outputs;
+
+    return res;
+}
+
 void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
     if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
         const int64_t n_tokens     = ubatch->n_tokens;
@@ -287,6 +312,24 @@ void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
     mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
 }
 
+bool llm_graph_input_attn_kv_unified::update(const llm_graph_params & params) {
+    const auto * mctx = static_cast<const llama_kv_cache_unified_context *>(params.mctx);
+
+    this->mctx = mctx;
+
+    bool res = true;
+
+    res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
+    res &= self_v_idxs->ne[0] == params.ubatch.n_tokens;
+
+    res &= self_kq_mask->ne[0] == mctx->get_n_kv();
+    res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
+
+    res &= mctx->get_supports_set_rows(); // TODO: tmp
+
+    return res;
+}
+
 void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) {
     mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
     mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
@@ -299,6 +342,30 @@ void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch
     mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
 }
 
+bool llm_graph_input_attn_kv_unified_iswa::update(const llm_graph_params & params) {
+    const auto * mctx = static_cast<const llama_kv_cache_unified_iswa_context *>(params.mctx);
+
+    this->mctx = mctx;
+
+    bool res = true;
+
+    res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
+    res &= self_v_idxs->ne[0] == params.ubatch.n_tokens;
+
+    res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
+    res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens;
+
+    res &= self_kq_mask->ne[0] == mctx->get_base()->get_n_kv();
+    res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
+
+    res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
+    res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
+
+    res &= mctx->get_base()->get_supports_set_rows(); // TODO: tmp
+
+    return res;
+}
+
 void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
     GGML_ASSERT(cross_kq_mask);
 
@@ -395,7 +462,6 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
     n_ctx_orig       (cparams.n_ctx_orig_yarn),
     pooling_type     (cparams.pooling_type),
     rope_type        (hparams.rope_type),
-    ctx0             (params.ctx),
     sched            (params.sched),
     backend_cpu      (params.backend_cpu),
     cvec             (params.cvec),
@@ -403,7 +469,8 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
     mctx             (params.mctx),
     cross            (params.cross),
     cb_func          (params.cb),
-    res              (std::make_unique<llm_graph_result>()) {
+    res              (static_cast<llm_graph_result *>(params.res)),
+    ctx0             (res->get_ctx()) {
     }
 
 void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 7bdf656768a0c..ae2fd6481b609 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "llama-arch.h"
+#include "llama-batch.h"
 #include "llama-hparams.h"
 #include "llama-adapter.h"
 
@@ -14,7 +15,6 @@ struct ggml_cgraph;
 struct ggml_context;
 struct ggml_tensor;
 
-struct llama_ubatch;
 struct llama_cparams;
 
 struct llama_memory_context_i;
@@ -69,6 +69,8 @@ struct llama_cross {
     std::vector<std::set<llama_seq_id>> seq_ids_enc;
 };
 
+struct llm_graph_params;
+
 //
 // llm_graph_input
 //
@@ -78,11 +80,19 @@ class llm_graph_input_i {
     virtual ~llm_graph_input_i() = default;
 
     virtual void set_input(const llama_ubatch * ubatch) = 0;
+
+    // return true if the resulting input tensors using the provided graph parameters would be
+    //   the same as the previous input tensors that we have currently stored in the object
+    virtual bool update(const llm_graph_params & params) {
+        // returning false here by default will prevent from reusing the graph if the check
+        //   for the input type has not been implemented yet
+        GGML_UNUSED(params);
+        return false;
+    }
 };
 
 using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
 
-
 class llm_graph_input_embd : public llm_graph_input_i {
 public:
     llm_graph_input_embd()          = default;
@@ -90,6 +100,8 @@ class llm_graph_input_embd : public llm_graph_input_i {
 
     void set_input(const llama_ubatch * ubatch) override;
 
+    bool update(const llm_graph_params & params) override;
+
     ggml_tensor * tokens = nullptr; // I32 [n_batch]
     ggml_tensor * embd   = nullptr; // F32 [n_embd, n_batch]
 };
@@ -101,6 +113,8 @@ class llm_graph_input_pos : public llm_graph_input_i {
 
     void set_input(const llama_ubatch * ubatch) override;
 
+    bool update(const llm_graph_params & params) override;
+
     ggml_tensor * pos = nullptr; // I32 [n_batch]
 
     const uint32_t n_pos_per_embd = 1;
@@ -154,17 +168,19 @@ class llm_graph_input_out_ids : public llm_graph_input_i {
     llm_graph_input_out_ids(
             const llama_hparams & hparams,
             const llama_cparams & cparams,
-            int32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
+            uint32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
     virtual ~llm_graph_input_out_ids() = default;
 
     void set_input(const llama_ubatch * ubatch) override;
 
+    bool update(const llm_graph_params & params) override;
+
     ggml_tensor * out_ids; // I32 [n_outputs]
 
     const llama_hparams & hparams;
     const llama_cparams & cparams;
 
-    const int32_t n_outputs;
+    const uint32_t n_outputs;
 };
 
 class llm_graph_input_mean : public llm_graph_input_i {
@@ -249,6 +265,8 @@ class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
 
     void set_input(const llama_ubatch * ubatch) override;
 
+    bool update(const llm_graph_params & params) override;
+
     ggml_tensor * get_k_idxs() const { return self_k_idxs; }
     ggml_tensor * get_v_idxs() const { return self_v_idxs; }
 
@@ -280,6 +298,8 @@ class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i {
 
     void set_input(const llama_ubatch * ubatch) override;
 
+    bool update(const llm_graph_params & params) override;
+
     ggml_tensor * get_k_idxs()     const { return self_k_idxs; }
     ggml_tensor * get_v_idxs()     const { return self_v_idxs; }
     ggml_tensor * get_k_idxs_swa() const { return self_k_idxs_swa; }
@@ -373,29 +393,110 @@ class llm_graph_input_one : public llm_graph_input_i {
 // along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc.
 //   these are used by the llama_context to extact the relevant data, based on the compute parameters
 
+// TODO: this interface seems redundant - remove it
 class llm_graph_result_i {
 public:
     virtual ~llm_graph_result_i() = default;
 
-    virtual ggml_tensor * get_tokens()      = 0;
-    virtual ggml_tensor * get_logits()      = 0;
-    virtual ggml_tensor * get_embd()        = 0;
-    virtual ggml_tensor * get_embd_pooled() = 0;
+    virtual ggml_tensor * get_tokens()      const = 0;
+    virtual ggml_tensor * get_logits()      const = 0;
+    virtual ggml_tensor * get_embd()        const = 0;
+    virtual ggml_tensor * get_embd_pooled() const = 0;
+
+    virtual ggml_cgraph  * get_gf()  = 0;
+    virtual ggml_context * get_ctx() = 0;
+
+    virtual void reset() = 0;
 
     virtual void set_inputs(const llama_ubatch * ubatch) = 0;
+
+    virtual bool update(const llm_graph_params & params) = 0;
 };
 
 using llm_graph_result_ptr = std::unique_ptr<llm_graph_result_i>;
 
+// callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
+using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>;
+
+struct llm_graph_params {
+    llm_arch arch = LLM_ARCH_UNKNOWN;
+
+    llama_hparams hparams;
+    llama_cparams cparams;
+
+    llama_ubatch ubatch; // note: intentionally make a copy
+
+    llm_graph_type gtype;
+
+    ggml_backend_sched_t sched;
+    ggml_backend_t backend_cpu;
+
+    const llama_adapter_cvec     * cvec;
+    const llama_adapter_loras    * loras;
+    const llama_memory_context_i * mctx;
+    const llama_cross            * cross;
+
+    uint32_t n_outputs;
+
+    llm_graph_cb cb;
+
+    // TODO: temporary
+    llm_graph_result_i * res;
+
+    bool is_same(const llm_graph_params & other) const {
+        return
+            hparams.is_same(other.hparams) &&
+            cparams.is_same(other.cparams) &&
+            ubatch .is_same(other.ubatch)  &&
+            arch      == other.arch  &&
+            gtype     == other.gtype &&
+            cvec      == other.cvec  &&
+            loras     == other.loras &&
+            cross     == other.cross &&
+            n_outputs == other.n_outputs;
+    }
+};
 
 class llm_graph_result : public llm_graph_result_i {
 public:
+    llm_graph_result(int64_t max_nodes) : max_nodes(max_nodes) {
+        reset();
+    }
+
     virtual ~llm_graph_result() = default;
 
-    ggml_tensor * get_tokens()      override { return t_tokens; }
-    ggml_tensor * get_logits()      override { return t_logits; }
-    ggml_tensor * get_embd()        override { return t_embd; }
-    ggml_tensor * get_embd_pooled() override { return t_embd_pooled; }
+    ggml_tensor * get_tokens()      const override { return t_tokens; }
+    ggml_tensor * get_logits()      const override { return t_logits; }
+    ggml_tensor * get_embd()        const override { return t_embd; }
+    ggml_tensor * get_embd_pooled() const override { return t_embd_pooled; }
+
+    ggml_cgraph  * get_gf()  override { return gf; }
+    ggml_context * get_ctx() override { return ctx_compute.get(); }
+
+    void set_max_nodes(int64_t max_nodes) {
+        this->max_nodes = max_nodes;
+    }
+
+    void reset() override {
+        t_tokens      = nullptr;
+        t_logits      = nullptr;
+        t_embd        = nullptr;
+        t_embd_pooled = nullptr;
+
+        inputs.clear();
+
+        buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
+
+        ggml_init_params params = {
+            /*.mem_size   =*/ buf_compute_meta.size(),
+            /*.mem_buffer =*/ buf_compute_meta.data(),
+            /*.no_alloc   =*/ true,
+        };
+
+        ctx_compute.reset(ggml_init(params));
+
+        gf = ggml_new_graph_custom(ctx_compute.get(), max_nodes, false);
+    }
 
     void set_inputs(const llama_ubatch * ubatch) override {
         for (auto & input : inputs) {
@@ -403,6 +504,25 @@ class llm_graph_result : public llm_graph_result_i {
         }
     }
 
+    // try to update the existing graph result using the new graph parameters
+    // this can only be done if we determine that the resulting graph using the new graph parameters
+    //   would be identical to the existing graph. in that case, we simply have to update the memory
+    //   contexts of the input tensors of the graph and we can reuse it for another computation
+    // return true if the graph was updated and can be reused
+    bool update(const llm_graph_params & params) override {
+        if (!this->params.is_same(params)) {
+            return false;
+        }
+
+        bool res = true;
+
+        for (auto & input : inputs) {
+            res &= input->update(params);
+        }
+
+        return res;
+    }
+
     llm_graph_input_i * add_input(llm_graph_input_ptr input) {
         inputs.emplace_back(std::move(input));
         return inputs.back().get();
@@ -415,37 +535,26 @@ class llm_graph_result : public llm_graph_result_i {
     ggml_tensor * t_embd_pooled = nullptr;
 
     std::vector<llm_graph_input_ptr> inputs;
-};
 
-//
-// llm_graph_context
-//
+    ggml_context_ptr ctx_compute;
 
-// callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
-using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>;
+    // memory buffers used to evaluate the model
+    std::vector<uint8_t> buf_compute_meta;
 
-struct llm_graph_params {
-    ggml_context * ctx;
+    ggml_cgraph * gf;
 
-    const llm_arch arch;
+    int64_t max_nodes;
 
-    const llama_hparams & hparams;
-    const llama_cparams & cparams;
-    const llama_ubatch  & ubatch;
-
-    ggml_backend_sched_t sched;
-    ggml_backend_t backend_cpu;
-
-    const llama_adapter_cvec     * cvec;
-    const llama_adapter_loras    * loras;
-    const llama_memory_context_i * mctx;
-    const llama_cross            * cross;
-
-    uint32_t n_outputs;
-
-    const llm_graph_cb & cb;
+    // keep a copy of the previous graph parameters
+    // we will use this to determine whether the graph can be reused by comparing them with the new parameters
+    // note: these are updated after constructing the new graph
+    llm_graph_params params;
 };
 
+//
+// llm_graph_context
+//
+
 // used in build_rs to properly order writes and avoid unnecessary copies
 using llm_graph_get_rows_fn = std::function<ggml_tensor * (ggml_context *, ggml_tensor * states, ggml_tensor * ids)>;
 
@@ -485,8 +594,6 @@ struct llm_graph_context {
     const enum llama_pooling_type pooling_type;
     const enum llama_rope_type    rope_type;
 
-    ggml_context * ctx0 = nullptr;
-
     ggml_backend_sched_t sched;
 
     ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
@@ -498,7 +605,9 @@ struct llm_graph_context {
 
     const llm_graph_cb & cb_func;
 
-    std::unique_ptr<llm_graph_result> res;
+    llm_graph_result * res;
+
+    ggml_context * ctx0 = nullptr;
 
     llm_graph_context(const llm_graph_params & params);
     virtual ~llm_graph_context() = default;
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index 86c814d51b901..a59e96a92fe7e 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -102,3 +102,12 @@ bool llama_hparams::is_swa(uint32_t il) const {
 
     GGML_ABORT("fatal error");
 }
+
+bool llama_hparams::is_same(const llama_hparams & other) const {
+    return
+        n_ctx_train == other.n_ctx_train &&
+        n_embd == other.n_embd &&
+        n_layer == other.n_layer &&
+        n_expert == other.n_expert &&
+        n_expert_used == other.n_expert_used;
+}
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 476d0a5eade28..dfddb4f3d9534 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -202,6 +202,8 @@ struct llama_hparams {
     uint32_t n_pos_per_embd() const;
 
     bool is_swa(uint32_t il) const;
+
+    bool is_same(const llama_hparams & other) const;
 };
 
 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp
index d3129cc53281e..e91b92b6095e1 100644
--- a/src/llama-kv-cache-unified.cpp
+++ b/src/llama-kv-cache-unified.cpp
@@ -68,6 +68,8 @@ llama_kv_cache_unified::llama_kv_cache_unified(
 
     cells.resize(kv_size);
 
+    gf_res.reset(new llm_graph_result(32768)); // note: the max nodes will be updated later
+
     for (uint32_t il = 0; il < n_layer_cache; il++) {
         if (filter && !filter(il)) {
             LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il);
@@ -158,7 +160,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
     debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
 
     const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
-    supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) : 0;
+    supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : 0;
 
     if (!supports_set_rows) {
         LLAMA_LOG_WARN("%s: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility\n", __func__);
@@ -480,14 +482,12 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
         if (hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
             ggml_backend_sched_reset(sched);
 
-            auto * gf = lctx->graph_init();
+            auto * res = gf_res.get();
 
-            auto res = build_graph_shift(lctx->get_cparams(), lctx->get_ctx_compute(), gf);
-            if (!res) {
-                LLAMA_LOG_ERROR("%s: failed to build graph for K-shift\n", __func__);
-                return updated;
-            }
+            res->set_max_nodes(lctx->graph_max_nodes());
+            res->reset();
 
+            auto * gf = build_graph_shift(res, lctx);
             if (!ggml_backend_sched_alloc_graph(sched, gf)) {
                 LLAMA_LOG_ERROR("%s: failed to allocate compute graph for K-shift\n", __func__);
                 return updated;
@@ -529,14 +529,12 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
 
         ggml_backend_sched_reset(sched);
 
-        auto * gf = lctx->graph_init();
+        auto * res = gf_res.get();
 
-        auto res = build_graph_defrag(lctx->get_cparams(), lctx->get_ctx_compute(), gf, dinfo);
-        if (!res) {
-            LLAMA_LOG_ERROR("%s: failed to build graph for defrag\n", __func__);
-            return updated;
-        }
+        res->set_max_nodes(lctx->graph_max_nodes());
+        res->reset();
 
+        auto * gf = build_graph_defrag(res, lctx, dinfo);
         if (!ggml_backend_sched_alloc_graph(sched, gf)) {
             LLAMA_LOG_ERROR("%s: failed to allocate compute graph for defrag\n", __func__);
             return updated;
@@ -780,6 +778,10 @@ uint32_t llama_kv_cache_unified::get_n_kv() const {
     return std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad)));
 }
 
+bool llama_kv_cache_unified::get_supports_set_rows() const {
+    return supports_set_rows;
+}
+
 ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv) const {
     const int32_t ikv = map_layer_ids.at(il);
 
@@ -1142,11 +1144,9 @@ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
     }
 }
 
-llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
-        const llama_cparams & cparams,
-               ggml_context * ctx,
-                ggml_cgraph * gf) const {
-    auto res = std::make_unique<llm_graph_result>();
+ggml_cgraph * llama_kv_cache_unified::build_graph_shift(llm_graph_result * res, llama_context * lctx) const {
+    auto * ctx = res->get_ctx();
+    auto * gf  = res->get_gf();
 
     const auto & n_embd_head_k = hparams.n_embd_head_k;
   //const auto & n_embd_head_v = hparams.n_embd_head_v;
@@ -1156,6 +1156,8 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
     inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cells.size());
     ggml_set_input(inp->k_shift);
 
+    const auto & cparams = lctx->get_cparams();
+
     for (const auto & layer : layers) {
         const uint32_t il = layer.il;
 
@@ -1181,18 +1183,20 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
 
     res->add_input(std::move(inp));
 
-    return res;
+    return gf;
 }
 
-llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
-                const llama_cparams & cparams,
-                       ggml_context * ctx,
-                        ggml_cgraph * gf,
-                  const defrag_info & dinfo) const {
-    auto res = std::make_unique<llm_graph_result>();
+ggml_cgraph * llama_kv_cache_unified::build_graph_defrag(
+         llm_graph_result * res,
+            llama_context * lctx,
+        const defrag_info & dinfo) const {
+    auto * ctx = res->get_ctx();
+    auto * gf  = res->get_gf();
 
     const auto & ids = dinfo.ids;
 
+    const auto & cparams = lctx->get_cparams();
+
 #if 0
     // CPU defrag
     //
@@ -1329,7 +1333,7 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
     //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
 #endif
 
-    return res;
+    return gf;
 }
 
 llama_kv_cache_unified::defrag_info llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) const {
@@ -1940,6 +1944,10 @@ uint32_t llama_kv_cache_unified_context::get_n_kv() const {
     return n_kv;
 }
 
+bool llama_kv_cache_unified_context::get_supports_set_rows() const {
+    return kv->get_supports_set_rows();
+}
+
 ggml_tensor * llama_kv_cache_unified_context::get_k(ggml_context * ctx, int32_t il) const {
     return kv->get_k(ctx, il, n_kv);
 }
diff --git a/src/llama-kv-cache-unified.h b/src/llama-kv-cache-unified.h
index b8b0356e830c8..3eff137fee953 100644
--- a/src/llama-kv-cache-unified.h
+++ b/src/llama-kv-cache-unified.h
@@ -121,6 +121,9 @@ class llama_kv_cache_unified : public llama_memory_i {
 
     uint32_t get_n_kv() const;
 
+    // TODO: temporary
+    bool get_supports_set_rows() const;
+
     // get views of the current state of the cache
     ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv) const;
     ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv) const;
@@ -193,13 +196,15 @@ class llama_kv_cache_unified : public llama_memory_i {
 
     // env: LLAMA_SET_ROWS (temporary)
     // ref: https://github.com/ggml-org/llama.cpp/pull/14285
-    int supports_set_rows = false;
+    bool supports_set_rows = false;
 
     const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
 
     std::vector<ggml_context_ptr>        ctxs;
     std::vector<ggml_backend_buffer_ptr> bufs;
 
+    std::unique_ptr<llm_graph_result> gf_res;
+
     llama_kv_cells_unified cells;
 
     std::vector<kv_layer> layers;
@@ -226,15 +231,13 @@ class llama_kv_cache_unified : public llama_memory_i {
                           float   freq_base,
                           float   freq_scale) const;
 
-    llm_graph_result_ptr build_graph_shift(
-            const llama_cparams & cparams,
-                   ggml_context * ctx,
-                    ggml_cgraph * gf) const;
+    ggml_cgraph * build_graph_shift(
+               llm_graph_result * res,
+                  llama_context * lctx) const;
 
-    llm_graph_result_ptr build_graph_defrag(
-            const llama_cparams & cparams,
-                   ggml_context * ctx,
-                    ggml_cgraph * gf,
+    ggml_cgraph * build_graph_defrag(
+               llm_graph_result * res,
+                  llama_context * lctx,
               const defrag_info & dinfo) const;
 
     void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
@@ -288,6 +291,9 @@ class llama_kv_cache_unified_context : public llama_memory_context_i {
 
     uint32_t get_n_kv() const;
 
+    // TODO: temporary
+    bool get_supports_set_rows() const;
+
     // get views of the current state of the cache
     ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
     ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 0573c5bcea0a4..caab6a0d270be 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -14751,10 +14751,10 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
     return res;
 }
 
-llm_graph_result_ptr llama_model::build_graph(
-        const llm_graph_params & params,
-                   ggml_cgraph * gf,
-                llm_graph_type   type) const {
+ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
+    // TODO: temporary - will refactor this to keep the "gf" instance in the llm_graph_context and avoid passing it everywhere
+    auto * gf = params.res->get_gf();
+
     std::unique_ptr<llm_graph_context> llm;
 
     switch (arch) {
@@ -14961,7 +14961,7 @@ llm_graph_result_ptr llama_model::build_graph(
             } break;
         case LLM_ARCH_T5:
             {
-                switch (type) {
+                switch (params.gtype) {
                     case LLM_GRAPH_TYPE_ENCODER:
                         llm = std::make_unique<llm_build_t5_enc>(*this, params, gf);
                         break;
@@ -15047,7 +15047,10 @@ llm_graph_result_ptr llama_model::build_graph(
     // add on pooling layer
     llm->build_pooling(gf, cls, cls_b, cls_out, cls_out_b);
 
-    return std::move(llm->res);
+    // TODO: updating the graph parameters here is a little bit obscure - figure out something better
+    llm->res->params = params;
+
+    return llm->res->get_gf();
 }
 
 //
diff --git a/src/llama-model.h b/src/llama-model.h
index 979fff62045f9..9bf9cf327dab9 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -436,10 +436,7 @@ struct llama_model {
     llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
 
     // TODO: move this to new llm_arch_model_i interface
-    llm_graph_result_ptr build_graph(
-            const llm_graph_params & params,
-                       ggml_cgraph * gf,
-                    llm_graph_type   type) const;
+    ggml_cgraph * build_graph(const llm_graph_params & params) const;
 
 private:
     struct impl;

From 0d2038f90a7738ce4c0a86808d09800453251bf7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 7 Jul 2025 09:07:15 +0300
Subject: [PATCH 2/6] llama-bench : add graph reuse parameter

ggml-ci
---
 tools/llama-bench/llama-bench.cpp | 37 ++++++++++++++++++++++++++++---
 1 file changed, 34 insertions(+), 3 deletions(-)

diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp
index b80e984d0245b..ad74dbaf27a11 100644
--- a/tools/llama-bench/llama-bench.cpp
+++ b/tools/llama-bench/llama-bench.cpp
@@ -261,6 +261,7 @@ struct cmd_params {
     std::vector<bool>                use_mmap;
     std::vector<bool>                embeddings;
     std::vector<bool>                no_op_offload;
+    std::vector<bool>                graph_reuse;
     ggml_numa_strategy               numa;
     int                              reps;
     ggml_sched_priority              prio;
@@ -298,6 +299,7 @@ static const cmd_params cmd_params_defaults = {
     /* use_mmap             */ { true },
     /* embeddings           */ { false },
     /* no_op_offload        */ { false },
+    /* graph_reuse          */ { false },
     /* numa                 */ GGML_NUMA_STRATEGY_DISABLED,
     /* reps                 */ 5,
     /* prio                 */ GGML_SCHED_PRIO_NORMAL,
@@ -377,6 +379,7 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  -ot --override-tensors <tensor name pattern>=<buffer type>;...\n");
     printf("                                            (default: disabled)\n");
     printf("  -nopo, --no-op-offload <0|1>              (default: 0)\n");
+    printf("  -gr, --graph-reuse <0|1>                  (default: 0)\n");
     printf("\n");
     printf(
         "Multiple values can be given for each parameter by separating them with ','\n"
@@ -620,6 +623,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 }
                 auto p = string_split<bool>(argv[i], split_delim);
                 params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
+            } else if (arg == "-gr" || arg == "--graph-reuse") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto p = string_split<bool>(argv[i], split_delim);
+                params.graph_reuse.insert(params.graph_reuse.end(), p.begin(), p.end());
             } else if (arg == "--numa") {
                 if (++i >= argc) {
                     invalid_param = true;
@@ -885,6 +895,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
     if (params.no_op_offload.empty()) {
         params.no_op_offload = cmd_params_defaults.no_op_offload;
     }
+    if (params.graph_reuse.empty()) {
+        params.graph_reuse = cmd_params_defaults.graph_reuse;
+    }
     if (params.n_threads.empty()) {
         params.n_threads = cmd_params_defaults.n_threads;
     }
@@ -926,6 +939,7 @@ struct cmd_params_instance {
     bool               use_mmap;
     bool               embeddings;
     bool               no_op_offload;
+    bool               graph_reuse;
 
     llama_model_params to_llama_mparams() const {
         llama_model_params mparams = llama_model_default_params();
@@ -998,6 +1012,7 @@ struct cmd_params_instance {
         cparams.embeddings   = embeddings;
         cparams.op_offload   = !no_op_offload;
         cparams.swa_full     = false;
+        cparams.graph_reuse  = graph_reuse;
 
         return cparams;
     }
@@ -1018,6 +1033,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
     for (const auto & mmp : params.use_mmap)
     for (const auto & embd : params.embeddings)
     for (const auto & nopo : params.no_op_offload)
+    for (const auto & gr : params.graph_reuse)
     for (const auto & nb : params.n_batch)
     for (const auto & nub : params.n_ubatch)
     for (const auto & tk : params.type_k)
@@ -1059,6 +1075,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .use_mmap     = */ mmp,
                 /* .embeddings   = */ embd,
                 /* .no_op_offload= */ nopo,
+                /* .graph_reuse  = */ gr,
             };
             instances.push_back(instance);
         }
@@ -1092,6 +1109,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .use_mmap     = */ mmp,
                 /* .embeddings   = */ embd,
                 /* .no_op_offload= */ nopo,
+                /* .graph_reuse  = */ gr,
             };
             instances.push_back(instance);
         }
@@ -1125,6 +1143,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .use_mmap     = */ mmp,
                 /* .embeddings   = */ embd,
                 /* .no_op_offload= */ nopo,
+                /* .graph_reuse  = */ gr,
             };
             instances.push_back(instance);
         }
@@ -1162,6 +1181,7 @@ struct test {
     bool                     use_mmap;
     bool                     embeddings;
     bool                     no_op_offload;
+    bool                     graph_reuse;
     int                      n_prompt;
     int                      n_gen;
     int                      n_depth;
@@ -1197,6 +1217,7 @@ struct test {
         use_mmap       = inst.use_mmap;
         embeddings     = inst.embeddings;
         no_op_offload  = inst.no_op_offload;
+        graph_reuse    = inst.graph_reuse;
         n_prompt       = inst.n_prompt;
         n_gen          = inst.n_gen;
         n_depth        = inst.n_depth;
@@ -1243,8 +1264,8 @@ struct test {
             "cpu_mask",     "cpu_strict",   "poll",           "type_k",     "type_v",       "n_gpu_layers",
             "split_mode",   "main_gpu",     "no_kv_offload",  "flash_attn", "tensor_split", "tensor_buft_overrides",
             "defrag_thold",
-            "use_mmap",     "embeddings",   "no_op_offload",   "n_prompt",       "n_gen",      "n_depth",      "test_time",
-            "avg_ns",       "stddev_ns",    "avg_ts",         "stddev_ts",
+            "use_mmap",     "embeddings",   "no_op_offload",  "graph_reuse", "n_prompt",       "n_gen",      "n_depth",
+            "test_time",    "avg_ns",       "stddev_ns",      "avg_ts",      "stddev_ts",
         };
         return fields;
     }
@@ -1259,7 +1280,7 @@ struct test {
             return INT;
         }
         if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
-            field == "use_mmap" || field == "embeddings") {
+            field == "use_mmap" || field == "embeddings" || field == "graph_reuse") {
             return BOOL;
         }
         if (field == "avg_ts" || field == "stddev_ts" || field == "defrag_thold") {
@@ -1333,6 +1354,7 @@ struct test {
                                             std::to_string(use_mmap),
                                             std::to_string(embeddings),
                                             std::to_string(no_op_offload),
+                                            std::to_string(graph_reuse),
                                             std::to_string(n_prompt),
                                             std::to_string(n_gen),
                                             std::to_string(n_depth),
@@ -1518,6 +1540,9 @@ struct markdown_printer : public printer {
         if (field == "no_op_offload") {
             return 4;
         }
+        if (field == "graph_reuse") {
+            return 4;
+        }
 
         int width = std::max((int) field.length(), 10);
 
@@ -1552,6 +1577,9 @@ struct markdown_printer : public printer {
         if (field == "no_op_offload") {
             return "nopo";
         }
+        if (field == "graph_reuse") {
+            return "gr";
+        }
         if (field == "tensor_split") {
             return "ts";
         }
@@ -1626,6 +1654,9 @@ struct markdown_printer : public printer {
         if (params.no_op_offload.size() > 1 || params.no_op_offload != cmd_params_defaults.no_op_offload) {
             fields.emplace_back("no_op_offload");
         }
+        if (params.graph_reuse.size() > 1 || params.graph_reuse != cmd_params_defaults.graph_reuse) {
+            fields.emplace_back("graph_reuse");
+        }
         fields.emplace_back("test");
         fields.emplace_back("t/s");
 

From 5143876791d7ebec40b0e4df5cf61330415154c4 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 8 Jul 2025 20:26:57 +0300
Subject: [PATCH 3/6] cont : remove the parameter and the sched resets

ggml-ci
---
 common/arg.cpp                    |  8 -------
 common/common.cpp                 |  1 -
 common/common.h                   |  1 -
 src/llama-context.cpp             | 21 +++++++-----------
 src/llama-kv-cache-unified.cpp    |  1 +
 tools/llama-bench/llama-bench.cpp | 37 +++----------------------------
 6 files changed, 12 insertions(+), 57 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index d4a3f5b7f21ec..40af7e574830f 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1464,14 +1464,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.swa_full = true;
         }
     ).set_env("LLAMA_ARG_SWA_FULL"));
-    add_opt(common_arg(
-        {"--graph-reuse", "-gr"},
-        string_format("reuse previous compute graphs when possible (default: %s)"
-            "[(more info)](https://github.com/ggml-org/llama.cpp/pull/14482)", params.graph_reuse ? "true" : "false"),
-        [](common_params & params) {
-            params.graph_reuse = true;
-        }
-    ).set_env("LLAMA_ARG_GRAPH_REUSE"));
     add_opt(common_arg(
         {"--no-context-shift"},
         string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
diff --git a/common/common.cpp b/common/common.cpp
index c7c163404a2af..e4e71ad13fb59 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1157,7 +1157,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
     cparams.no_perf           = params.no_perf;
     cparams.op_offload        = !params.no_op_offload;
     cparams.swa_full          = params.swa_full;
-    cparams.graph_reuse       = params.graph_reuse;
 
     cparams.type_k = params.cache_type_k;
     cparams.type_v = params.cache_type_v;
diff --git a/common/common.h b/common/common.h
index 465d05672d2a6..8922090e7b10d 100644
--- a/common/common.h
+++ b/common/common.h
@@ -330,7 +330,6 @@ struct common_params {
     bool no_perf           = false; // disable performance metrics
     bool ctx_shift         = true;  // context shift on inifinite text generation
     bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
-    bool graph_reuse       = false; // reuse previous compute graphs when possible
 
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
     bool use_mmap          = true;  // use mmap for faster loads
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 8a1e2a86f9709..b3c94da597f2c 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -718,7 +718,14 @@ llm_graph_result_i * llama_context::process_ubatch(const llama_ubatch & ubatch,
         }
     }
 
-    res->set_inputs(&ubatch);
+    // set the input data for the input tensors
+    {
+        //const auto t_start_us = ggml_time_us();
+
+        res->set_inputs(&ubatch);
+
+        //LLAMA_LOG_INFO("graph set inputs time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
+    }
 
     const auto status = graph_compute(res->get_gf(), ubatch.n_tokens > 1);
     if (status != GGML_STATUS_SUCCESS) {
@@ -852,12 +859,6 @@ int llama_context::encode(const llama_batch & batch_inp) {
         }
     }
 
-    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
-    // overlap with device computation.
-    if (!cparams.graph_reuse) {
-        ggml_backend_sched_reset(sched.get());
-    }
-
     // TODO: hacky solution
     if (model.arch == LLM_ARCH_T5 && t_embd) {
         //cross.t_embd = t_embd;
@@ -1197,12 +1198,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
     // wait for the computation to finish (automatically done when obtaining the model output)
     //synchronize();
 
-    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
-    // overlap with device computation.
-    if (!cparams.graph_reuse) {
-        ggml_backend_sched_reset(sched.get());
-    }
-
     return 0;
 }
 
diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp
index e91b92b6095e1..246eaec164a11 100644
--- a/src/llama-kv-cache-unified.cpp
+++ b/src/llama-kv-cache-unified.cpp
@@ -963,6 +963,7 @@ void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ub
     //      xxxxx-----
     //      xxxxx-----
     // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
+    // TODO: optimize this section
     for (uint32_t h = 0; h < 1; ++h) {
         for (uint32_t i = 0; i < n_tokens; ++i) {
             const llama_seq_id seq_id = ubatch->seq_id[i][0];
diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp
index ad74dbaf27a11..b80e984d0245b 100644
--- a/tools/llama-bench/llama-bench.cpp
+++ b/tools/llama-bench/llama-bench.cpp
@@ -261,7 +261,6 @@ struct cmd_params {
     std::vector<bool>                use_mmap;
     std::vector<bool>                embeddings;
     std::vector<bool>                no_op_offload;
-    std::vector<bool>                graph_reuse;
     ggml_numa_strategy               numa;
     int                              reps;
     ggml_sched_priority              prio;
@@ -299,7 +298,6 @@ static const cmd_params cmd_params_defaults = {
     /* use_mmap             */ { true },
     /* embeddings           */ { false },
     /* no_op_offload        */ { false },
-    /* graph_reuse          */ { false },
     /* numa                 */ GGML_NUMA_STRATEGY_DISABLED,
     /* reps                 */ 5,
     /* prio                 */ GGML_SCHED_PRIO_NORMAL,
@@ -379,7 +377,6 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  -ot --override-tensors <tensor name pattern>=<buffer type>;...\n");
     printf("                                            (default: disabled)\n");
     printf("  -nopo, --no-op-offload <0|1>              (default: 0)\n");
-    printf("  -gr, --graph-reuse <0|1>                  (default: 0)\n");
     printf("\n");
     printf(
         "Multiple values can be given for each parameter by separating them with ','\n"
@@ -623,13 +620,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 }
                 auto p = string_split<bool>(argv[i], split_delim);
                 params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
-            } else if (arg == "-gr" || arg == "--graph-reuse") {
-                if (++i >= argc) {
-                    invalid_param = true;
-                    break;
-                }
-                auto p = string_split<bool>(argv[i], split_delim);
-                params.graph_reuse.insert(params.graph_reuse.end(), p.begin(), p.end());
             } else if (arg == "--numa") {
                 if (++i >= argc) {
                     invalid_param = true;
@@ -895,9 +885,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
     if (params.no_op_offload.empty()) {
         params.no_op_offload = cmd_params_defaults.no_op_offload;
     }
-    if (params.graph_reuse.empty()) {
-        params.graph_reuse = cmd_params_defaults.graph_reuse;
-    }
     if (params.n_threads.empty()) {
         params.n_threads = cmd_params_defaults.n_threads;
     }
@@ -939,7 +926,6 @@ struct cmd_params_instance {
     bool               use_mmap;
     bool               embeddings;
     bool               no_op_offload;
-    bool               graph_reuse;
 
     llama_model_params to_llama_mparams() const {
         llama_model_params mparams = llama_model_default_params();
@@ -1012,7 +998,6 @@ struct cmd_params_instance {
         cparams.embeddings   = embeddings;
         cparams.op_offload   = !no_op_offload;
         cparams.swa_full     = false;
-        cparams.graph_reuse  = graph_reuse;
 
         return cparams;
     }
@@ -1033,7 +1018,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
     for (const auto & mmp : params.use_mmap)
     for (const auto & embd : params.embeddings)
     for (const auto & nopo : params.no_op_offload)
-    for (const auto & gr : params.graph_reuse)
     for (const auto & nb : params.n_batch)
     for (const auto & nub : params.n_ubatch)
     for (const auto & tk : params.type_k)
@@ -1075,7 +1059,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .use_mmap     = */ mmp,
                 /* .embeddings   = */ embd,
                 /* .no_op_offload= */ nopo,
-                /* .graph_reuse  = */ gr,
             };
             instances.push_back(instance);
         }
@@ -1109,7 +1092,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .use_mmap     = */ mmp,
                 /* .embeddings   = */ embd,
                 /* .no_op_offload= */ nopo,
-                /* .graph_reuse  = */ gr,
             };
             instances.push_back(instance);
         }
@@ -1143,7 +1125,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .use_mmap     = */ mmp,
                 /* .embeddings   = */ embd,
                 /* .no_op_offload= */ nopo,
-                /* .graph_reuse  = */ gr,
             };
             instances.push_back(instance);
         }
@@ -1181,7 +1162,6 @@ struct test {
     bool                     use_mmap;
     bool                     embeddings;
     bool                     no_op_offload;
-    bool                     graph_reuse;
     int                      n_prompt;
     int                      n_gen;
     int                      n_depth;
@@ -1217,7 +1197,6 @@ struct test {
         use_mmap       = inst.use_mmap;
         embeddings     = inst.embeddings;
         no_op_offload  = inst.no_op_offload;
-        graph_reuse    = inst.graph_reuse;
         n_prompt       = inst.n_prompt;
         n_gen          = inst.n_gen;
         n_depth        = inst.n_depth;
@@ -1264,8 +1243,8 @@ struct test {
             "cpu_mask",     "cpu_strict",   "poll",           "type_k",     "type_v",       "n_gpu_layers",
             "split_mode",   "main_gpu",     "no_kv_offload",  "flash_attn", "tensor_split", "tensor_buft_overrides",
             "defrag_thold",
-            "use_mmap",     "embeddings",   "no_op_offload",  "graph_reuse", "n_prompt",       "n_gen",      "n_depth",
-            "test_time",    "avg_ns",       "stddev_ns",      "avg_ts",      "stddev_ts",
+            "use_mmap",     "embeddings",   "no_op_offload",   "n_prompt",       "n_gen",      "n_depth",      "test_time",
+            "avg_ns",       "stddev_ns",    "avg_ts",         "stddev_ts",
         };
         return fields;
     }
@@ -1280,7 +1259,7 @@ struct test {
             return INT;
         }
         if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
-            field == "use_mmap" || field == "embeddings" || field == "graph_reuse") {
+            field == "use_mmap" || field == "embeddings") {
             return BOOL;
         }
         if (field == "avg_ts" || field == "stddev_ts" || field == "defrag_thold") {
@@ -1354,7 +1333,6 @@ struct test {
                                             std::to_string(use_mmap),
                                             std::to_string(embeddings),
                                             std::to_string(no_op_offload),
-                                            std::to_string(graph_reuse),
                                             std::to_string(n_prompt),
                                             std::to_string(n_gen),
                                             std::to_string(n_depth),
@@ -1540,9 +1518,6 @@ struct markdown_printer : public printer {
         if (field == "no_op_offload") {
             return 4;
         }
-        if (field == "graph_reuse") {
-            return 4;
-        }
 
         int width = std::max((int) field.length(), 10);
 
@@ -1577,9 +1552,6 @@ struct markdown_printer : public printer {
         if (field == "no_op_offload") {
             return "nopo";
         }
-        if (field == "graph_reuse") {
-            return "gr";
-        }
         if (field == "tensor_split") {
             return "ts";
         }
@@ -1654,9 +1626,6 @@ struct markdown_printer : public printer {
         if (params.no_op_offload.size() > 1 || params.no_op_offload != cmd_params_defaults.no_op_offload) {
             fields.emplace_back("no_op_offload");
         }
-        if (params.graph_reuse.size() > 1 || params.graph_reuse != cmd_params_defaults.graph_reuse) {
-            fields.emplace_back("graph_reuse");
-        }
         fields.emplace_back("test");
         fields.emplace_back("t/s");
 

From ee7c644f82d4cc327d184ddf898b0da92b1ef05d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 8 Jul 2025 20:30:32 +0300
Subject: [PATCH 4/6] graph : rename update() to can_reuse()

ggml-ci
---
 include/llama.h       |  2 --
 src/llama-context.cpp | 10 ++++------
 src/llama-cparams.cpp |  1 -
 src/llama-cparams.h   |  1 -
 src/llama-graph.cpp   | 10 +++++-----
 src/llama-graph.h     | 20 ++++++++++----------
 6 files changed, 19 insertions(+), 25 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index 1e8228997f1d8..af1765b53b7e2 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -374,8 +374,6 @@ extern "C" {
         bool swa_full;    // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
                           // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
                           //       ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
-
-        bool graph_reuse; // reuse previous compute graphs when possible
     };
 
     // model quantization parameters
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index b3c94da597f2c..7ec6e24a94ada 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -101,8 +101,7 @@ llama_context::llama_context(
 
     cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
 
-    cparams.op_offload  = params.op_offload;
-    cparams.graph_reuse = params.graph_reuse;
+    cparams.op_offload = params.op_offload;
 
     const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
 
@@ -689,9 +688,9 @@ llm_graph_result_i * llama_context::process_ubatch(const llama_ubatch & ubatch,
     // in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
     const auto gparams = graph_params(res, ubatch, mctx, gtype);
 
-    const bool can_reuse = cparams.graph_reuse && res->update(gparams);
-    if (can_reuse) {
-        LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
+    if (res->can_reuse(gparams)) {
+        //LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
+
         n_reused++;
     } else {
         res->reset();
@@ -2186,7 +2185,6 @@ llama_context_params llama_context_default_params() {
         /*.no_perf                     =*/ true,
         /*.op_offload                  =*/ true,
         /*.swa_full                    =*/ true,
-        /*.graph_reuse                 =*/ false,
     };
 
     return result;
diff --git a/src/llama-cparams.cpp b/src/llama-cparams.cpp
index f81cd10823d3b..2abe8feb0cd99 100644
--- a/src/llama-cparams.cpp
+++ b/src/llama-cparams.cpp
@@ -27,7 +27,6 @@ bool llama_cparams::is_same(const llama_cparams & other) const {
         no_perf             == other.no_perf             &&
         warmup              == other.warmup              &&
         op_offload          == other.op_offload          &&
-        graph_reuse         == other.graph_reuse         &&
         pooling_type        == other.pooling_type        &&
         cb_eval             == other.cb_eval             &&
         cb_eval_user_data   == other.cb_eval_user_data;
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
index 064767d51d92c..2c38f92e423a3 100644
--- a/src/llama-cparams.h
+++ b/src/llama-cparams.h
@@ -33,7 +33,6 @@ struct llama_cparams {
     bool no_perf;
     bool warmup;
     bool op_offload;
-    bool graph_reuse;
 
     enum llama_pooling_type pooling_type;
 
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 264491b4de83a..99c6fe0e0353a 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -28,7 +28,7 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
     }
 }
 
-bool llm_graph_input_embd::update(const llm_graph_params & params) {
+bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) {
     bool res = true;
 
     res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
@@ -59,7 +59,7 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
     }
 }
 
-bool llm_graph_input_pos::update(const llm_graph_params & params) {
+bool llm_graph_input_pos::can_reuse(const llm_graph_params & params) {
     bool res = true;
 
     res &= pos->ne[0] == params.ubatch.n_tokens;
@@ -135,7 +135,7 @@ void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {
     }
 }
 
-bool llm_graph_input_out_ids::update(const llm_graph_params & params) {
+bool llm_graph_input_out_ids::can_reuse(const llm_graph_params & params) {
     bool res = true;
 
     res &= n_outputs == params.n_outputs;
@@ -312,7 +312,7 @@ void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
     mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
 }
 
-bool llm_graph_input_attn_kv_unified::update(const llm_graph_params & params) {
+bool llm_graph_input_attn_kv_unified::can_reuse(const llm_graph_params & params) {
     const auto * mctx = static_cast<const llama_kv_cache_unified_context *>(params.mctx);
 
     this->mctx = mctx;
@@ -342,7 +342,7 @@ void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch
     mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
 }
 
-bool llm_graph_input_attn_kv_unified_iswa::update(const llm_graph_params & params) {
+bool llm_graph_input_attn_kv_unified_iswa::can_reuse(const llm_graph_params & params) {
     const auto * mctx = static_cast<const llama_kv_cache_unified_iswa_context *>(params.mctx);
 
     this->mctx = mctx;
diff --git a/src/llama-graph.h b/src/llama-graph.h
index ae2fd6481b609..49a3835e90492 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -83,7 +83,7 @@ class llm_graph_input_i {
 
     // return true if the resulting input tensors using the provided graph parameters would be
     //   the same as the previous input tensors that we have currently stored in the object
-    virtual bool update(const llm_graph_params & params) {
+    virtual bool can_reuse(const llm_graph_params & params) {
         // returning false here by default will prevent from reusing the graph if the check
         //   for the input type has not been implemented yet
         GGML_UNUSED(params);
@@ -100,7 +100,7 @@ class llm_graph_input_embd : public llm_graph_input_i {
 
     void set_input(const llama_ubatch * ubatch) override;
 
-    bool update(const llm_graph_params & params) override;
+    bool can_reuse(const llm_graph_params & params) override;
 
     ggml_tensor * tokens = nullptr; // I32 [n_batch]
     ggml_tensor * embd   = nullptr; // F32 [n_embd, n_batch]
@@ -113,7 +113,7 @@ class llm_graph_input_pos : public llm_graph_input_i {
 
     void set_input(const llama_ubatch * ubatch) override;
 
-    bool update(const llm_graph_params & params) override;
+    bool can_reuse(const llm_graph_params & params) override;
 
     ggml_tensor * pos = nullptr; // I32 [n_batch]
 
@@ -173,7 +173,7 @@ class llm_graph_input_out_ids : public llm_graph_input_i {
 
     void set_input(const llama_ubatch * ubatch) override;
 
-    bool update(const llm_graph_params & params) override;
+    bool can_reuse(const llm_graph_params & params) override;
 
     ggml_tensor * out_ids; // I32 [n_outputs]
 
@@ -265,7 +265,7 @@ class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
 
     void set_input(const llama_ubatch * ubatch) override;
 
-    bool update(const llm_graph_params & params) override;
+    bool can_reuse(const llm_graph_params & params) override;
 
     ggml_tensor * get_k_idxs() const { return self_k_idxs; }
     ggml_tensor * get_v_idxs() const { return self_v_idxs; }
@@ -298,7 +298,7 @@ class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i {
 
     void set_input(const llama_ubatch * ubatch) override;
 
-    bool update(const llm_graph_params & params) override;
+    bool can_reuse(const llm_graph_params & params) override;
 
     ggml_tensor * get_k_idxs()     const { return self_k_idxs; }
     ggml_tensor * get_v_idxs()     const { return self_v_idxs; }
@@ -410,7 +410,7 @@ class llm_graph_result_i {
 
     virtual void set_inputs(const llama_ubatch * ubatch) = 0;
 
-    virtual bool update(const llm_graph_params & params) = 0;
+    virtual bool can_reuse(const llm_graph_params & params) = 0;
 };
 
 using llm_graph_result_ptr = std::unique_ptr<llm_graph_result_i>;
@@ -504,12 +504,12 @@ class llm_graph_result : public llm_graph_result_i {
         }
     }
 
-    // try to update the existing graph result using the new graph parameters
+    // try to update the existing graph result using the new graph parameters in order to reuse it
     // this can only be done if we determine that the resulting graph using the new graph parameters
     //   would be identical to the existing graph. in that case, we simply have to update the memory
     //   contexts of the input tensors of the graph and we can reuse it for another computation
     // return true if the graph was updated and can be reused
-    bool update(const llm_graph_params & params) override {
+    bool can_reuse(const llm_graph_params & params) override {
         if (!this->params.is_same(params)) {
             return false;
         }
@@ -517,7 +517,7 @@ class llm_graph_result : public llm_graph_result_i {
         bool res = true;
 
         for (auto & input : inputs) {
-            res &= input->update(params);
+            res &= input->can_reuse(params);
         }
 
         return res;

From 600e69fdbb921b8f84c718e39d5c578f536cd638 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 8 Jul 2025 20:58:08 +0300
Subject: [PATCH 5/6] params : remove is_same()

ggml-ci
---
 src/llama-batch.h     | 25 -------------------------
 src/llama-cparams.cpp | 28 ----------------------------
 src/llama-cparams.h   |  2 --
 src/llama-graph.h     | 37 ++++++++++++++++++++++++++++++++-----
 src/llama-hparams.cpp |  9 ---------
 src/llama-hparams.h   |  2 --
 6 files changed, 32 insertions(+), 71 deletions(-)

diff --git a/src/llama-batch.h b/src/llama-batch.h
index b27134c9331a9..3420803ff9469 100644
--- a/src/llama-batch.h
+++ b/src/llama-batch.h
@@ -34,31 +34,6 @@ struct llama_ubatch {
     llama_seq_id *  seq_id_unq; // [n_seqs_unq]       | s   | seq_id
     int32_t      *  seq_idx;    // [LLAMA_MAX_SEQ]    | -   | seq_idx
     int8_t       *  output;     // [n_tokens]         | i   | -
-
-    bool is_same(const llama_ubatch & other) const {
-        bool res =
-            equal_seqs   == other.equal_seqs &&
-            n_tokens     == other.n_tokens &&
-            n_seq_tokens == other.n_seq_tokens &&
-            n_seqs       == other.n_seqs &&
-            n_seqs_unq   == other.n_seqs_unq &&
-            (
-                (!token && !other.token) ||
-                (!embd  && !other.embd)
-            );
-
-        if (!res) {
-            return false;
-        }
-
-        // TODO: this won't work because seq_id_unq ptr can point to an old balloc that has
-        //       been freed by this point. find a way to fix this
-        //for (uint32_t s = 0; s < n_seqs_unq; ++s) {
-        //    res &= seq_id_unq[s] == other.seq_id_unq[s];
-        //}
-
-        return res;
-    }
 };
 
 // a helper for sanitizing, fulfilling and splitting a batch
diff --git a/src/llama-cparams.cpp b/src/llama-cparams.cpp
index 2abe8feb0cd99..a3e7a37ee36d7 100644
--- a/src/llama-cparams.cpp
+++ b/src/llama-cparams.cpp
@@ -3,31 +3,3 @@
 size_t llama_max_parallel_sequences(void) {
     return LLAMA_MAX_SEQ;
 }
-
-bool llama_cparams::is_same(const llama_cparams & other) const {
-    return
-        n_ctx               == other.n_ctx               &&
-        n_batch             == other.n_batch             &&
-        n_ubatch            == other.n_ubatch            &&
-        n_seq_max           == other.n_seq_max           &&
-        n_threads           == other.n_threads           &&
-        n_threads_batch     == other.n_threads_batch     &&
-        rope_freq_base      == other.rope_freq_base      &&
-        rope_freq_scale     == other.rope_freq_scale     &&
-        n_ctx_orig_yarn     == other.n_ctx_orig_yarn     &&
-        yarn_ext_factor     == other.yarn_ext_factor     &&
-        yarn_attn_factor    == other.yarn_attn_factor    &&
-        yarn_beta_fast      == other.yarn_beta_fast      &&
-        yarn_beta_slow      == other.yarn_beta_slow      &&
-        defrag_thold        == other.defrag_thold        &&
-        embeddings          == other.embeddings          &&
-        causal_attn         == other.causal_attn         &&
-        offload_kqv         == other.offload_kqv         &&
-        flash_attn          == other.flash_attn          &&
-        no_perf             == other.no_perf             &&
-        warmup              == other.warmup              &&
-        op_offload          == other.op_offload          &&
-        pooling_type        == other.pooling_type        &&
-        cb_eval             == other.cb_eval             &&
-        cb_eval_user_data   == other.cb_eval_user_data;
-}
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
index 2c38f92e423a3..118615d5bd2d5 100644
--- a/src/llama-cparams.h
+++ b/src/llama-cparams.h
@@ -38,6 +38,4 @@ struct llama_cparams {
 
     ggml_backend_sched_eval_callback cb_eval;
     void * cb_eval_user_data;
-
-    bool is_same(const llama_cparams & other) const;
 };
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 49a3835e90492..2e73681b90e44 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -443,11 +443,38 @@ struct llm_graph_params {
     // TODO: temporary
     llm_graph_result_i * res;
 
-    bool is_same(const llm_graph_params & other) const {
+    // return true if the "other" params would result in a graph with the same topology as with the current params
+    //   having the same topology allows us to reuse the graph in some cases
+    bool allow_reuse(const llm_graph_params & other) const {
+        // first check the ubatch
+        bool can_reuse_ubatch =
+            ubatch.equal_seqs   == other.ubatch.equal_seqs &&
+            ubatch.n_tokens     == other.ubatch.n_tokens &&
+            ubatch.n_seq_tokens == other.ubatch.n_seq_tokens &&
+            ubatch.n_seqs       == other.ubatch.n_seqs &&
+            ubatch.n_seqs_unq   == other.ubatch.n_seqs_unq &&
+            (
+                (!ubatch.token && !other.ubatch.token) ||
+                (!ubatch.embd  && !other.ubatch.embd)
+            );
+
+        // TODO: this won't work because seq_id_unq ptr can point to an old balloc that has
+        //       been freed by this point. find a way to fix this
+        //for (uint32_t s = 0; s < n_seqs_unq; ++s) {
+        //    can_reuse_ubatch &= seq_id_unq[s] == other.seq_id_unq[s];
+        //}
+
+        // for now conservatively disallow, until the issue above is resolved
+        // ref: https://github.com/ggml-org/llama.cpp/pull/14363
+        can_reuse_ubatch = can_reuse_ubatch && !ubatch.equal_seqs;
+
+        if (!can_reuse_ubatch) {
+            return false;
+        }
+
         return
-            hparams.is_same(other.hparams) &&
-            cparams.is_same(other.cparams) &&
-            ubatch .is_same(other.ubatch)  &&
+            cparams.embeddings  == other.cparams.embeddings  &&
+            cparams.causal_attn == other.cparams.causal_attn &&
             arch      == other.arch  &&
             gtype     == other.gtype &&
             cvec      == other.cvec  &&
@@ -510,7 +537,7 @@ class llm_graph_result : public llm_graph_result_i {
     //   contexts of the input tensors of the graph and we can reuse it for another computation
     // return true if the graph was updated and can be reused
     bool can_reuse(const llm_graph_params & params) override {
-        if (!this->params.is_same(params)) {
+        if (!this->params.allow_reuse(params)) {
             return false;
         }
 
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index a59e96a92fe7e..86c814d51b901 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -102,12 +102,3 @@ bool llama_hparams::is_swa(uint32_t il) const {
 
     GGML_ABORT("fatal error");
 }
-
-bool llama_hparams::is_same(const llama_hparams & other) const {
-    return
-        n_ctx_train == other.n_ctx_train &&
-        n_embd == other.n_embd &&
-        n_layer == other.n_layer &&
-        n_expert == other.n_expert &&
-        n_expert_used == other.n_expert_used;
-}
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index dfddb4f3d9534..476d0a5eade28 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -202,8 +202,6 @@ struct llama_hparams {
     uint32_t n_pos_per_embd() const;
 
     bool is_swa(uint32_t il) const;
-
-    bool is_same(const llama_hparams & other) const;
 };
 
 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");

From a28677d170f10ef713c6f8f8b416606e023536da Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 9 Jul 2025 17:08:32 +0300
Subject: [PATCH 6/6] graph : set res->params in llm_graph_context constructor

ggml-ci
---
 src/llama-graph.cpp | 1 +
 src/llama-model.cpp | 3 ---
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 99c6fe0e0353a..5caa24c4e50a1 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -471,6 +471,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
     cb_func          (params.cb),
     res              (static_cast<llm_graph_result *>(params.res)),
     ctx0             (res->get_ctx()) {
+        res->params = params;
     }
 
 void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index caab6a0d270be..bdff24c3a4cfc 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -15047,9 +15047,6 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
     // add on pooling layer
     llm->build_pooling(gf, cls, cls_b, cls_out, cls_out_b);
 
-    // TODO: updating the graph parameters here is a little bit obscure - figure out something better
-    llm->res->params = params;
-
     return llm->res->get_gf();
 }