ggml-org
diff --git a/‎src/llama-context.cpp
Lines changed: 56 additions & 58 deletions b/‎src/llama-context.cpp
Lines changed: 56 additions & 58 deletions
diff --git a/‎src/llama-context.h
Lines changed: 28 additions & 13 deletions b/‎src/llama-context.h
Lines changed: 28 additions & 13 deletions
@@ -227,8 +227,14 @@ llama_context::llama_context(
 
         LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
 
-        // buffer used to store the computation graph and the tensor meta data
-        buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
+        // buffers used to store the computation graph and the tensor meta data
+        for (auto & res : gf_res) {
+            res.reset(new llm_graph_result());
+            res->reserve(max_nodes);
+        };
+
+        gf_res_reserve.reset(new llm_graph_result());
+        gf_res_reserve->reserve(max_nodes);
 
         // TODO: move these checks to ggml_backend_sched
         // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
@@ -388,10 +394,6 @@ ggml_backend_sched_t llama_context::get_sched() const {
     return sched.get();
 }
 
-ggml_context * llama_context::get_ctx_compute() const {
-    return ctx_compute.get();
-}
-
 uint32_t llama_context::n_ctx() const {
     return cparams.n_ctx;
 }
@@ -678,36 +680,40 @@ bool llama_context::apply_adapter_cvec(
     return cvec.apply(model, data, len, n_embd, il_start, il_end);
 }
 
-llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
+llm_graph_result_i * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
     if (mctx && !mctx->apply()) {
         LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__);
         ret = GGML_STATUS_FAILED;
         return nullptr;
     }
 
-    auto * gf = graph_init();
+    gf_res_next()->init();
+
+    auto * gf = gf_res_cur()->get_gf();
     if (!gf) {
         LLAMA_LOG_ERROR("%s: failed to initialize graph\n", __func__);
         ret = GGML_STATUS_FAILED;
         return nullptr;
     }
 
-    auto res = graph_build(ctx_compute.get(), gf, ubatch, gtype, mctx);
-    if (!res) {
-        LLAMA_LOG_ERROR("%s: failed to build graph\n", __func__);
-        ret = GGML_STATUS_FAILED;
-        return nullptr;
-    }
+    const bool can_reuse = graph_build(gf_res_cur(), gf_res_prv(), ubatch, gtype, mctx);
+    if (can_reuse) {
+        LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
+        gf_res_next()->update(mctx);
+    } else {
+        // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
 
-    // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
+        ggml_backend_sched_reset(sched.get());
+        ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
 
-    if (!ggml_backend_sched_alloc_graph(sched.get(), gf)) {
-        LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__);
-        ret = GGML_STATUS_ALLOC_FAILED;
-        return nullptr;
+        if (!ggml_backend_sched_alloc_graph(sched.get(), gf)) {
+            LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__);
+            ret = GGML_STATUS_ALLOC_FAILED;
+            return nullptr;
+        }
     }
 
-    res->set_inputs(&ubatch);
+    gf_res_cur()->set_inputs(&ubatch);
 
     const auto status = graph_compute(gf, ubatch.n_tokens > 1);
     if (status != GGML_STATUS_SUCCESS) {
@@ -718,7 +724,7 @@ llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch,
 
     ret = GGML_STATUS_SUCCESS;
 
-    return res;
+    return gf_res_cur();
 }
 
 int llama_context::encode(const llama_batch & batch_inp) {
@@ -767,6 +773,8 @@ int llama_context::encode(const llama_batch & batch_inp) {
 
     n_outputs = n_tokens;
 
+    // TODO: when resetting the scheduler, clear prev graph buffers
+    gf_res_next()->init();
     ggml_backend_sched_reset(sched.get());
     ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
 
@@ -778,7 +786,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
     cparams.causal_attn = false;
 
     ggml_status status;
-    const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status);
+    const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status);
 
     cparams.causal_attn = causal_attn_org;
 
@@ -846,7 +854,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
 
     // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
     // overlap with device computation.
-    ggml_backend_sched_reset(sched.get());
+    //ggml_backend_sched_reset(sched.get());
 
     // TODO: hacky solution
     if (model.arch == LLM_ARCH_T5 && t_embd) {
@@ -1005,11 +1013,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
             n_outputs = n_outputs_new;
         }
 
-        ggml_backend_sched_reset(sched.get());
-        ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
-
         ggml_status status;
-        const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
+        const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
 
         if (!res) {
             // the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache
@@ -1192,7 +1197,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
 
     // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
     // overlap with device computation.
-    ggml_backend_sched_reset(sched.get());
+    //ggml_backend_sched_reset(sched.get());
 
     return 0;
 }
@@ -1279,18 +1284,6 @@ int32_t llama_context::graph_max_nodes() const {
     return std::max<int32_t>(65536, 5*model.n_tensors());
 }
 
-ggml_cgraph * llama_context::graph_init() {
-    ggml_init_params params = {
-        /*.mem_size   =*/ buf_compute_meta.size(),
-        /*.mem_buffer =*/ buf_compute_meta.data(),
-        /*.no_alloc   =*/ true,
-    };
-
-    ctx_compute.reset(ggml_init(params));
-
-    return ggml_new_graph_custom(ctx_compute.get(), graph_max_nodes(), false);
-}
-
 ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx) {
     LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
 
@@ -1301,6 +1294,10 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
         LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);
     }
 
+    // TODO: when resetting the scheduler, clear prev graph buffers
+    gf_res_next()->init();
+    ggml_backend_sched_reset(sched.get());
+
     // store the n_outputs as it is, and restore it afterwards
     // TODO: not sure if needed, might simplify in the future by removing this
     const auto save_n_outputs = this->n_outputs;
@@ -1310,17 +1307,13 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
     llama_batch_allocr balloc(model.hparams.n_pos_per_embd());
     llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs);
 
-    auto * gf = graph_init();
-    auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx);
+    gf_res_reserve->init();
+    auto * gf = gf_res_reserve->get_gf();
 
-    this->n_outputs = save_n_outputs;
-
-    if (!res) {
-        LLAMA_LOG_ERROR("%s: failed to build worst-case graph\n", __func__);
-        return nullptr;
-    }
+    const bool can_reuse = graph_build(gf_res_reserve.get(), nullptr, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx);
+    GGML_ASSERT(!can_reuse); // cannot reuse reserve graphs
 
-    ggml_backend_sched_reset(sched.get());
+    this->n_outputs = save_n_outputs;
 
     // initialize scheduler with the specified graph
     if (!ggml_backend_sched_reserve(sched.get(), gf)) {
@@ -1331,15 +1324,17 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
     return gf;
 }
 
-llm_graph_result_ptr llama_context::graph_build(
-                      ggml_context * ctx,
-                       ggml_cgraph * gf,
+bool llama_context::graph_build(
+                llm_graph_result_i * gf_res_cur,
+                llm_graph_result_i * gf_res_prv,
                 const llama_ubatch & ubatch,
                     llm_graph_type   gtype,
       const llama_memory_context_i * mctx) {
     return model.build_graph(
             {
-                /*.ctx         =*/ ctx,
+                /*.ctx         =*/ gf_res_cur->get_ctx(),
+                /*.gf_res_cur  =*/ static_cast<llm_graph_result *>(gf_res_cur),
+                /*.gf_res_prv  =*/ static_cast<llm_graph_result *>(gf_res_prv),
                 /*.arch        =*/ model.arch,
                 /*.hparams     =*/ model.hparams,
                 /*.cparams     =*/ cparams,
@@ -1352,7 +1347,7 @@ llm_graph_result_ptr llama_context::graph_build(
                 /*.cross       =*/ &cross,
                 /*.n_outputs   =*/ n_outputs,
                 /*.cb          =*/ graph_get_cb(),
-            }, gf, gtype);
+            }, gtype);
 }
 
 ggml_status llama_context::graph_compute(
@@ -2064,8 +2059,11 @@ void llama_context::opt_epoch_iter(
                 break;
             }
 
-            auto * gf = graph_init();
-            auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx.get());
+            gf_res_cur()->init();
+            auto * gf = gf_res_cur()->get_gf();
+
+            const bool can_reuse = graph_build(gf_res_cur(), nullptr, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx.get());
+            GGML_ASSERT(!can_reuse); // cannot reuse optimization graphs
 
             struct ggml_context * ctx_compute_opt;
             {
@@ -2078,10 +2076,10 @@ void llama_context::opt_epoch_iter(
                 };
                 ctx_compute_opt = ggml_init(params);
             }
-            ggml_opt_prepare_alloc(opt_ctx, ctx_compute_opt, gf, res->get_tokens(), res->get_logits());
+            ggml_opt_prepare_alloc(opt_ctx, ctx_compute_opt, gf, gf_res_cur()->get_tokens(), gf_res_cur()->get_logits());
             ggml_opt_alloc(opt_ctx, train);
 
-            res->set_inputs(&ubatch);
+            gf_res_cur()->set_inputs(&ubatch);
             {
                 struct ggml_tensor * labels = ggml_opt_labels(opt_ctx);
                 GGML_ASSERT(labels->ne[1] == n_ubatch);
 
@@ -35,8 +35,6 @@ struct llama_context {
 
     ggml_backend_sched_t get_sched() const;
 
-    ggml_context * get_ctx_compute() const;
-
     uint32_t n_ctx()         const;
     uint32_t n_ctx_per_seq() const;
     uint32_t n_batch()       const;
@@ -96,7 +94,7 @@ struct llama_context {
     // if memory_context is provided, it will be applied first to the context's memory
     // ret contains the status of the graph computation
     // returns nullptr only if ret != GGML_STATUS_SUCCESS
-    llm_graph_result_ptr process_ubatch(
+    llm_graph_result_i * process_ubatch(
                 const llama_ubatch & ubatch,
                     llm_graph_type   gtype,
             llama_memory_context_i * mctx,
@@ -190,19 +188,17 @@ struct llama_context {
 public:
     int32_t graph_max_nodes() const;
 
-    // zero-out inputs and create the ctx_compute for the compute graph
-    ggml_cgraph * graph_init();
-
     // returns the result of ggml_backend_sched_graph_compute_async execution
     ggml_status graph_compute(ggml_cgraph * gf, bool batched);
 
     // reserve a graph with a dummy ubatch of the specified size
     ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx);
 
 private:
-    llm_graph_result_ptr graph_build(
-                      ggml_context * ctx,
-                       ggml_cgraph * gf,
+    // true - can reuse prev graph
+    bool graph_build(
+                llm_graph_result_i * gf_res_cur,
+                llm_graph_result_i * gf_res_prv,
                 const llama_ubatch & ubatch,
                     llm_graph_type   gtype,
       const llama_memory_context_i * mctx);
@@ -258,8 +254,6 @@ struct llama_context {
     ggml_backend_t backend_cpu = nullptr;
     std::vector<ggml_backend_ptr> backends;
 
-    ggml_context_ptr ctx_compute;
-
     // training
     ggml_opt_context_t opt_ctx = nullptr;
 
@@ -275,8 +269,29 @@ struct llama_context {
     std::vector<ggml_backend_t>             backend_ptrs;
     std::vector<ggml_backend_buffer_type_t> backend_buft;
 
-    // memory buffers used to evaluate the model
-    std::vector<uint8_t> buf_compute_meta;
+    // ==================================
+    // double-buffer for compute graphs
+    // TODO: polish this rough first iteration
+    //
+    std::array<llm_graph_result_ptr, 2> gf_res;
+
+    int gf_res_i = 0;
+
+    llm_graph_result_i * gf_res_next() {
+        gf_res_i = gf_res_i == 0 ? 1 : 0;
+        return gf_res[gf_res_i].get();
+    }
+
+    llm_graph_result_i * gf_res_cur() const {
+        return gf_res[gf_res_i].get();
+    }
+
+    llm_graph_result_i * gf_res_prv() const {
+        return gf_res[(gf_res_i + 1) % 2].get();
+    }
+
+    llm_graph_result_ptr gf_res_reserve;
+    // ==================================
 
     // host buffer for the model output (logits and embeddings)
     ggml_backend_buffer_ptr buf_output;