From 76681e3c735f8ac45eccf51368bb562a9ab27784 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 1 Jul 2025 15:59:43 +0300 Subject: [PATCH 1/6] llama : reuse compute graphs ggml-ci --- common/arg.cpp | 8 ++ common/common.cpp | 1 + common/common.h | 1 + include/llama.h | 3 + src/llama-batch.h | 25 +++++ src/llama-context.cpp | 170 +++++++++++++++--------------- src/llama-context.h | 28 ++--- src/llama-cparams.cpp | 29 +++++ src/llama-cparams.h | 3 + src/llama-graph.cpp | 71 ++++++++++++- src/llama-graph.h | 187 ++++++++++++++++++++++++++------- src/llama-hparams.cpp | 9 ++ src/llama-hparams.h | 2 + src/llama-kv-cache-unified.cpp | 60 ++++++----- src/llama-kv-cache-unified.h | 24 +++-- src/llama-model.cpp | 15 +-- src/llama-model.h | 5 +- 17 files changed, 456 insertions(+), 185 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 40af7e574830f..d4a3f5b7f21ec 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1464,6 +1464,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.swa_full = true; } ).set_env("LLAMA_ARG_SWA_FULL")); + add_opt(common_arg( + {"--graph-reuse", "-gr"}, + string_format("reuse previous compute graphs when possible (default: %s)" + "[(more info)](https://github.com/ggml-org/llama.cpp/pull/14482)", params.graph_reuse ? "true" : "false"), + [](common_params & params) { + params.graph_reuse = true; + } + ).set_env("LLAMA_ARG_GRAPH_REUSE")); add_opt(common_arg( {"--no-context-shift"}, string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"), diff --git a/common/common.cpp b/common/common.cpp index e4e71ad13fb59..c7c163404a2af 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1157,6 +1157,7 @@ struct llama_context_params common_context_params_to_llama(const common_params & cparams.no_perf = params.no_perf; cparams.op_offload = !params.no_op_offload; cparams.swa_full = params.swa_full; + cparams.graph_reuse = params.graph_reuse; cparams.type_k = params.cache_type_k; cparams.type_v = params.cache_type_v; diff --git a/common/common.h b/common/common.h index 8922090e7b10d..465d05672d2a6 100644 --- a/common/common.h +++ b/common/common.h @@ -330,6 +330,7 @@ struct common_params { bool no_perf = false; // disable performance metrics bool ctx_shift = true; // context shift on inifinite text generation bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) + bool graph_reuse = false; // reuse previous compute graphs when possible bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool use_mmap = true; // use mmap for faster loads diff --git a/include/llama.h b/include/llama.h index 3eda9bc68608c..1e8228997f1d8 100644 --- a/include/llama.h +++ b/include/llama.h @@ -374,6 +374,8 @@ extern "C" { bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573 + + bool graph_reuse; // reuse previous compute graphs when possible }; // model quantization parameters @@ -1429,6 +1431,7 @@ extern "C" { int32_t n_p_eval; int32_t n_eval; + int32_t n_reused; }; struct llama_perf_sampler_data { diff --git a/src/llama-batch.h b/src/llama-batch.h index 3420803ff9469..b27134c9331a9 100644 --- a/src/llama-batch.h +++ b/src/llama-batch.h @@ -34,6 +34,31 @@ struct llama_ubatch { llama_seq_id * seq_id_unq; // [n_seqs_unq] | s | seq_id int32_t * seq_idx; // [LLAMA_MAX_SEQ] | - | seq_idx int8_t * output; // [n_tokens] | i | - + + bool is_same(const llama_ubatch & other) const { + bool res = + equal_seqs == other.equal_seqs && + n_tokens == other.n_tokens && + n_seq_tokens == other.n_seq_tokens && + n_seqs == other.n_seqs && + n_seqs_unq == other.n_seqs_unq && + ( + (!token && !other.token) || + (!embd && !other.embd) + ); + + if (!res) { + return false; + } + + // TODO: this won't work because seq_id_unq ptr can point to an old balloc that has + // been freed by this point. find a way to fix this + //for (uint32_t s = 0; s < n_seqs_unq; ++s) { + // res &= seq_id_unq[s] == other.seq_id_unq[s]; + //} + + return res; + } }; // a helper for sanitizing, fulfilling and splitting a batch diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 06e93b19cbf40..8a1e2a86f9709 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -101,7 +101,8 @@ llama_context::llama_context( cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch); - cparams.op_offload = params.op_offload; + cparams.op_offload = params.op_offload; + cparams.graph_reuse = params.graph_reuse; const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max; @@ -227,8 +228,8 @@ llama_context::llama_context( LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes); - // buffer used to store the computation graph and the tensor meta data - buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); + gf_res_prev.reset(new llm_graph_result(max_nodes)); + gf_res_reserve.reset(new llm_graph_result(max_nodes)); // TODO: move these checks to ggml_backend_sched // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary @@ -388,10 +389,6 @@ ggml_backend_sched_t llama_context::get_sched() const { return sched.get(); } -ggml_context * llama_context::get_ctx_compute() const { - return ctx_compute.get(); -} - uint32_t llama_context::n_ctx() const { return cparams.n_ctx; } @@ -678,38 +675,52 @@ bool llama_context::apply_adapter_cvec( return cvec.apply(model, data, len, n_embd, il_start, il_end); } -llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) { +llm_graph_result_i * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) { if (mctx && !mctx->apply()) { LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__); ret = GGML_STATUS_FAILED; return nullptr; } - auto * gf = graph_init(); - if (!gf) { - LLAMA_LOG_ERROR("%s: failed to initialize graph\n", __func__); - ret = GGML_STATUS_FAILED; - return nullptr; - } + auto * res = gf_res_prev.get(); + auto * gf = res->get_gf(); - auto res = graph_build(ctx_compute.get(), gf, ubatch, gtype, mctx); - if (!res) { - LLAMA_LOG_ERROR("%s: failed to build graph\n", __func__); - ret = GGML_STATUS_FAILED; - return nullptr; - } + // the new graph parameters + // in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters + const auto gparams = graph_params(res, ubatch, mctx, gtype); - // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); + const bool can_reuse = cparams.graph_reuse && res->update(gparams); + if (can_reuse) { + LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__); + n_reused++; + } else { + res->reset(); - if (!ggml_backend_sched_alloc_graph(sched.get(), gf)) { - LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__); - ret = GGML_STATUS_ALLOC_FAILED; - return nullptr; + ggml_backend_sched_reset(sched.get()); + ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); + + //const auto t_start_us = ggml_time_us(); + + gf = model.build_graph(gparams); + + //LLAMA_LOG_INFO("graph build time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0); + + if (!gf) { + LLAMA_LOG_ERROR("%s: failed to initialize graph\n", __func__); + ret = GGML_STATUS_FAILED; + return nullptr; + } + + if (!ggml_backend_sched_alloc_graph(sched.get(), gf)) { + LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__); + ret = GGML_STATUS_ALLOC_FAILED; + return nullptr; + } } res->set_inputs(&ubatch); - const auto status = graph_compute(gf, ubatch.n_tokens > 1); + const auto status = graph_compute(res->get_gf(), ubatch.n_tokens > 1); if (status != GGML_STATUS_SUCCESS) { LLAMA_LOG_ERROR("%s: failed to compute graph, compute status: %d\n", __func__, status); ret = status; @@ -767,9 +778,6 @@ int llama_context::encode(const llama_batch & batch_inp) { n_outputs = n_tokens; - ggml_backend_sched_reset(sched.get()); - ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); - const auto causal_attn_org = cparams.causal_attn; // always use non-causal attention for encoder graphs @@ -778,7 +786,7 @@ int llama_context::encode(const llama_batch & batch_inp) { cparams.causal_attn = false; ggml_status status; - const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status); + const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status); cparams.causal_attn = causal_attn_org; @@ -846,7 +854,9 @@ int llama_context::encode(const llama_batch & batch_inp) { // Reset state for the next token before backend sync, to allow the CPU activities in the reset to // overlap with device computation. - ggml_backend_sched_reset(sched.get()); + if (!cparams.graph_reuse) { + ggml_backend_sched_reset(sched.get()); + } // TODO: hacky solution if (model.arch == LLM_ARCH_T5 && t_embd) { @@ -1005,11 +1015,8 @@ int llama_context::decode(const llama_batch & batch_inp) { n_outputs = n_outputs_new; } - ggml_backend_sched_reset(sched.get()); - ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); - ggml_status status; - const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status); + const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status); if (!res) { // the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache @@ -1192,7 +1199,9 @@ int llama_context::decode(const llama_batch & batch_inp) { // Reset state for the next token before backend sync, to allow the CPU activities in the reset to // overlap with device computation. - ggml_backend_sched_reset(sched.get()); + if (!cparams.graph_reuse) { + ggml_backend_sched_reset(sched.get()); + } return 0; } @@ -1275,20 +1284,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { // graph // -int32_t llama_context::graph_max_nodes() const { - return std::max(65536, 5*model.n_tensors()); -} - -ggml_cgraph * llama_context::graph_init() { - ggml_init_params params = { - /*.mem_size =*/ buf_compute_meta.size(), - /*.mem_buffer =*/ buf_compute_meta.data(), - /*.no_alloc =*/ true, - }; - - ctx_compute.reset(ggml_init(params)); - - return ggml_new_graph_custom(ctx_compute.get(), graph_max_nodes(), false); +uint32_t llama_context::graph_max_nodes() const { + return std::max(65536u, 5u*model.n_tensors()); } ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx) { @@ -1301,6 +1298,9 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs); } + gf_res_prev->reset(); + ggml_backend_sched_reset(sched.get()); + // store the n_outputs as it is, and restore it afterwards // TODO: not sure if needed, might simplify in the future by removing this const auto save_n_outputs = this->n_outputs; @@ -1310,17 +1310,15 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u llama_batch_allocr balloc(model.hparams.n_pos_per_embd()); llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs); - auto * gf = graph_init(); - auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx); + auto * res = gf_res_reserve.get(); - this->n_outputs = save_n_outputs; + const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT); - if (!res) { - LLAMA_LOG_ERROR("%s: failed to build worst-case graph\n", __func__); - return nullptr; - } + res->reset(); - ggml_backend_sched_reset(sched.get()); + auto * gf = model.build_graph(gparams); + + this->n_outputs = save_n_outputs; // initialize scheduler with the specified graph if (!ggml_backend_sched_reserve(sched.get(), gf)) { @@ -1331,28 +1329,27 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u return gf; } -llm_graph_result_ptr llama_context::graph_build( - ggml_context * ctx, - ggml_cgraph * gf, - const llama_ubatch & ubatch, - llm_graph_type gtype, - const llama_memory_context_i * mctx) { - return model.build_graph( - { - /*.ctx =*/ ctx, - /*.arch =*/ model.arch, - /*.hparams =*/ model.hparams, - /*.cparams =*/ cparams, - /*.ubatch =*/ ubatch, - /*.sched =*/ sched.get(), - /*.backend_cpu =*/ backend_cpu, - /*.cvec =*/ &cvec, - /*.loras =*/ &loras, - /*.mctx =*/ mctx, - /*.cross =*/ &cross, - /*.n_outputs =*/ n_outputs, - /*.cb =*/ graph_get_cb(), - }, gf, gtype); +llm_graph_params llama_context::graph_params( + llm_graph_result_i * res, + const llama_ubatch & ubatch, + const llama_memory_context_i * mctx, + llm_graph_type gtype) const { + return { + /*.arch =*/ model.arch, + /*.hparams =*/ model.hparams, + /*.cparams =*/ cparams, + /*.ubatch =*/ ubatch, + /*.gtype =*/ gtype, + /*.sched =*/ sched.get(), + /*.backend_cpu =*/ backend_cpu, + /*.cvec =*/ &cvec, + /*.loras =*/ &loras, + /*.mctx =*/ mctx, + /*.cross =*/ &cross, + /*.n_outputs =*/ n_outputs, + /*.cb =*/ graph_get_cb(), + /*.res =*/ res, + }; } ggml_status llama_context::graph_compute( @@ -1930,6 +1927,7 @@ llama_perf_context_data llama_context::perf_get_data() const { data.t_eval_ms = 1e-3 * t_eval_us; data.n_p_eval = std::max(1, n_p_eval); data.n_eval = std::max(1, n_eval); + data.n_reused = std::max(0, n_reused); return data; } @@ -1938,6 +1936,7 @@ void llama_context::perf_reset() { t_start_us = ggml_time_us(); t_eval_us = n_eval = 0; t_p_eval_us = n_p_eval = 0; + n_reused = 0; } // @@ -2064,8 +2063,13 @@ void llama_context::opt_epoch_iter( break; } - auto * gf = graph_init(); - auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx.get()); + auto * res = gf_res_prev.get(); + + const auto gparams = graph_params(res, ubatch, mctx.get(), LLM_GRAPH_TYPE_DEFAULT); + + res->reset(); + + auto * gf = model.build_graph(gparams); struct ggml_context * ctx_compute_opt; { @@ -2187,6 +2191,7 @@ llama_context_params llama_context_default_params() { /*.no_perf =*/ true, /*.op_offload =*/ true, /*.swa_full =*/ true, + /*.graph_reuse =*/ false, }; return result; @@ -2807,6 +2812,7 @@ void llama_perf_context_print(const llama_context * ctx) { LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval); LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); + LLAMA_LOG_INFO("%s: graphs reused = %10d\n", __func__, data.n_reused); } void llama_perf_context_reset(llama_context * ctx) { diff --git a/src/llama-context.h b/src/llama-context.h index 9ce05715a8c03..21d23d83851d1 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -35,8 +35,6 @@ struct llama_context { ggml_backend_sched_t get_sched() const; - ggml_context * get_ctx_compute() const; - uint32_t n_ctx() const; uint32_t n_ctx_per_seq() const; uint32_t n_batch() const; @@ -96,7 +94,7 @@ struct llama_context { // if memory_context is provided, it will be applied first to the context's memory // ret contains the status of the graph computation // returns nullptr only if ret != GGML_STATUS_SUCCESS - llm_graph_result_ptr process_ubatch( + llm_graph_result_i * process_ubatch( const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, @@ -188,10 +186,7 @@ struct llama_context { // public: - int32_t graph_max_nodes() const; - - // zero-out inputs and create the ctx_compute for the compute graph - ggml_cgraph * graph_init(); + uint32_t graph_max_nodes() const; // returns the result of ggml_backend_sched_graph_compute_async execution ggml_status graph_compute(ggml_cgraph * gf, bool batched); @@ -200,12 +195,11 @@ struct llama_context { ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx); private: - llm_graph_result_ptr graph_build( - ggml_context * ctx, - ggml_cgraph * gf, - const llama_ubatch & ubatch, - llm_graph_type gtype, - const llama_memory_context_i * mctx); + llm_graph_params graph_params( + llm_graph_result_i * res, + const llama_ubatch & ubatch, + const llama_memory_context_i * mctx, + llm_graph_type gtype) const; llm_graph_cb graph_get_cb() const; @@ -258,8 +252,6 @@ struct llama_context { ggml_backend_t backend_cpu = nullptr; std::vector backends; - ggml_context_ptr ctx_compute; - // training ggml_opt_context_t opt_ctx = nullptr; @@ -275,8 +267,8 @@ struct llama_context { std::vector backend_ptrs; std::vector backend_buft; - // memory buffers used to evaluate the model - std::vector buf_compute_meta; + llm_graph_result_ptr gf_res_prev; + llm_graph_result_ptr gf_res_reserve; // host buffer for the model output (logits and embeddings) ggml_backend_buffer_ptr buf_output; @@ -294,4 +286,6 @@ struct llama_context { mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) mutable int32_t n_eval = 0; // number of eval calls + + mutable int32_t n_reused = 0; // number of times the previous graph was reused }; diff --git a/src/llama-cparams.cpp b/src/llama-cparams.cpp index a3e7a37ee36d7..f81cd10823d3b 100644 --- a/src/llama-cparams.cpp +++ b/src/llama-cparams.cpp @@ -3,3 +3,32 @@ size_t llama_max_parallel_sequences(void) { return LLAMA_MAX_SEQ; } + +bool llama_cparams::is_same(const llama_cparams & other) const { + return + n_ctx == other.n_ctx && + n_batch == other.n_batch && + n_ubatch == other.n_ubatch && + n_seq_max == other.n_seq_max && + n_threads == other.n_threads && + n_threads_batch == other.n_threads_batch && + rope_freq_base == other.rope_freq_base && + rope_freq_scale == other.rope_freq_scale && + n_ctx_orig_yarn == other.n_ctx_orig_yarn && + yarn_ext_factor == other.yarn_ext_factor && + yarn_attn_factor == other.yarn_attn_factor && + yarn_beta_fast == other.yarn_beta_fast && + yarn_beta_slow == other.yarn_beta_slow && + defrag_thold == other.defrag_thold && + embeddings == other.embeddings && + causal_attn == other.causal_attn && + offload_kqv == other.offload_kqv && + flash_attn == other.flash_attn && + no_perf == other.no_perf && + warmup == other.warmup && + op_offload == other.op_offload && + graph_reuse == other.graph_reuse && + pooling_type == other.pooling_type && + cb_eval == other.cb_eval && + cb_eval_user_data == other.cb_eval_user_data; +} diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 118615d5bd2d5..064767d51d92c 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -33,9 +33,12 @@ struct llama_cparams { bool no_perf; bool warmup; bool op_offload; + bool graph_reuse; enum llama_pooling_type pooling_type; ggml_backend_sched_eval_callback cb_eval; void * cb_eval_user_data; + + bool is_same(const llama_cparams & other) const; }; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 7f0e8c67f1325..264491b4de83a 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -28,6 +28,15 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) { } } +bool llm_graph_input_embd::update(const llm_graph_params & params) { + bool res = true; + + res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens); + res &= (!embd && !params.ubatch.embd) || (embd && embd->ne[0] == params.ubatch.n_tokens); + + return res; +} + void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) { if (ubatch->pos && pos) { const int64_t n_tokens = ubatch->n_tokens; @@ -50,6 +59,14 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) { } } +bool llm_graph_input_pos::update(const llm_graph_params & params) { + bool res = true; + + res &= pos->ne[0] == params.ubatch.n_tokens; + + return res; +} + void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) { if (ubatch->pos && attn_scale) { const int64_t n_tokens = ubatch->n_tokens; @@ -118,6 +135,14 @@ void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) { } } +bool llm_graph_input_out_ids::update(const llm_graph_params & params) { + bool res = true; + + res &= n_outputs == params.n_outputs; + + return res; +} + void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) { if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { const int64_t n_tokens = ubatch->n_tokens; @@ -287,6 +312,24 @@ void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) { mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); } +bool llm_graph_input_attn_kv_unified::update(const llm_graph_params & params) { + const auto * mctx = static_cast(params.mctx); + + this->mctx = mctx; + + bool res = true; + + res &= self_k_idxs->ne[0] == params.ubatch.n_tokens; + res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; + + res &= self_kq_mask->ne[0] == mctx->get_n_kv(); + res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD); + + res &= mctx->get_supports_set_rows(); // TODO: tmp + + return res; +} + void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) { mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch); mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch); @@ -299,6 +342,30 @@ void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); } +bool llm_graph_input_attn_kv_unified_iswa::update(const llm_graph_params & params) { + const auto * mctx = static_cast(params.mctx); + + this->mctx = mctx; + + bool res = true; + + res &= self_k_idxs->ne[0] == params.ubatch.n_tokens; + res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; + + res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens; + res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; + + res &= self_kq_mask->ne[0] == mctx->get_base()->get_n_kv(); + res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD); + + res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv(); + res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD); + + res &= mctx->get_base()->get_supports_set_rows(); // TODO: tmp + + return res; +} + void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) { GGML_ASSERT(cross_kq_mask); @@ -395,7 +462,6 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : n_ctx_orig (cparams.n_ctx_orig_yarn), pooling_type (cparams.pooling_type), rope_type (hparams.rope_type), - ctx0 (params.ctx), sched (params.sched), backend_cpu (params.backend_cpu), cvec (params.cvec), @@ -403,7 +469,8 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : mctx (params.mctx), cross (params.cross), cb_func (params.cb), - res (std::make_unique()) { + res (static_cast(params.res)), + ctx0 (res->get_ctx()) { } void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const { diff --git a/src/llama-graph.h b/src/llama-graph.h index 7bdf656768a0c..ae2fd6481b609 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -1,6 +1,7 @@ #pragma once #include "llama-arch.h" +#include "llama-batch.h" #include "llama-hparams.h" #include "llama-adapter.h" @@ -14,7 +15,6 @@ struct ggml_cgraph; struct ggml_context; struct ggml_tensor; -struct llama_ubatch; struct llama_cparams; struct llama_memory_context_i; @@ -69,6 +69,8 @@ struct llama_cross { std::vector> seq_ids_enc; }; +struct llm_graph_params; + // // llm_graph_input // @@ -78,11 +80,19 @@ class llm_graph_input_i { virtual ~llm_graph_input_i() = default; virtual void set_input(const llama_ubatch * ubatch) = 0; + + // return true if the resulting input tensors using the provided graph parameters would be + // the same as the previous input tensors that we have currently stored in the object + virtual bool update(const llm_graph_params & params) { + // returning false here by default will prevent from reusing the graph if the check + // for the input type has not been implemented yet + GGML_UNUSED(params); + return false; + } }; using llm_graph_input_ptr = std::unique_ptr; - class llm_graph_input_embd : public llm_graph_input_i { public: llm_graph_input_embd() = default; @@ -90,6 +100,8 @@ class llm_graph_input_embd : public llm_graph_input_i { void set_input(const llama_ubatch * ubatch) override; + bool update(const llm_graph_params & params) override; + ggml_tensor * tokens = nullptr; // I32 [n_batch] ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch] }; @@ -101,6 +113,8 @@ class llm_graph_input_pos : public llm_graph_input_i { void set_input(const llama_ubatch * ubatch) override; + bool update(const llm_graph_params & params) override; + ggml_tensor * pos = nullptr; // I32 [n_batch] const uint32_t n_pos_per_embd = 1; @@ -154,17 +168,19 @@ class llm_graph_input_out_ids : public llm_graph_input_i { llm_graph_input_out_ids( const llama_hparams & hparams, const llama_cparams & cparams, - int32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {} + uint32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {} virtual ~llm_graph_input_out_ids() = default; void set_input(const llama_ubatch * ubatch) override; + bool update(const llm_graph_params & params) override; + ggml_tensor * out_ids; // I32 [n_outputs] const llama_hparams & hparams; const llama_cparams & cparams; - const int32_t n_outputs; + const uint32_t n_outputs; }; class llm_graph_input_mean : public llm_graph_input_i { @@ -249,6 +265,8 @@ class llm_graph_input_attn_kv_unified : public llm_graph_input_i { void set_input(const llama_ubatch * ubatch) override; + bool update(const llm_graph_params & params) override; + ggml_tensor * get_k_idxs() const { return self_k_idxs; } ggml_tensor * get_v_idxs() const { return self_v_idxs; } @@ -280,6 +298,8 @@ class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i { void set_input(const llama_ubatch * ubatch) override; + bool update(const llm_graph_params & params) override; + ggml_tensor * get_k_idxs() const { return self_k_idxs; } ggml_tensor * get_v_idxs() const { return self_v_idxs; } ggml_tensor * get_k_idxs_swa() const { return self_k_idxs_swa; } @@ -373,29 +393,110 @@ class llm_graph_input_one : public llm_graph_input_i { // along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc. // these are used by the llama_context to extact the relevant data, based on the compute parameters +// TODO: this interface seems redundant - remove it class llm_graph_result_i { public: virtual ~llm_graph_result_i() = default; - virtual ggml_tensor * get_tokens() = 0; - virtual ggml_tensor * get_logits() = 0; - virtual ggml_tensor * get_embd() = 0; - virtual ggml_tensor * get_embd_pooled() = 0; + virtual ggml_tensor * get_tokens() const = 0; + virtual ggml_tensor * get_logits() const = 0; + virtual ggml_tensor * get_embd() const = 0; + virtual ggml_tensor * get_embd_pooled() const = 0; + + virtual ggml_cgraph * get_gf() = 0; + virtual ggml_context * get_ctx() = 0; + + virtual void reset() = 0; virtual void set_inputs(const llama_ubatch * ubatch) = 0; + + virtual bool update(const llm_graph_params & params) = 0; }; using llm_graph_result_ptr = std::unique_ptr; +// callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) +using llm_graph_cb = std::function; + +struct llm_graph_params { + llm_arch arch = LLM_ARCH_UNKNOWN; + + llama_hparams hparams; + llama_cparams cparams; + + llama_ubatch ubatch; // note: intentionally make a copy + + llm_graph_type gtype; + + ggml_backend_sched_t sched; + ggml_backend_t backend_cpu; + + const llama_adapter_cvec * cvec; + const llama_adapter_loras * loras; + const llama_memory_context_i * mctx; + const llama_cross * cross; + + uint32_t n_outputs; + + llm_graph_cb cb; + + // TODO: temporary + llm_graph_result_i * res; + + bool is_same(const llm_graph_params & other) const { + return + hparams.is_same(other.hparams) && + cparams.is_same(other.cparams) && + ubatch .is_same(other.ubatch) && + arch == other.arch && + gtype == other.gtype && + cvec == other.cvec && + loras == other.loras && + cross == other.cross && + n_outputs == other.n_outputs; + } +}; class llm_graph_result : public llm_graph_result_i { public: + llm_graph_result(int64_t max_nodes) : max_nodes(max_nodes) { + reset(); + } + virtual ~llm_graph_result() = default; - ggml_tensor * get_tokens() override { return t_tokens; } - ggml_tensor * get_logits() override { return t_logits; } - ggml_tensor * get_embd() override { return t_embd; } - ggml_tensor * get_embd_pooled() override { return t_embd_pooled; } + ggml_tensor * get_tokens() const override { return t_tokens; } + ggml_tensor * get_logits() const override { return t_logits; } + ggml_tensor * get_embd() const override { return t_embd; } + ggml_tensor * get_embd_pooled() const override { return t_embd_pooled; } + + ggml_cgraph * get_gf() override { return gf; } + ggml_context * get_ctx() override { return ctx_compute.get(); } + + void set_max_nodes(int64_t max_nodes) { + this->max_nodes = max_nodes; + } + + void reset() override { + t_tokens = nullptr; + t_logits = nullptr; + t_embd = nullptr; + t_embd_pooled = nullptr; + + inputs.clear(); + + buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); + + ggml_init_params params = { + /*.mem_size =*/ buf_compute_meta.size(), + /*.mem_buffer =*/ buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + + ctx_compute.reset(ggml_init(params)); + + gf = ggml_new_graph_custom(ctx_compute.get(), max_nodes, false); + } void set_inputs(const llama_ubatch * ubatch) override { for (auto & input : inputs) { @@ -403,6 +504,25 @@ class llm_graph_result : public llm_graph_result_i { } } + // try to update the existing graph result using the new graph parameters + // this can only be done if we determine that the resulting graph using the new graph parameters + // would be identical to the existing graph. in that case, we simply have to update the memory + // contexts of the input tensors of the graph and we can reuse it for another computation + // return true if the graph was updated and can be reused + bool update(const llm_graph_params & params) override { + if (!this->params.is_same(params)) { + return false; + } + + bool res = true; + + for (auto & input : inputs) { + res &= input->update(params); + } + + return res; + } + llm_graph_input_i * add_input(llm_graph_input_ptr input) { inputs.emplace_back(std::move(input)); return inputs.back().get(); @@ -415,37 +535,26 @@ class llm_graph_result : public llm_graph_result_i { ggml_tensor * t_embd_pooled = nullptr; std::vector inputs; -}; -// -// llm_graph_context -// + ggml_context_ptr ctx_compute; -// callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) -using llm_graph_cb = std::function; + // memory buffers used to evaluate the model + std::vector buf_compute_meta; -struct llm_graph_params { - ggml_context * ctx; + ggml_cgraph * gf; - const llm_arch arch; + int64_t max_nodes; - const llama_hparams & hparams; - const llama_cparams & cparams; - const llama_ubatch & ubatch; - - ggml_backend_sched_t sched; - ggml_backend_t backend_cpu; - - const llama_adapter_cvec * cvec; - const llama_adapter_loras * loras; - const llama_memory_context_i * mctx; - const llama_cross * cross; - - uint32_t n_outputs; - - const llm_graph_cb & cb; + // keep a copy of the previous graph parameters + // we will use this to determine whether the graph can be reused by comparing them with the new parameters + // note: these are updated after constructing the new graph + llm_graph_params params; }; +// +// llm_graph_context +// + // used in build_rs to properly order writes and avoid unnecessary copies using llm_graph_get_rows_fn = std::function; @@ -485,8 +594,6 @@ struct llm_graph_context { const enum llama_pooling_type pooling_type; const enum llama_rope_type rope_type; - ggml_context * ctx0 = nullptr; - ggml_backend_sched_t sched; ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove? @@ -498,7 +605,9 @@ struct llm_graph_context { const llm_graph_cb & cb_func; - std::unique_ptr res; + llm_graph_result * res; + + ggml_context * ctx0 = nullptr; llm_graph_context(const llm_graph_params & params); virtual ~llm_graph_context() = default; diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 86c814d51b901..a59e96a92fe7e 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -102,3 +102,12 @@ bool llama_hparams::is_swa(uint32_t il) const { GGML_ABORT("fatal error"); } + +bool llama_hparams::is_same(const llama_hparams & other) const { + return + n_ctx_train == other.n_ctx_train && + n_embd == other.n_embd && + n_layer == other.n_layer && + n_expert == other.n_expert && + n_expert_used == other.n_expert_used; +} diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 476d0a5eade28..dfddb4f3d9534 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -202,6 +202,8 @@ struct llama_hparams { uint32_t n_pos_per_embd() const; bool is_swa(uint32_t il) const; + + bool is_same(const llama_hparams & other) const; }; static_assert(std::is_trivially_copyable::value, "llama_hparams must be trivially copyable"); diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp index d3129cc53281e..e91b92b6095e1 100644 --- a/src/llama-kv-cache-unified.cpp +++ b/src/llama-kv-cache-unified.cpp @@ -68,6 +68,8 @@ llama_kv_cache_unified::llama_kv_cache_unified( cells.resize(kv_size); + gf_res.reset(new llm_graph_result(32768)); // note: the max nodes will be updated later + for (uint32_t il = 0; il < n_layer_cache; il++) { if (filter && !filter(il)) { LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il); @@ -158,7 +160,7 @@ llama_kv_cache_unified::llama_kv_cache_unified( debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0; const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS"); - supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) : 0; + supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : 0; if (!supports_set_rows) { LLAMA_LOG_WARN("%s: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility\n", __func__); @@ -480,14 +482,12 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d if (hparams.rope_type != LLAMA_ROPE_TYPE_NONE) { ggml_backend_sched_reset(sched); - auto * gf = lctx->graph_init(); + auto * res = gf_res.get(); - auto res = build_graph_shift(lctx->get_cparams(), lctx->get_ctx_compute(), gf); - if (!res) { - LLAMA_LOG_ERROR("%s: failed to build graph for K-shift\n", __func__); - return updated; - } + res->set_max_nodes(lctx->graph_max_nodes()); + res->reset(); + auto * gf = build_graph_shift(res, lctx); if (!ggml_backend_sched_alloc_graph(sched, gf)) { LLAMA_LOG_ERROR("%s: failed to allocate compute graph for K-shift\n", __func__); return updated; @@ -529,14 +529,12 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d ggml_backend_sched_reset(sched); - auto * gf = lctx->graph_init(); + auto * res = gf_res.get(); - auto res = build_graph_defrag(lctx->get_cparams(), lctx->get_ctx_compute(), gf, dinfo); - if (!res) { - LLAMA_LOG_ERROR("%s: failed to build graph for defrag\n", __func__); - return updated; - } + res->set_max_nodes(lctx->graph_max_nodes()); + res->reset(); + auto * gf = build_graph_defrag(res, lctx, dinfo); if (!ggml_backend_sched_alloc_graph(sched, gf)) { LLAMA_LOG_ERROR("%s: failed to allocate compute graph for defrag\n", __func__); return updated; @@ -780,6 +778,10 @@ uint32_t llama_kv_cache_unified::get_n_kv() const { return std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad))); } +bool llama_kv_cache_unified::get_supports_set_rows() const { + return supports_set_rows; +} + ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv) const { const int32_t ikv = map_layer_ids.at(il); @@ -1142,11 +1144,9 @@ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) { } } -llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift( - const llama_cparams & cparams, - ggml_context * ctx, - ggml_cgraph * gf) const { - auto res = std::make_unique(); +ggml_cgraph * llama_kv_cache_unified::build_graph_shift(llm_graph_result * res, llama_context * lctx) const { + auto * ctx = res->get_ctx(); + auto * gf = res->get_gf(); const auto & n_embd_head_k = hparams.n_embd_head_k; //const auto & n_embd_head_v = hparams.n_embd_head_v; @@ -1156,6 +1156,8 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift( inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cells.size()); ggml_set_input(inp->k_shift); + const auto & cparams = lctx->get_cparams(); + for (const auto & layer : layers) { const uint32_t il = layer.il; @@ -1181,18 +1183,20 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift( res->add_input(std::move(inp)); - return res; + return gf; } -llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag( - const llama_cparams & cparams, - ggml_context * ctx, - ggml_cgraph * gf, - const defrag_info & dinfo) const { - auto res = std::make_unique(); +ggml_cgraph * llama_kv_cache_unified::build_graph_defrag( + llm_graph_result * res, + llama_context * lctx, + const defrag_info & dinfo) const { + auto * ctx = res->get_ctx(); + auto * gf = res->get_gf(); const auto & ids = dinfo.ids; + const auto & cparams = lctx->get_cparams(); + #if 0 // CPU defrag // @@ -1329,7 +1333,7 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag( //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes); #endif - return res; + return gf; } llama_kv_cache_unified::defrag_info llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) const { @@ -1940,6 +1944,10 @@ uint32_t llama_kv_cache_unified_context::get_n_kv() const { return n_kv; } +bool llama_kv_cache_unified_context::get_supports_set_rows() const { + return kv->get_supports_set_rows(); +} + ggml_tensor * llama_kv_cache_unified_context::get_k(ggml_context * ctx, int32_t il) const { return kv->get_k(ctx, il, n_kv); } diff --git a/src/llama-kv-cache-unified.h b/src/llama-kv-cache-unified.h index b8b0356e830c8..3eff137fee953 100644 --- a/src/llama-kv-cache-unified.h +++ b/src/llama-kv-cache-unified.h @@ -121,6 +121,9 @@ class llama_kv_cache_unified : public llama_memory_i { uint32_t get_n_kv() const; + // TODO: temporary + bool get_supports_set_rows() const; + // get views of the current state of the cache ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv) const; ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv) const; @@ -193,13 +196,15 @@ class llama_kv_cache_unified : public llama_memory_i { // env: LLAMA_SET_ROWS (temporary) // ref: https://github.com/ggml-org/llama.cpp/pull/14285 - int supports_set_rows = false; + bool supports_set_rows = false; const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE; std::vector ctxs; std::vector bufs; + std::unique_ptr gf_res; + llama_kv_cells_unified cells; std::vector layers; @@ -226,15 +231,13 @@ class llama_kv_cache_unified : public llama_memory_i { float freq_base, float freq_scale) const; - llm_graph_result_ptr build_graph_shift( - const llama_cparams & cparams, - ggml_context * ctx, - ggml_cgraph * gf) const; + ggml_cgraph * build_graph_shift( + llm_graph_result * res, + llama_context * lctx) const; - llm_graph_result_ptr build_graph_defrag( - const llama_cparams & cparams, - ggml_context * ctx, - ggml_cgraph * gf, + ggml_cgraph * build_graph_defrag( + llm_graph_result * res, + llama_context * lctx, const defrag_info & dinfo) const; void state_write_meta(llama_io_write_i & io, const std::vector> & cell_ranges, llama_seq_id seq_id = -1) const; @@ -288,6 +291,9 @@ class llama_kv_cache_unified_context : public llama_memory_context_i { uint32_t get_n_kv() const; + // TODO: temporary + bool get_supports_set_rows() const; + // get views of the current state of the cache ggml_tensor * get_k(ggml_context * ctx, int32_t il) const; ggml_tensor * get_v(ggml_context * ctx, int32_t il) const; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 0573c5bcea0a4..caab6a0d270be 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -14751,10 +14751,10 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, return res; } -llm_graph_result_ptr llama_model::build_graph( - const llm_graph_params & params, - ggml_cgraph * gf, - llm_graph_type type) const { +ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { + // TODO: temporary - will refactor this to keep the "gf" instance in the llm_graph_context and avoid passing it everywhere + auto * gf = params.res->get_gf(); + std::unique_ptr llm; switch (arch) { @@ -14961,7 +14961,7 @@ llm_graph_result_ptr llama_model::build_graph( } break; case LLM_ARCH_T5: { - switch (type) { + switch (params.gtype) { case LLM_GRAPH_TYPE_ENCODER: llm = std::make_unique(*this, params, gf); break; @@ -15047,7 +15047,10 @@ llm_graph_result_ptr llama_model::build_graph( // add on pooling layer llm->build_pooling(gf, cls, cls_b, cls_out, cls_out_b); - return std::move(llm->res); + // TODO: updating the graph parameters here is a little bit obscure - figure out something better + llm->res->params = params; + + return llm->res->get_gf(); } // diff --git a/src/llama-model.h b/src/llama-model.h index 979fff62045f9..9bf9cf327dab9 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -436,10 +436,7 @@ struct llama_model { llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const; // TODO: move this to new llm_arch_model_i interface - llm_graph_result_ptr build_graph( - const llm_graph_params & params, - ggml_cgraph * gf, - llm_graph_type type) const; + ggml_cgraph * build_graph(const llm_graph_params & params) const; private: struct impl; From 0d2038f90a7738ce4c0a86808d09800453251bf7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 7 Jul 2025 09:07:15 +0300 Subject: [PATCH 2/6] llama-bench : add graph reuse parameter ggml-ci --- tools/llama-bench/llama-bench.cpp | 37 ++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index b80e984d0245b..ad74dbaf27a11 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -261,6 +261,7 @@ struct cmd_params { std::vector use_mmap; std::vector embeddings; std::vector no_op_offload; + std::vector graph_reuse; ggml_numa_strategy numa; int reps; ggml_sched_priority prio; @@ -298,6 +299,7 @@ static const cmd_params cmd_params_defaults = { /* use_mmap */ { true }, /* embeddings */ { false }, /* no_op_offload */ { false }, + /* graph_reuse */ { false }, /* numa */ GGML_NUMA_STRATEGY_DISABLED, /* reps */ 5, /* prio */ GGML_SCHED_PRIO_NORMAL, @@ -377,6 +379,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -ot --override-tensors =;...\n"); printf(" (default: disabled)\n"); printf(" -nopo, --no-op-offload <0|1> (default: 0)\n"); + printf(" -gr, --graph-reuse <0|1> (default: 0)\n"); printf("\n"); printf( "Multiple values can be given for each parameter by separating them with ','\n" @@ -620,6 +623,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = string_split(argv[i], split_delim); params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end()); + } else if (arg == "-gr" || arg == "--graph-reuse") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.graph_reuse.insert(params.graph_reuse.end(), p.begin(), p.end()); } else if (arg == "--numa") { if (++i >= argc) { invalid_param = true; @@ -885,6 +895,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.no_op_offload.empty()) { params.no_op_offload = cmd_params_defaults.no_op_offload; } + if (params.graph_reuse.empty()) { + params.graph_reuse = cmd_params_defaults.graph_reuse; + } if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } @@ -926,6 +939,7 @@ struct cmd_params_instance { bool use_mmap; bool embeddings; bool no_op_offload; + bool graph_reuse; llama_model_params to_llama_mparams() const { llama_model_params mparams = llama_model_default_params(); @@ -998,6 +1012,7 @@ struct cmd_params_instance { cparams.embeddings = embeddings; cparams.op_offload = !no_op_offload; cparams.swa_full = false; + cparams.graph_reuse = graph_reuse; return cparams; } @@ -1018,6 +1033,7 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & mmp : params.use_mmap) for (const auto & embd : params.embeddings) for (const auto & nopo : params.no_op_offload) + for (const auto & gr : params.graph_reuse) for (const auto & nb : params.n_batch) for (const auto & nub : params.n_ubatch) for (const auto & tk : params.type_k) @@ -1059,6 +1075,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .use_mmap = */ mmp, /* .embeddings = */ embd, /* .no_op_offload= */ nopo, + /* .graph_reuse = */ gr, }; instances.push_back(instance); } @@ -1092,6 +1109,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .use_mmap = */ mmp, /* .embeddings = */ embd, /* .no_op_offload= */ nopo, + /* .graph_reuse = */ gr, }; instances.push_back(instance); } @@ -1125,6 +1143,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .use_mmap = */ mmp, /* .embeddings = */ embd, /* .no_op_offload= */ nopo, + /* .graph_reuse = */ gr, }; instances.push_back(instance); } @@ -1162,6 +1181,7 @@ struct test { bool use_mmap; bool embeddings; bool no_op_offload; + bool graph_reuse; int n_prompt; int n_gen; int n_depth; @@ -1197,6 +1217,7 @@ struct test { use_mmap = inst.use_mmap; embeddings = inst.embeddings; no_op_offload = inst.no_op_offload; + graph_reuse = inst.graph_reuse; n_prompt = inst.n_prompt; n_gen = inst.n_gen; n_depth = inst.n_depth; @@ -1243,8 +1264,8 @@ struct test { "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", "defrag_thold", - "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time", - "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", + "use_mmap", "embeddings", "no_op_offload", "graph_reuse", "n_prompt", "n_gen", "n_depth", + "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", }; return fields; } @@ -1259,7 +1280,7 @@ struct test { return INT; } if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || - field == "use_mmap" || field == "embeddings") { + field == "use_mmap" || field == "embeddings" || field == "graph_reuse") { return BOOL; } if (field == "avg_ts" || field == "stddev_ts" || field == "defrag_thold") { @@ -1333,6 +1354,7 @@ struct test { std::to_string(use_mmap), std::to_string(embeddings), std::to_string(no_op_offload), + std::to_string(graph_reuse), std::to_string(n_prompt), std::to_string(n_gen), std::to_string(n_depth), @@ -1518,6 +1540,9 @@ struct markdown_printer : public printer { if (field == "no_op_offload") { return 4; } + if (field == "graph_reuse") { + return 4; + } int width = std::max((int) field.length(), 10); @@ -1552,6 +1577,9 @@ struct markdown_printer : public printer { if (field == "no_op_offload") { return "nopo"; } + if (field == "graph_reuse") { + return "gr"; + } if (field == "tensor_split") { return "ts"; } @@ -1626,6 +1654,9 @@ struct markdown_printer : public printer { if (params.no_op_offload.size() > 1 || params.no_op_offload != cmd_params_defaults.no_op_offload) { fields.emplace_back("no_op_offload"); } + if (params.graph_reuse.size() > 1 || params.graph_reuse != cmd_params_defaults.graph_reuse) { + fields.emplace_back("graph_reuse"); + } fields.emplace_back("test"); fields.emplace_back("t/s"); From 5143876791d7ebec40b0e4df5cf61330415154c4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 8 Jul 2025 20:26:57 +0300 Subject: [PATCH 3/6] cont : remove the parameter and the sched resets ggml-ci --- common/arg.cpp | 8 ------- common/common.cpp | 1 - common/common.h | 1 - src/llama-context.cpp | 21 +++++++----------- src/llama-kv-cache-unified.cpp | 1 + tools/llama-bench/llama-bench.cpp | 37 +++---------------------------- 6 files changed, 12 insertions(+), 57 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index d4a3f5b7f21ec..40af7e574830f 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1464,14 +1464,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.swa_full = true; } ).set_env("LLAMA_ARG_SWA_FULL")); - add_opt(common_arg( - {"--graph-reuse", "-gr"}, - string_format("reuse previous compute graphs when possible (default: %s)" - "[(more info)](https://github.com/ggml-org/llama.cpp/pull/14482)", params.graph_reuse ? "true" : "false"), - [](common_params & params) { - params.graph_reuse = true; - } - ).set_env("LLAMA_ARG_GRAPH_REUSE")); add_opt(common_arg( {"--no-context-shift"}, string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"), diff --git a/common/common.cpp b/common/common.cpp index c7c163404a2af..e4e71ad13fb59 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1157,7 +1157,6 @@ struct llama_context_params common_context_params_to_llama(const common_params & cparams.no_perf = params.no_perf; cparams.op_offload = !params.no_op_offload; cparams.swa_full = params.swa_full; - cparams.graph_reuse = params.graph_reuse; cparams.type_k = params.cache_type_k; cparams.type_v = params.cache_type_v; diff --git a/common/common.h b/common/common.h index 465d05672d2a6..8922090e7b10d 100644 --- a/common/common.h +++ b/common/common.h @@ -330,7 +330,6 @@ struct common_params { bool no_perf = false; // disable performance metrics bool ctx_shift = true; // context shift on inifinite text generation bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) - bool graph_reuse = false; // reuse previous compute graphs when possible bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool use_mmap = true; // use mmap for faster loads diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 8a1e2a86f9709..b3c94da597f2c 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -718,7 +718,14 @@ llm_graph_result_i * llama_context::process_ubatch(const llama_ubatch & ubatch, } } - res->set_inputs(&ubatch); + // set the input data for the input tensors + { + //const auto t_start_us = ggml_time_us(); + + res->set_inputs(&ubatch); + + //LLAMA_LOG_INFO("graph set inputs time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0); + } const auto status = graph_compute(res->get_gf(), ubatch.n_tokens > 1); if (status != GGML_STATUS_SUCCESS) { @@ -852,12 +859,6 @@ int llama_context::encode(const llama_batch & batch_inp) { } } - // Reset state for the next token before backend sync, to allow the CPU activities in the reset to - // overlap with device computation. - if (!cparams.graph_reuse) { - ggml_backend_sched_reset(sched.get()); - } - // TODO: hacky solution if (model.arch == LLM_ARCH_T5 && t_embd) { //cross.t_embd = t_embd; @@ -1197,12 +1198,6 @@ int llama_context::decode(const llama_batch & batch_inp) { // wait for the computation to finish (automatically done when obtaining the model output) //synchronize(); - // Reset state for the next token before backend sync, to allow the CPU activities in the reset to - // overlap with device computation. - if (!cparams.graph_reuse) { - ggml_backend_sched_reset(sched.get()); - } - return 0; } diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp index e91b92b6095e1..246eaec164a11 100644 --- a/src/llama-kv-cache-unified.cpp +++ b/src/llama-kv-cache-unified.cpp @@ -963,6 +963,7 @@ void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ub // xxxxx----- // xxxxx----- // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615 + // TODO: optimize this section for (uint32_t h = 0; h < 1; ++h) { for (uint32_t i = 0; i < n_tokens; ++i) { const llama_seq_id seq_id = ubatch->seq_id[i][0]; diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index ad74dbaf27a11..b80e984d0245b 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -261,7 +261,6 @@ struct cmd_params { std::vector use_mmap; std::vector embeddings; std::vector no_op_offload; - std::vector graph_reuse; ggml_numa_strategy numa; int reps; ggml_sched_priority prio; @@ -299,7 +298,6 @@ static const cmd_params cmd_params_defaults = { /* use_mmap */ { true }, /* embeddings */ { false }, /* no_op_offload */ { false }, - /* graph_reuse */ { false }, /* numa */ GGML_NUMA_STRATEGY_DISABLED, /* reps */ 5, /* prio */ GGML_SCHED_PRIO_NORMAL, @@ -379,7 +377,6 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -ot --override-tensors =;...\n"); printf(" (default: disabled)\n"); printf(" -nopo, --no-op-offload <0|1> (default: 0)\n"); - printf(" -gr, --graph-reuse <0|1> (default: 0)\n"); printf("\n"); printf( "Multiple values can be given for each parameter by separating them with ','\n" @@ -623,13 +620,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = string_split(argv[i], split_delim); params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end()); - } else if (arg == "-gr" || arg == "--graph-reuse") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.graph_reuse.insert(params.graph_reuse.end(), p.begin(), p.end()); } else if (arg == "--numa") { if (++i >= argc) { invalid_param = true; @@ -895,9 +885,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.no_op_offload.empty()) { params.no_op_offload = cmd_params_defaults.no_op_offload; } - if (params.graph_reuse.empty()) { - params.graph_reuse = cmd_params_defaults.graph_reuse; - } if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } @@ -939,7 +926,6 @@ struct cmd_params_instance { bool use_mmap; bool embeddings; bool no_op_offload; - bool graph_reuse; llama_model_params to_llama_mparams() const { llama_model_params mparams = llama_model_default_params(); @@ -1012,7 +998,6 @@ struct cmd_params_instance { cparams.embeddings = embeddings; cparams.op_offload = !no_op_offload; cparams.swa_full = false; - cparams.graph_reuse = graph_reuse; return cparams; } @@ -1033,7 +1018,6 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & mmp : params.use_mmap) for (const auto & embd : params.embeddings) for (const auto & nopo : params.no_op_offload) - for (const auto & gr : params.graph_reuse) for (const auto & nb : params.n_batch) for (const auto & nub : params.n_ubatch) for (const auto & tk : params.type_k) @@ -1075,7 +1059,6 @@ static std::vector get_cmd_params_instances(const cmd_param /* .use_mmap = */ mmp, /* .embeddings = */ embd, /* .no_op_offload= */ nopo, - /* .graph_reuse = */ gr, }; instances.push_back(instance); } @@ -1109,7 +1092,6 @@ static std::vector get_cmd_params_instances(const cmd_param /* .use_mmap = */ mmp, /* .embeddings = */ embd, /* .no_op_offload= */ nopo, - /* .graph_reuse = */ gr, }; instances.push_back(instance); } @@ -1143,7 +1125,6 @@ static std::vector get_cmd_params_instances(const cmd_param /* .use_mmap = */ mmp, /* .embeddings = */ embd, /* .no_op_offload= */ nopo, - /* .graph_reuse = */ gr, }; instances.push_back(instance); } @@ -1181,7 +1162,6 @@ struct test { bool use_mmap; bool embeddings; bool no_op_offload; - bool graph_reuse; int n_prompt; int n_gen; int n_depth; @@ -1217,7 +1197,6 @@ struct test { use_mmap = inst.use_mmap; embeddings = inst.embeddings; no_op_offload = inst.no_op_offload; - graph_reuse = inst.graph_reuse; n_prompt = inst.n_prompt; n_gen = inst.n_gen; n_depth = inst.n_depth; @@ -1264,8 +1243,8 @@ struct test { "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", "defrag_thold", - "use_mmap", "embeddings", "no_op_offload", "graph_reuse", "n_prompt", "n_gen", "n_depth", - "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", + "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time", + "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", }; return fields; } @@ -1280,7 +1259,7 @@ struct test { return INT; } if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || - field == "use_mmap" || field == "embeddings" || field == "graph_reuse") { + field == "use_mmap" || field == "embeddings") { return BOOL; } if (field == "avg_ts" || field == "stddev_ts" || field == "defrag_thold") { @@ -1354,7 +1333,6 @@ struct test { std::to_string(use_mmap), std::to_string(embeddings), std::to_string(no_op_offload), - std::to_string(graph_reuse), std::to_string(n_prompt), std::to_string(n_gen), std::to_string(n_depth), @@ -1540,9 +1518,6 @@ struct markdown_printer : public printer { if (field == "no_op_offload") { return 4; } - if (field == "graph_reuse") { - return 4; - } int width = std::max((int) field.length(), 10); @@ -1577,9 +1552,6 @@ struct markdown_printer : public printer { if (field == "no_op_offload") { return "nopo"; } - if (field == "graph_reuse") { - return "gr"; - } if (field == "tensor_split") { return "ts"; } @@ -1654,9 +1626,6 @@ struct markdown_printer : public printer { if (params.no_op_offload.size() > 1 || params.no_op_offload != cmd_params_defaults.no_op_offload) { fields.emplace_back("no_op_offload"); } - if (params.graph_reuse.size() > 1 || params.graph_reuse != cmd_params_defaults.graph_reuse) { - fields.emplace_back("graph_reuse"); - } fields.emplace_back("test"); fields.emplace_back("t/s"); From ee7c644f82d4cc327d184ddf898b0da92b1ef05d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 8 Jul 2025 20:30:32 +0300 Subject: [PATCH 4/6] graph : rename update() to can_reuse() ggml-ci --- include/llama.h | 2 -- src/llama-context.cpp | 10 ++++------ src/llama-cparams.cpp | 1 - src/llama-cparams.h | 1 - src/llama-graph.cpp | 10 +++++----- src/llama-graph.h | 20 ++++++++++---------- 6 files changed, 19 insertions(+), 25 deletions(-) diff --git a/include/llama.h b/include/llama.h index 1e8228997f1d8..af1765b53b7e2 100644 --- a/include/llama.h +++ b/include/llama.h @@ -374,8 +374,6 @@ extern "C" { bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573 - - bool graph_reuse; // reuse previous compute graphs when possible }; // model quantization parameters diff --git a/src/llama-context.cpp b/src/llama-context.cpp index b3c94da597f2c..7ec6e24a94ada 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -101,8 +101,7 @@ llama_context::llama_context( cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch); - cparams.op_offload = params.op_offload; - cparams.graph_reuse = params.graph_reuse; + cparams.op_offload = params.op_offload; const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max; @@ -689,9 +688,9 @@ llm_graph_result_i * llama_context::process_ubatch(const llama_ubatch & ubatch, // in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters const auto gparams = graph_params(res, ubatch, mctx, gtype); - const bool can_reuse = cparams.graph_reuse && res->update(gparams); - if (can_reuse) { - LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__); + if (res->can_reuse(gparams)) { + //LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__); + n_reused++; } else { res->reset(); @@ -2186,7 +2185,6 @@ llama_context_params llama_context_default_params() { /*.no_perf =*/ true, /*.op_offload =*/ true, /*.swa_full =*/ true, - /*.graph_reuse =*/ false, }; return result; diff --git a/src/llama-cparams.cpp b/src/llama-cparams.cpp index f81cd10823d3b..2abe8feb0cd99 100644 --- a/src/llama-cparams.cpp +++ b/src/llama-cparams.cpp @@ -27,7 +27,6 @@ bool llama_cparams::is_same(const llama_cparams & other) const { no_perf == other.no_perf && warmup == other.warmup && op_offload == other.op_offload && - graph_reuse == other.graph_reuse && pooling_type == other.pooling_type && cb_eval == other.cb_eval && cb_eval_user_data == other.cb_eval_user_data; diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 064767d51d92c..2c38f92e423a3 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -33,7 +33,6 @@ struct llama_cparams { bool no_perf; bool warmup; bool op_offload; - bool graph_reuse; enum llama_pooling_type pooling_type; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 264491b4de83a..99c6fe0e0353a 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -28,7 +28,7 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) { } } -bool llm_graph_input_embd::update(const llm_graph_params & params) { +bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) { bool res = true; res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens); @@ -59,7 +59,7 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) { } } -bool llm_graph_input_pos::update(const llm_graph_params & params) { +bool llm_graph_input_pos::can_reuse(const llm_graph_params & params) { bool res = true; res &= pos->ne[0] == params.ubatch.n_tokens; @@ -135,7 +135,7 @@ void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) { } } -bool llm_graph_input_out_ids::update(const llm_graph_params & params) { +bool llm_graph_input_out_ids::can_reuse(const llm_graph_params & params) { bool res = true; res &= n_outputs == params.n_outputs; @@ -312,7 +312,7 @@ void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) { mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); } -bool llm_graph_input_attn_kv_unified::update(const llm_graph_params & params) { +bool llm_graph_input_attn_kv_unified::can_reuse(const llm_graph_params & params) { const auto * mctx = static_cast(params.mctx); this->mctx = mctx; @@ -342,7 +342,7 @@ void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); } -bool llm_graph_input_attn_kv_unified_iswa::update(const llm_graph_params & params) { +bool llm_graph_input_attn_kv_unified_iswa::can_reuse(const llm_graph_params & params) { const auto * mctx = static_cast(params.mctx); this->mctx = mctx; diff --git a/src/llama-graph.h b/src/llama-graph.h index ae2fd6481b609..49a3835e90492 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -83,7 +83,7 @@ class llm_graph_input_i { // return true if the resulting input tensors using the provided graph parameters would be // the same as the previous input tensors that we have currently stored in the object - virtual bool update(const llm_graph_params & params) { + virtual bool can_reuse(const llm_graph_params & params) { // returning false here by default will prevent from reusing the graph if the check // for the input type has not been implemented yet GGML_UNUSED(params); @@ -100,7 +100,7 @@ class llm_graph_input_embd : public llm_graph_input_i { void set_input(const llama_ubatch * ubatch) override; - bool update(const llm_graph_params & params) override; + bool can_reuse(const llm_graph_params & params) override; ggml_tensor * tokens = nullptr; // I32 [n_batch] ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch] @@ -113,7 +113,7 @@ class llm_graph_input_pos : public llm_graph_input_i { void set_input(const llama_ubatch * ubatch) override; - bool update(const llm_graph_params & params) override; + bool can_reuse(const llm_graph_params & params) override; ggml_tensor * pos = nullptr; // I32 [n_batch] @@ -173,7 +173,7 @@ class llm_graph_input_out_ids : public llm_graph_input_i { void set_input(const llama_ubatch * ubatch) override; - bool update(const llm_graph_params & params) override; + bool can_reuse(const llm_graph_params & params) override; ggml_tensor * out_ids; // I32 [n_outputs] @@ -265,7 +265,7 @@ class llm_graph_input_attn_kv_unified : public llm_graph_input_i { void set_input(const llama_ubatch * ubatch) override; - bool update(const llm_graph_params & params) override; + bool can_reuse(const llm_graph_params & params) override; ggml_tensor * get_k_idxs() const { return self_k_idxs; } ggml_tensor * get_v_idxs() const { return self_v_idxs; } @@ -298,7 +298,7 @@ class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i { void set_input(const llama_ubatch * ubatch) override; - bool update(const llm_graph_params & params) override; + bool can_reuse(const llm_graph_params & params) override; ggml_tensor * get_k_idxs() const { return self_k_idxs; } ggml_tensor * get_v_idxs() const { return self_v_idxs; } @@ -410,7 +410,7 @@ class llm_graph_result_i { virtual void set_inputs(const llama_ubatch * ubatch) = 0; - virtual bool update(const llm_graph_params & params) = 0; + virtual bool can_reuse(const llm_graph_params & params) = 0; }; using llm_graph_result_ptr = std::unique_ptr; @@ -504,12 +504,12 @@ class llm_graph_result : public llm_graph_result_i { } } - // try to update the existing graph result using the new graph parameters + // try to update the existing graph result using the new graph parameters in order to reuse it // this can only be done if we determine that the resulting graph using the new graph parameters // would be identical to the existing graph. in that case, we simply have to update the memory // contexts of the input tensors of the graph and we can reuse it for another computation // return true if the graph was updated and can be reused - bool update(const llm_graph_params & params) override { + bool can_reuse(const llm_graph_params & params) override { if (!this->params.is_same(params)) { return false; } @@ -517,7 +517,7 @@ class llm_graph_result : public llm_graph_result_i { bool res = true; for (auto & input : inputs) { - res &= input->update(params); + res &= input->can_reuse(params); } return res; From 600e69fdbb921b8f84c718e39d5c578f536cd638 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 8 Jul 2025 20:58:08 +0300 Subject: [PATCH 5/6] params : remove is_same() ggml-ci --- src/llama-batch.h | 25 ------------------------- src/llama-cparams.cpp | 28 ---------------------------- src/llama-cparams.h | 2 -- src/llama-graph.h | 37 ++++++++++++++++++++++++++++++++----- src/llama-hparams.cpp | 9 --------- src/llama-hparams.h | 2 -- 6 files changed, 32 insertions(+), 71 deletions(-) diff --git a/src/llama-batch.h b/src/llama-batch.h index b27134c9331a9..3420803ff9469 100644 --- a/src/llama-batch.h +++ b/src/llama-batch.h @@ -34,31 +34,6 @@ struct llama_ubatch { llama_seq_id * seq_id_unq; // [n_seqs_unq] | s | seq_id int32_t * seq_idx; // [LLAMA_MAX_SEQ] | - | seq_idx int8_t * output; // [n_tokens] | i | - - - bool is_same(const llama_ubatch & other) const { - bool res = - equal_seqs == other.equal_seqs && - n_tokens == other.n_tokens && - n_seq_tokens == other.n_seq_tokens && - n_seqs == other.n_seqs && - n_seqs_unq == other.n_seqs_unq && - ( - (!token && !other.token) || - (!embd && !other.embd) - ); - - if (!res) { - return false; - } - - // TODO: this won't work because seq_id_unq ptr can point to an old balloc that has - // been freed by this point. find a way to fix this - //for (uint32_t s = 0; s < n_seqs_unq; ++s) { - // res &= seq_id_unq[s] == other.seq_id_unq[s]; - //} - - return res; - } }; // a helper for sanitizing, fulfilling and splitting a batch diff --git a/src/llama-cparams.cpp b/src/llama-cparams.cpp index 2abe8feb0cd99..a3e7a37ee36d7 100644 --- a/src/llama-cparams.cpp +++ b/src/llama-cparams.cpp @@ -3,31 +3,3 @@ size_t llama_max_parallel_sequences(void) { return LLAMA_MAX_SEQ; } - -bool llama_cparams::is_same(const llama_cparams & other) const { - return - n_ctx == other.n_ctx && - n_batch == other.n_batch && - n_ubatch == other.n_ubatch && - n_seq_max == other.n_seq_max && - n_threads == other.n_threads && - n_threads_batch == other.n_threads_batch && - rope_freq_base == other.rope_freq_base && - rope_freq_scale == other.rope_freq_scale && - n_ctx_orig_yarn == other.n_ctx_orig_yarn && - yarn_ext_factor == other.yarn_ext_factor && - yarn_attn_factor == other.yarn_attn_factor && - yarn_beta_fast == other.yarn_beta_fast && - yarn_beta_slow == other.yarn_beta_slow && - defrag_thold == other.defrag_thold && - embeddings == other.embeddings && - causal_attn == other.causal_attn && - offload_kqv == other.offload_kqv && - flash_attn == other.flash_attn && - no_perf == other.no_perf && - warmup == other.warmup && - op_offload == other.op_offload && - pooling_type == other.pooling_type && - cb_eval == other.cb_eval && - cb_eval_user_data == other.cb_eval_user_data; -} diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 2c38f92e423a3..118615d5bd2d5 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -38,6 +38,4 @@ struct llama_cparams { ggml_backend_sched_eval_callback cb_eval; void * cb_eval_user_data; - - bool is_same(const llama_cparams & other) const; }; diff --git a/src/llama-graph.h b/src/llama-graph.h index 49a3835e90492..2e73681b90e44 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -443,11 +443,38 @@ struct llm_graph_params { // TODO: temporary llm_graph_result_i * res; - bool is_same(const llm_graph_params & other) const { + // return true if the "other" params would result in a graph with the same topology as with the current params + // having the same topology allows us to reuse the graph in some cases + bool allow_reuse(const llm_graph_params & other) const { + // first check the ubatch + bool can_reuse_ubatch = + ubatch.equal_seqs == other.ubatch.equal_seqs && + ubatch.n_tokens == other.ubatch.n_tokens && + ubatch.n_seq_tokens == other.ubatch.n_seq_tokens && + ubatch.n_seqs == other.ubatch.n_seqs && + ubatch.n_seqs_unq == other.ubatch.n_seqs_unq && + ( + (!ubatch.token && !other.ubatch.token) || + (!ubatch.embd && !other.ubatch.embd) + ); + + // TODO: this won't work because seq_id_unq ptr can point to an old balloc that has + // been freed by this point. find a way to fix this + //for (uint32_t s = 0; s < n_seqs_unq; ++s) { + // can_reuse_ubatch &= seq_id_unq[s] == other.seq_id_unq[s]; + //} + + // for now conservatively disallow, until the issue above is resolved + // ref: https://github.com/ggml-org/llama.cpp/pull/14363 + can_reuse_ubatch = can_reuse_ubatch && !ubatch.equal_seqs; + + if (!can_reuse_ubatch) { + return false; + } + return - hparams.is_same(other.hparams) && - cparams.is_same(other.cparams) && - ubatch .is_same(other.ubatch) && + cparams.embeddings == other.cparams.embeddings && + cparams.causal_attn == other.cparams.causal_attn && arch == other.arch && gtype == other.gtype && cvec == other.cvec && @@ -510,7 +537,7 @@ class llm_graph_result : public llm_graph_result_i { // contexts of the input tensors of the graph and we can reuse it for another computation // return true if the graph was updated and can be reused bool can_reuse(const llm_graph_params & params) override { - if (!this->params.is_same(params)) { + if (!this->params.allow_reuse(params)) { return false; } diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index a59e96a92fe7e..86c814d51b901 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -102,12 +102,3 @@ bool llama_hparams::is_swa(uint32_t il) const { GGML_ABORT("fatal error"); } - -bool llama_hparams::is_same(const llama_hparams & other) const { - return - n_ctx_train == other.n_ctx_train && - n_embd == other.n_embd && - n_layer == other.n_layer && - n_expert == other.n_expert && - n_expert_used == other.n_expert_used; -} diff --git a/src/llama-hparams.h b/src/llama-hparams.h index dfddb4f3d9534..476d0a5eade28 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -202,8 +202,6 @@ struct llama_hparams { uint32_t n_pos_per_embd() const; bool is_swa(uint32_t il) const; - - bool is_same(const llama_hparams & other) const; }; static_assert(std::is_trivially_copyable::value, "llama_hparams must be trivially copyable"); From a28677d170f10ef713c6f8f8b416606e023536da Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 9 Jul 2025 17:08:32 +0300 Subject: [PATCH 6/6] graph : set res->params in llm_graph_context constructor ggml-ci --- src/llama-graph.cpp | 1 + src/llama-model.cpp | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 99c6fe0e0353a..5caa24c4e50a1 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -471,6 +471,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : cb_func (params.cb), res (static_cast(params.res)), ctx0 (res->get_ctx()) { + res->params = params; } void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index caab6a0d270be..bdff24c3a4cfc 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -15047,9 +15047,6 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { // add on pooling layer llm->build_pooling(gf, cls, cls_b, cls_out, cls_out_b); - // TODO: updating the graph parameters here is a little bit obscure - figure out something better - llm->res->params = params; - return llm->res->get_gf(); }