@@ -227,8 +227,14 @@ llama_context::llama_context(
227
227
228
228
LLAMA_LOG_DEBUG (" %s: max_nodes = %zu\n " , __func__, max_nodes);
229
229
230
- // buffer used to store the computation graph and the tensor meta data
231
- buf_compute_meta.resize (ggml_tensor_overhead ()*max_nodes + ggml_graph_overhead_custom (max_nodes, false ));
230
+ // buffers used to store the computation graph and the tensor meta data
231
+ for (auto & res : gf_res) {
232
+ res.reset (new llm_graph_result ());
233
+ res->reserve (max_nodes);
234
+ };
235
+
236
+ gf_res_reserve.reset (new llm_graph_result ());
237
+ gf_res_reserve->reserve (max_nodes);
232
238
233
239
// TODO: move these checks to ggml_backend_sched
234
240
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
@@ -388,10 +394,6 @@ ggml_backend_sched_t llama_context::get_sched() const {
388
394
return sched.get ();
389
395
}
390
396
391
- ggml_context * llama_context::get_ctx_compute () const {
392
- return ctx_compute.get ();
393
- }
394
-
395
397
uint32_t llama_context::n_ctx () const {
396
398
return cparams.n_ctx ;
397
399
}
@@ -678,36 +680,40 @@ bool llama_context::apply_adapter_cvec(
678
680
return cvec.apply (model, data, len, n_embd, il_start, il_end);
679
681
}
680
682
681
- llm_graph_result_ptr llama_context::process_ubatch (const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
683
+ llm_graph_result_i * llama_context::process_ubatch (const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
682
684
if (mctx && !mctx->apply ()) {
683
685
LLAMA_LOG_ERROR (" %s: failed to apply memory context\n " , __func__);
684
686
ret = GGML_STATUS_FAILED;
685
687
return nullptr ;
686
688
}
687
689
688
- auto * gf = graph_init ();
690
+ gf_res_next ()->init ();
691
+
692
+ auto * gf = gf_res_cur ()->get_gf ();
689
693
if (!gf) {
690
694
LLAMA_LOG_ERROR (" %s: failed to initialize graph\n " , __func__);
691
695
ret = GGML_STATUS_FAILED;
692
696
return nullptr ;
693
697
}
694
698
695
- auto res = graph_build (ctx_compute. get (), gf , ubatch, gtype, mctx);
696
- if (!res ) {
697
- LLAMA_LOG_ERROR (" %s: failed to build graph\n " , __func__);
698
- ret = GGML_STATUS_FAILED ;
699
- return nullptr ;
700
- }
699
+ const bool can_reuse = graph_build (gf_res_cur (), gf_res_prv () , ubatch, gtype, mctx);
700
+ if (can_reuse ) {
701
+ LLAMA_LOG_DEBUG (" %s: reusing previous graph\n " , __func__);
702
+ gf_res_next ()-> update (mctx) ;
703
+ } else {
704
+ // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
701
705
702
- // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
706
+ ggml_backend_sched_reset (sched.get ());
707
+ ggml_backend_sched_set_eval_callback (sched.get (), cparams.cb_eval , cparams.cb_eval_user_data );
703
708
704
- if (!ggml_backend_sched_alloc_graph (sched.get (), gf)) {
705
- LLAMA_LOG_ERROR (" %s: failed to allocate graph\n " , __func__);
706
- ret = GGML_STATUS_ALLOC_FAILED;
707
- return nullptr ;
709
+ if (!ggml_backend_sched_alloc_graph (sched.get (), gf)) {
710
+ LLAMA_LOG_ERROR (" %s: failed to allocate graph\n " , __func__);
711
+ ret = GGML_STATUS_ALLOC_FAILED;
712
+ return nullptr ;
713
+ }
708
714
}
709
715
710
- res ->set_inputs (&ubatch);
716
+ gf_res_cur () ->set_inputs (&ubatch);
711
717
712
718
const auto status = graph_compute (gf, ubatch.n_tokens > 1 );
713
719
if (status != GGML_STATUS_SUCCESS) {
@@ -718,7 +724,7 @@ llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch,
718
724
719
725
ret = GGML_STATUS_SUCCESS;
720
726
721
- return res ;
727
+ return gf_res_cur () ;
722
728
}
723
729
724
730
int llama_context::encode (const llama_batch & batch_inp) {
@@ -767,6 +773,8 @@ int llama_context::encode(const llama_batch & batch_inp) {
767
773
768
774
n_outputs = n_tokens;
769
775
776
+ // TODO: when resetting the scheduler, clear prev graph buffers
777
+ gf_res_next ()->init ();
770
778
ggml_backend_sched_reset (sched.get ());
771
779
ggml_backend_sched_set_eval_callback (sched.get (), cparams.cb_eval , cparams.cb_eval_user_data );
772
780
@@ -778,7 +786,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
778
786
cparams.causal_attn = false ;
779
787
780
788
ggml_status status;
781
- const auto res = process_ubatch (ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr , status);
789
+ const auto * res = process_ubatch (ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr , status);
782
790
783
791
cparams.causal_attn = causal_attn_org;
784
792
@@ -846,7 +854,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
846
854
847
855
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
848
856
// overlap with device computation.
849
- ggml_backend_sched_reset (sched.get ());
857
+ // ggml_backend_sched_reset(sched.get());
850
858
851
859
// TODO: hacky solution
852
860
if (model.arch == LLM_ARCH_T5 && t_embd) {
@@ -1005,11 +1013,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
1005
1013
n_outputs = n_outputs_new;
1006
1014
}
1007
1015
1008
- ggml_backend_sched_reset (sched.get ());
1009
- ggml_backend_sched_set_eval_callback (sched.get (), cparams.cb_eval , cparams.cb_eval_user_data );
1010
-
1011
1016
ggml_status status;
1012
- const auto res = process_ubatch (ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get (), status);
1017
+ const auto * res = process_ubatch (ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get (), status);
1013
1018
1014
1019
if (!res) {
1015
1020
// the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache
@@ -1192,7 +1197,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
1192
1197
1193
1198
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
1194
1199
// overlap with device computation.
1195
- ggml_backend_sched_reset (sched.get ());
1200
+ // ggml_backend_sched_reset(sched.get());
1196
1201
1197
1202
return 0 ;
1198
1203
}
@@ -1279,18 +1284,6 @@ int32_t llama_context::graph_max_nodes() const {
1279
1284
return std::max<int32_t >(65536 , 5 *model.n_tensors ());
1280
1285
}
1281
1286
1282
- ggml_cgraph * llama_context::graph_init () {
1283
- ggml_init_params params = {
1284
- /* .mem_size =*/ buf_compute_meta.size (),
1285
- /* .mem_buffer =*/ buf_compute_meta.data (),
1286
- /* .no_alloc =*/ true ,
1287
- };
1288
-
1289
- ctx_compute.reset (ggml_init (params));
1290
-
1291
- return ggml_new_graph_custom (ctx_compute.get (), graph_max_nodes (), false );
1292
- }
1293
-
1294
1287
ggml_cgraph * llama_context::graph_reserve (uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx) {
1295
1288
LLAMA_LOG_DEBUG (" %s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n " , __func__, n_tokens, n_seqs, n_outputs);
1296
1289
@@ -1301,6 +1294,10 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
1301
1294
LLAMA_LOG_DEBUG (" %s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n " , __func__, n_tokens, n_seqs, n_outputs);
1302
1295
}
1303
1296
1297
+ // TODO: when resetting the scheduler, clear prev graph buffers
1298
+ gf_res_next ()->init ();
1299
+ ggml_backend_sched_reset (sched.get ());
1300
+
1304
1301
// store the n_outputs as it is, and restore it afterwards
1305
1302
// TODO: not sure if needed, might simplify in the future by removing this
1306
1303
const auto save_n_outputs = this ->n_outputs ;
@@ -1310,17 +1307,13 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
1310
1307
llama_batch_allocr balloc (model.hparams .n_pos_per_embd ());
1311
1308
llama_ubatch ubatch = balloc.ubatch_reserve (n_tokens/n_seqs, n_seqs);
1312
1309
1313
- auto * gf = graph_init ();
1314
- auto res = graph_build (ctx_compute. get (), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx );
1310
+ gf_res_reserve-> init ();
1311
+ auto * gf = gf_res_reserve-> get_gf ( );
1315
1312
1316
- this ->n_outputs = save_n_outputs;
1317
-
1318
- if (!res) {
1319
- LLAMA_LOG_ERROR (" %s: failed to build worst-case graph\n " , __func__);
1320
- return nullptr ;
1321
- }
1313
+ const bool can_reuse = graph_build (gf_res_reserve.get (), nullptr , ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx);
1314
+ GGML_ASSERT (!can_reuse); // cannot reuse reserve graphs
1322
1315
1323
- ggml_backend_sched_reset (sched. get ()) ;
1316
+ this -> n_outputs = save_n_outputs ;
1324
1317
1325
1318
// initialize scheduler with the specified graph
1326
1319
if (!ggml_backend_sched_reserve (sched.get (), gf)) {
@@ -1331,15 +1324,17 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
1331
1324
return gf;
1332
1325
}
1333
1326
1334
- llm_graph_result_ptr llama_context::graph_build (
1335
- ggml_context * ctx ,
1336
- ggml_cgraph * gf ,
1327
+ bool llama_context::graph_build (
1328
+ llm_graph_result_i * gf_res_cur ,
1329
+ llm_graph_result_i * gf_res_prv ,
1337
1330
const llama_ubatch & ubatch,
1338
1331
llm_graph_type gtype,
1339
1332
const llama_memory_context_i * mctx) {
1340
1333
return model.build_graph (
1341
1334
{
1342
- /* .ctx =*/ ctx,
1335
+ /* .ctx =*/ gf_res_cur->get_ctx (),
1336
+ /* .gf_res_cur =*/ static_cast <llm_graph_result *>(gf_res_cur),
1337
+ /* .gf_res_prv =*/ static_cast <llm_graph_result *>(gf_res_prv),
1343
1338
/* .arch =*/ model.arch ,
1344
1339
/* .hparams =*/ model.hparams ,
1345
1340
/* .cparams =*/ cparams,
@@ -1352,7 +1347,7 @@ llm_graph_result_ptr llama_context::graph_build(
1352
1347
/* .cross =*/ &cross,
1353
1348
/* .n_outputs =*/ n_outputs,
1354
1349
/* .cb =*/ graph_get_cb (),
1355
- }, gf, gtype);
1350
+ }, gtype);
1356
1351
}
1357
1352
1358
1353
ggml_status llama_context::graph_compute (
@@ -2064,8 +2059,11 @@ void llama_context::opt_epoch_iter(
2064
2059
break ;
2065
2060
}
2066
2061
2067
- auto * gf = graph_init ();
2068
- auto res = graph_build (ctx_compute.get (), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx.get ());
2062
+ gf_res_cur ()->init ();
2063
+ auto * gf = gf_res_cur ()->get_gf ();
2064
+
2065
+ const bool can_reuse = graph_build (gf_res_cur (), nullptr , ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx.get ());
2066
+ GGML_ASSERT (!can_reuse); // cannot reuse optimization graphs
2069
2067
2070
2068
struct ggml_context * ctx_compute_opt;
2071
2069
{
@@ -2078,10 +2076,10 @@ void llama_context::opt_epoch_iter(
2078
2076
};
2079
2077
ctx_compute_opt = ggml_init (params);
2080
2078
}
2081
- ggml_opt_prepare_alloc (opt_ctx, ctx_compute_opt, gf, res ->get_tokens (), res ->get_logits ());
2079
+ ggml_opt_prepare_alloc (opt_ctx, ctx_compute_opt, gf, gf_res_cur () ->get_tokens (), gf_res_cur () ->get_logits ());
2082
2080
ggml_opt_alloc (opt_ctx, train);
2083
2081
2084
- res ->set_inputs (&ubatch);
2082
+ gf_res_cur () ->set_inputs (&ubatch);
2085
2083
{
2086
2084
struct ggml_tensor * labels = ggml_opt_labels (opt_ctx);
2087
2085
GGML_ASSERT (labels->ne [1 ] == n_ubatch);
0 commit comments