@@ -1436,7 +1436,8 @@ struct server_slot_prompt {
1436
1436
struct server_prompt_cache {
1437
1437
std::list<server_slot_prompt> states;
1438
1438
1439
- size_t limit_size = 0 ; // 0 = no limit
1439
+ // in bytes, 0 = no limit
1440
+ size_t limit_size = 2ull *1024 *1024 *1024 ;
1440
1441
1441
1442
size_t size () const {
1442
1443
size_t res = 0 ;
@@ -1532,7 +1533,7 @@ struct server_slot {
1532
1533
std::vector<std::string> generated_tool_call_ids;
1533
1534
1534
1535
// stats
1535
- size_t n_sent_text = 0 ; // number of sent text character
1536
+ size_t n_sent_text = 0 ; // number of sent text character
1536
1537
1537
1538
int64_t t_start_process_prompt;
1538
1539
int64_t t_start_generation;
@@ -1792,7 +1793,7 @@ void server_slot::prompt_save(server_prompt_cache & prompt_cache) {
1792
1793
const int cur_lcs_len = cached_prompt.get_common_prefix (prompt.tokens );
1793
1794
1794
1795
if (cur_lcs_len == (int ) prompt.tokens .size ()) {
1795
- SRV_INF (" %s" , " - prompt is already cached, skipping\n " );
1796
+ SRV_WRN (" %s" , " - prompt is already cached, skipping\n " );
1796
1797
return ;
1797
1798
}
1798
1799
}
@@ -1804,7 +1805,7 @@ void server_slot::prompt_save(server_prompt_cache & prompt_cache) {
1804
1805
const int len = cached_prompt.get_common_prefix (prompt.tokens );
1805
1806
1806
1807
if (len == (int ) cached_prompt.size ()) {
1807
- SRV_INF (" - removing cached prompt with length %d\n " , len);
1808
+ SRV_WRN (" - removing cached prompt with length %d\n " , len);
1808
1809
1809
1810
it = states.erase (it);
1810
1811
} else {
@@ -1814,7 +1815,7 @@ void server_slot::prompt_save(server_prompt_cache & prompt_cache) {
1814
1815
1815
1816
const size_t cur_size = llama_state_seq_get_size_ext (ctx, id, 0 );
1816
1817
1817
- SRV_INF (" - saving prompt with length %d, total cache size = %.3f MiB\n " ,
1818
+ SRV_WRN (" - saving prompt with length %d, total cache size = %.3f MiB\n " ,
1818
1819
(int ) prompt.tokens .size (), cur_size / (1024.0 * 1024.0 ));
1819
1820
1820
1821
// if there is a limit, remove the oldest entries to make room
@@ -1824,6 +1825,8 @@ void server_slot::prompt_save(server_prompt_cache & prompt_cache) {
1824
1825
break ;
1825
1826
}
1826
1827
1828
+ SRV_WRN (" - cache size limit reached, removing oldest entry (size = %.3f MiB)\n " , states.front ().size () / (1024.0 * 1024.0 ));
1829
+
1827
1830
states.pop_front ();
1828
1831
}
1829
1832
} else {
@@ -1833,6 +1836,8 @@ void server_slot::prompt_save(server_prompt_cache & prompt_cache) {
1833
1836
break ;
1834
1837
}
1835
1838
1839
+ SRV_WRN (" - cache token limit reached, removing oldest entry (size = %.3f MiB)\n " , states.front ().size () / (1024.0 * 1024.0 ));
1840
+
1836
1841
states.pop_front ();
1837
1842
}
1838
1843
}
@@ -1847,15 +1852,19 @@ void server_slot::prompt_save(server_prompt_cache & prompt_cache) {
1847
1852
1848
1853
llama_state_seq_get_data_ext (ctx, cur.data .data (), cur_size, id, 0 );
1849
1854
1850
- SRV_INF (" - cache state: %zu prompts, %.3f MiB\n " , states.size (), prompt_cache.size () / (1024.0 * 1024.0 ));
1855
+ SRV_WRN (" - cache state: %zu prompts, %.3f MiB\n " , states.size (), prompt_cache.size () / (1024.0 * 1024.0 ));
1856
+
1857
+ for (const auto & state : states) {
1858
+ SRV_WRN (" - prompt %p: %7d tokens, checkpoints: %2zu, %.3f MiB\n " , (const void *)&state, state.n_tokens (), state.checkpoints .size (), state.size () / (1024.0 * 1024.0 ));
1859
+ }
1851
1860
}
1852
1861
1853
1862
void server_slot::prompt_load (server_prompt_cache & prompt_cache, const server_tokens & tokens) {
1854
1863
auto & states = prompt_cache.states ;
1855
1864
1856
1865
int lcs_len = prompt.tokens .get_common_prefix (tokens);
1857
1866
1858
- SRV_INF (" - looking for better prompt, base lcs_len = %d\n " , lcs_len);
1867
+ SRV_WRN (" - looking for better prompt, base lcs_len = %d\n " , lcs_len);
1859
1868
1860
1869
auto it_best = states.end ();
1861
1870
@@ -1872,7 +1881,7 @@ void server_slot::prompt_load(server_prompt_cache & prompt_cache, const server_t
1872
1881
}
1873
1882
1874
1883
if (it_best != states.end ()) {
1875
- SRV_INF (" - found better prompt with lcs_len = %d\n " , lcs_len);
1884
+ SRV_WRN (" - found better prompt with lcs_len = %d\n " , lcs_len);
1876
1885
1877
1886
const size_t size = it_best->data .size ();
1878
1887
const size_t n = llama_state_seq_set_data_ext (ctx, it_best->data .data (), size, id, 0 );
@@ -2454,7 +2463,7 @@ struct server_context {
2454
2463
SRV_ERR (" %s" , " failed to create speculator\n " );
2455
2464
return ;
2456
2465
}
2457
- for (auto &pair : params_base.speculative .replacements ) {
2466
+ for (auto & pair : params_base.speculative .replacements ) {
2458
2467
common_speculative_add_replacement_tgt_dft (slot.spec , pair.first .c_str (), pair.second .c_str ());
2459
2468
}
2460
2469
}
@@ -2483,7 +2492,7 @@ struct server_context {
2483
2492
// 1. It's not explicitly disabled (reasoning_budget == 0)
2484
2493
// 2. The chat template supports it
2485
2494
const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking (chat_templates.get ());
2486
- SRV_INF (" Enable thinking? %d\n " , enable_thinking);
2495
+ SRV_INF (" thinking = %d\n " , enable_thinking);
2487
2496
2488
2497
oai_parser_opt = {
2489
2498
/* use_jinja */ params_base.use_jinja ,
@@ -2585,21 +2594,24 @@ struct server_context {
2585
2594
if (ret) {
2586
2595
const auto & tokens = ret->prompt .tokens ;
2587
2596
2597
+ // cache prompts only for completion tasks
2598
+ update_cache = update_cache && task.type == SERVER_TASK_TYPE_COMPLETION;
2599
+
2588
2600
// don't update the cache if the slot's context is empty
2589
2601
update_cache = update_cache && tokens.size () > 0 ;
2590
2602
2591
2603
// TODO: mtmd does not support prompt cache
2592
2604
update_cache = update_cache && (ret->mctx == nullptr );
2593
2605
2594
2606
if (update_cache) {
2595
- SRV_INF (" %s" , " updating prompt cache\n " );
2607
+ SRV_WRN (" %s" , " updating prompt cache\n " );
2596
2608
2597
2609
const int64_t t_start = ggml_time_us ();
2598
2610
2599
2611
ret->prompt_save (prompt_cache);
2600
2612
ret->prompt_load (prompt_cache, task.tokens );
2601
2613
2602
- SRV_INF (" prompt cache update took %.2f ms\n " , (ggml_time_us () - t_start) / 1000.0 );
2614
+ SRV_WRN (" prompt cache update took %.2f ms\n " , (ggml_time_us () - t_start) / 1000.0 );
2603
2615
}
2604
2616
}
2605
2617
@@ -3734,16 +3746,16 @@ struct server_context {
3734
3746
3735
3747
if (!do_reset) {
3736
3748
// restore the context checkpoint
3737
- const size_t ctx_checkpoint_size = it->data .size ();
3738
- const size_t n = llama_state_seq_set_data_ext (ctx, it->data .data (), ctx_checkpoint_size , slot.id , LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
3749
+ const size_t checkpoint_size = it->data .size ();
3750
+ const size_t n = llama_state_seq_set_data_ext (ctx, it->data .data (), checkpoint_size , slot.id , LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
3739
3751
3740
- if (n != ctx_checkpoint_size ) {
3741
- SLT_ERR (slot, " failed to restore context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n " , it->pos_min , it->pos_max , (float ) ctx_checkpoint_size / 1024 / 1024 );
3752
+ if (n != checkpoint_size ) {
3753
+ SLT_ERR (slot, " failed to restore context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n " , it->pos_min , it->pos_max , (float ) checkpoint_size / 1024 / 1024 );
3742
3754
do_reset = true ;
3743
3755
// printf("[DEBUG] `do_reset` was set to `true` after failing to restore a checkpoint");
3744
3756
} else {
3745
3757
slot.n_past = std::min (slot.n_past , std::max (it->pos_min + 1 , it->pos_max ));
3746
- SLT_WRN (slot, " restored context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n " , it->pos_min , it->pos_max , (float ) ctx_checkpoint_size / 1024 / 1024 );
3758
+ SLT_WRN (slot, " restored context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n " , it->pos_min , it->pos_max , (float ) checkpoint_size / 1024 / 1024 );
3747
3759
}
3748
3760
}
3749
3761
@@ -3842,6 +3854,9 @@ struct server_context {
3842
3854
3843
3855
bool do_checkpoint = params_base.n_ctx_checkpoints > 0 ;
3844
3856
3857
+ // make checkpoints only for completion tasks
3858
+ do_checkpoint = do_checkpoint && slot.task ->type == SERVER_TASK_TYPE_COMPLETION;
3859
+
3845
3860
// make a checkpoint of the parts of the memory that cannot be rolled back.
3846
3861
// checkpoints are created only if:
3847
3862
// - the model uses SWA and we are not using `swa_full`
@@ -3941,7 +3956,7 @@ struct server_context {
3941
3956
3942
3957
llama_state_seq_get_data_ext (ctx, cur.data .data (), checkpoint_size, slot.id , LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
3943
3958
3944
- SLT_WRN (slot, " saved context checkpoint %d of %d (pos_min = %d, pos_max = %d, size = %.3f MiB)\n " ,
3959
+ SLT_WRN (slot, " created context checkpoint %d of %d (pos_min = %d, pos_max = %d, size = %.3f MiB)\n " ,
3945
3960
(int ) slot.prompt .checkpoints .size (), params_base.n_ctx_checkpoints , cur.pos_min , cur.pos_max , (float ) cur.data .size () / 1024 / 1024 );
3946
3961
}
3947
3962
}
0 commit comments