From 15ce574d1d395669d75ca940fe551de281633554 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 4 Dec 2025 23:56:40 +0100
Subject: [PATCH 01/11] backend support

---
 tools/server/server-common.cpp  | 24 ++++++++++-------
 tools/server/server-common.h    |  2 ++
 tools/server/server-context.cpp | 46 +++++++++++++++++++++++++++++++--
 tools/server/server-task.h      | 12 +++++++++
 4 files changed, 72 insertions(+), 12 deletions(-)

diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp
index cfdd0c656f4..7579f4eb5d9 100644
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@@ -494,6 +494,18 @@ int32_t server_tokens::process_chunk(
     return 0;
 }
 
+server_tokens server_tokens::clone() const {
+    server_tokens res;
+    res.has_mtmd = has_mtmd;
+    res.tokens   = tokens;
+    for (auto it = map_idx_to_media.begin(); it != map_idx_to_media.end(); ++it) {
+        size_t idx = it->first;
+        const mtmd::input_chunk_ptr & chunk = it->second;
+        res.map_idx_to_media[idx] = mtmd::input_chunk_ptr(mtmd_input_chunk_copy(chunk.get()));
+    }
+    return res;
+}
+
 //
 // tokenizer and input processing utils
 //
@@ -745,11 +757,7 @@ json oaicompat_completion_params_parse(const json & body) {
         llama_params["stop"] = json_value(body, "stop", json::array());
     }
 
-    // Handle "n" field
-    int n_choices = json_value(body, "n", 1);
-    if (n_choices != 1) {
-        throw std::runtime_error("Only one completion choice is allowed");
-    }
+    llama_params["n"] = json_value(body, "n", 1);
 
     // Handle "echo" field
     if (json_value(body, "echo", false)) {
@@ -1049,11 +1057,7 @@ json oaicompat_chat_params_parse(
         llama_params["chat_parser"] = chat_params.parser;
     }
 
-    // Handle "n" field
-    int n_choices = json_value(body, "n", 1);
-    if (n_choices != 1) {
-        throw std::invalid_argument("Only one completion choice is allowed");
-    }
+    llama_params["n"] = json_value(body, "n", 1);
 
     // Handle "logprobs" field
     // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
diff --git a/tools/server/server-common.h b/tools/server/server-common.h
index bb04e82b4f5..db942ca6ff9 100644
--- a/tools/server/server-common.h
+++ b/tools/server/server-common.h
@@ -215,6 +215,8 @@ struct server_tokens {
                 llama_pos pos,
                 int32_t seq_id,
                 size_t & n_tokens_out) const;
+    
+    server_tokens clone() const;
 };
 
 
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index f3f2edc0cc4..d1575c0dc8d 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -35,9 +35,10 @@ constexpr int HTTP_POLLING_SECONDS = 1;
 // state diagram: https://github.com/ggml-org/llama.cpp/pull/9283
 enum slot_state {
     SLOT_STATE_IDLE,
-    SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
+    SLOT_STATE_STARTED, // after assigning a task
     SLOT_STATE_PROCESSING_PROMPT,
     SLOT_STATE_DONE_PROMPT,
+    SLOT_STATE_WAIT_OTHER, // prompt processed, but waiting for other slots to copy the state
     SLOT_STATE_GENERATING,
 };
 
@@ -383,6 +384,15 @@ struct server_slot {
 
         return res;
     }
+
+    void copy_state_to(server_slot & other) {
+        llama_memory_seq_cp(llama_get_memory(ctx), id, other.id, 0, -1);
+        other.n_decoded   = n_decoded;
+        other.n_remaining = n_remaining;
+        other.n_prompt_tokens_cache     = n_prompt_tokens_cache;
+        other.n_prompt_tokens_processed = n_prompt_tokens_processed;
+        other.prompt = prompt.clone();
+    }
 };
 
 
@@ -2143,7 +2153,9 @@ struct server_context_impl {
 
                     // entire prompt has been processed
                     if (slot.prompt.n_tokens() == slot.task->n_tokens()) {
-                        slot.state = SLOT_STATE_DONE_PROMPT;
+                        slot.state = slot.task->n_children == 0
+                            ? SLOT_STATE_DONE_PROMPT // state not being reused by any other slots
+                            : SLOT_STATE_WAIT_OTHER;
 
                         GGML_ASSERT(batch.n_tokens > 0);
 
@@ -2211,6 +2223,36 @@ struct server_context_impl {
             }
         }
 
+        // may need to copy state to other slots
+        for (auto & slot : slots) {
+            if (slot.state == SLOT_STATE_WAIT_OTHER) {
+                GGML_ASSERT(slot.task->n_children > 0);
+
+                size_t n_waiting = 0;
+                std::vector<server_slot *> child_slots;
+                for (auto & slot : slots) {
+                    if (slot.task->id == slot.task->id_parent) {
+                        n_waiting++;
+                        child_slots.push_back(&slot);
+                    }
+                }
+
+                // we can only proceed if all "child" slots are having the correct tasks
+                if (n_waiting < slot.task->n_children) {
+                    continue;
+                }
+
+                // copy state to the child slots
+                for (auto & child : child_slots) {
+                    SLT_INF(*child, "copying state from slot %d to child %d\n", slot.id, child->id);
+                    slot.copy_state_to(*child);
+                    child->state = SLOT_STATE_DONE_PROMPT;
+                }
+
+                slot.state = SLOT_STATE_DONE_PROMPT;
+            }
+        }
+
         if (batch.n_tokens == 0) {
             SRV_WRN("%s", "no tokens to decode\n");
             return;
diff --git a/tools/server/server-task.h b/tools/server/server-task.h
index 8e7b9e3e310..5fbc6348b77 100644
--- a/tools/server/server-task.h
+++ b/tools/server/server-task.h
@@ -89,6 +89,10 @@ struct server_task {
     int id_target = -1;
     int id_slot   = -1;
 
+    // used by parallel sampling (multiple completions from same prompt)
+    size_t n_children =  0; // number of tasks reusing this prompt
+    int    id_parent  = -1;
+
     // used by SERVER_TASK_TYPE_INFERENCE
     task_params   params;
     server_tokens tokens;
@@ -466,6 +470,14 @@ struct server_prompt {
     int n_tokens() const {
         return tokens.size();
     }
+
+    server_prompt clone() const {
+        return server_prompt {
+            tokens.clone(),
+            data,
+            checkpoints
+        };
+    }
 };
 
 struct server_prompt_cache {

From 0d842cb575ee50a36b2e444b6299c1ecf5c88f34 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 5 Dec 2025 01:09:52 +0100
Subject: [PATCH 02/11] server: support multiple generations from one prompt
 (OAI "n" option)

---
 tools/server/server-common.cpp  |  4 +-
 tools/server/server-context.cpp | 85 ++++++++++++++++++++-------------
 tools/server/server-task.cpp    |  5 ++
 tools/server/server-task.h      | 12 +++++
 4 files changed, 72 insertions(+), 34 deletions(-)

diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp
index 7579f4eb5d9..51e76084e6d 100644
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@@ -757,7 +757,7 @@ json oaicompat_completion_params_parse(const json & body) {
         llama_params["stop"] = json_value(body, "stop", json::array());
     }
 
-    llama_params["n"] = json_value(body, "n", 1);
+    llama_params["n_cmpl"] = json_value(body, "n", 1);
 
     // Handle "echo" field
     if (json_value(body, "echo", false)) {
@@ -1057,7 +1057,7 @@ json oaicompat_chat_params_parse(
         llama_params["chat_parser"] = chat_params.parser;
     }
 
-    llama_params["n"] = json_value(body, "n", 1);
+    llama_params["n_cmpl"] = json_value(body, "n", 1);
 
     // Handle "logprobs" field
     // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index d1575c0dc8d..136a374c182 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -385,10 +385,11 @@ struct server_slot {
         return res;
     }
 
-    void copy_state_to(server_slot & other) {
+    void copy_state_to(server_slot & other) const {
         llama_memory_seq_cp(llama_get_memory(ctx), id, other.id, 0, -1);
         other.n_decoded   = n_decoded;
         other.n_remaining = n_remaining;
+        other.i_batch     = i_batch;
         other.n_prompt_tokens_cache     = n_prompt_tokens_cache;
         other.n_prompt_tokens_processed = n_prompt_tokens_processed;
         other.prompt = prompt.clone();
@@ -1788,6 +1789,12 @@ struct server_context_impl {
                 if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
                     const auto & input_tokens = slot.task->tokens;
 
+                    if (slot.task->id_parent >= 0) {
+                        slot.state = SLOT_STATE_PROCESSING_PROMPT;
+                        // do nothing, parent slot will handle prompt processing
+                        continue;
+                    }
+
                     // TODO: maybe move branch to outside of this loop in the future
                     if (slot.state == SLOT_STATE_STARTED) {
                         slot.t_start_process_prompt = ggml_time_us();
@@ -2223,36 +2230,6 @@ struct server_context_impl {
             }
         }
 
-        // may need to copy state to other slots
-        for (auto & slot : slots) {
-            if (slot.state == SLOT_STATE_WAIT_OTHER) {
-                GGML_ASSERT(slot.task->n_children > 0);
-
-                size_t n_waiting = 0;
-                std::vector<server_slot *> child_slots;
-                for (auto & slot : slots) {
-                    if (slot.task->id == slot.task->id_parent) {
-                        n_waiting++;
-                        child_slots.push_back(&slot);
-                    }
-                }
-
-                // we can only proceed if all "child" slots are having the correct tasks
-                if (n_waiting < slot.task->n_children) {
-                    continue;
-                }
-
-                // copy state to the child slots
-                for (auto & child : child_slots) {
-                    SLT_INF(*child, "copying state from slot %d to child %d\n", slot.id, child->id);
-                    slot.copy_state_to(*child);
-                    child->state = SLOT_STATE_DONE_PROMPT;
-                }
-
-                slot.state = SLOT_STATE_DONE_PROMPT;
-            }
-        }
-
         if (batch.n_tokens == 0) {
             SRV_WRN("%s", "no tokens to decode\n");
             return;
@@ -2349,7 +2326,38 @@ struct server_context_impl {
             // on successful decode, restore the original batch size
             n_batch = llama_n_batch(ctx);
 
+            // may need to copy state to other slots
             for (auto & slot : slots) {
+                if (slot.state == SLOT_STATE_WAIT_OTHER) {
+                    GGML_ASSERT(slot.task->n_children > 0);
+
+                    size_t n_waiting = 0;
+                    std::vector<server_slot *> child_slots;
+                    for (auto & other : slots) {
+                        if (!other.is_processing()) {
+                            continue;
+                        }
+                        if (slot.task->id == other.task->id_parent) {
+                            n_waiting++;
+                            child_slots.push_back(&other);
+                        }
+                    }
+
+                    // we can only proceed if all "child" slots are having the correct tasks
+                    if (n_waiting < slot.task->n_children) {
+                        continue;
+                    }
+
+                    // copy state to the child slots
+                    for (auto & child : child_slots) {
+                        SLT_INF(*child, "copying state from slot %d to child %d\n", slot.id, child->id);
+                        slot.copy_state_to(*child);
+                        child->state = SLOT_STATE_DONE_PROMPT;
+                    }
+
+                    slot.state = SLOT_STATE_DONE_PROMPT;
+                }
+
                 // optionally send prompt processing progress
                 if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_DONE_PROMPT) {
                     if (slot.task->params.stream && slot.task->params.return_progress) {
@@ -2635,11 +2643,12 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
         }
         tasks.reserve(inputs.size());
         states.reserve(inputs.size());
+        int idx = 0;
         for (size_t i = 0; i < inputs.size(); i++) {
             server_task task = server_task(type);
 
             task.id    = ctx_server.queue_tasks.get_new_id();
-            task.index = i;
+            task.index = idx++;
 
             task.tokens = std::move(inputs[i]);
             task.params = server_task::params_from_json_cmpl(
@@ -2654,6 +2663,18 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
             task.params.oaicompat_model   = ctx_server.model_name;
             states.push_back(task.params.oaicompat_chat_syntax);
 
+            if (task.params.n_cmpl > 1) {
+                task.n_children = task.params.n_cmpl - 1;
+                for (size_t j = 0; j < task.n_children; j++) {
+                    server_task child = task.create_child(
+                        task.id,
+                        ctx_server.queue_tasks.get_new_id(),
+                        idx++);
+                    states.push_back(child.params.oaicompat_chat_syntax);
+                    tasks.push_back(std::move(child));
+                }
+            }
+
             tasks.push_back(std::move(task));
         }
 
diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
index df066264778..8a84c4ec539 100644
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -175,6 +175,7 @@ task_params server_task::params_from_json_cmpl(
     params.n_indent         = json_value(data,       "n_indent",           defaults.n_indent);
     params.n_keep           = json_value(data,       "n_keep",             defaults.n_keep);
     params.n_discard        = json_value(data,       "n_discard",          defaults.n_discard);
+    params.n_cmpl           = json_value(data,       "n_cmpl",             defaults.n_cmpl);
     //params.t_max_prompt_ms  = json_value(data,       "t_max_prompt_ms",    defaults.t_max_prompt_ms); // TODO: implement
     params.t_max_predict_ms = json_value(data,       "t_max_predict_ms",   defaults.t_max_predict_ms);
     params.response_fields  = json_value(data,       "response_fields",    std::vector<std::string>());
@@ -453,6 +454,10 @@ task_params server_task::params_from_json_cmpl(
         }
     }
 
+    if (params.n_cmpl > params_base.n_parallel) {
+        throw std::runtime_error("n_cmpl cannot be greater than the number of slots, please increase -np");
+    }
+
     return params;
 }
 
diff --git a/tools/server/server-task.h b/tools/server/server-task.h
index 5fbc6348b77..4e4840fc83b 100644
--- a/tools/server/server-task.h
+++ b/tools/server/server-task.h
@@ -53,6 +53,7 @@ struct task_params {
     int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
     int32_t n_predict = -1; // new tokens to predict
     int32_t n_indent  =  0; // minimum line indentation for the generated text in number of whitespace characters
+    int32_t n_cmpl    =  1; // number of completions to generate from this prompt
 
     int64_t t_max_prompt_ms  = -1; // TODO: implement
     int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
@@ -134,6 +135,17 @@ struct server_task {
         }
         return ids;
     }
+
+    server_task create_child(int id_parent, int id_child, int idx) const {
+        server_task copy;
+        copy.id        = id_child;
+        copy.index     = idx;
+        copy.id_parent = id_parent;
+        copy.params    = params;
+        copy.type      = type;
+        copy.tokens    = tokens.clone();
+        return copy;
+    }
 };
 
 struct result_timings {

From bf33d13b64bdeaecbd828053f4d51eefa4fed5dd Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 5 Dec 2025 10:04:46 +0100
Subject: [PATCH 03/11] fix invalid batch

---
 tools/server/server-context.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 136a374c182..15e9ad19e1e 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -386,6 +386,7 @@ struct server_slot {
     }
 
     void copy_state_to(server_slot & other) const {
+        llama_memory_seq_rm(llama_get_memory(ctx), other.id, 0, -1);
         llama_memory_seq_cp(llama_get_memory(ctx), id, other.id, 0, -1);
         other.n_decoded   = n_decoded;
         other.n_remaining = n_remaining;

From a768a5e871c997e41a62b5c5cb02106057cf7563 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 5 Dec 2025 10:33:51 +0100
Subject: [PATCH 04/11] format oai

---
 tools/server/server-context.cpp | 17 +++++++++++++++--
 tools/server/server-task.cpp    |  2 +-
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 15e9ad19e1e..98e70e967db 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -2702,8 +2702,21 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
                 GGML_ASSERT(dynamic_cast<server_task_result_cmpl_final*>(res.get()) != nullptr);
                 arr.push_back(res->to_json());
             }
-            // if single request, return single object instead of array
-            res->ok(arr.size() == 1 ? arr[0] : arr);
+            GGML_ASSERT(!arr.empty() && "empty results");
+            if (arr.size() == 1) {
+                // if single request, return single object instead of array
+                res->ok(arr[0]);
+            } else if (res_type == TASK_RESPONSE_TYPE_OAI_CHAT || res_type == TASK_RESPONSE_TYPE_OAI_CMPL) {
+                // if multiple results in OAI format, we need to re-format them
+                json & choices = arr[0]["choices"];
+                for (size_t i = 1; i < arr.size(); i++) {
+                    choices.push_back(std::move(arr[i]["choices"][0]));
+                }
+                res->ok(arr[0]);
+            } else {
+                // multi-results, non-OAI compat
+                res->ok(arr);
+            }
         }
     } else {
         // in streaming mode, the first error must be treated as non-stream response
diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
index 8a84c4ec539..57a830fea1a 100644
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -1069,7 +1069,7 @@ json server_task_result_cmpl_partial::to_json_oaicompat_chat() {
             {"choices", json::array({
                 json {
                     {"finish_reason", nullptr},
-                    {"index", 0},
+                    {"index", index},
                     {"delta", delta},
                 },
             })},

From 5cc3156f096bb2d981165d823d276e9ec942fb3b Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 5 Dec 2025 16:28:09 +0100
Subject: [PATCH 05/11] clean up

---
 tools/server/server-context.cpp | 56 ++++++++++++++-------------------
 tools/server/server-task.cpp    |  2 +-
 2 files changed, 25 insertions(+), 33 deletions(-)

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 98e70e967db..efa4923ae41 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -255,6 +255,14 @@ struct server_slot {
         generated_token_probs.push_back(token);
     }
 
+    bool is_parent() const {
+        return is_processing() && task->n_children > 0;
+    }
+
+    bool is_child() const {
+        return is_processing() && task->id_parent >= 0;
+    }
+
     void release() {
         if (is_processing()) {
             GGML_ASSERT(task);
@@ -1034,7 +1042,9 @@ struct server_context_impl {
 
         slot.task = std::make_unique<const server_task>(std::move(task));
 
-        slot.state = SLOT_STATE_STARTED;
+        slot.state = slot.is_child()
+            ? SLOT_STATE_WAIT_OTHER // wait for the parent to process prompt
+            : SLOT_STATE_STARTED;
 
         SLT_INF(slot, "%s", "processing task\n");
 
@@ -1790,12 +1800,6 @@ struct server_context_impl {
                 if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
                     const auto & input_tokens = slot.task->tokens;
 
-                    if (slot.task->id_parent >= 0) {
-                        slot.state = SLOT_STATE_PROCESSING_PROMPT;
-                        // do nothing, parent slot will handle prompt processing
-                        continue;
-                    }
-
                     // TODO: maybe move branch to outside of this loop in the future
                     if (slot.state == SLOT_STATE_STARTED) {
                         slot.t_start_process_prompt = ggml_time_us();
@@ -2161,9 +2165,7 @@ struct server_context_impl {
 
                     // entire prompt has been processed
                     if (slot.prompt.n_tokens() == slot.task->n_tokens()) {
-                        slot.state = slot.task->n_children == 0
-                            ? SLOT_STATE_DONE_PROMPT // state not being reused by any other slots
-                            : SLOT_STATE_WAIT_OTHER;
+                        slot.state = SLOT_STATE_DONE_PROMPT;
 
                         GGML_ASSERT(batch.n_tokens > 0);
 
@@ -2327,36 +2329,26 @@ struct server_context_impl {
             // on successful decode, restore the original batch size
             n_batch = llama_n_batch(ctx);
 
-            // may need to copy state to other slots
             for (auto & slot : slots) {
-                if (slot.state == SLOT_STATE_WAIT_OTHER) {
-                    GGML_ASSERT(slot.task->n_children > 0);
-
-                    size_t n_waiting = 0;
+                // may need to copy state to other slots
+                if (slot.state == SLOT_STATE_DONE_PROMPT && slot.is_parent()) {
                     std::vector<server_slot *> child_slots;
                     for (auto & other : slots) {
-                        if (!other.is_processing()) {
-                            continue;
-                        }
-                        if (slot.task->id == other.task->id_parent) {
-                            n_waiting++;
+                        if (other.state == SLOT_STATE_WAIT_OTHER && slot.task->id == other.task->id_parent) {
                             child_slots.push_back(&other);
                         }
                     }
 
-                    // we can only proceed if all "child" slots are having the correct tasks
-                    if (n_waiting < slot.task->n_children) {
-                        continue;
-                    }
-
-                    // copy state to the child slots
-                    for (auto & child : child_slots) {
-                        SLT_INF(*child, "copying state from slot %d to child %d\n", slot.id, child->id);
-                        slot.copy_state_to(*child);
-                        child->state = SLOT_STATE_DONE_PROMPT;
+                    // we can only proceed if all child slots are having the correct tasks
+                    if (child_slots.size() == slot.task->n_children) {
+                        // copy state to the child slots
+                        for (auto & child : child_slots) {
+                            SLT_INF(slot, "copying state to child %d\n", child->id);
+                            slot.copy_state_to(*child);
+                            child->state = SLOT_STATE_DONE_PROMPT;
+                        }
+                        slot.state = SLOT_STATE_DONE_PROMPT;
                     }
-
-                    slot.state = SLOT_STATE_DONE_PROMPT;
                 }
 
                 // optionally send prompt processing progress
diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
index 57a830fea1a..b2e61a81130 100644
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -669,7 +669,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat() {
 
     json choice {
         {"finish_reason", finish_reason},
-        {"index", 0},
+        {"index", index},
         {"message", msg.to_json_oaicompat<json>()},
     };
 

From 2a7728f56f51077aee0631f406d6b4ed46c4c174 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 5 Dec 2025 16:30:30 +0100
Subject: [PATCH 06/11] disable ctx shift

---
 tools/server/server-context.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index efa4923ae41..7ef25790a20 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -255,6 +255,7 @@ struct server_slot {
         generated_token_probs.push_back(token);
     }
 
+    // note: a slot can also be either a parent or a child
     bool is_parent() const {
         return is_processing() && task->n_children > 0;
     }
@@ -1706,6 +1707,12 @@ struct server_context_impl {
                     GGML_ABORT("not supported by multimodal");
                 }
 
+                if (slot.is_parent() || slot.is_child()) {
+                    send_error(slot, "context shift cannot be used for shared prompt", ERROR_TYPE_SERVER);
+                    slot.release();
+                    continue;
+                }
+
                 // Shift context
                 int n_keep = slot.task->params.n_keep < 0 ? slot.task->n_tokens() : slot.task->params.n_keep;
 

From e06607102b7c30dc33b7f8903db2e47e0c2c3d72 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 5 Dec 2025 16:34:56 +0100
Subject: [PATCH 07/11] add test

---
 .../server/tests/unit/test_chat_completion.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py
index aa6229c93a5..64f3158b986 100644
--- a/tools/server/tests/unit/test_chat_completion.py
+++ b/tools/server/tests/unit/test_chat_completion.py
@@ -477,3 +477,22 @@ def make_cmpl_request():
     assert last_progress["total"] > 0
     assert last_progress["processed"] == last_progress["total"]
     assert total_batch_count == batch_count
+
+
+def test_chat_completions_multiple_choices():
+    global server
+    server.start()
+    res = server.make_request("POST", "/chat/completions", data={
+        "max_tokens": 8,
+        "n": 2,
+        "messages": [
+            {"role": "system", "content": "Book"},
+            {"role": "user", "content": "What is the best book"},
+        ],
+    })
+    assert res.status_code == 200
+    assert len(res.body["choices"]) == 2
+    for choice in res.body["choices"]:
+        assert "assistant" == choice["message"]["role"]
+        assert match_regex("Suddenly", choice["message"]["content"])
+        assert choice["finish_reason"] == "length"

From 46f6fd26063cefa761edfd378b2a30704c5b7429 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 5 Dec 2025 16:40:39 +0100
Subject: [PATCH 08/11] update comments

---
 tools/server/server-context.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 7ef25790a20..8e0b58be544 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -35,10 +35,10 @@ constexpr int HTTP_POLLING_SECONDS = 1;
 // state diagram: https://github.com/ggml-org/llama.cpp/pull/9283
 enum slot_state {
     SLOT_STATE_IDLE,
-    SLOT_STATE_STARTED, // after assigning a task
+    SLOT_STATE_WAIT_OTHER, // after assigning a task, but waiting for parent slot to process prompt
+    SLOT_STATE_STARTED,    // after assigning a task and about to process prompt
     SLOT_STATE_PROCESSING_PROMPT,
     SLOT_STATE_DONE_PROMPT,
-    SLOT_STATE_WAIT_OTHER, // prompt processed, but waiting for other slots to copy the state
     SLOT_STATE_GENERATING,
 };
 

From b65ee6476759ae9a1d150d456dc520ccc9d08c55 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 5 Dec 2025 17:34:41 +0100
Subject: [PATCH 09/11] fix style

---
 tools/server/server-common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/server/server-common.h b/tools/server/server-common.h
index db942ca6ff9..0c4d84ffa06 100644
--- a/tools/server/server-common.h
+++ b/tools/server/server-common.h
@@ -215,7 +215,7 @@ struct server_tokens {
                 llama_pos pos,
                 int32_t seq_id,
                 size_t & n_tokens_out) const;
-    
+
     server_tokens clone() const;
 };
 

From 6fb3226d0af625085def4bd3d74291d1ae3681ee Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 6 Dec 2025 11:03:13 +0100
Subject: [PATCH 10/11] add n_cmpl to docs [no ci]

---
 tools/server/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/server/README.md b/tools/server/README.md
index cb2fbcf8eb7..bf274db79d4 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -493,6 +493,8 @@ Note for `multimodal_data` in JSON object prompts. This should be an array of st
 `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
 By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
 
+`n_cmpl`: Number of completions to generate from the current prompt. If input has multiple prompts, the output will have N prompts times `n_cmpl` entries.
+
 `stream`: Allows receiving each predicted token in real-time instead of waiting for the completion to finish (uses a different response format). To enable this, set to `true`.
 
 `stop`: Specify a JSON array of stopping strings.

From ea7f0669fb128cb566abf0d193b4e49bf4ab1666 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 6 Dec 2025 13:56:48 +0100
Subject: [PATCH 11/11] allowing using both n_cmpl and n

---
 tools/server/server-common.cpp  | 4 ----
 tools/server/server-context.cpp | 1 -
 tools/server/server-task.cpp    | 2 +-
 3 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp
index 51e76084e6d..b403864e0ee 100644
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@@ -757,8 +757,6 @@ json oaicompat_completion_params_parse(const json & body) {
         llama_params["stop"] = json_value(body, "stop", json::array());
     }
 
-    llama_params["n_cmpl"] = json_value(body, "n", 1);
-
     // Handle "echo" field
     if (json_value(body, "echo", false)) {
         throw std::runtime_error("Only no echo is supported");
@@ -1057,8 +1055,6 @@ json oaicompat_chat_params_parse(
         llama_params["chat_parser"] = chat_params.parser;
     }
 
-    llama_params["n_cmpl"] = json_value(body, "n", 1);
-
     // Handle "logprobs" field
     // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
     if (json_value(body, "logprobs", false)) {
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 8e0b58be544..12a4e94e5d8 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -2354,7 +2354,6 @@ struct server_context_impl {
                             slot.copy_state_to(*child);
                             child->state = SLOT_STATE_DONE_PROMPT;
                         }
-                        slot.state = SLOT_STATE_DONE_PROMPT;
                     }
                 }
 
diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
index b2e61a81130..c401f47a788 100644
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -175,7 +175,7 @@ task_params server_task::params_from_json_cmpl(
     params.n_indent         = json_value(data,       "n_indent",           defaults.n_indent);
     params.n_keep           = json_value(data,       "n_keep",             defaults.n_keep);
     params.n_discard        = json_value(data,       "n_discard",          defaults.n_discard);
-    params.n_cmpl           = json_value(data,       "n_cmpl",             defaults.n_cmpl);
+    params.n_cmpl           = json_value(data,       "n_cmpl",             json_value(data, "n", 1));
     //params.t_max_prompt_ms  = json_value(data,       "t_max_prompt_ms",    defaults.t_max_prompt_ms); // TODO: implement
     params.t_max_predict_ms = json_value(data,       "t_max_predict_ms",   defaults.t_max_predict_ms);
     params.response_fields  = json_value(data,       "response_fields",    std::vector<std::string>());