Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.

Commit aa39a27

Browse files
authored
Merge pull request #342 from janhq/313-bug-busy-waiting-is-causing-cpu-usage
313 bug busy waiting is causing cpu usage
2 parents 50fa2e1 + 9a31ee8 commit aa39a27

File tree

2 files changed

+23
-14
lines changed

2 files changed

+23
-14
lines changed

controllers/llamaCPP.cc

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ void llamaCPP::chatCompletion(
153153
const HttpRequestPtr &req,
154154
std::function<void(const HttpResponsePtr &)> &&callback) {
155155

156-
if (!model_loaded) {
156+
if (!llama.model_loaded_external) {
157157
Json::Value jsonResp;
158158
jsonResp["message"] =
159159
"Model has not been loaded, please load model into nitro";
@@ -391,7 +391,7 @@ void llamaCPP::unloadModel(
391391
std::function<void(const HttpResponsePtr &)> &&callback) {
392392
Json::Value jsonResp;
393393
jsonResp["message"] = "No model loaded";
394-
if (model_loaded) {
394+
if (llama.model_loaded_external) {
395395
stopBackgroundTask();
396396

397397
llama_free(llama.ctx);
@@ -408,7 +408,7 @@ void llamaCPP::modelStatus(
408408
const HttpRequestPtr &req,
409409
std::function<void(const HttpResponsePtr &)> &&callback) {
410410
Json::Value jsonResp;
411-
bool is_model_loaded = this->model_loaded;
411+
bool is_model_loaded = llama.model_loaded_external;
412412
if (is_model_loaded) {
413413
jsonResp["model_loaded"] = is_model_loaded;
414414
jsonResp["model_data"] = llama.get_model_props().dump();
@@ -456,7 +456,7 @@ bool llamaCPP::loadModelImpl(const Json::Value &jsonBody) {
456456
log_enable();
457457
std::string llama_log_folder = jsonBody["llama_log_folder"].asString();
458458
log_set_target(llama_log_folder + "llama.log");
459-
} // Set folder for llama log
459+
} // Set folder for llama log
460460
}
461461
#ifdef GGML_USE_CUBLAS
462462
LOG_INFO << "Setting up GGML CUBLAS PARAMS";
@@ -483,7 +483,9 @@ bool llamaCPP::loadModelImpl(const Json::Value &jsonBody) {
483483
return false; // Indicate failure
484484
}
485485
llama.initialize();
486-
model_loaded = true;
486+
487+
llama.model_loaded_external = true;
488+
487489
LOG_INFO << "Started background task here!";
488490
backgroundThread = std::thread(&llamaCPP::backgroundTask, this);
489491
warmupModel();
@@ -494,7 +496,7 @@ void llamaCPP::loadModel(
494496
const HttpRequestPtr &req,
495497
std::function<void(const HttpResponsePtr &)> &&callback) {
496498

497-
if (model_loaded) {
499+
if (llama.model_loaded_external) {
498500
LOG_INFO << "model loaded";
499501
Json::Value jsonResp;
500502
jsonResp["message"] = "Model already loaded";
@@ -522,7 +524,7 @@ void llamaCPP::loadModel(
522524
}
523525

524526
void llamaCPP::backgroundTask() {
525-
while (model_loaded) {
527+
while (llama.model_loaded_external) {
526528
// model_loaded =
527529
llama.update_slots();
528530
}
@@ -533,8 +535,9 @@ void llamaCPP::backgroundTask() {
533535
}
534536

535537
void llamaCPP::stopBackgroundTask() {
536-
if (model_loaded) {
537-
model_loaded = false;
538+
if (llama.model_loaded_external) {
539+
llama.model_loaded_external = false;
540+
llama.condition_tasks.notify_one();
538541
LOG_INFO << "changed to false";
539542
if (backgroundThread.joinable()) {
540543
backgroundThread.join();

controllers/llamaCPP.h

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -503,6 +503,9 @@ struct llama_server_context {
503503
int32_t id_gen;
504504
int32_t n_ctx; // total context for all clients / slots
505505

506+
// Internal
507+
std::atomic<bool> model_loaded_external = false;
508+
506509
// system prompt
507510
bool system_need_update = false;
508511

@@ -1538,10 +1541,13 @@ struct llama_server_context {
15381541
"cache\n");
15391542
kv_cache_clear();
15401543
}
1541-
std::this_thread::sleep_for(std::chrono::milliseconds(5));
1542-
// TODO: Need to implement queueing using CV for better performance
1543-
// std::unique_lock<std::mutex> lock(mutex_tasks);
1544-
// condition_tasks.wait(lock, [&] { return !queue_tasks.empty(); });
1544+
// std::this_thread::sleep_for(std::chrono::milliseconds(5));
1545+
// TODO: Need to implement queueing using CV for better performance
1546+
std::unique_lock<std::mutex> lock(mutex_tasks);
1547+
condition_tasks.wait(lock, [&] {
1548+
return (!queue_tasks.empty() && model_loaded_external) ||
1549+
(!model_loaded_external);
1550+
});
15451551
}
15461552

15471553
for (llama_client_slot &slot : slots) {
@@ -2554,7 +2560,7 @@ class llamaCPP : public drogon::HttpController<llamaCPP> {
25542560

25552561
private:
25562562
llama_server_context llama;
2557-
std::atomic<bool> model_loaded = false;
2563+
//std::atomic<bool> model_loaded = false;
25582564
size_t sent_count = 0;
25592565
size_t sent_token_probs_index = 0;
25602566
std::thread backgroundThread;

0 commit comments

Comments
 (0)