@@ -77,7 +77,7 @@ Json::Value create_embedding_payload(const std::vector<float>& embedding,
7777 return dataItem;
7878}
7979
80- std::string create_full_return_json (const std::string& id,
80+ Json::Value create_full_return_json (const std::string& id,
8181 const std::string& model,
8282 const std::string& content,
8383 const std::string& system_fingerprint,
@@ -110,9 +110,7 @@ std::string create_full_return_json(const std::string& id,
110110 usage[" total_tokens" ] = prompt_tokens + completion_tokens;
111111 root[" usage" ] = usage;
112112
113- Json::StreamWriterBuilder writer;
114- writer[" indentation" ] = " " ; // Compact output
115- return Json::writeString (writer, root);
113+ return root;
116114}
117115
118116std::string create_return_json (const std::string& id, const std::string& model,
@@ -422,7 +420,6 @@ void llamaCPP::InferenceImpl(
422420 });
423421 } else {
424422 Json::Value respData;
425- auto resp = nitro_utils::nitroHttpResponse ();
426423 int task_id = llama.request_completion (data, false , false , -1 );
427424 LOG_INFO_REQUEST (request_id) << " Non stream, waiting for respone" ;
428425 if (!json_value (data, " stream" , false )) {
@@ -431,16 +428,14 @@ void llamaCPP::InferenceImpl(
431428 if (!result.error && result.stop ) {
432429 int prompt_tokens = result.result_json [" tokens_evaluated" ];
433430 int predicted_tokens = result.result_json [" tokens_predicted" ];
434- std::string full_return =
435- create_full_return_json (nitro_utils::generate_random_string (20 ),
436- " _" , result.result_json [" content" ], " _" ,
437- prompt_tokens, predicted_tokens);
438- resp->setBody (full_return);
431+ respData = create_full_return_json (nitro_utils::generate_random_string (20 ),
432+ " _" , result.result_json [" content" ], " _" ,
433+ prompt_tokens, predicted_tokens);
439434 } else {
440435 respData[" message" ] = " Internal error during inference" ;
441- resp = nitro_utils::nitroHttpJsonResponse (respData);
442436 LOG_ERROR_REQUEST (request_id) << " Error during inference" ;
443437 }
438+ auto resp = nitro_utils::nitroHttpJsonResponse (respData);
444439 callback (resp);
445440 LOG_INFO_REQUEST (request_id) << " Inference completed" ;
446441 }
@@ -496,7 +491,6 @@ void llamaCPP::EmbeddingImpl(
496491 }
497492 }
498493
499- auto resp = nitro_utils::nitroHttpResponse ();
500494 Json::Value root;
501495 root[" data" ] = responseData;
502496 root[" model" ] = " _" ;
@@ -506,8 +500,7 @@ void llamaCPP::EmbeddingImpl(
506500 usage[" total_tokens" ] = 0 ;
507501 root[" usage" ] = usage;
508502
509- resp->setBody (Json::writeString (Json::StreamWriterBuilder (), root));
510- resp->setContentTypeString (" application/json" );
503+ auto resp = nitro_utils::nitroHttpJsonResponse (root);
511504 callback (resp);
512505 LOG_INFO_REQUEST (request_id) << " Embedding completed" ;
513506 });
0 commit comments