@@ -22,8 +22,9 @@ std::shared_ptr<inferenceState> create_inference_state(llamaCPP *instance) {
2222// --------------------------------------------
2323
2424// Function to check if the model is loaded
25- void check_model_loaded (llama_server_context &llama, const HttpRequestPtr &req,
26- std::function<void (const HttpResponsePtr &)> &callback) {
25+ void check_model_loaded (
26+ llama_server_context &llama, const HttpRequestPtr &req,
27+ std::function<void (const HttpResponsePtr &)> &callback) {
2728 if (!llama.model_loaded_external ) {
2829 Json::Value jsonResp;
2930 jsonResp[" message" ] =
@@ -299,13 +300,9 @@ void llamaCPP::chatCompletion(
299300 LOG_INFO << " Current completion text" ;
300301 LOG_INFO << formatted_output;
301302#endif
302- int task_id;
303-
304- LOG_INFO << " Resolved request for task_id:" << task_id;
305303
306304 if (is_streamed) {
307305 auto state = create_inference_state (this );
308- state->task_id = task_id;
309306 auto chunked_content_provider =
310307 [state, data](char *pBuffer, std::size_t nBuffSize) -> std::size_t {
311308 if (!state->is_streaming ) {
@@ -386,9 +383,12 @@ void llamaCPP::chatCompletion(
386383 } else {
387384 Json::Value respData;
388385 auto resp = nitro_utils::nitroHttpResponse ();
386+ int task_id = llama.request_completion (data, false , false , -1 );
387+ LOG_INFO << " sent the non stream, waiting for respone" ;
389388 if (!json_value (data, " stream" , false )) {
390389 std::string completion_text;
391390 task_result result = llama.next_result (task_id);
391+ LOG_INFO << " Here is the result:" << result.error ;
392392 if (!result.error && result.stop ) {
393393 int prompt_tokens = result.result_json [" tokens_evaluated" ];
394394 int predicted_tokens = result.result_json [" tokens_predicted" ];
0 commit comments