Partially updated examples.

xsxszab · xsxszab · commit 67d62a51cab5 · 2025-03-03T15:26:55.000-08:00
diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -53,7 +53,7 @@
 //    auto path_to_model = env->GetStringUTFChars(filename, 0);
 //    LOGi("Loading model from %s", path_to_model);
 //
-//    auto model = llama_load_model_from_file(path_to_model, model_params);
+//    auto model = llama_model_load_from_file(path_to_model, model_params);
 //    env->ReleaseStringUTFChars(filename, path_to_model);
 //
 //    if (!model) {
@@ -91,12 +91,12 @@
 //    ctx_params.n_threads       = n_threads;
 //    ctx_params.n_threads_batch = n_threads;
 //
-//    llama_context * context = llama_new_context_with_model(model, ctx_params);
+//    llama_context * context = llama_init_from_model(model, ctx_params);
 //
 //    if (!context) {
-//        LOGe("llama_new_context_with_model() returned null)");
+//        LOGe("llama_init_from_model() returned null)");
 //        env->ThrowNew(env->FindClass("java/lang/IllegalStateException"),
-//                      "llama_new_context_with_model() returned null)");
+//                      "llama_init_from_model() returned null)");
 //        return 0;
 //    }
 //
@@ -374,7 +374,7 @@
 //    const auto new_token_id = llama_sample_token_greedy(context, &candidates_p);
 //
 //    const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
-//    if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
+//    if (llama_vocab_is_eog(model, new_token_id) || n_cur == n_len) {
 //        return nullptr;
 //    }
 //
diff --git a/examples/nexa-omni-audio/main-encode.cpp b/examples/nexa-omni-audio/main-encode.cpp
@@ -2,7 +2,8 @@
 #include "common-nexa.h"
 
 #include "whisper.h"
-#include "grammar-parser.h"
+// #include "grammar-parser.h"
+#include "llama-grammar.h"
 
 #include <cmath>
 #include <fstream>
diff --git a/examples/nexa-omni-audio/omni.cpp b/examples/nexa-omni-audio/omni.cpp
@@ -628,7 +628,7 @@ static const char * sample(struct common_sampler * ctx_sampling,
     const llama_token id = common_sampler_sample(ctx_sampling, ctx_llama, -1);
     common_sampler_accept(ctx_sampling, id, true);
     static std::string ret;
-    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
+    if (llama_vocab_is_eog(llama_model_get_vocab(llama_get_model(ctx_llama)), id)) {
         ret = "</s>";
     } else {
         ret = common_token_to_piece(ctx_llama, id);
@@ -661,7 +661,7 @@ struct omni_context *omni_init_context(omni_context_params &params)
 
     llama_model_params model_params = common_model_params_to_llama(all_params.gpt);
 
-    llama_model *model = llama_load_model_from_file(all_params.gpt.model.c_str(), model_params);
+    llama_model *model = llama_model_load_from_file(all_params.gpt.model.c_str(), model_params);
     if (model == NULL)
     {
         LLAMA_LOG_ERROR("%s: unable to load model\n", __func__);
@@ -671,7 +671,7 @@ struct omni_context *omni_init_context(omni_context_params &params)
     llama_context_params ctx_params = common_context_params_to_llama(all_params.gpt);
     ctx_params.n_ctx = all_params.gpt.n_ctx < 2048 ? 2048 : all_params.gpt.n_ctx; // we need a longer context size to process image embeddings
 
-    llama_context *ctx_llama = llama_new_context_with_model(model, ctx_params);
+    llama_context *ctx_llama = llama_init_from_model(model, ctx_params);
 
     if (ctx_llama == NULL)
     {
@@ -729,7 +729,7 @@ void omni_free(struct omni_context *ctx_omni)
 
 static bool omni_eval_audio_embed(llama_context *ctx_llama, ggml_tensor *audio_embed, int n_batch, int *n_past)
 {
-    int n_embd = llama_n_embd(llama_get_model(ctx_llama));
+    int n_embd = llama_model_n_embd(llama_get_model(ctx_llama));
 
     int n_audio_embed = audio_embed->ne[1];
     GGML_ASSERT(audio_embed->ne[0] == n_embd);
@@ -829,7 +829,7 @@ const char* omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audi
 
     LOG("\n");
 
-    struct common_sampler * ctx_sampling = common_sampler_init(ctx_omni->model, params.gpt.sparams);
+    struct common_sampler * ctx_sampling = common_sampler_init(ctx_omni->model, params.gpt.sampling);
     if (!ctx_sampling) {
         fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
         exit(1);
@@ -890,14 +890,14 @@ struct omni_streaming {
         : ctx_omni_(ctx), params_(params) {
         dec_cnt_ = 0;
         n_past_ = 0;
-        ctx_sampling_ = common_sampler_init(ctx_omni_->model, params_.gpt.sparams);
+        ctx_sampling_ = common_sampler_init(ctx_omni_->model, params_.gpt.sampling);
     };
 
     int32_t sample() {
         llama_token id = common_sampler_sample(ctx_sampling_, ctx_omni_->ctx_llama, -1);
         common_sampler_accept(ctx_sampling_, id, true);
         static std::string ret_str;
-        if (llama_token_is_eog(llama_get_model(ctx_omni_->ctx_llama), id)) {
+        if (llama_vocab_is_eog(llama_model_get_vocab(llama_get_model(ctx_omni_->ctx_llama)), id)) {
             ret_str = "</s>";
         } else {
             ret_str = common_token_to_piece(ctx_omni_->ctx_llama, id);
diff --git a/examples/nexa-omni-audio/omni.h b/examples/nexa-omni-audio/omni.h
@@ -2,7 +2,7 @@
 
 #include "whisper.h"
 #include "llama.h"
-#include "grammar-parser.h"
+// #include "grammar-parser.h"
 #include "common.h"
 #include "common-nexa.h"
 
diff --git a/examples/omni-vlm/omni-vlm-cli.cpp b/examples/omni-vlm/omni-vlm-cli.cpp
@@ -52,7 +52,7 @@ static const char * sample(struct common_sampler * ctx_sampling,
     const llama_token id = common_sampler_sample(ctx_sampling, ctx_llama, -1);
     common_sampler_accept(ctx_sampling, id, true);
     static std::string ret;
-    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
+    if (llama_vocab_is_eog(llama_model_get_vocab(llama_get_model(ctx_llama)), id)) {
         ret = "</s>";
     } else {
         ret = common_token_to_piece(ctx_llama, id);
@@ -167,10 +167,10 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima
 
     LOG("\n");
 
-    params->sparams.temp = 0.0f;
-    params->sparams.top_k = 1;
-    params->sparams.top_p = 1.0f;
-    struct common_sampler * ctx_sampling = common_sampler_init(ctx_omnivlm->model, params->sparams);
+    params->sampling.temp = 0.0f;
+    params->sampling.top_k = 1;
+    params->sampling.top_p = 1.0f;
+    struct common_sampler * ctx_sampling = common_sampler_init(ctx_omnivlm->model, params->sampling);
     if (!ctx_sampling) {
         LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
         exit(1);
@@ -201,7 +201,7 @@ static struct llama_model * omnivlm_init(common_params * params) {
 
     llama_model_params model_params = common_model_params_to_llama(*params);
 
-    llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
     if (model == NULL) {
         LOG_ERR("%s: unable to load model\n" , __func__);
         return NULL;
@@ -223,7 +223,7 @@ static struct omnivlm_context * omnivlm_init_context(common_params * params, lla
     llama_context_params ctx_params = common_context_params_to_llama(*params);
     ctx_params.n_ctx           = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
 
-    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
+    llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
 
     if (ctx_llama == NULL) {
         LOG_ERR("%s: failed to create the llama_context\n" , __func__);
diff --git a/examples/omni-vlm/omni-vlm-wrapper.cpp b/examples/omni-vlm/omni-vlm-wrapper.cpp
@@ -47,15 +47,15 @@ struct omni_streaming_sample {
             :image_(image) {
         n_past_ = 0;
         dec_cnt_ = 0;
-        params.sparams.top_k = 1;
-        params.sparams.top_p = 1.0f;
-        ctx_sampling_ = common_sampler_init(model, params.sparams);
+        params.sampling.top_k = 1;
+        params.sampling.top_p = 1.0f;
+        ctx_sampling_ = common_sampler_init(model, params.sampling);
     }
 
     int32_t sample() {
         const llama_token id = common_sampler_sample(ctx_sampling_, ctx_omnivlm->ctx_llama, -1);
         common_sampler_accept(ctx_sampling_, id, true);
-        if (llama_token_is_eog(llama_get_model(ctx_omnivlm->ctx_llama), id)) {
+        if (llama_vocab_is_eog(llama_model_get_vocab(llama_get_model(ctx_omnivlm->ctx_llama)), id)) {
             ret_str_ = "</s>";
         } else {
             ret_str_ = common_token_to_piece(ctx_omnivlm->ctx_llama, id);
@@ -97,7 +97,7 @@ static struct llama_model * omnivlm_init(common_params * params) {
 
     llama_model_params model_params = common_model_params_to_llama(*params);
 
-    llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
     if (model == NULL) {
         LOG_ERR("%s: unable to load model\n" , __func__);
         return NULL;
@@ -120,7 +120,7 @@ static struct omnivlm_context * omnivlm_init_context(common_params * params, lla
     llama_context_params ctx_params = common_context_params_to_llama(*params);
     ctx_params.n_ctx           = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
 
-    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
+    llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
 
     if (ctx_llama == NULL) {
         LOG_ERR("%s: failed to create the llama_context\n" , __func__);
@@ -170,7 +170,7 @@ static const char * sample(struct common_sampler * smpl,
     const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
     common_sampler_accept(smpl, id, true);
     static std::string ret;
-    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
+    if (llama_vocab_is_eog(llama_model_get_vocab(llama_get_model(ctx_llama)), id)) {
         ret = "</s>";
     } else {
         ret = common_token_to_piece(ctx_llama, id);
@@ -206,8 +206,8 @@ static const char* process_prompt(struct omnivlm_context * ctx_omnivlm, struct o
         }
     }
 
-    params->sparams.top_k = 1;
-    params->sparams.top_p = 1.0f;
+    params->sampling.top_k = 1;
+    params->sampling.top_p = 1.0f;
 
     eval_string(ctx_omnivlm->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
     omnivlm_eval_image_embed(ctx_omnivlm->ctx_llama, image_embed, params->n_batch, &n_past);
@@ -217,7 +217,7 @@ static const char* process_prompt(struct omnivlm_context * ctx_omnivlm, struct o
 
     LOG("\n");
 
-    struct common_sampler * smpl = common_sampler_init(ctx_omnivlm->model, params->sparams);
+    struct common_sampler * smpl = common_sampler_init(ctx_omnivlm->model, params->sampling);
     if (!smpl) {
         LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
         exit(1);
diff --git a/examples/qwen2-audio/main-encode.cpp b/examples/qwen2-audio/main-encode.cpp
@@ -2,7 +2,7 @@
 #include "common-nexa.h"
 
 #include "whisper.h"
-#include "grammar-parser.h"
+// #include "grammar-parser.h"
 
 #include <cmath>
 #include <fstream>
diff --git a/examples/qwen2-audio/qwen2.cpp b/examples/qwen2-audio/qwen2.cpp
@@ -626,7 +626,7 @@ static const char * sample(struct common_sampler * ctx_sampling,
     const llama_token id = common_sampler_sample(ctx_sampling, ctx_llama, -1);
     common_sampler_accept(ctx_sampling, id, true);
     static std::string ret;
-    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
+    if (llama_vocab_is_eog(llama_model_get_vocab(llama_get_model(ctx_llama)), id)) {
         ret = "</s>";
     } else {
         ret = common_token_to_piece(ctx_llama, id);
@@ -659,7 +659,7 @@ struct omni_context *omni_init_context(omni_context_params &params)
 
     llama_model_params model_params = common_model_params_to_llama(all_params.gpt);
 
-    llama_model *model = llama_load_model_from_file(all_params.gpt.model.c_str(), model_params);
+    llama_model *model = llama_model_load_from_file(all_params.gpt.model.c_str(), model_params);
     if (model == NULL)
     {
         LLAMA_LOG_ERROR("%s: unable to load model\n", __func__);
@@ -669,7 +669,7 @@ struct omni_context *omni_init_context(omni_context_params &params)
     llama_context_params ctx_params = common_context_params_to_llama(all_params.gpt);
     ctx_params.n_ctx = all_params.gpt.n_ctx < 2048 ? 2048 : all_params.gpt.n_ctx; // we need a longer context size to process image embeddings
 
-    llama_context *ctx_llama = llama_new_context_with_model(model, ctx_params);
+    llama_context *ctx_llama = llama_init_from_model(model, ctx_params);
 
     if (ctx_llama == NULL)
     {
@@ -730,7 +730,7 @@ void omni_free(struct omni_context *ctx_omni)
 
 static bool omni_eval_audio_embed(llama_context *ctx_llama, ggml_tensor *audio_embed, int n_batch, int *n_past)
 {
-    int n_embd = llama_n_embd(llama_get_model(ctx_llama));
+    int n_embd = llama_model_n_embd(llama_get_model(ctx_llama));
 
     int n_audio_embed = audio_embed->ne[1];
     GGML_ASSERT(audio_embed->ne[0] == n_embd);
@@ -839,7 +839,7 @@ const char* omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audi
 
     LOG("\n");
 
-    struct common_sampler * ctx_sampling = common_sampler_init(ctx_omni->model, params.gpt.sparams);
+    struct common_sampler * ctx_sampling = common_sampler_init(ctx_omni->model, params.gpt.sampling);
     if (!ctx_sampling) {
         fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
         exit(1);
@@ -901,14 +901,14 @@ struct omni_streaming {
         : ctx_omni_(ctx), params_(params) {
         dec_cnt_ = 0;
         n_past_ = 0;
-        ctx_sampling_ = common_sampler_init(ctx_omni_->model, params_.gpt.sparams);
+        ctx_sampling_ = common_sampler_init(ctx_omni_->model, params_.gpt.sampling);
     };
 
     int32_t sample() {
         llama_token id = common_sampler_sample(ctx_sampling_, ctx_omni_->ctx_llama, -1);
         common_sampler_accept(ctx_sampling_, id, true);
         static std::string ret_str;
-        if (llama_token_is_eog(llama_get_model(ctx_omni_->ctx_llama), id)) {
+        if (llama_vocab_is_eog(llama_model_get_vocab(llama_get_model(ctx_omni_->ctx_llama)), id)) {
             ret_str = "</s>";
         } else {
             ret_str = common_token_to_piece(ctx_omni_->ctx_llama, id);
diff --git a/examples/qwen2-audio/qwen2.h b/examples/qwen2-audio/qwen2.h
@@ -2,7 +2,7 @@
 
 #include "whisper.h"
 #include "llama.h"
-#include "grammar-parser.h"
+// #include "grammar-parser.h"
 #include "common.h"
 #include "common-nexa.h"