Remove default template and unify template loading (#3722)

mzegla · web-flow · commit 448a8c38d9e8 · 2025-12-04T10:12:53.000+01:00
diff --git a/ci/lib_search.py b/ci/lib_search.py
@@ -153,6 +153,7 @@ def check_dir(start_dir):
         "windows_sign.bat",
         "config.yaml",
         "kserve-openvino.yaml",
+        "dummy_facebook_template.jinja",
         ]
 
     exclude_directories = ['/dist/', 'release_files/thirdparty-licenses', 'extras/chat_template_examples']
diff --git a/prepare_llm_models.sh b/prepare_llm_models.sh
@@ -20,7 +20,7 @@ if [ -z "$1" ]; then
   exit 1
 fi
 
-CB_MODEL="facebook/opt-125m"
+TEXT_GENERATION_MODEL="facebook/opt-125m"
 TOKENIZER_FILE="openvino_tokenizer.bin"
 LEGACY_MODEL_FILE="1/model.bin"
 EMBEDDING_MODEL="thenlper/gte-small"
@@ -35,23 +35,6 @@ PHI4_MODEL="microsoft/Phi-4-mini-instruct"
 MISTRAL_MODEL="mistralai/Mistral-7B-Instruct-v0.3"
 GPT_OSS="openai/gpt-oss-20b"
 
-MODELS=("$CB_MODEL/$TOKENIZER_FILE" "$RERANK_MODEL/rerank/$LEGACY_MODEL_FILE" "$VLM_MODEL/$TOKENIZER_FILE" "$QWEN3_MODEL/$TOKENIZER_FILE" "$LLAMA3_MODEL/$TOKENIZER_FILE" "$HERMES3_MODEL/$TOKENIZER_FILE" "$PHI4_MODEL/$TOKENIZER_FILE" "$MISTRAL_MODEL/$TOKENIZER_FILE" "$GPT_OSS/$TOKENIZER_FILE" "$EMBEDDING_MODEL/ov/$TOKENIZER_FILE" "$RERANK_MODEL/ov/$TOKENIZER_FILE")
-
-all_exist=true
-for model in "${MODELS[@]}"; do
-  if [ ! -f "$1/$model" ]; then
-    echo "Model file does not exist $1/$model"
-    all_exist=false
-    break
-  fi
-  echo "Model file exist $1/$model"
-done
-
-if $all_exist; then
-  echo "All model directories exist in $1. Skipping downloading models."
-  exit 0
-fi
-
 if [ "$(python3 -c 'import sys; print(sys.version_info[1])')" -le "8" ]; then echo "Prepare models with python > 3.8."; exit 1 ; fi
 
 echo "Downloading LLM testing models to directory $1"
@@ -69,16 +52,22 @@ else
 fi
 mkdir -p $1
 
-if [ -f "$1/$CB_MODEL/$TOKENIZER_FILE" ]; then
-  echo "Models file $1/$CB_MODEL/$TOKENIZER_FILE exists. Skipping downloading models."
+if [ -f "$1/$TEXT_GENERATION_MODEL/$TOKENIZER_FILE" ]; then
+  echo "Models file $1/$TEXT_GENERATION_MODEL/$TOKENIZER_FILE exists. Skipping downloading models."
 else
-  python3 demos/common/export_models/export_model.py text_generation --source_model "$CB_MODEL" --weight-format int8 --model_repository_path $1
+  python3 demos/common/export_models/export_model.py text_generation --source_model "$TEXT_GENERATION_MODEL" --weight-format int8 --model_repository_path $1
 fi
-if [ ! -f "$1/$CB_MODEL/$TOKENIZER_FILE" ]; then
-  echo "[ERROR] Models file $1/$CB_MODEL/$TOKENIZER_FILE does not exist."
+
+if [ ! -f "$1/$TEXT_GENERATION_MODEL/$TOKENIZER_FILE" ]; then
+  echo "[ERROR] Models file $1/$TEXT_GENERATION_MODEL/$TOKENIZER_FILE does not exist."
   exit 1
 fi
 
+if [ ! -f "$1/$TEXT_GENERATION_MODEL/chat_template.jinja" ]; then
+    echo "Copying dummy chat template to $TEXT_GENERATION_MODEL model directory."
+    cp src/test/llm/dummy_facebook_template.jinja "$1/$TEXT_GENERATION_MODEL/chat_template.jinja"
+fi
+
 if [ -f "$1/$VLM_MODEL/$TOKENIZER_FILE" ]; then
   echo "Model file $1/$VLM_MODEL/$TOKENIZER_FILE exists. Skipping downloading models."
 else
diff --git a/src/llm/servable.cpp b/src/llm/servable.cpp
@@ -141,7 +141,12 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptr<GenAiServableExecution
 #else
         ov::genai::ChatHistory& chatHistory = executionContext->apiHandler->getChatHistory();
         constexpr bool add_generation_prompt = true;  // confirm it should be hardcoded
-        inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, add_generation_prompt);
+        try {
+            inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, add_generation_prompt);
+        } catch (const std::exception& e) {
+            SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to apply chat template: {}", e.what());
+            return absl::Status(absl::StatusCode::kInvalidArgument, "Failed to apply chat template. The model either does not have chat template or has an invalid one.");
+        }
 #endif
         if (inputText.size() == 0) {
             return absl::Status(absl::StatusCode::kInvalidArgument, "Final prompt after applying chat template is empty");
diff --git a/src/llm/servable_initializer.cpp b/src/llm/servable_initializer.cpp
@@ -51,15 +51,17 @@
 namespace ovms {
 
 static const std::string CHAT_TEMPLATE_WARNING_MESSAGE = "Warning: Chat template has not been loaded properly. Servable will not respond to /chat/completions endpoint.";
-static const std::string DEFAULT_CHAT_TEMPLATE = R"({% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] }}{% elif message['role'] == 'system' %}{{ '<|system|>\n' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %})";
 
 void GenAiServableInitializer::loadChatTemplate(std::shared_ptr<GenAiServableProperties> properties, const std::string& chatTemplateDirectory) {
 #if (PYTHON_DISABLE == 0)
     ExtraGenerationInfo extraGenInfo = readExtraGenerationInfo(properties, chatTemplateDirectory);
     loadPyTemplateProcessor(properties, extraGenInfo);
 #else
-    loadDefaultTemplateProcessorIfNeeded(properties);
+    if (properties->tokenizer.get_chat_template().empty()) {
+        SPDLOG_LOGGER_DEBUG(modelmanager_logger, CHAT_TEMPLATE_WARNING_MESSAGE);
+    }
 #endif
+    // In non-python build, GenAI handles chat template loading
 }
 
 #if (PYTHON_DISABLE == 0)
@@ -123,29 +125,37 @@ ExtraGenerationInfo GenAiServableInitializer::readExtraGenerationInfo(std::share
 }
 
 void GenAiServableInitializer::loadPyTemplateProcessor(std::shared_ptr<GenAiServableProperties> properties, const ExtraGenerationInfo& extraGenInfo) {
-    // GGUF models specific validation
-    if (extraGenInfo.isGgufModel) {
-        bool errorFound = false;
-        if (extraGenInfo.eosTokenFromTokenizer.empty()) {
-            SPDLOG_ERROR("Tokenizer eos token not found in tokenizer nor in vocabulary but required for GGUF models.");
-            errorFound = true;
-        }
-        if (extraGenInfo.bosTokenFromTokenizer.empty()) {
-            SPDLOG_ERROR("Tokenizer bos token not found in tokenizer nor in vocabulary but required for GGUF models.");
-            errorFound = true;
-        }
-        if (extraGenInfo.chatTemplateFromTokenizer.empty()) {
-            SPDLOG_ERROR("Tokenizer chat template not found in tokenizer but required for GGUF models.");
-            errorFound = true;
-        }
-        if (errorFound)
-            return;
+    // At this point tokenizer cannot be uninitialized as we need to access its methods for prepare for chat template processing
+    if (properties->tokenizer == ov::genai::Tokenizer()) {
+        SPDLOG_LOGGER_ERROR(modelmanager_logger, "Tokenizer is not initialized. Cannot load chat template processor.");
+        return;
+    }
+    std::string chatTemplate = properties->tokenizer.get_original_chat_template();
+    std::string bosToken = properties->tokenizer.get_bos_token();
+    std::string eosToken = properties->tokenizer.get_eos_token();
+    if (bosToken.empty()) {
+        SPDLOG_ERROR("BOS token was not found in model files.");
+        return;
     }
+    if (eosToken.empty()) {
+        SPDLOG_ERROR("EOS token was not found in model files.");
+        return;
+    }
+    if (chatTemplate.empty()) {
+        SPDLOG_ERROR("Chat template was not found in model files.");
+        return;
+    }
+
+    properties->templateProcessor.bosToken = bosToken;
+    properties->templateProcessor.eosToken = eosToken;
+
+    SPDLOG_LOGGER_DEBUG(modelmanager_logger, "Loading Python Jinja template processor with chat template from tokenizer. Bos token: {}, Eos token: {}, chat template: \n{}",
+        bosToken, eosToken, chatTemplate);
+
     py::gil_scoped_acquire acquire;
     try {
-        auto locals = py::dict("tokenizer_template"_a = extraGenInfo.chatTemplateFromTokenizer,
-            "templates_directory"_a = extraGenInfo.chatTemplateDirectory,
-            "is_gguf_model"_a = extraGenInfo.isGgufModel);
+        auto locals = py::dict("chat_template"_a = chatTemplate,
+            "templates_directory"_a = extraGenInfo.chatTemplateDirectory);
         py::exec(R"(
             # Following the logic from:
             # https://github.com/huggingface/transformers/blob/25245ec26dc29bcf6102e1b4ddd0dfd02e720cf5/src/transformers/tokenization_utils_base.py#L1837
@@ -214,71 +224,51 @@ void GenAiServableInitializer::loadPyTemplateProcessor(std::shared_ptr<GenAiServ
                         self._rendered_blocks = None
                         self._generation_indices = None
 
-
-            # Default chat template accepts only single message and outputs only it's 'content'
-            # effectively turning it into a regular prompt. 
-            default_chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] }}{% elif message['role'] == 'system' %}{{ '<|system|>\n' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}"
-
-            bos_token = ""
-            eos_token = ""
-            chat_template = default_chat_template
+            
+            # Optional dedicated tool chat template (might not be present)
             tool_chat_template = None
 
+            # Variables needed to be set at the end of this script execution
             template = None
             tool_template = None
 
-            # Try to read template from template.jinja file
-            jinja_file = Path(templates_directory + "/chat_template.jinja")
-            jinja_file_legacy = Path(templates_directory + "/template.jinja")
+            # Load Jinja2 environment
             template_loader = jinja2.FileSystemLoader(searchpath=templates_directory)
             jinja_env = ImmutableSandboxedEnvironment(trim_blocks=True, lstrip_blocks=True, extensions=[AssistantTracker, jinja2.ext.loopcontrols], loader=template_loader)
             jinja_env.policies["json.dumps_kwargs"]["ensure_ascii"] = False
             jinja_env.globals["raise_exception"] = raise_exception
             jinja_env.globals["strftime_now"] = strftime_now
             jinja_env.filters["from_json"] = json.loads
-            if jinja_file.is_file():
-                template = jinja_env.get_template("chat_template.jinja")
-            elif jinja_file_legacy.is_file():
-                template = jinja_env.get_template("template.jinja")
 
-            # Try to read data from tokenizer_config.json
+            # Try to read data from tokenizer_config.json to get additional tool chat template if present
             tokenizer_config_file = Path(templates_directory + "/tokenizer_config.json")
             if tokenizer_config_file.is_file():
                 f = open(templates_directory + "/tokenizer_config.json", "r", encoding="utf-8")
                 data = json.load(f)
-                bos_token = data.get("bos_token", "")
-                bos_token = "" if bos_token is None else bos_token  # Null token conversion to empty string.
-                eos_token = data.get("eos_token", "")
-                eos_token = "" if eos_token is None else eos_token  # Null token conversion to empty string.
-
-                chat_template = data.get("chat_template", default_chat_template)
-                if isinstance(chat_template, list):
-                    for template_entry in chat_template:
+
+                chat_template_from_tokenizer_config = data.get("chat_template", None)
+                if isinstance(chat_template_from_tokenizer_config, list):
+                    for template_entry in chat_template_from_tokenizer_config:
                         if isinstance(template_entry, dict):
-                            if template_entry.get("name") == "default":
-                                chat_template = template_entry.get("template")
-                            elif template_entry.get("name") == "tool_use":
+                            if template_entry.get("name") == "tool_use":
                                 tool_chat_template = template_entry.get("template")
-            if template is None:
-                if is_gguf_model and (chat_template == default_chat_template):
-                    # GGUF model directory might not contain files with chat template and in that case we use template read from the tokenizer 
-                    template = jinja_env.from_string(tokenizer_template)
-                else:
-                    template = jinja_env.from_string(chat_template)
+            
+            # Try read tool_use.jinja template file from additional_chat_templates directory if exists
+            additional_templates_dir = Path(templates_directory + "/additional_chat_templates")
+            tool_use_template_file = additional_templates_dir / "tool_use.jinja"
+            if tool_use_template_file.is_file():
+                with open(tool_use_template_file, "r", encoding="utf-8") as f:
+                    tool_chat_template = f.read()
+            
+            # Load templates from strings
+            template = jinja_env.from_string(chat_template)
             if tool_chat_template is not None:
                 tool_template = jinja_env.from_string(tool_chat_template)
             else:
                 tool_template = template
         )",
             py::globals(), locals);
 
-        if (extraGenInfo.isGgufModel) {
-            properties->templateProcessor.bosToken = extraGenInfo.bosTokenFromTokenizer;
-            properties->templateProcessor.eosToken = extraGenInfo.eosTokenFromTokenizer;
-        } else {
-            properties->templateProcessor.bosToken = locals["bos_token"].cast<std::string>();
-            properties->templateProcessor.eosToken = locals["eos_token"].cast<std::string>();
-        }
         properties->templateProcessor.chatTemplate = std::make_unique<PyObjectWrapper<py::object>>(locals["template"]);
         properties->templateProcessor.toolTemplate = std::make_unique<PyObjectWrapper<py::object>>(locals["tool_template"]);
     } catch (const pybind11::error_already_set& e) {
@@ -298,15 +288,6 @@ void GenAiServableInitializer::loadPyTemplateProcessor(std::shared_ptr<GenAiServ
         SPDLOG_DEBUG("Chat template loading failed with an unexpected error");
     }
 }
-
-#else
-void GenAiServableInitializer::loadDefaultTemplateProcessorIfNeeded(std::shared_ptr<GenAiServableProperties> properties) {
-    const std::string modelChatTemplate = properties->tokenizer.get_chat_template();
-    if (modelChatTemplate.empty()) {
-        SPDLOG_LOGGER_DEBUG(modelmanager_logger, "Could not load model chat template. Using default template.");
-        properties->tokenizer.set_chat_template(DEFAULT_CHAT_TEMPLATE);
-    }
-}
 #endif
 
 Status parseModelsPath(std::string& outPath, std::string modelsPath, std::string graphPath) {
diff --git a/src/llm/servable_initializer.hpp b/src/llm/servable_initializer.hpp
@@ -52,10 +52,6 @@ class GenAiServableInitializer {
     // Use Python Jinja module for template processing
     static void loadPyTemplateProcessor(std::shared_ptr<GenAiServableProperties> properties, const ExtraGenerationInfo& extraGenInfo);
     static ExtraGenerationInfo readExtraGenerationInfo(std::shared_ptr<GenAiServableProperties> properties, const std::string& chatTemplateDirectory);
-#else
-    // In C++ only version we use GenAI for template processing, but to have the same behavior as in Python-enabled version
-    // we use default template if model does not have its own, so that servable can also work on chat/completion endpoint.
-    static void loadDefaultTemplateProcessorIfNeeded(std::shared_ptr<GenAiServableProperties> properties);
 #endif
     /*
     initialize method implementation MUST fill servable with all required properties i.e. pipeline, tokenizer, configs etc. based on mediapipe node options.
diff --git a/src/test/llm/dummy_facebook_template.jinja b/src/test/llm/dummy_facebook_template.jinja
@@ -0,0 +1 @@
+{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] }}{% elif message['role'] == 'system' %}{{ '<|system|>\n' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}
diff --git a/src/test/llm/llmtemplate_test.cpp b/src/test/llm/llmtemplate_test.cpp
diff --git a/windows_prepare_llm_models.bat b/windows_prepare_llm_models.bat

Original file line number	Diff line number	Diff line change
`@@ -153,6 +153,7 @@ def check_dir(start_dir):`
`153`	`153`	`"windows_sign.bat",`
`154`	`154`	`"config.yaml",`
`155`	`155`	`"kserve-openvino.yaml",`
	`156`	`+ "dummy_facebook_template.jinja",`
`156`	`157`	`]`
`157`	`158`
`158`	`159`	`exclude_directories = ['/dist/', 'release_files/thirdparty-licenses', 'extras/chat_template_examples']`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] }}{% elif message['role'] == 'system' %}{{ '<\|system\|>\n' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}`