Skip to content

Commit 448a8c3

Browse files
authored
Remove default template and unify template loading (#3722)
1 parent 0fc4507 commit 448a8c3

File tree

8 files changed

+114
-116
lines changed

8 files changed

+114
-116
lines changed

ci/lib_search.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ def check_dir(start_dir):
153153
"windows_sign.bat",
154154
"config.yaml",
155155
"kserve-openvino.yaml",
156+
"dummy_facebook_template.jinja",
156157
]
157158

158159
exclude_directories = ['/dist/', 'release_files/thirdparty-licenses', 'extras/chat_template_examples']

prepare_llm_models.sh

Lines changed: 12 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ if [ -z "$1" ]; then
2020
exit 1
2121
fi
2222

23-
CB_MODEL="facebook/opt-125m"
23+
TEXT_GENERATION_MODEL="facebook/opt-125m"
2424
TOKENIZER_FILE="openvino_tokenizer.bin"
2525
LEGACY_MODEL_FILE="1/model.bin"
2626
EMBEDDING_MODEL="thenlper/gte-small"
@@ -35,23 +35,6 @@ PHI4_MODEL="microsoft/Phi-4-mini-instruct"
3535
MISTRAL_MODEL="mistralai/Mistral-7B-Instruct-v0.3"
3636
GPT_OSS="openai/gpt-oss-20b"
3737

38-
MODELS=("$CB_MODEL/$TOKENIZER_FILE" "$RERANK_MODEL/rerank/$LEGACY_MODEL_FILE" "$VLM_MODEL/$TOKENIZER_FILE" "$QWEN3_MODEL/$TOKENIZER_FILE" "$LLAMA3_MODEL/$TOKENIZER_FILE" "$HERMES3_MODEL/$TOKENIZER_FILE" "$PHI4_MODEL/$TOKENIZER_FILE" "$MISTRAL_MODEL/$TOKENIZER_FILE" "$GPT_OSS/$TOKENIZER_FILE" "$EMBEDDING_MODEL/ov/$TOKENIZER_FILE" "$RERANK_MODEL/ov/$TOKENIZER_FILE")
39-
40-
all_exist=true
41-
for model in "${MODELS[@]}"; do
42-
if [ ! -f "$1/$model" ]; then
43-
echo "Model file does not exist $1/$model"
44-
all_exist=false
45-
break
46-
fi
47-
echo "Model file exist $1/$model"
48-
done
49-
50-
if $all_exist; then
51-
echo "All model directories exist in $1. Skipping downloading models."
52-
exit 0
53-
fi
54-
5538
if [ "$(python3 -c 'import sys; print(sys.version_info[1])')" -le "8" ]; then echo "Prepare models with python > 3.8."; exit 1 ; fi
5639

5740
echo "Downloading LLM testing models to directory $1"
@@ -69,16 +52,22 @@ else
6952
fi
7053
mkdir -p $1
7154

72-
if [ -f "$1/$CB_MODEL/$TOKENIZER_FILE" ]; then
73-
echo "Models file $1/$CB_MODEL/$TOKENIZER_FILE exists. Skipping downloading models."
55+
if [ -f "$1/$TEXT_GENERATION_MODEL/$TOKENIZER_FILE" ]; then
56+
echo "Models file $1/$TEXT_GENERATION_MODEL/$TOKENIZER_FILE exists. Skipping downloading models."
7457
else
75-
python3 demos/common/export_models/export_model.py text_generation --source_model "$CB_MODEL" --weight-format int8 --model_repository_path $1
58+
python3 demos/common/export_models/export_model.py text_generation --source_model "$TEXT_GENERATION_MODEL" --weight-format int8 --model_repository_path $1
7659
fi
77-
if [ ! -f "$1/$CB_MODEL/$TOKENIZER_FILE" ]; then
78-
echo "[ERROR] Models file $1/$CB_MODEL/$TOKENIZER_FILE does not exist."
60+
61+
if [ ! -f "$1/$TEXT_GENERATION_MODEL/$TOKENIZER_FILE" ]; then
62+
echo "[ERROR] Models file $1/$TEXT_GENERATION_MODEL/$TOKENIZER_FILE does not exist."
7963
exit 1
8064
fi
8165

66+
if [ ! -f "$1/$TEXT_GENERATION_MODEL/chat_template.jinja" ]; then
67+
echo "Copying dummy chat template to $TEXT_GENERATION_MODEL model directory."
68+
cp src/test/llm/dummy_facebook_template.jinja "$1/$TEXT_GENERATION_MODEL/chat_template.jinja"
69+
fi
70+
8271
if [ -f "$1/$VLM_MODEL/$TOKENIZER_FILE" ]; then
8372
echo "Model file $1/$VLM_MODEL/$TOKENIZER_FILE exists. Skipping downloading models."
8473
else

src/llm/servable.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,12 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptr<GenAiServableExecution
141141
#else
142142
ov::genai::ChatHistory& chatHistory = executionContext->apiHandler->getChatHistory();
143143
constexpr bool add_generation_prompt = true; // confirm it should be hardcoded
144-
inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, add_generation_prompt);
144+
try {
145+
inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, add_generation_prompt);
146+
} catch (const std::exception& e) {
147+
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to apply chat template: {}", e.what());
148+
return absl::Status(absl::StatusCode::kInvalidArgument, "Failed to apply chat template. The model either does not have chat template or has an invalid one.");
149+
}
145150
#endif
146151
if (inputText.size() == 0) {
147152
return absl::Status(absl::StatusCode::kInvalidArgument, "Final prompt after applying chat template is empty");

src/llm/servable_initializer.cpp

Lines changed: 52 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -51,15 +51,17 @@
5151
namespace ovms {
5252

5353
static const std::string CHAT_TEMPLATE_WARNING_MESSAGE = "Warning: Chat template has not been loaded properly. Servable will not respond to /chat/completions endpoint.";
54-
static const std::string DEFAULT_CHAT_TEMPLATE = R"({% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] }}{% elif message['role'] == 'system' %}{{ '<|system|>\n' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %})";
5554

5655
void GenAiServableInitializer::loadChatTemplate(std::shared_ptr<GenAiServableProperties> properties, const std::string& chatTemplateDirectory) {
5756
#if (PYTHON_DISABLE == 0)
5857
ExtraGenerationInfo extraGenInfo = readExtraGenerationInfo(properties, chatTemplateDirectory);
5958
loadPyTemplateProcessor(properties, extraGenInfo);
6059
#else
61-
loadDefaultTemplateProcessorIfNeeded(properties);
60+
if (properties->tokenizer.get_chat_template().empty()) {
61+
SPDLOG_LOGGER_DEBUG(modelmanager_logger, CHAT_TEMPLATE_WARNING_MESSAGE);
62+
}
6263
#endif
64+
// In non-python build, GenAI handles chat template loading
6365
}
6466

6567
#if (PYTHON_DISABLE == 0)
@@ -123,29 +125,37 @@ ExtraGenerationInfo GenAiServableInitializer::readExtraGenerationInfo(std::share
123125
}
124126

125127
void GenAiServableInitializer::loadPyTemplateProcessor(std::shared_ptr<GenAiServableProperties> properties, const ExtraGenerationInfo& extraGenInfo) {
126-
// GGUF models specific validation
127-
if (extraGenInfo.isGgufModel) {
128-
bool errorFound = false;
129-
if (extraGenInfo.eosTokenFromTokenizer.empty()) {
130-
SPDLOG_ERROR("Tokenizer eos token not found in tokenizer nor in vocabulary but required for GGUF models.");
131-
errorFound = true;
132-
}
133-
if (extraGenInfo.bosTokenFromTokenizer.empty()) {
134-
SPDLOG_ERROR("Tokenizer bos token not found in tokenizer nor in vocabulary but required for GGUF models.");
135-
errorFound = true;
136-
}
137-
if (extraGenInfo.chatTemplateFromTokenizer.empty()) {
138-
SPDLOG_ERROR("Tokenizer chat template not found in tokenizer but required for GGUF models.");
139-
errorFound = true;
140-
}
141-
if (errorFound)
142-
return;
128+
// At this point tokenizer cannot be uninitialized as we need to access its methods for prepare for chat template processing
129+
if (properties->tokenizer == ov::genai::Tokenizer()) {
130+
SPDLOG_LOGGER_ERROR(modelmanager_logger, "Tokenizer is not initialized. Cannot load chat template processor.");
131+
return;
132+
}
133+
std::string chatTemplate = properties->tokenizer.get_original_chat_template();
134+
std::string bosToken = properties->tokenizer.get_bos_token();
135+
std::string eosToken = properties->tokenizer.get_eos_token();
136+
if (bosToken.empty()) {
137+
SPDLOG_ERROR("BOS token was not found in model files.");
138+
return;
143139
}
140+
if (eosToken.empty()) {
141+
SPDLOG_ERROR("EOS token was not found in model files.");
142+
return;
143+
}
144+
if (chatTemplate.empty()) {
145+
SPDLOG_ERROR("Chat template was not found in model files.");
146+
return;
147+
}
148+
149+
properties->templateProcessor.bosToken = bosToken;
150+
properties->templateProcessor.eosToken = eosToken;
151+
152+
SPDLOG_LOGGER_DEBUG(modelmanager_logger, "Loading Python Jinja template processor with chat template from tokenizer. Bos token: {}, Eos token: {}, chat template: \n{}",
153+
bosToken, eosToken, chatTemplate);
154+
144155
py::gil_scoped_acquire acquire;
145156
try {
146-
auto locals = py::dict("tokenizer_template"_a = extraGenInfo.chatTemplateFromTokenizer,
147-
"templates_directory"_a = extraGenInfo.chatTemplateDirectory,
148-
"is_gguf_model"_a = extraGenInfo.isGgufModel);
157+
auto locals = py::dict("chat_template"_a = chatTemplate,
158+
"templates_directory"_a = extraGenInfo.chatTemplateDirectory);
149159
py::exec(R"(
150160
# Following the logic from:
151161
# https://github.com/huggingface/transformers/blob/25245ec26dc29bcf6102e1b4ddd0dfd02e720cf5/src/transformers/tokenization_utils_base.py#L1837
@@ -214,71 +224,51 @@ void GenAiServableInitializer::loadPyTemplateProcessor(std::shared_ptr<GenAiServ
214224
self._rendered_blocks = None
215225
self._generation_indices = None
216226
217-
218-
# Default chat template accepts only single message and outputs only it's 'content'
219-
# effectively turning it into a regular prompt.
220-
default_chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] }}{% elif message['role'] == 'system' %}{{ '<|system|>\n' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}"
221-
222-
bos_token = ""
223-
eos_token = ""
224-
chat_template = default_chat_template
227+
228+
# Optional dedicated tool chat template (might not be present)
225229
tool_chat_template = None
226230
231+
# Variables needed to be set at the end of this script execution
227232
template = None
228233
tool_template = None
229234
230-
# Try to read template from template.jinja file
231-
jinja_file = Path(templates_directory + "/chat_template.jinja")
232-
jinja_file_legacy = Path(templates_directory + "/template.jinja")
235+
# Load Jinja2 environment
233236
template_loader = jinja2.FileSystemLoader(searchpath=templates_directory)
234237
jinja_env = ImmutableSandboxedEnvironment(trim_blocks=True, lstrip_blocks=True, extensions=[AssistantTracker, jinja2.ext.loopcontrols], loader=template_loader)
235238
jinja_env.policies["json.dumps_kwargs"]["ensure_ascii"] = False
236239
jinja_env.globals["raise_exception"] = raise_exception
237240
jinja_env.globals["strftime_now"] = strftime_now
238241
jinja_env.filters["from_json"] = json.loads
239-
if jinja_file.is_file():
240-
template = jinja_env.get_template("chat_template.jinja")
241-
elif jinja_file_legacy.is_file():
242-
template = jinja_env.get_template("template.jinja")
243242
244-
# Try to read data from tokenizer_config.json
243+
# Try to read data from tokenizer_config.json to get additional tool chat template if present
245244
tokenizer_config_file = Path(templates_directory + "/tokenizer_config.json")
246245
if tokenizer_config_file.is_file():
247246
f = open(templates_directory + "/tokenizer_config.json", "r", encoding="utf-8")
248247
data = json.load(f)
249-
bos_token = data.get("bos_token", "")
250-
bos_token = "" if bos_token is None else bos_token # Null token conversion to empty string.
251-
eos_token = data.get("eos_token", "")
252-
eos_token = "" if eos_token is None else eos_token # Null token conversion to empty string.
253-
254-
chat_template = data.get("chat_template", default_chat_template)
255-
if isinstance(chat_template, list):
256-
for template_entry in chat_template:
248+
249+
chat_template_from_tokenizer_config = data.get("chat_template", None)
250+
if isinstance(chat_template_from_tokenizer_config, list):
251+
for template_entry in chat_template_from_tokenizer_config:
257252
if isinstance(template_entry, dict):
258-
if template_entry.get("name") == "default":
259-
chat_template = template_entry.get("template")
260-
elif template_entry.get("name") == "tool_use":
253+
if template_entry.get("name") == "tool_use":
261254
tool_chat_template = template_entry.get("template")
262-
if template is None:
263-
if is_gguf_model and (chat_template == default_chat_template):
264-
# GGUF model directory might not contain files with chat template and in that case we use template read from the tokenizer
265-
template = jinja_env.from_string(tokenizer_template)
266-
else:
267-
template = jinja_env.from_string(chat_template)
255+
256+
# Try read tool_use.jinja template file from additional_chat_templates directory if exists
257+
additional_templates_dir = Path(templates_directory + "/additional_chat_templates")
258+
tool_use_template_file = additional_templates_dir / "tool_use.jinja"
259+
if tool_use_template_file.is_file():
260+
with open(tool_use_template_file, "r", encoding="utf-8") as f:
261+
tool_chat_template = f.read()
262+
263+
# Load templates from strings
264+
template = jinja_env.from_string(chat_template)
268265
if tool_chat_template is not None:
269266
tool_template = jinja_env.from_string(tool_chat_template)
270267
else:
271268
tool_template = template
272269
)",
273270
py::globals(), locals);
274271

275-
if (extraGenInfo.isGgufModel) {
276-
properties->templateProcessor.bosToken = extraGenInfo.bosTokenFromTokenizer;
277-
properties->templateProcessor.eosToken = extraGenInfo.eosTokenFromTokenizer;
278-
} else {
279-
properties->templateProcessor.bosToken = locals["bos_token"].cast<std::string>();
280-
properties->templateProcessor.eosToken = locals["eos_token"].cast<std::string>();
281-
}
282272
properties->templateProcessor.chatTemplate = std::make_unique<PyObjectWrapper<py::object>>(locals["template"]);
283273
properties->templateProcessor.toolTemplate = std::make_unique<PyObjectWrapper<py::object>>(locals["tool_template"]);
284274
} catch (const pybind11::error_already_set& e) {
@@ -298,15 +288,6 @@ void GenAiServableInitializer::loadPyTemplateProcessor(std::shared_ptr<GenAiServ
298288
SPDLOG_DEBUG("Chat template loading failed with an unexpected error");
299289
}
300290
}
301-
302-
#else
303-
void GenAiServableInitializer::loadDefaultTemplateProcessorIfNeeded(std::shared_ptr<GenAiServableProperties> properties) {
304-
const std::string modelChatTemplate = properties->tokenizer.get_chat_template();
305-
if (modelChatTemplate.empty()) {
306-
SPDLOG_LOGGER_DEBUG(modelmanager_logger, "Could not load model chat template. Using default template.");
307-
properties->tokenizer.set_chat_template(DEFAULT_CHAT_TEMPLATE);
308-
}
309-
}
310291
#endif
311292

312293
Status parseModelsPath(std::string& outPath, std::string modelsPath, std::string graphPath) {

src/llm/servable_initializer.hpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,6 @@ class GenAiServableInitializer {
5252
// Use Python Jinja module for template processing
5353
static void loadPyTemplateProcessor(std::shared_ptr<GenAiServableProperties> properties, const ExtraGenerationInfo& extraGenInfo);
5454
static ExtraGenerationInfo readExtraGenerationInfo(std::shared_ptr<GenAiServableProperties> properties, const std::string& chatTemplateDirectory);
55-
#else
56-
// In C++ only version we use GenAI for template processing, but to have the same behavior as in Python-enabled version
57-
// we use default template if model does not have its own, so that servable can also work on chat/completion endpoint.
58-
static void loadDefaultTemplateProcessorIfNeeded(std::shared_ptr<GenAiServableProperties> properties);
5955
#endif
6056
/*
6157
initialize method implementation MUST fill servable with all required properties i.e. pipeline, tokenizer, configs etc. based on mediapipe node options.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] }}{% elif message['role'] == 'system' %}{{ '<|system|>\n' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}

0 commit comments

Comments
 (0)