Fix chat template (#3)

LukasBluebaum · web-flow · commit d408d3ce60cd · 2025-11-14T17:33:22.000+01:00
* Cache chat template in HATTokenizer
* Apply HAT chat template correctly
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
@@ -1346,5 +1346,22 @@ def apply_mistral_chat_template(
             "template")
         raise ValueError(str(e)) from e
 
+def apply_hat_chat_template(
+    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    conversation: list[ConversationMessage],
+    chat_template: Optional[str],
+    tools: Optional[list[dict[str, Any]]],
+    tokenize: bool = False,
+    **kwargs: Any,
+) -> str:
+    chat_template = tokenizer.get_chat_template(chat_template, tools=tools)
+    return tokenizer.apply_chat_template(
+        conversation=conversation,
+        chat_template=chat_template,
+        tools=tools,
+        tokenize=tokenize,
+        **kwargs,
+    )
+
 def random_tool_call_id() -> str:
     return f"chatcmpl-tool-{random_uuid()}"
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
@@ -33,6 +33,7 @@
                                          ConversationMessage,
                                          apply_hf_chat_template,
                                          apply_mistral_chat_template,
+                                         apply_hat_chat_template,
                                          parse_chat_messages_futures,
                                          resolve_chat_template_content_format)
 from vllm.entrypoints.logger import RequestLogger
@@ -76,6 +77,7 @@
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.utils import (AsyncMicrobatchTokenizer, is_list_of,
                         merge_async_iterators, random_uuid)
+from vllm.v1.hat.hat_tokenizer import HATTokenizer
 
 logger = init_logger(__name__)
 
@@ -882,6 +884,12 @@ async def _preprocess_chat(
                 messages=messages,
                 **_chat_template_kwargs,
             )
+        elif isinstance(tokenizer, HATTokenizer):
+            request_prompt = apply_hat_chat_template(
+                tokenizer,
+                conversation=conversation,
+                **_chat_template_kwargs,
+            )
         else:
             request_prompt = apply_hf_chat_template(
                 tokenizer=tokenizer,
diff --git a/vllm/v1/hat/hat_tokenizer.py b/vllm/v1/hat/hat_tokenizer.py
@@ -24,6 +24,8 @@ def __init__(self, special_token_dict: Dict[str, int]):
         self.name_or_path = "HAT"
         self.jinja2_env = ImmutableSandboxedEnvironment()
         self.special_tokens_map = None
+        # Cache for compiled Jinja2 templates to avoid recompiling on every request
+        self._template_cache: Dict[str, Any] = {}
 
     @property
     def all_special_tokens_extended(self) -> List[str]:
@@ -141,7 +143,9 @@ def apply_chat_template(self,
                             tokenize: bool,
                             tools: Optional[List[Dict[str, Any]]] = None,
                             **kwargs) -> str:
-        compiled_template = self.jinja2_env.from_string(chat_template)
+        if chat_template not in self._template_cache:
+            self._template_cache[chat_template] = self.jinja2_env.from_string(chat_template)
+        compiled_template = self._template_cache[chat_template]
         rendered = compiled_template.render(messages=conversation,
                                             add_generation_prompt=True)
         return rendered