From 8733bf3ea99e3a53f424556edaec2d43ccbf364b Mon Sep 17 00:00:00 2001 From: ttsyliu Date: Tue, 23 Dec 2025 14:49:41 +0800 Subject: [PATCH 1/6] Support Youtu-VL Model --- convert_hf_to_gguf.py | 87 +++++++++++++-- gguf-py/gguf/constants.py | 1 + gguf-py/gguf/tensor_mapping.py | 12 +++ src/llama-vocab.cpp | 11 ++ src/llama-vocab.h | 1 + src/unicode.cpp | 37 ++++--- tools/mtmd/CMakeLists.txt | 1 + tools/mtmd/clip-impl.h | 2 + tools/mtmd/clip.cpp | 152 +++++++++++++++++++++++++- tools/mtmd/models/models.h | 5 + tools/mtmd/models/utuvl.cpp | 190 +++++++++++++++++++++++++++++++++ tools/mtmd/mtmd.cpp | 2 +- 12 files changed, 477 insertions(+), 24 deletions(-) create mode 100644 tools/mtmd/models/utuvl.cpp diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 16c5acf346d..2161149f3d6 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1173,6 +1173,8 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5": # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3 res = "deepseek-v3" + if chkhsh == "9d70134b369a70e5735009b6de918f7581b5211f7c074d1f89f753aea8248af1": + res = "utu-vl" if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5": # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B res = "deepseek-r1-qwen" @@ -7133,6 +7135,7 @@ def prepare_tensors(self): "DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM", "KimiVLForConditionalGeneration", + "UTUVLForCausalLM", ) class DeepseekV2Model(TextModel): model_arch = gguf.MODEL_ARCH.DEEPSEEK2 @@ -7211,11 +7214,26 @@ def set_gguf_parameters(self): self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) self.gguf_writer.add_value_length_mla(hparams["v_head_dim"]) - self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) - self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) - self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) - self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"]) - self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"]) + if hparams.get("moe_intermediate_size") is not None: + self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) + else: + self.gguf_writer.add_expert_feed_forward_length(hparams.get("intermediate_size", 0)) + + if hparams.get("n_routed_experts") is not None: + self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) + + if hparams.get("n_shared_experts") is not None: + self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) + else: + self.gguf_writer.add_expert_shared_count(0) + + if hparams.get("routed_scaling_factor") is not None: + self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"]) + else: + self.gguf_writer.add_expert_weights_scale(1.0) + + if hparams.get("norm_topk_prob") is not None and hparams["norm_topk_prob"]: + self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"]) self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) @@ -7226,15 +7244,26 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_mscale_all) _experts: list[dict[str, Tensor]] | None = None + _token_embd: Tensor | None = None def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # skip vision tensors and remove "language_model." for Kimi-VL if "vision_tower" in name or "multi_modal_projector" in name: return [] - + if name.startswith("siglip2.") or name.startswith("merger."): + return [] if name.startswith("language_model."): name = name.replace("language_model.", "") + # skip lm_head.weight if tie_word_embeddings is True + if self.hparams.get("tie_word_embeddings", False): + # Save token_embd for potential duplication as output if tie_word_embeddings is True + if name == "model.embed_tokens.weight": + self._token_embd = data_torch + if name == "lm_head.weight" or name == "model.lm_head.weight": + logger.info("Skipping tied output layer 'lm_head.weight' - will duplicate from token_embd.weight") + return [] + # rename e_score_correction_bias tensors if name.endswith("e_score_correction_bias"): name = name.replace("e_score_correction_bias", "e_score_correction.bias") @@ -7246,7 +7275,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [] # process the experts separately - if name.find("mlp.experts") != -1: + if name.find("mlp.experts") != -1 and self.hparams.get("n_routed_experts") is not None: n_experts = self.hparams["n_routed_experts"] assert bid is not None @@ -7308,7 +7337,10 @@ def prepare_tensors(self): experts = [k for d in self._experts for k in d.keys()] if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") - + if self._token_embd is not None: + logger.info("Model has tie_word_embeddings=True but no lm_head.weight found - adding output.weight from token_embd.weight") + output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) + self.gguf_writer.add_tensor(output_name, self._token_embd.numpy()) @ModelBase.register("MiniMaxM2ForCausalLM") class MiniMaxM2Model(TextModel): @@ -10466,7 +10498,46 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [] +@ModelBase.register("UtuVLForConditionalGeneration", "UTUVLForCausalLM") +class UtuVLVisionModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.UTUVL) + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) + + # Handle activation function + hidden_act = str(self.hparams.get("hidden_act", "gelu_pytorch_tanh")).lower() + if hidden_act in ("gelu", "gelu_pytorch_tanh", "gelu_fast", "gelu_new", "gelu_accurate"): + self.gguf_writer.add_vision_use_gelu(True) + elif hidden_act == "silu": + self.gguf_writer.add_vision_use_silu(True) + else: + raise ValueError(f"Unsupported activation function for UTUVL: {hidden_act}") + + self.gguf_writer.add_vision_spatial_merge_size(self.hparams.get("spatial_merge_size", 2)) + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + # Skip language model tensors + skip_prefixes = ('lm_head.', 'model.layers.', 'model.embed_tokens.', 'model.norm.') + if name.startswith(skip_prefixes): + return [] + + # Try to map the tensor using TensorNameMap (handles vision encoder and projector) + try: + new_name = self.map_tensor_name(name) + return [(new_name, data_torch)] + except ValueError: + # If mapping fails, log warning and skip + logger.warning(f"Cannot map tensor: {name}") + return [] ###### CONVERSION LOGIC ###### diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 41d3bd4faf2..473ca8b407f 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -3432,6 +3432,7 @@ class VisionProjectorType: JANUS_PRO = "janus_pro" LFM2A = "lfm2a" # audio GLM4V = "glm4v" + UTUVL = "utuvl" # Items here are (block size, type size) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 276720fcde9..c9eb4476341 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1218,6 +1218,7 @@ class TensorNameMap: MODEL_TENSOR.V_MMPROJ: ( "multi_modal_projector.linear_{bid}", "visual.merger.mlp.{bid}", # qwen2vl + "merger.mlp.{bid}", ), MODEL_TENSOR.V_MMPROJ_FC: ( @@ -1255,6 +1256,7 @@ class TensorNameMap: "visual.patch_embed.proj", # qwen2vl "vision_tower.patch_embed.proj", # kimi-vl "model.vision.patch_embedding.proj", # cogvlm + "siglip2.vision_model.embeddings.patch_embedding", ), MODEL_TENSOR.V_ENC_EMBD_NORM: ( @@ -1288,6 +1290,7 @@ class TensorNameMap: "vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral "visual.blocks.{bid}.attn.q", # qwen2vl, generated "vision_tower.encoder.blocks.{bid}.wq", # kimi-vl, generated + "siglip2.vision_model.encoder.layers.{bid}.self_attn.q_proj", # utuvl ), MODEL_TENSOR.V_ENC_ATTN_Q_NORM: ( @@ -1305,6 +1308,7 @@ class TensorNameMap: "vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral "visual.blocks.{bid}.attn.k", # qwen2vl, generated "vision_tower.encoder.blocks.{bid}.wk", # kimi-vl, generated + "siglip2.vision_model.encoder.layers.{bid}.self_attn.k_proj", ), MODEL_TENSOR.V_ENC_ATTN_K_NORM: ( @@ -1322,6 +1326,7 @@ class TensorNameMap: "vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral "visual.blocks.{bid}.attn.v", # qwen2vl, generated "vision_tower.encoder.blocks.{bid}.wv", # kimi-vl, generated + "siglip2.vision_model.encoder.layers.{bid}.self_attn.v_proj", ), MODEL_TENSOR.V_ENC_INPUT_NORM: ( @@ -1336,6 +1341,7 @@ class TensorNameMap: "visual.blocks.{bid}.norm1", # qwen2vl "vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1) "model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm + "siglip2.vision_model.encoder.layers.{bid}.layer_norm1", ), MODEL_TENSOR.V_ENC_ATTN_O: ( @@ -1351,6 +1357,7 @@ class TensorNameMap: "visual.blocks.{bid}.attn.proj", # qwen2vl "vision_tower.encoder.blocks.{bid}.wo", # kimi-vl "model.vision.transformer.layers.{bid}.attention.dense", # cogvlm + "siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # utuvl ), MODEL_TENSOR.V_ENC_POST_ATTN_NORM: ( @@ -1365,6 +1372,7 @@ class TensorNameMap: "visual.blocks.{bid}.norm2", # qwen2vl "vision_tower.encoder.blocks.{bid}.norm1", # kimi-vl (norm0/norm1) "model.vision.transformer.layers.{bid}.post_attention_layernorm", # cogvlm + "siglip2.vision_model.encoder.layers.{bid}.layer_norm2", ), MODEL_TENSOR.V_ENC_FFN_UP: ( @@ -1380,6 +1388,7 @@ class TensorNameMap: "visual.blocks.{bid}.mlp.linear_fc1", # qwen3vl "vision_tower.encoder.blocks.{bid}.mlp.fc0", # kimi-vl (fc0/fc1) "model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm + "siglip2.vision_model.encoder.layers.{bid}.mlp.fc1", ), MODEL_TENSOR.V_ENC_FFN_GATE: ( @@ -1401,6 +1410,7 @@ class TensorNameMap: "visual.blocks.{bid}.mlp.linear_fc2", # qwen3vl "vision_tower.encoder.blocks.{bid}.mlp.fc1", # kimi-vl (fc0/fc1) "model.vision.transformer.layers.{bid}.mlp.fc2", # cogvlm + "siglip2.vision_model.encoder.layers.{bid}.mlp.fc2", ), MODEL_TENSOR.V_LAYER_SCALE_1: ( @@ -1427,6 +1437,7 @@ class TensorNameMap: "visual.merger.ln_q", # qwen2vl "vision_tower.encoder.final_layernorm", # kimi-vl "visual.post_layernorm", # glm4v + "siglip2.vision_model.post_layernorm", ), MODEL_TENSOR.V_MM_POST_NORM: ( @@ -1443,6 +1454,7 @@ class TensorNameMap: "multi_modal_projector.pre_norm", "pre_mm_projector_norm", "model.vision.linear_proj.norm1", # cogvlm + "merger.ln_q", ), MODEL_TENSOR.V_MM_SOFT_EMB_NORM: ( diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index cd4092ca077..b2e148d32f4 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -314,6 +314,12 @@ struct llm_tokenizer_bpe : llm_tokenizer { "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+", }; break; + case LLAMA_VOCAB_PRE_TYPE_UTU_VL: + regex_exprs = { + "[가-힣ㄱ-ㆎ]+|[!…“”‘’—:;,、-〿︰-﹏]+|[ㄅ-ㄯ]+|[一-龥぀-ゟ゠-ヿ]+", + "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + }; + break; case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: regex_exprs = { "[\r\n]", @@ -1860,6 +1866,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "deepseek-v3") { pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM; clean_spaces = false; + } else if ( + tokenizer_pre == "utu-vl") { + pre_type = LLAMA_VOCAB_PRE_TYPE_UTU_VL; + clean_spaces = false; + ignore_merges = true; } else if ( tokenizer_pre == "falcon") { pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON; diff --git a/src/llama-vocab.h b/src/llama-vocab.h index 55f8f3923c9..19ae099a3fb 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -51,6 +51,7 @@ enum llama_vocab_pre_type { LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40, LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41, LLAMA_VOCAB_PRE_TYPE_AFMOE = 42, + LLAMA_VOCAB_PRE_TYPE_UTU_VL = 43, }; struct LLM_KV; diff --git a/src/unicode.cpp b/src/unicode.cpp index bb44edfaddf..b47dcbe6198 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -964,6 +964,11 @@ std::vector unicode_regex_split(const std::string & text, const std { "\\p{P}", unicode_cpt_flags::PUNCTUATION }, { "\\p{M}", unicode_cpt_flags::ACCENT_MARK }, { "\\p{S}", unicode_cpt_flags::SYMBOL }, + { "\\p{Lu}", unicode_cpt_flags::LETTER }, // Uppercase letter + { "\\p{Ll}", unicode_cpt_flags::LETTER }, // Lowercase letter + { "\\p{Lt}", unicode_cpt_flags::LETTER }, // Titlecase letter + { "\\p{Lm}", unicode_cpt_flags::LETTER }, // Modifier letter + { "\\p{Lo}", unicode_cpt_flags::LETTER }, // Other letter }; static const std::map k_ucat_cpt = { @@ -1074,22 +1079,26 @@ std::vector unicode_regex_split(const std::string & text, const std continue; } - if (regex_expr[i + 0] == '\\' && i + 4 < regex_expr.size() && + // Match \p{...} Unicode properties of varying lengths + if (regex_expr[i + 0] == '\\' && i + 3 < regex_expr.size() && regex_expr[i + 1] == 'p' && - regex_expr[i + 2] == '{' && - regex_expr[i + 4] == '}') { - const std::string pat = regex_expr.substr(i, 5); - if (k_ucat_enum.find(pat) != k_ucat_enum.end()) { - if (!inside) { - regex_expr_collapsed += '['; + regex_expr[i + 2] == '{') { + // Find the closing brace + size_t closing_brace = regex_expr.find('}', i + 3); + if (closing_brace != std::string::npos && closing_brace <= i + 10) { // reasonable limit + const std::string pat = regex_expr.substr(i, closing_brace - i + 1); + if (k_ucat_enum.find(pat) != k_ucat_enum.end()) { + if (!inside) { + regex_expr_collapsed += '['; + } + regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat)); + regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat)); + if (!inside) { + regex_expr_collapsed += ']'; + } + i = closing_brace; + continue; } - regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat)); - regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat)); - if (!inside) { - regex_expr_collapsed += ']'; - } - i += 4; - continue; } } diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 317d5f19fd9..0e862994d67 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -27,6 +27,7 @@ add_library(mtmd models/qwen3vl.cpp models/siglip.cpp models/whisper-enc.cpp + models/utuvl.cpp ) set_target_properties(mtmd PROPERTIES diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index a0939865e3f..113d65736cd 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -187,6 +187,7 @@ enum projector_type { PROJECTOR_TYPE_JANUS_PRO, PROJECTOR_TYPE_LFM2A, PROJECTOR_TYPE_GLM4V, + PROJECTOR_TYPE_UTUVL, PROJECTOR_TYPE_UNKNOWN, }; @@ -216,6 +217,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"}, { PROJECTOR_TYPE_LFM2A, "lfm2a"}, { PROJECTOR_TYPE_GLM4V, "glm4v"}, + { PROJECTOR_TYPE_UTUVL, "utuvl"}, }; static projector_type clip_projector_type_from_string(const std::string & str) { diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 3ba0823defb..23bf968ab95 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -845,6 +845,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; + case PROJECTOR_TYPE_UTUVL: + { + builder = std::make_unique(ctx, img); + } break; default: GGML_ABORT("missing cgraph builder"); } @@ -1158,6 +1162,17 @@ struct clip_model_loader { LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__); } } break; + case PROJECTOR_TYPE_UTUVL: + { + hparams.n_merge = 2; + get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); + hparams.set_limit_image_tokens(8, 4096); + hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup + const int warn_min_pixels = 1024 * hparams.n_merge * hparams.n_merge * hparams.patch_size * hparams.patch_size; + if (hparams.image_min_pixels < warn_min_pixels) { + LOG_WRN("%s: Youtu-VL models require at minimum 1024 image tokens to function correctly on grounding tasks\n", __func__); + } + } break; case PROJECTOR_TYPE_GLM4V: { hparams.rope_theta = 10000.0f; @@ -1493,6 +1508,14 @@ struct clip_model_loader { model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); } break; + case PROJECTOR_TYPE_UTUVL: + { + model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM); // merger.ln_q (RMS norm) + model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); // merger.mlp.0 + model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias")); + model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); // merger.mlp.2 + model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); + } break; case PROJECTOR_TYPE_GLM4V: { model.projection = get_tensor(TN_MM_PROJECTOR); @@ -2684,6 +2707,57 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str // res_imgs->data[0] = *res; res_imgs->entries.push_back(std::move(img_f32)); } break; + case PROJECTOR_TYPE_UTUVL: + { + const int patch_size = params.patch_size; // typically 16 + const int merge_size = params.n_merge; // typically 2 + const int align_size = patch_size * merge_size; // 32 + + const int max_num_patches = params.image_max_pixels > 0 ? + params.image_max_pixels / (patch_size * patch_size) : 256; + + // Binary search for optimal scale to fit within max_num_patches + float scale = 1.0f; + int target_height = original_size.height; + int target_width = original_size.width; + + auto get_scaled_image_size = [align_size](float scale, int size) -> int { + float scaled_size = size * scale; + // Round up to nearest multiple of align_size + int aligned = static_cast(std::ceil(scaled_size / align_size)) * align_size; + // Ensure at least one patch + return std::max(align_size, aligned); + }; + + // Binary search with 0.02 step size + while (scale > 0.0f) { + target_height = get_scaled_image_size(scale, original_size.height); + target_width = get_scaled_image_size(scale, original_size.width); + + int num_patches_h = target_height / patch_size; + int num_patches_w = target_width / patch_size; + int num_patches = num_patches_h * num_patches_w; + + if (num_patches > max_num_patches) { + scale -= 0.02f; + } else { + break; + } + } + + clip_image_size new_size = {target_width, target_height}; + + // Resize the image + clip_image_u8 resized; + img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false); + + // Normalize to float32 + clip_image_f32_ptr img_f32(clip_image_f32_init()); + normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std); + + // Add to results + res_imgs->entries.push_back(std::move(img_f32)); + } break; case PROJECTOR_TYPE_IDEFICS3: { @@ -2916,6 +2990,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN3VL: case PROJECTOR_TYPE_GLM4V: + case PROJECTOR_TYPE_UTUVL: return (img->nx / params.patch_size) / 2; default: break; @@ -2931,6 +3006,7 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN3VL: case PROJECTOR_TYPE_GLM4V: + case PROJECTOR_TYPE_UTUVL: return (img->ny / params.patch_size) / 2; default: break; @@ -2991,6 +3067,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN3VL: case PROJECTOR_TYPE_GLM4V: + case PROJECTOR_TYPE_UTUVL: { // dynamic size (2 conv, so double patch size) int x_patch = img->nx / (params.patch_size * 2); @@ -3117,7 +3194,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima const int pos_w = image_size_width / patch_size; const int pos_h = image_size_height / patch_size; - const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl auto get_inp_tensor = [&gf](const char * name) { ggml_tensor * inp = ggml_graph_get_tensor(gf, name); @@ -3269,6 +3345,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima { // pw * ph = number of tokens output by ViT after apply patch merger // ipw * ipw = number of vision token been processed inside ViT + const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl const int merge_ratio = 2; const int pw = image_size_width / patch_size / merge_ratio; const int ph = image_size_height / patch_size / merge_ratio; @@ -3344,6 +3421,78 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } } + set_input_i32("positions", positions); + } break; + case PROJECTOR_TYPE_UTUVL: + { + const bool use_window_attn = true; + const int merge_ratio = 2; + const int pw = image_size_width / patch_size / merge_ratio; // patches after merger + const int ph = image_size_height / patch_size / merge_ratio; + const int ipw = image_size_width / patch_size; // internal patches in ViT + const int iph = image_size_height / patch_size; + std::vector idx (ph * pw); + std::vector inv_idx(ph * pw); + if (use_window_attn) { + const int attn_window_size = patch_size * 2 * 8; + const int grid_window = attn_window_size / patch_size / merge_ratio; + int dst = 0; + // [num_vision_tokens, num_vision_tokens] attention mask tensor + std::vector mask(pow(ipw * iph, 2), std::numeric_limits::lowest()); + int mask_row = 0; + for (int y = 0; y < ph; y += grid_window) { + for (int x = 0; x < pw; x += grid_window) { + const int win_h = std::min(grid_window, ph - y); + const int win_w = std::min(grid_window, pw - x); + const int dst_0 = dst; + // group all tokens belong to the same window togather (to a continue range) + for (int dy = 0; dy < win_h; dy++) { + for (int dx = 0; dx < win_w; dx++) { + const int src = (y + dy) * pw + (x + dx); + GGML_ASSERT(src < (int)idx.size()); + GGML_ASSERT(dst < (int)inv_idx.size()); + idx [src] = dst; + inv_idx[dst] = src; + dst++; + } + } + for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) { + int row_offset = mask_row * (ipw * iph); + std::fill( + mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio), + mask.begin() + row_offset + (dst * merge_ratio * merge_ratio), + 0.0); + mask_row++; + } + } + } + set_input_i32("window_idx", idx); + set_input_i32("inv_window_idx", inv_idx); + set_input_f32("window_mask", mask); + } else { + for (int i = 0; i < ph * pw; i++) { + idx[i] = i; + } + } + const int mpow = merge_ratio * merge_ratio; + std::vector positions(n_pos * 4); + int ptr = 0; + for (int y = 0; y < iph; y += merge_ratio) { + for (int x = 0; x < ipw; x += merge_ratio) { + for (int dy = 0; dy < merge_ratio; dy++) { + for (int dx = 0; dx < merge_ratio; dx++) { + // Remap positions to match window-grouped order + auto remap = idx[ptr / mpow]; + remap = (remap * mpow) + (ptr % mpow); + positions[ remap] = y + dy; + positions[ num_patches + remap] = x + dx; + positions[2 * num_patches + remap] = y + dy; + positions[3 * num_patches + remap] = x + dx; + ptr++; + } + } + } + } set_input_i32("positions", positions); } break; case PROJECTOR_TYPE_PIXTRAL: @@ -3516,6 +3665,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_JANUS_PRO: + case PROJECTOR_TYPE_UTUVL: return ctx->model.mm_1_b->ne[0]; case PROJECTOR_TYPE_QWEN3VL: // main path + deepstack paths diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index 8d6d4ef67be..8360c72d050 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -22,6 +22,11 @@ struct clip_graph_qwen3vl : clip_graph { ggml_cgraph * build() override; }; +struct clip_graph_utuvl : clip_graph { + clip_graph_utuvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + struct clip_graph_minicpmv : clip_graph { clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; diff --git a/tools/mtmd/models/utuvl.cpp b/tools/mtmd/models/utuvl.cpp new file mode 100644 index 00000000000..558ebec687a --- /dev/null +++ b/tools/mtmd/models/utuvl.cpp @@ -0,0 +1,190 @@ +#include "models.h" + +ggml_cgraph * clip_graph_utuvl::build() { + GGML_ASSERT(model.class_embedding == nullptr); + const int batch_size = 1; + const bool use_window_attn = true; + const int n_pos = n_patches; + const int num_position_ids = n_pos * 4; + const int m = 2; + const int Wp = n_patches_x; + const int Hp = n_patches_y; + const int Hm = Hp / m; + const int Wm = Wp / m; + norm_type norm_t = NORM_TYPE_NORMAL; + + int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; + + ggml_tensor * inp = build_inp_raw(); + + inp = ggml_reshape_4d( + ctx0, inp, + Wm * m * patch_size, m * patch_size, Hm, 3); + inp = ggml_permute(ctx0, inp, 1, 2, 3, 0); + inp = ggml_cont_4d( + ctx0, inp, + m * patch_size * 3, Wm, m * patch_size, Hm); + + inp = ggml_permute(ctx0, inp, 0, 2, 1, 3); + inp = ggml_cont_4d( + ctx0, inp, + m * patch_size * 3, patch_size, m, Hm * Wm); + + inp = ggml_permute(ctx0, inp, 1, 0, 2, 3); + inp = ggml_cont_4d( + ctx0, inp, + patch_size, 3, patch_size, Hm * Wm * m * m); + + inp = ggml_permute(ctx0, inp, 2, 0, 1, 3); + inp = ggml_cont_3d( + ctx0, inp, + 3*patch_size* patch_size, Hm * Wm * m * m, 1); + + inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp); + + if (model.patch_bias) { + inp = ggml_add(ctx0, inp, model.patch_bias); + } + + inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches); + + ggml_tensor * inpL = inp; + ggml_tensor * window_mask = nullptr; + ggml_tensor * window_idx = nullptr; + ggml_tensor * inv_window_idx = nullptr; + + ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + // pre-layernorm + if (model.pre_ln_w) { + inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1); + } + if (use_window_attn) { + inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4); + ggml_set_name(inv_window_idx, "inv_window_idx"); + ggml_set_input(inv_window_idx); + // mask for window attention + window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos); + ggml_set_name(window_mask, "window_mask"); + ggml_set_input(window_mask); + + // if flash attn is used, we need to pad the mask and cast to f16 + if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) { + window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16); + } + + // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size] + GGML_ASSERT(batch_size == 1); + inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4); + inpL = ggml_get_rows(ctx0, inpL, inv_window_idx); + inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size); + } + + // loop over layers + for (int il = 0; il < n_layer; il++) { + const auto & layer = model.layers[il]; + const bool full_attn = (il + 1) % 8 == 0 || il == n_layer - 1; + + ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states + + // layernorm1 + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); + // self-attention + { + ggml_tensor * Qcur = ggml_add(ctx0, + ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b); + ggml_tensor * Kcur = ggml_add(ctx0, + ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b); + ggml_tensor * Vcur = ggml_add(ctx0, + ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b); + + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches); + Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches); + Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches); + + Qcur = ggml_rope_multi( + ctx0, Qcur, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + Kcur = ggml_rope_multi( + ctx0, Kcur, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + + ggml_tensor * attn_mask = full_attn ? nullptr : window_mask; + + cur = build_attn(layer.o_w, layer.o_b, + Qcur, Kcur, Vcur, attn_mask, kq_scale, il); + } + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, inpL); + + inpL = cur; // inpL = residual, cur = hidden_states + + // layernorm2 + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); + + // ffn + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + nullptr, nullptr, + layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + + // residual 2 + cur = ggml_add(ctx0, inpL, cur); + + inpL = cur; + } + + ggml_tensor * embeddings = inpL; + if (use_window_attn) { + const int spatial_merge_unit = 4; + window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / spatial_merge_unit); + ggml_set_name(window_idx, "window_idx"); + ggml_set_input(window_idx); + GGML_ASSERT(batch_size == 1); + embeddings = ggml_reshape_2d(ctx0, embeddings, n_embd * spatial_merge_unit, n_patches / spatial_merge_unit); + embeddings = ggml_get_rows(ctx0, embeddings, window_idx); + embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd, n_patches, batch_size); + cb(embeddings, "window_order_restored", -1); + } + + // post-layernorm (part of Siglip2VisionTransformer, applied after encoder) + if (model.post_ln_w) { + embeddings = build_norm(embeddings, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer); + } + + // Now apply merger (VLPatchMerger): + // 1. Apply RMS norm (ln_q in VLPatchMerger) + embeddings = build_norm(embeddings, model.mm_input_norm_w, nullptr, NORM_TYPE_RMS, 1e-6, -1); + cb(embeddings, "merger_normed", -1); + + // 2. First reshape for spatial merge (merge 2x2 patches) + embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size); + cb(embeddings, "merger_reshaped", -1); + + embeddings = build_ffn(embeddings, + model.mm_0_w, model.mm_0_b, + nullptr, nullptr, + model.mm_1_w, model.mm_1_b, + FFN_GELU, + -1); + // // 3. First linear layer + // embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); + // embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); + // cb(embeddings, "merger_fc1", -1); + + // // 4. GELU activation + // embeddings = ggml_gelu(ctx0, embeddings); + // cb(embeddings, "merger_gelu", -1); + + // // 5. Second linear layer + // embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings); + // embeddings = ggml_add(ctx0, embeddings, model.mm_1_b); + + // build the graph + ggml_build_forward_expand(gf, embeddings); + + return gf; +} diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index b9c4fa90980..3f8bf53454a 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -283,7 +283,7 @@ struct mtmd_context { // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md img_end = "[IMG_END]"; - } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL) { + } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL || proj == PROJECTOR_TYPE_UTUVL) { // <|vision_start|> ... (image embeddings) ... <|vision_end|> img_beg = "<|vision_start|>"; img_end = "<|vision_end|>"; From 160097417733a6aa86c01bd3eae6fa6c53f05421 Mon Sep 17 00:00:00 2001 From: ttsyliu Date: Tue, 23 Dec 2025 20:14:40 +0800 Subject: [PATCH 2/6] merge code --- tools/mtmd/clip.cpp | 75 +------------------------------------ tools/mtmd/models/utuvl.cpp | 14 ------- 2 files changed, 2 insertions(+), 87 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 23bf968ab95..37f766f077f 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -3342,6 +3342,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima set_input_i32("positions", positions); } break; case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_UTUVL: { // pw * ph = number of tokens output by ViT after apply patch merger // ipw * ipw = number of vision token been processed inside ViT @@ -3356,7 +3357,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima std::vector inv_idx(ph * pw); if (use_window_attn) { - const int attn_window_size = 112; + const int attn_window_size = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? 112 : patch_size * 2 * 8; const int grid_window = attn_window_size / patch_size / merge_ratio; int dst = 0; // [num_vision_tokens, num_vision_tokens] attention mask tensor @@ -3421,78 +3422,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } } - set_input_i32("positions", positions); - } break; - case PROJECTOR_TYPE_UTUVL: - { - const bool use_window_attn = true; - const int merge_ratio = 2; - const int pw = image_size_width / patch_size / merge_ratio; // patches after merger - const int ph = image_size_height / patch_size / merge_ratio; - const int ipw = image_size_width / patch_size; // internal patches in ViT - const int iph = image_size_height / patch_size; - std::vector idx (ph * pw); - std::vector inv_idx(ph * pw); - if (use_window_attn) { - const int attn_window_size = patch_size * 2 * 8; - const int grid_window = attn_window_size / patch_size / merge_ratio; - int dst = 0; - // [num_vision_tokens, num_vision_tokens] attention mask tensor - std::vector mask(pow(ipw * iph, 2), std::numeric_limits::lowest()); - int mask_row = 0; - for (int y = 0; y < ph; y += grid_window) { - for (int x = 0; x < pw; x += grid_window) { - const int win_h = std::min(grid_window, ph - y); - const int win_w = std::min(grid_window, pw - x); - const int dst_0 = dst; - // group all tokens belong to the same window togather (to a continue range) - for (int dy = 0; dy < win_h; dy++) { - for (int dx = 0; dx < win_w; dx++) { - const int src = (y + dy) * pw + (x + dx); - GGML_ASSERT(src < (int)idx.size()); - GGML_ASSERT(dst < (int)inv_idx.size()); - idx [src] = dst; - inv_idx[dst] = src; - dst++; - } - } - for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) { - int row_offset = mask_row * (ipw * iph); - std::fill( - mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio), - mask.begin() + row_offset + (dst * merge_ratio * merge_ratio), - 0.0); - mask_row++; - } - } - } - set_input_i32("window_idx", idx); - set_input_i32("inv_window_idx", inv_idx); - set_input_f32("window_mask", mask); - } else { - for (int i = 0; i < ph * pw; i++) { - idx[i] = i; - } - } - const int mpow = merge_ratio * merge_ratio; - std::vector positions(n_pos * 4); - int ptr = 0; - for (int y = 0; y < iph; y += merge_ratio) { - for (int x = 0; x < ipw; x += merge_ratio) { - for (int dy = 0; dy < merge_ratio; dy++) { - for (int dx = 0; dx < merge_ratio; dx++) { - // Remap positions to match window-grouped order - auto remap = idx[ptr / mpow]; - remap = (remap * mpow) + (ptr % mpow); - positions[ remap] = y + dy; - positions[ num_patches + remap] = x + dx; - positions[2 * num_patches + remap] = y + dy; - positions[3 * num_patches + remap] = x + dx; - ptr++; - } - } - } - } set_input_i32("positions", positions); } break; case PROJECTOR_TYPE_PIXTRAL: diff --git a/tools/mtmd/models/utuvl.cpp b/tools/mtmd/models/utuvl.cpp index 558ebec687a..ce477caa1da 100644 --- a/tools/mtmd/models/utuvl.cpp +++ b/tools/mtmd/models/utuvl.cpp @@ -170,20 +170,6 @@ ggml_cgraph * clip_graph_utuvl::build() { model.mm_1_w, model.mm_1_b, FFN_GELU, -1); - // // 3. First linear layer - // embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); - // embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); - // cb(embeddings, "merger_fc1", -1); - - // // 4. GELU activation - // embeddings = ggml_gelu(ctx0, embeddings); - // cb(embeddings, "merger_gelu", -1); - - // // 5. Second linear layer - // embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings); - // embeddings = ggml_add(ctx0, embeddings, model.mm_1_b); - - // build the graph ggml_build_forward_expand(gf, embeddings); return gf; From 867709ca75db92137a544f94808a9f52ea7de9ee Mon Sep 17 00:00:00 2001 From: ttsyliu Date: Wed, 24 Dec 2025 17:31:16 +0800 Subject: [PATCH 3/6] fix bug --- convert_hf_to_gguf.py | 53 ++++++++++++++++------------------- convert_hf_to_gguf_update.py | 1 + gguf-py/gguf/constants.py | 3 +- gguf-py/gguf/gguf_writer.py | 7 +++-- src/llama-model.cpp | 12 ++++++-- tools/mtmd/clip-impl.h | 2 +- tools/mtmd/clip-model.h | 2 +- tools/mtmd/clip.cpp | 27 +++++++++++++++--- tools/mtmd/models/qwen2vl.cpp | 5 ++-- tools/mtmd/models/utuvl.cpp | 53 ++++++++++++++++++----------------- 10 files changed, 97 insertions(+), 68 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 2161149f3d6..4309bf34f97 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1173,8 +1173,6 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5": # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3 res = "deepseek-v3" - if chkhsh == "9d70134b369a70e5735009b6de918f7581b5211f7c074d1f89f753aea8248af1": - res = "utu-vl" if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5": # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B res = "deepseek-r1-qwen" @@ -1232,6 +1230,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "4a2e2abae11ca2b86d570fc5b44be4d5eb5e72cc8f22dd136a94b37da83ab665": # ref: https://huggingface.co/KORMo-Team/KORMo-tokenizer res = "kormo" + if chkhsh == "9d70134b369a70e5735009b6de918f7581b5211f7c074d1f89f753aea8248af1": + # ref: ./Youtu-VL + res = "utu-vl" if res is None: logger.warning("\n") @@ -3808,15 +3809,10 @@ def set_gguf_parameters(self): else: self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL) self.gguf_writer.add_vision_use_silu(True) - # find n_wa_pattern (window attention pattern) + # save window attention layers (full attention block indexes) fullatt_block_indexes = hparams.get("fullatt_block_indexes") assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for qwen2_5_vl" - n_wa_pattern = fullatt_block_indexes[0] + 1 - # validate n_wa_pattern - for i in range(1, len(fullatt_block_indexes)): - if fullatt_block_indexes[i] - fullatt_block_indexes[i - 1] != n_wa_pattern: - raise ValueError(f"Invalid fullatt_block_indexes: {fullatt_block_indexes}") - self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern) + self.gguf_writer.add_vision_wa_layers(fullatt_block_indexes) else: raise ValueError(f"Unknown QwenVL model type: {self.global_config['model_type']}") # default values below are taken from HF tranformers code @@ -7214,26 +7210,26 @@ def set_gguf_parameters(self): self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) self.gguf_writer.add_value_length_mla(hparams["v_head_dim"]) - if hparams.get("moe_intermediate_size") is not None: - self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) + if (moe_intermediate_size := hparams.get("moe_intermediate_size")) is not None: + self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) else: self.gguf_writer.add_expert_feed_forward_length(hparams.get("intermediate_size", 0)) - if hparams.get("n_routed_experts") is not None: - self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) + if (n_routed_experts := hparams.get("n_routed_experts")) is not None: + self.gguf_writer.add_expert_count(n_routed_experts) - if hparams.get("n_shared_experts") is not None: - self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) + if (n_shared_experts := hparams.get("n_shared_experts")) is not None: + self.gguf_writer.add_expert_shared_count(n_shared_experts) else: self.gguf_writer.add_expert_shared_count(0) - if hparams.get("routed_scaling_factor") is not None: - self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"]) + if (routed_scaling_factor := hparams.get("routed_scaling_factor")) is not None: + self.gguf_writer.add_expert_weights_scale(routed_scaling_factor) else: self.gguf_writer.add_expert_weights_scale(1.0) - if hparams.get("norm_topk_prob") is not None and hparams["norm_topk_prob"]: - self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"]) + if (norm_topk_prob := hparams.get("norm_topk_prob")) is not None and norm_topk_prob: + self.gguf_writer.add_expert_weights_norm(norm_topk_prob) self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) @@ -7244,7 +7240,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_mscale_all) _experts: list[dict[str, Tensor]] | None = None - _token_embd: Tensor | None = None def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # skip vision tensors and remove "language_model." for Kimi-VL @@ -7257,11 +7252,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # skip lm_head.weight if tie_word_embeddings is True if self.hparams.get("tie_word_embeddings", False): - # Save token_embd for potential duplication as output if tie_word_embeddings is True - if name == "model.embed_tokens.weight": - self._token_embd = data_torch if name == "lm_head.weight" or name == "model.lm_head.weight": - logger.info("Skipping tied output layer 'lm_head.weight' - will duplicate from token_embd.weight") + logger.info("Skipping tied output layer 'lm_head.weight' (will use token_embd.weight)") return [] # rename e_score_correction_bias tensors @@ -7337,10 +7329,6 @@ def prepare_tensors(self): experts = [k for d in self._experts for k in d.keys()] if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") - if self._token_embd is not None: - logger.info("Model has tie_word_embeddings=True but no lm_head.weight found - adding output.weight from token_embd.weight") - output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) - self.gguf_writer.add_tensor(output_name, self._token_embd.numpy()) @ModelBase.register("MiniMaxM2ForCausalLM") class MiniMaxM2Model(TextModel): @@ -10521,7 +10509,14 @@ def set_gguf_parameters(self): raise ValueError(f"Unsupported activation function for UTUVL: {hidden_act}") self.gguf_writer.add_vision_spatial_merge_size(self.hparams.get("spatial_merge_size", 2)) - + + window_size = self.hparams.get("window_size") + if window_size is not None: + self.gguf_writer.add_vision_window_size(window_size) + fullatt_block_indexes = self.hparams.get("fullatt_block_indexes") + assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for utuvl" + self.gguf_writer.add_vision_wa_layers(layers=fullatt_block_indexes) + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 4378378309f..78a2a0168ce 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -145,6 +145,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "granite-docling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", }, {"name": "minimax-m2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/MiniMaxAI/MiniMax-M2", }, {"name": "kormo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/KORMo-Team/KORMo-tokenizer", }, + {"name": "utu-vl", "tokt": TOKENIZER_TYPE.BPE, "repo": "./Youtu-VL", }, ] # some models are known to be broken upstream, so we will skip them as exceptions diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 473ca8b407f..e06589097fe 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -293,8 +293,9 @@ class ClipVision: SPATIAL_MERGE_SIZE = "clip.vision.spatial_merge_size" USE_GELU = "clip.use_gelu" USE_SILU = "clip.use_silu" - N_WA_PATTERN = "clip.vision.n_wa_pattern" # used by qwen2.5vl + WA_LAYERS = "clip.vision.wa_layers" # used by qwen2.5vl and utuvl IS_DEEPSTACK_LAYERS = "clip.vision.is_deepstack_layers" + WINDOW_SIZE = "clip.vision.window_size" class Attention: HEAD_COUNT = "clip.vision.attention.head_count" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 6a4a504f8dc..2521879f272 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1128,12 +1128,15 @@ def add_vision_use_silu(self, value: bool) -> None: def add_vision_projector_scale_factor(self, value: int) -> None: self.add_uint32(Keys.ClipVision.Projector.SCALE_FACTOR, value) - def add_vision_n_wa_pattern(self, value: int) -> None: - self.add_uint32(Keys.ClipVision.N_WA_PATTERN, value) + def add_vision_wa_layers(self, layers: Sequence[int]) -> None: + self.add_array(Keys.ClipVision.WA_LAYERS, layers) def add_vision_is_deepstack_layers(self, layers: Sequence[bool]) -> None: self.add_array(Keys.ClipVision.IS_DEEPSTACK_LAYERS, layers) + def add_vision_window_size(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.WINDOW_SIZE, value) + # audio models def add_audio_projection_dim(self, value: int) -> None: diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 0d5bcc64fe5..1bb98bed09f 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4699,7 +4699,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // output output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); - output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); + // try to load output.weight, if not found, use token_embd (tied embeddings) + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + if (!output) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; @@ -4762,7 +4766,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // output output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); - output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); + // try to load output.weight, if not found, use token_embd (tied embeddings) + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + if (!output) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 113d65736cd..310a526fcb4 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -48,7 +48,7 @@ #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" #define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution" -#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern" +#define KEY_WIN_ATTN_LAYERS "clip.vision.wa_layers" #define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size" #define KEY_MINICPMV_VERSION "clip.minicpmv_version" #define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num" diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index b4c31cdde6b..dd21aa60bab 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -60,7 +60,7 @@ struct clip_hparams { int32_t image_crop_resolution; std::unordered_set vision_feature_layer; int32_t attn_window_size = 0; - int32_t n_wa_pattern = 0; + std::unordered_set wa_layers; // window attention full layers // audio int32_t n_mel_bins = 0; // whisper preprocessor diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 37f766f077f..66834da4d3f 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1151,7 +1151,14 @@ struct clip_model_loader { { hparams.n_merge = 2; // default value for Qwen 2 and 2.5 get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); - get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it + // load window attention layers (only 2.5 requires it) + if (model.proj_type == PROJECTOR_TYPE_QWEN25VL) { + std::vector wa_layers_vec; + get_arr_int(KEY_WIN_ATTN_LAYERS, wa_layers_vec, true); + for (auto & layer : wa_layers_vec) { + hparams.wa_layers.insert(layer); + } + } // ref: https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json hparams.set_limit_image_tokens(8, 4096); hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup @@ -1166,6 +1173,12 @@ struct clip_model_loader { { hparams.n_merge = 2; get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); + get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true); + std::vector wa_layers_vec; + get_arr_int(KEY_WIN_ATTN_LAYERS, wa_layers_vec, true); + for (auto & layer : wa_layers_vec) { + hparams.wa_layers.insert(layer); + } hparams.set_limit_image_tokens(8, 4096); hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup const int warn_min_pixels = 1024 * hparams.n_merge * hparams.n_merge * hparams.patch_size * hparams.patch_size; @@ -1240,7 +1253,13 @@ struct clip_model_loader { LOG_INF("%s: has_llava_proj: %d\n", __func__, hparams.has_llava_projector); LOG_INF("%s: minicpmv_version: %d\n", __func__, hparams.minicpmv_version); LOG_INF("%s: n_merge: %d\n", __func__, hparams.n_merge); - LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern); + if (!hparams.wa_layers.empty()) { + LOG_INF("%s: wa_layers: ", __func__); + for (auto & layer : hparams.wa_layers) { + LOG_INF("%d ", layer); + } + LOG_INF("\n"); + } if (hparams.image_min_pixels > 0) { LOG_INF("%s: image_min_pixels: %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > 0 ? " (custom value)" : ""); } @@ -3346,7 +3365,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima { // pw * ph = number of tokens output by ViT after apply patch merger // ipw * ipw = number of vision token been processed inside ViT - const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl + const bool use_window_attn = !hparams.wa_layers.empty(); // for qwen2.5vl const int merge_ratio = 2; const int pw = image_size_width / patch_size / merge_ratio; const int ph = image_size_height / patch_size / merge_ratio; @@ -3357,7 +3376,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima std::vector inv_idx(ph * pw); if (use_window_attn) { - const int attn_window_size = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? 112 : patch_size * 2 * 8; + const int attn_window_size = hparams.attn_window_size > 0 ? hparams.attn_window_size : 112; const int grid_window = attn_window_size / patch_size / merge_ratio; int dst = 0; // [num_vision_tokens, num_vision_tokens] attention mask tensor diff --git a/tools/mtmd/models/qwen2vl.cpp b/tools/mtmd/models/qwen2vl.cpp index 85f158bb1c0..389c5a94f63 100644 --- a/tools/mtmd/models/qwen2vl.cpp +++ b/tools/mtmd/models/qwen2vl.cpp @@ -5,8 +5,7 @@ ggml_cgraph * clip_graph_qwen2vl::build() { GGML_ASSERT(model.class_embedding == nullptr); const int batch_size = 1; - const bool use_window_attn = hparams.n_wa_pattern > 0; - const int n_wa_pattern = hparams.n_wa_pattern; + const bool use_window_attn = !hparams.wa_layers.empty(); const int n_pos = n_patches; const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position @@ -79,7 +78,7 @@ ggml_cgraph * clip_graph_qwen2vl::build() { // loop over layers for (int il = 0; il < n_layer; il++) { const auto & layer = model.layers[il]; - const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true; + const bool full_attn = use_window_attn ? hparams.wa_layers.count(il) > 0 : true; ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states diff --git a/tools/mtmd/models/utuvl.cpp b/tools/mtmd/models/utuvl.cpp index ce477caa1da..aa3c9857dc9 100644 --- a/tools/mtmd/models/utuvl.cpp +++ b/tools/mtmd/models/utuvl.cpp @@ -3,7 +3,7 @@ ggml_cgraph * clip_graph_utuvl::build() { GGML_ASSERT(model.class_embedding == nullptr); const int batch_size = 1; - const bool use_window_attn = true; + const bool use_window_attn = !hparams.wa_layers.empty(); const int n_pos = n_patches; const int num_position_ids = n_pos * 4; const int m = 2; @@ -17,29 +17,32 @@ ggml_cgraph * clip_graph_utuvl::build() { ggml_tensor * inp = build_inp_raw(); - inp = ggml_reshape_4d( - ctx0, inp, - Wm * m * patch_size, m * patch_size, Hm, 3); - inp = ggml_permute(ctx0, inp, 1, 2, 3, 0); - inp = ggml_cont_4d( - ctx0, inp, - m * patch_size * 3, Wm, m * patch_size, Hm); - - inp = ggml_permute(ctx0, inp, 0, 2, 1, 3); - inp = ggml_cont_4d( - ctx0, inp, - m * patch_size * 3, patch_size, m, Hm * Wm); - - inp = ggml_permute(ctx0, inp, 1, 0, 2, 3); - inp = ggml_cont_4d( - ctx0, inp, - patch_size, 3, patch_size, Hm * Wm * m * m); - - inp = ggml_permute(ctx0, inp, 2, 0, 1, 3); - inp = ggml_cont_3d( - ctx0, inp, - 3*patch_size* patch_size, Hm * Wm * m * m, 1); - + // change conv3d to linear + // reshape and permute to get patches, permute from (patch_size, m, Wm, patch_size, m, Hm, C) to (C, patch_size, patch_size, m, m, Wm, Hm) + { + inp = ggml_reshape_4d( + ctx0, inp, + Wm * m * patch_size, m * patch_size, Hm, 3); + inp = ggml_permute(ctx0, inp, 1, 2, 3, 0); + inp = ggml_cont_4d( + ctx0, inp, + m * patch_size * 3, Wm, m * patch_size, Hm); + + inp = ggml_permute(ctx0, inp, 0, 2, 1, 3); + inp = ggml_cont_4d( + ctx0, inp, + m * patch_size * 3, patch_size, m, Hm * Wm); + + inp = ggml_permute(ctx0, inp, 1, 0, 2, 3); + inp = ggml_cont_4d( + ctx0, inp, + patch_size, 3, patch_size, Hm * Wm * m * m); + + inp = ggml_permute(ctx0, inp, 2, 0, 1, 3); + inp = ggml_cont_3d( + ctx0, inp, + 3*patch_size* patch_size, Hm * Wm * m * m, 1); + } inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp); if (model.patch_bias) { @@ -85,7 +88,7 @@ ggml_cgraph * clip_graph_utuvl::build() { // loop over layers for (int il = 0; il < n_layer; il++) { const auto & layer = model.layers[il]; - const bool full_attn = (il + 1) % 8 == 0 || il == n_layer - 1; + const bool full_attn = use_window_attn ? hparams.wa_layers.count(il) > 0 : true; ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states From 3e816b28024aac1395a7ab52abbe399370bace4f Mon Sep 17 00:00:00 2001 From: ttsyliu Date: Thu, 25 Dec 2025 11:05:16 +0800 Subject: [PATCH 4/6] revert qwen2 code & support rsplit in minja.hpp --- convert_hf_to_gguf.py | 9 +++++++-- gguf-py/gguf/constants.py | 3 ++- gguf-py/gguf/gguf_writer.py | 3 +++ tools/mtmd/clip-impl.h | 1 + tools/mtmd/clip-model.h | 1 + tools/mtmd/clip.cpp | 12 +++--------- tools/mtmd/models/qwen2vl.cpp | 5 +++-- vendor/minja/minja.hpp | 25 ++++++++++++++++++++++++- 8 files changed, 44 insertions(+), 15 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 4309bf34f97..e1deb089cd3 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3809,10 +3809,15 @@ def set_gguf_parameters(self): else: self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL) self.gguf_writer.add_vision_use_silu(True) - # save window attention layers (full attention block indexes) + # find n_wa_pattern (window attention pattern) fullatt_block_indexes = hparams.get("fullatt_block_indexes") assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for qwen2_5_vl" - self.gguf_writer.add_vision_wa_layers(fullatt_block_indexes) + n_wa_pattern = fullatt_block_indexes[0] + 1 + # validate n_wa_pattern + for i in range(1, len(fullatt_block_indexes)): + if fullatt_block_indexes[i] - fullatt_block_indexes[i - 1] != n_wa_pattern: + raise ValueError(f"Invalid fullatt_block_indexes: {fullatt_block_indexes}") + self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern) else: raise ValueError(f"Unknown QwenVL model type: {self.global_config['model_type']}") # default values below are taken from HF tranformers code diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index e06589097fe..4c1741a12d2 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -293,7 +293,8 @@ class ClipVision: SPATIAL_MERGE_SIZE = "clip.vision.spatial_merge_size" USE_GELU = "clip.use_gelu" USE_SILU = "clip.use_silu" - WA_LAYERS = "clip.vision.wa_layers" # used by qwen2.5vl and utuvl + N_WA_PATTERN = "clip.vision.n_wa_pattern" # used by qwen2.5vl + WA_LAYERS = "clip.vision.wa_layers" # used by utuvl IS_DEEPSTACK_LAYERS = "clip.vision.is_deepstack_layers" WINDOW_SIZE = "clip.vision.window_size" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 2521879f272..937550bb53f 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1128,6 +1128,9 @@ def add_vision_use_silu(self, value: bool) -> None: def add_vision_projector_scale_factor(self, value: int) -> None: self.add_uint32(Keys.ClipVision.Projector.SCALE_FACTOR, value) + def add_vision_n_wa_pattern(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.N_WA_PATTERN, value) + def add_vision_wa_layers(self, layers: Sequence[int]) -> None: self.add_array(Keys.ClipVision.WA_LAYERS, layers) diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 310a526fcb4..90f53f0cdb8 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -48,6 +48,7 @@ #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" #define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution" +#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern" #define KEY_WIN_ATTN_LAYERS "clip.vision.wa_layers" #define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size" #define KEY_MINICPMV_VERSION "clip.minicpmv_version" diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index dd21aa60bab..3b17e5a8e82 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -60,6 +60,7 @@ struct clip_hparams { int32_t image_crop_resolution; std::unordered_set vision_feature_layer; int32_t attn_window_size = 0; + int32_t n_wa_pattern = 0; std::unordered_set wa_layers; // window attention full layers // audio diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 66834da4d3f..acf318c8893 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1151,14 +1151,7 @@ struct clip_model_loader { { hparams.n_merge = 2; // default value for Qwen 2 and 2.5 get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); - // load window attention layers (only 2.5 requires it) - if (model.proj_type == PROJECTOR_TYPE_QWEN25VL) { - std::vector wa_layers_vec; - get_arr_int(KEY_WIN_ATTN_LAYERS, wa_layers_vec, true); - for (auto & layer : wa_layers_vec) { - hparams.wa_layers.insert(layer); - } - } + get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it // ref: https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json hparams.set_limit_image_tokens(8, 4096); hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup @@ -1253,6 +1246,7 @@ struct clip_model_loader { LOG_INF("%s: has_llava_proj: %d\n", __func__, hparams.has_llava_projector); LOG_INF("%s: minicpmv_version: %d\n", __func__, hparams.minicpmv_version); LOG_INF("%s: n_merge: %d\n", __func__, hparams.n_merge); + LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern); if (!hparams.wa_layers.empty()) { LOG_INF("%s: wa_layers: ", __func__); for (auto & layer : hparams.wa_layers) { @@ -3365,7 +3359,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima { // pw * ph = number of tokens output by ViT after apply patch merger // ipw * ipw = number of vision token been processed inside ViT - const bool use_window_attn = !hparams.wa_layers.empty(); // for qwen2.5vl + const bool use_window_attn = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? hparams.n_wa_pattern > 0 : !hparams.wa_layers.empty(); const int merge_ratio = 2; const int pw = image_size_width / patch_size / merge_ratio; const int ph = image_size_height / patch_size / merge_ratio; diff --git a/tools/mtmd/models/qwen2vl.cpp b/tools/mtmd/models/qwen2vl.cpp index 389c5a94f63..85f158bb1c0 100644 --- a/tools/mtmd/models/qwen2vl.cpp +++ b/tools/mtmd/models/qwen2vl.cpp @@ -5,7 +5,8 @@ ggml_cgraph * clip_graph_qwen2vl::build() { GGML_ASSERT(model.class_embedding == nullptr); const int batch_size = 1; - const bool use_window_attn = !hparams.wa_layers.empty(); + const bool use_window_attn = hparams.n_wa_pattern > 0; + const int n_wa_pattern = hparams.n_wa_pattern; const int n_pos = n_patches; const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position @@ -78,7 +79,7 @@ ggml_cgraph * clip_graph_qwen2vl::build() { // loop over layers for (int il = 0; il < n_layer; il++) { const auto & layer = model.layers[il]; - const bool full_attn = use_window_attn ? hparams.wa_layers.count(il) > 0 : true; + const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true; ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states diff --git a/vendor/minja/minja.hpp b/vendor/minja/minja.hpp index 873ece8c180..905fc47f8a6 100644 --- a/vendor/minja/minja.hpp +++ b/vendor/minja/minja.hpp @@ -1446,7 +1446,7 @@ struct ArgumentsExpression { static std::string strip(const std::string & s, const std::string & chars = "", bool left = true, bool right = true) { auto charset = chars.empty() ? " \t\n\r" : chars; auto start = left ? s.find_first_not_of(charset) : 0; - if (start == std::string::npos) return ""; + if (start == std::string::npos) return ""; auto end = right ? s.find_last_not_of(charset) : s.size() - 1; return s.substr(start, end - start + 1); } @@ -1464,6 +1464,20 @@ static std::vector split(const std::string & s, const std::string & return result; } +static std::vector rsplit(const std::string & s, const std::string & sep) { + std::vector result; + size_t end = s.length(); + size_t pos = s.rfind(sep); + while (pos != std::string::npos) { + result.insert(result.begin(), s.substr(pos + sep.length(), end - pos - sep.length())); + end = pos; + if (pos == 0) break; + pos = s.rfind(sep, pos - 1); + } + result.insert(result.begin(), s.substr(0, end)); + return result; +} + static std::string capitalize(const std::string & s) { if (s.empty()) return s; auto result = s; @@ -1573,6 +1587,15 @@ class MethodCallExpr : public Expression { result.push_back(Value(part)); } return result; + } else if (method->get_name() == "rsplit") { + vargs.expectArgs("rsplit method", {1, 1}, {0, 0}); + auto sep = vargs.args[0].get(); + auto parts = rsplit(str, sep); + Value result = Value::array(); + for (const auto& part : parts) { + result.push_back(Value(part)); + } + return result; } else if (method->get_name() == "capitalize") { vargs.expectArgs("capitalize method", {0, 0}, {0, 0}); return Value(capitalize(str)); From 3ec91fb9a7fb5fde3faf25b4ca14700603fda4fa Mon Sep 17 00:00:00 2001 From: ttsyliu Date: Thu, 25 Dec 2025 11:22:55 +0800 Subject: [PATCH 5/6] update warm info --- tools/mtmd/clip.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index acf318c8893..9a888a4f00a 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1172,11 +1172,11 @@ struct clip_model_loader { for (auto & layer : wa_layers_vec) { hparams.wa_layers.insert(layer); } - hparams.set_limit_image_tokens(8, 4096); - hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup - const int warn_min_pixels = 1024 * hparams.n_merge * hparams.n_merge * hparams.patch_size * hparams.patch_size; + hparams.set_limit_image_tokens(1, 62500); + hparams.set_warmup_n_tokens(16*16); // avoid OOM on warmup + const int warn_min_pixels = 1 * hparams.n_merge * hparams.n_merge * hparams.patch_size * hparams.patch_size; if (hparams.image_min_pixels < warn_min_pixels) { - LOG_WRN("%s: Youtu-VL models require at minimum 1024 image tokens to function correctly on grounding tasks\n", __func__); + LOG_WRN("%s: Youtu-VL models require at minimum 1 image tokens to function correctly on grounding tasks\n", __func__); } } break; case PROJECTOR_TYPE_GLM4V: From 251852a6e068b5abc73a29a8343c231d37e8a960 Mon Sep 17 00:00:00 2001 From: ttsyliu Date: Thu, 25 Dec 2025 11:28:12 +0800 Subject: [PATCH 6/6] fix annotation --- tools/mtmd/clip.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 9a888a4f00a..637180f07ac 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -2729,7 +2729,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str const int max_num_patches = params.image_max_pixels > 0 ? params.image_max_pixels / (patch_size * patch_size) : 256; - // Binary search for optimal scale to fit within max_num_patches + // Linear search for optimal scale to fit within max_num_patches float scale = 1.0f; int target_height = original_size.height; int target_width = original_size.width; @@ -2742,7 +2742,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str return std::max(align_size, aligned); }; - // Binary search with 0.02 step size + // Linear search with 0.02 step size while (scale > 0.0f) { target_height = get_scaled_image_size(scale, original_size.height); target_width = get_scaled_image_size(scale, original_size.width);