ggml-org · ngxson · Jan 29, 2026 · Jan 29, 2026 · Jan 29, 2026 · Jan 29, 2026
@@ -1257,6 +1257,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "6c81ce329e0802883b22eabab0d3fa48357337ef1ecb45443828bf1f6254833f":
             # ref: https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B
             res = "exaone-moe"
+        if chkhsh == "27d87c17bcffe5262a1e80b2ceb9a5e002c4f8a17d796fd5afac9180dd8bd96e":
+            # ref: https://huggingface.co/meituan-longcat/LongCat-Flash-Chat
+            res = "longcat-flash"
 
         if res is None:
             logger.warning("\n")
@@ -10914,6 +10917,61 @@ def set_vocab(self):
         special_vocab.add_to_gguf(self.gguf_writer)
 
 
+@ModelBase.register("LongcatFlashForCausalLM")
+class LongcatFlashModel(DeepseekV2Model):
+    model_arch = gguf.MODEL_ARCH.LONGCAT_FLASH
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # the model use double block, we need to adjust block count
+        self.block_count = self.hparams["num_layers"] * 2
+        self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
+        # compat with deepseek2 base class hparam
+        self.hparams["num_hidden_layers"] = self.block_count
+        self.hparams["num_key_value_heads"] = self.hparams["num_attention_heads"]
+        self.hparams["intermediate_size"] = self.hparams["ffn_hidden_size"]
+        self.hparams["moe_intermediate_size"] = self.hparams["expert_ffn_hidden_size"]
+        self.hparams["num_experts_per_tok"] = self.hparams["moe_topk"]
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        zero_expert_num = self.hparams["zero_expert_num"]
+        zero_expert_type = self.hparams["zero_expert_type"]
+        assert zero_expert_type == "identity", "cpp implementation only supports 'identity' type"
+        self.gguf_writer.add_n_zero_experts(zero_expert_num)
+
+    def modify_tensors(self, data_torch, name, bid):
+        if bid is not None:
+            bid = bid * 2  # double block id
+
+        # Rename rules examples:
+        # model.layers.1.input_layernorm.0.weight --> model.layers.1.input_layernorm.weight
+        # model.layers.1.input_layernorm.1.weight --> model.layers.2.input_layernorm.weight
+        # model.layers.1.mlp.experts.0 --> model.layers.2.mlp.expert.0 (special case for experts)
+
+        name = name.replace('.mlps.', '.mlp.')
+        name = name.replace('.router.classifier.', '.gate.')
+        name = name.replace('.router.e_score_correction_bias', '.e_score_correction_bias')
+
+        # handle sub-block remapping
+        match = re.match(r'.*\.(\d+)\.([a-z_\.]+)\.(\d+)\..*', name)
+        if match and ".mlp.experts." not in name:
+            # convert block id from N.(name).M to (N+M).(name)
+            N = int(match.group(1))
+            middle = match.group(2)
+            M = int(match.group(3))
+            assert(N * 2 == bid)
+            new_bid = N * 2 + M
+            new_name = re.sub(r'\.(\d+)\.([a-z_\.]+)\.(\d+)\.', f'.{new_bid}.{middle}.', name)
+            yield from super().modify_tensors(data_torch, new_name, new_bid)
+        else:
+            # correct block inside name (fix for experts tensors)
+            if bid is not None:
+                name = name.replace(f'.{bid // 2}.', f'.{bid}.', 1)
+            yield from super().modify_tensors(data_torch, name, bid)
+
+
 ###### CONVERSION LOGIC ######
 
 

@@ -148,6 +148,7 @@ class TOKENIZER_TYPE(IntEnum):
     {"name": "youtu",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Youtu-LLM-2B", },
     {"name": "solar-open",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/upstage/Solar-Open-100B", },
     {"name": "exaone-moe",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B", },
+    {"name": "longcat-flash",    "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meituan-longcat/LongCat-Flash-Chat", },
 ]
 
 # some models are known to be broken upstream, so we will skip them as exceptions

@@ -148,6 +148,7 @@ class LLM:
         EMBD_LENGTH_PER_LAYER_INP         = "{arch}.embedding_length_per_layer_input"
         DENSE_FEAT_IN_SIZE                = "{arch}.{dense}_feat_in"
         DENSE_FEAT_OUT_SIZE               = "{arch}.{dense}_feat_out"
+        N_ZERO_EXPERTS                    = "{arch}.n_zero_experts" # longcat-flash
 
     class Attention:
         HEAD_COUNT                   = "{arch}.attention.head_count"
@@ -459,6 +460,7 @@ class MODEL_ARCH(IntEnum):
     MIMO2            = auto()
     LLAMA_EMBED      = auto()
     MAINCODER        = auto()
+    LONGCAT_FLASH    = auto()
 
 
 class VISION_PROJECTOR_TYPE(IntEnum):
@@ -880,6 +882,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.MIMO2:            "mimo2",
     MODEL_ARCH.LLAMA_EMBED:      "llama-embed",
     MODEL_ARCH.MAINCODER:        "maincoder",
+    MODEL_ARCH.LONGCAT_FLASH:    "longcat-flash",
 }
 
 VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@@ -3377,6 +3380,36 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
     ],
+    MODEL_ARCH.LONGCAT_FLASH: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_Q_A,
+        MODEL_TENSOR.ATTN_Q_B,
+        MODEL_TENSOR.ATTN_KV_A_MQA,
+        MODEL_TENSOR.ATTN_KV_B,
+        MODEL_TENSOR.ATTN_K_B,
+        MODEL_TENSOR.ATTN_V_B,
+        MODEL_TENSOR.ATTN_Q_A_NORM,
+        MODEL_TENSOR.ATTN_KV_A_NORM,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+        MODEL_TENSOR.FFN_GATE_SHEXP,
+        MODEL_TENSOR.FFN_DOWN_SHEXP,
+        MODEL_TENSOR.FFN_UP_SHEXP,
+        MODEL_TENSOR.FFN_EXP_PROBS_B,
+    ],
     # TODO
 }
 

@@ -1075,6 +1075,9 @@ def add_eom_token_id(self, id: int) -> None:
     def add_classifier_output_labels(self, labels: Sequence[str]) -> None:
         self.add_array(Keys.Classifier.OUTPUT_LABELS.format(arch=self.arch), labels)
 
+    def add_n_zero_experts(self, n: int) -> None:
+        self.add_uint32(Keys.LLM.N_ZERO_EXPERTS.format(arch=self.arch), n)
+
     # for vision models
 
     def add_clip_has_vision_encoder(self, value: bool) -> None:

@@ -89,6 +89,7 @@ add_library(llama
             models/llada.cpp
             models/llama-iswa.cpp
             models/llama.cpp
+            models/longcat-flash.cpp
             models/maincoder.cpp
             models/mamba.cpp
             models/mimo2-iswa.cpp

@@ -120,6 +120,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_MIMO2,            "mimo2"           },
     { LLM_ARCH_LLAMA_EMBED,      "llama-embed"      },
     { LLM_ARCH_MAINCODER,        "maincoder"        },
+    { LLM_ARCH_LONGCAT_FLASH,    "longcat-flash"    },
     { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };
 
@@ -191,6 +192,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_EMBEDDING_SCALE,                   "%s.embedding_scale"                   },
     { LLM_KV_TOKEN_SHIFT_COUNT,                 "%s.token_shift_count"                 },
     { LLM_KV_INTERLEAVE_MOE_LAYER_STEP,         "%s.interleave_moe_layer_step"         },
+    { LLM_KV_N_ZERO_EXPERTS,                    "%s.n_zero_experts"                    },
 
     { LLM_KV_ATTENTION_HEAD_COUNT,                   "%s.attention.head_count"                   },
     { LLM_KV_ATTENTION_HEAD_COUNT_KV,                "%s.attention.head_count_kv"                },
@@ -1475,6 +1477,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                 LLM_TENSOR_FFN_UP_SHEXP,
             };
         case LLM_ARCH_DEEPSEEK2:
+        case LLM_ARCH_LONGCAT_FLASH:
             return {
                 LLM_TENSOR_TOKEN_EMBD,
                 LLM_TENSOR_OUTPUT_NORM,

@@ -124,6 +124,7 @@ enum llm_arch {
     LLM_ARCH_MIMO2,
     LLM_ARCH_LLAMA_EMBED,
     LLM_ARCH_MAINCODER,
+    LLM_ARCH_LONGCAT_FLASH,
     LLM_ARCH_UNKNOWN,
 };
 
@@ -195,6 +196,7 @@ enum llm_kv {
     LLM_KV_EMBEDDING_SCALE,
     LLM_KV_TOKEN_SHIFT_COUNT,
     LLM_KV_INTERLEAVE_MOE_LAYER_STEP,
+    LLM_KV_N_ZERO_EXPERTS,
 
     LLM_KV_ATTENTION_HEAD_COUNT,
     LLM_KV_ATTENTION_HEAD_COUNT_KV,

@@ -1114,6 +1114,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
     const int64_t n_tokens = cur->ne[1];
     const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
 
+    // longcat-flash use n_zero_experts
+    const int64_t n_probs = n_expert + hparams.n_zero_experts;
+
     ggml_tensor * logits = nullptr;
 
     if (probs_in == nullptr) {
@@ -1169,7 +1172,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
     // select top n_group_used expert groups
     // https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/e815299b0bcbac849fa540c768ef21845365c9eb/modeling_deepseek.py#L440-L457
     if (hparams.n_expert_groups > 1 && n_tokens > 0) {
-        const int64_t n_exp_per_group = n_expert / hparams.n_expert_groups;
+        const int64_t n_exp_per_group = n_probs / hparams.n_expert_groups;
 
         // organize experts into n_expert_groups
         ggml_tensor * selection_groups = ggml_reshape_3d(ctx0, selection_probs, n_exp_per_group, hparams.n_expert_groups, n_tokens); // [n_exp_per_group, n_expert_groups, n_tokens]
@@ -1187,7 +1190,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
         // mask out the other groups
         selection_probs = ggml_get_rows(ctx0, selection_groups, expert_groups); // [n_exp_per_group, n_group_used, n_tokens]
         selection_probs = ggml_set_rows(ctx0, ggml_fill(ctx0, selection_groups, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]
-        selection_probs = ggml_reshape_2d(ctx0, selection_probs, n_expert, n_tokens); // [n_expert, n_tokens]
+        selection_probs = ggml_reshape_2d(ctx0, selection_probs, n_probs, n_tokens); // [n_probs, n_tokens]
         cb(selection_probs, "ffn_moe_probs_masked", il);
     }
 
@@ -1201,6 +1204,12 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
         ggml_tensor * f_sel = ggml_cast(ctx0, selected_experts, GGML_TYPE_F32);
         selected_experts = ggml_cast(ctx0, ggml_scale(ctx0, f_sel, 1.0f / float(hparams.n_group_experts)), GGML_TYPE_I32);
         probs = ggml_reshape_3d(ctx0, probs, 1, hparams.n_expert, n_tokens);
+
+    } else if (arch == LLM_ARCH_LONGCAT_FLASH && hparams.n_zero_experts > 0) {
+        ggml_tensor * f_sel = ggml_cast(ctx0, selected_experts, GGML_TYPE_F32);
+        // TODO (hard): how to implement zero-computation experts here?
+        probs = ggml_reshape_3d(ctx0, probs, 1, n_probs, n_tokens);
+
     } else {
         probs = ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens);
     }

@@ -77,6 +77,7 @@ struct llama_hparams {
     uint32_t n_expert_groups    = 0;
     uint32_t n_group_used       = 0;
     uint32_t n_group_experts    = 0;
+    uint32_t n_zero_experts     = 0;
 
     float    expert_group_scale   = 0.05f;
     float    expert_weights_scale = 0.0f;

@@ -857,6 +857,8 @@ struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx
         n_created++;
     }
 
+    loaded_tensor_names.insert(name);
+
     return tensor;
 
 }
@@ -886,11 +888,20 @@ struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_conte
 
     n_created++;
 
+    loaded_tensor_names.insert(name);
+
     return tensor;
 }
 
 void llama_model_loader::done_getting_tensors() const {
     if (n_created != n_tensors) {
+        // for debugging
+        for (const auto & it : weights_map) {
+            const std::string & name = it.first;
+            if (loaded_tensor_names.find(name) == loaded_tensor_names.end()) {
+                LLAMA_LOG_DEBUG("%s: tensor '%s' was not created\n", __func__, name.c_str());
+            }
+        }
         throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
     }
 }

@@ -10,6 +10,7 @@
 
 #include <cstddef>
 #include <map>
+#include <set>
 #include <stdexcept>
 #include <unordered_map>
 
@@ -94,6 +95,8 @@ struct llama_model_loader {
     size_t size_data = 0;
     std::vector<std::pair<size_t, size_t>> mmaps_used;
 
+    std::set<std::string> loaded_tensor_names; // for debugging
+
     llama_model_loader(
         const std::string & fname,
         std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme