Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 61 additions & 1 deletion convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -657,7 +657,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206":
# ref: https://huggingface.co/inclusionAI/Ling-mini-base-2.0
res = "bailingmoe2"

if chkhsh == "f4f37b6c8eb9ea29b3eac6bb8c8487c5ab7885f8d8022e67edc1c68ce8403e95":
# ref: https://huggingface.co/MiniMaxAI/MiniMax-M2
res = "minimax-m2"
if res is None:
logger.warning("\n")
logger.warning("**************************************************************************************")
Expand Down Expand Up @@ -4122,6 +4124,63 @@ def prepare_tensors(self):
super().prepare_tensors()
self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)

@Model.register("MiniMaxM2ForCausalLM")
class MiniMaxM2Model(Model):
model_arch = gguf.MODEL_ARCH.MINIMAXM2
_experts_cache: dict[int, dict[str, Tensor]] = {}

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.hparams["num_experts"] = self.hparams["num_local_experts"]

def set_gguf_parameters(self):
super().set_gguf_parameters()
if self.hparams["scoring_func"] == "sigmoid":
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
elif self.hparams["scoring_func"] == "softmax":
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
else:
raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}")

self.gguf_writer.add_expert_feed_forward_length(self.find_hparam(["intermediate_size"]))
self.gguf_writer.add_rope_dimension_count(self.find_hparam(["rotary_dim"]))

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
if name.endswith("e_score_correction_bias"):
name = name.replace("e_score_correction_bias", "e_score_correction.bias")

# merge expert weights
if 'experts' in name:
n_experts = self.hparams["num_experts"]
assert bid is not None

expert_cache = self._experts_cache.setdefault(bid, {})
expert_cache[name] = data_torch
expert_weights = ["w1", "w2", "w3"]

# not enough expert weights to merge
if len(expert_cache) < n_experts * len(expert_weights):
return []

tensors: list[tuple[str, Tensor]] = []
for w_name in expert_weights:
datas: list[Tensor] = []

for xid in range(n_experts):
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight"
datas.append(expert_cache[ename])
del expert_cache[ename]

data_torch = torch.stack(datas, dim=0)
merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight"
new_name = self.map_tensor_name(merged_name)
tensors.append((new_name, data_torch))

del self._experts_cache[bid]
return tensors

return super().modify_tensors(data_torch, name, bid)


@Model.register("Dots1ForCausalLM")
class Dots1Model(Qwen2MoeModel):
Expand Down Expand Up @@ -4150,6 +4209,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
return [(self.map_tensor_name(name), data_torch)]
return super().modify_tensors(data_torch, name, bid)


@Model.register("Glm4MoeForCausalLM")
class Glm4MoeModel(Model):
model_arch = gguf.MODEL_ARCH.GLM4_MOE
Expand Down
3 changes: 2 additions & 1 deletion convert_hf_to_gguf_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ class TOKENIZER_TYPE(IntEnum):
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/zai-org/GLM-4.5-Air", "chkhsh": "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902", },
{"name": "kimi-k2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base", "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890", },
{"name": "grok-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
{"name": "minimax-m2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/MiniMaxAI/MiniMax-M2", },
]


Expand Down Expand Up @@ -164,7 +165,7 @@ def download_model(model):
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
else:
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
except OSError as e:
except (OSError, TypeError) as e:
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
continue # Skip to the next model if the tokenizer can't be loaded

Expand Down
24 changes: 22 additions & 2 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,8 @@ class MODEL_ARCH(IntEnum):
ERNIE4_5 = auto()
ERNIE4_5_MOE = auto()
BAILINGMOE2 = auto()

MINIMAXM2 = auto()

class MODEL_TENSOR(IntEnum):
TOKEN_EMBD = auto()
TOKEN_EMBD_NORM = auto()
Expand Down Expand Up @@ -384,7 +385,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.ARCTIC: "arctic",
MODEL_ARCH.DEEPSEEK2: "deepseek2",
MODEL_ARCH.CHATGLM: "chatglm",
MODEL_ARCH.GLM4_MOE: "glm4moe",
MODEL_ARCH.GLM4_MOE: "glm4moe",
MODEL_ARCH.BITNET: "bitnet",
MODEL_ARCH.BITNET_25: "bitnet-25",
MODEL_ARCH.T5: "t5",
Expand All @@ -394,6 +395,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.ERNIE4_5: "ernie4_5",
MODEL_ARCH.ERNIE4_5_MOE: "ernie4_5-moe",
MODEL_ARCH.BAILINGMOE2: "bailingmoe2",
MODEL_ARCH.MINIMAXM2: "minimax-m2",
}

TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
Expand Down Expand Up @@ -1324,6 +1326,24 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
MODEL_TENSOR.LAYER_OUT_NORM,
],
MODEL_ARCH.MINIMAXM2: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_K_NORM,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
MODEL_TENSOR.FFN_EXP_PROBS_B,
],
# TODO
}

Expand Down
1 change: 1 addition & 0 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,7 @@ class TensorNameMap:
"model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3 dots1
"model.layers.{bid}.mlp.moe_statics.e_score_correction", # ernie4.5-moe
"model.layers.{bid}.mlp.gate.expert_bias", # bailingmoe2
"model.layers.{bid}.block_sparse_moe.e_score_correction", # minimax-m2
),

# Feed-forward up
Expand Down
1 change: 1 addition & 0 deletions src/llama-arch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
{ LLM_ARCH_OPENAI_MOE, "gpt-oss" },
{ LLM_ARCH_BAILINGMOE2, "bailingmoe2" },
{ LLM_ARCH_MINIMAX_M2, "minimax-m2" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};

Expand Down
1 change: 1 addition & 0 deletions src/llama-arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ enum llm_arch {
LLM_ARCH_HUNYUAN_MOE,
LLM_ARCH_OPENAI_MOE,
LLM_ARCH_BAILINGMOE2,
LLM_ARCH_MINIMAX_M2,
LLM_ARCH_UNKNOWN,
};

Expand Down
129 changes: 127 additions & 2 deletions src/llama-build-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7847,7 +7847,6 @@ ggml_cgraph * llm_build_context::build_ernie4_5_moe() {
cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il);

// self-attention
// self-attention
{
// Q, K, V projections
Expand Down Expand Up @@ -7899,7 +7898,6 @@ ggml_cgraph * llm_build_context::build_ernie4_5_moe() {
}

if (il == n_layer - 1 && inp_out_ids) {
ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
Expand Down Expand Up @@ -8366,6 +8364,129 @@ ggml_cgraph * llm_build_context::build_bailingmoe2() {
return gf;
}

ggml_cgraph* llm_build_context::build_minimaxm2() {
ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
// GGML_ASSERT(n_embd_head == hparams.n_rot); this is wrong in case of minimax, head_dim = 128, n_rot = 64

ggml_tensor * cur;
ggml_tensor * inpL;

inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);

ggml_tensor * inp_pos = build_inp_pos();


//auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
ggml_tensor * KQ_mask = build_inp_KQ_mask();

for (int il = 0; il < n_layer; ++il) {
ggml_tensor* inpSA = inpL;

cur = inpL;

// self_attention
{
cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il);

// Q, K, V projections
ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code from here to line 8416 can now be replaced with

            auto [Qcur, Kcur, Vcur] = llm_build_mul_mat_qkv(gf, cur, 
                    model.layers[il].wqkv, nullptr, model.layers[il].wqk, nullptr,
                    model.layers[il].wq, nullptr, model.layers[il].wk, nullptr, model.layers[il].wv, nullptr,
                    model.layers[il].attn_q_norm, model.layers[il].attn_k_norm, 0, il); 

In that way we don't need to worry about how Q, K, V should be reshaped, we automatically get Q,K,V fusion for this model if the user requested via -mqkv, and we use a standardized way to compute attention (when attention is standard as it is here).

It would be useful to do that so it can get tested as part of this PR.

Copy link
Collaborator Author

@firecoperana firecoperana Nov 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I get error ik_llama.cpp\ggml\src\ggml.c:6358: GGML_ASSERT(ggml_can_repeat(b, a)) failed when loading the model using cpu only backend after incorporating both changes. I think it's that the buffer of a is null that caused the crash.
Call Stack:

 	ggml.dll!ggml_abort(const char * file, int line, const char * fmt, ...) Line 272	C
 	ggml.dll!ggml_mul_impl(ggml_context * ctx, ggml_tensor * a, ggml_tensor * b, bool inplace) Line 6360	C
>	ggml.dll!ggml_fused_rms_norm_impl(ggml_context * ctx, ggml_tensor * a, ggml_tensor * b, float eps, bool inplace) Line 7277	C
 	ggml.dll!ggml_fused_rms_norm(ggml_context * ctx, ggml_tensor * a, ggml_tensor * b, float eps) Line 7305	C
 	llama.dll!llm_build_context::llm_build_norm(ggml_context * ctx, ggml_tensor * cur, const llama_hparams & hparams, ggml_tensor * mw, ggml_tensor * mb, llm_norm_type type, const std::function<void __cdecl(ggml_tensor *,char const *,int)> & cb, int il, float scale_eps) Line 582	C++
 	llama.dll!llm_build_context::llm_build_mul_mat_qkv(ggml_cgraph * gf, ggml_tensor * cur, ggml_tensor * wqkv, ggml_tensor * bqkv, ggml_tensor * wqk, ggml_tensor * bqk, ggml_tensor * wq, ggml_tensor * bq, ggml_tensor * wk, ggml_tensor * bk, ggml_tensor * wv, ggml_tensor * bv, ggml_tensor * q_norm, ggml_tensor * k_norm, float attention_scale, int il) Line 1348	C++
 	llama.dll!llm_build_context::build_minimaxm2() Line 8418	C++

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I see. The attn_q_norm and attn_k_norm tensors are not just 1d with the size of one attention head, but rather head_size x head_count, so the usual trick of fusing the RMS_NORM, does not work. Sorry. Ignore the two comments then.

cb(Qcur, "Qcur", il);

ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);

ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);

Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(Qcur, "Qcur_normed", il);

Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(Kcur, "Kcur_normed", il);

// reshape for multi-head
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
// Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);


// apply RoPE
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);

cur = llm_build_kv(ctx0, lctx, kv_self, gf,
model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask,
n_tokens, kv_head, n_kv,
1.0f / sqrtf(float(n_embd_head)), cb, il);
}

if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}

ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);

// MoE branch
cur = llm_build_norm(ctx0, ffn_inp, hparams,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS,cb, il);
cb(cur, "ffn_norm", il);

cur = llm_build_moe_ffn(ctx0, lctx, cur,
model.layers[il].ffn_gate_inp,
model.layers[il].ffn_up_exps,
model.layers[il].ffn_gate_exps,
model.layers[il].ffn_down_exps,
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
false, 0,
(llm_expert_gating_func_type)hparams.expert_gating_func,
cb, il, gf);
cb(cur, "ffn_moe_out", il);

cur = ggml_add(ctx0, cur, ffn_inp);

cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);

// input for next layer
inpL = cur;
}

cur = inpL;

cur = llm_build_norm(ctx0, cur,
hparams, model.output_norm, NULL,
LLM_NORM_RMS, cb, -1);

cb(cur, "result_norm", -1);

// lm_head
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);

cb(cur, "result_output", -1);

ggml_build_forward_expand(gf, cur);
return gf;
}

ggml_cgraph * llm_build_context::llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
llama_batch dummy;
dummy.n_tokens = 0;
Expand Down Expand Up @@ -8712,6 +8833,10 @@ ggml_cgraph * llm_build_context::llama_build_graph(
{
result = llm.build_bailingmoe2();
} break;
case LLM_ARCH_MINIMAX_M2:
{
result = llm.build_minimaxm2();
} break;
default:
GGML_ABORT("fatal error");
}
Expand Down
2 changes: 2 additions & 0 deletions src/llama-build-context.h
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,8 @@ struct llm_build_context {

ggml_cgraph * build_bailingmoe2();

ggml_cgraph * build_minimaxm2();

//
static ggml_tensor * llm_build_lora_mm(llama_context & lctx, ggml_context * ctx0,
ggml_tensor * w, ggml_tensor * cur);
Expand Down
11 changes: 11 additions & 0 deletions src/llama-hparams.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1012,6 +1012,17 @@ void llm_load_hparams(
// TODO: switch (hparams.n_layer)

} break;
case LLM_ARCH_MINIMAX_M2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);

switch (hparams.n_layer) {
case 62: model.type = e_model::MODEL_230B_A10B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
default: (void)0;
}

Expand Down
Loading