Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1257,6 +1257,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
if chkhsh == "6c81ce329e0802883b22eabab0d3fa48357337ef1ecb45443828bf1f6254833f":
# ref: https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B
res = "exaone-moe"
if chkhsh == "27d87c17bcffe5262a1e80b2ceb9a5e002c4f8a17d796fd5afac9180dd8bd96e":
# ref: https://huggingface.co/meituan-longcat/LongCat-Flash-Chat
res = "longcat-flash"

if res is None:
logger.warning("\n")
Expand Down Expand Up @@ -10914,6 +10917,61 @@ def set_vocab(self):
special_vocab.add_to_gguf(self.gguf_writer)


@ModelBase.register("LongcatFlashForCausalLM")
class LongcatFlashModel(DeepseekV2Model):
model_arch = gguf.MODEL_ARCH.LONGCAT_FLASH

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# the model use double block, we need to adjust block count
self.block_count = self.hparams["num_layers"] * 2
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
# compat with deepseek2 base class hparam
self.hparams["num_hidden_layers"] = self.block_count
self.hparams["num_key_value_heads"] = self.hparams["num_attention_heads"]
self.hparams["intermediate_size"] = self.hparams["ffn_hidden_size"]
self.hparams["moe_intermediate_size"] = self.hparams["expert_ffn_hidden_size"]
self.hparams["num_experts_per_tok"] = self.hparams["moe_topk"]

def set_gguf_parameters(self):
super().set_gguf_parameters()

zero_expert_num = self.hparams["zero_expert_num"]
zero_expert_type = self.hparams["zero_expert_type"]
assert zero_expert_type == "identity", "cpp implementation only supports 'identity' type"
self.gguf_writer.add_n_zero_experts(zero_expert_num)

def modify_tensors(self, data_torch, name, bid):
if bid is not None:
bid = bid * 2 # double block id

# Rename rules examples:
# model.layers.1.input_layernorm.0.weight --> model.layers.1.input_layernorm.weight
# model.layers.1.input_layernorm.1.weight --> model.layers.2.input_layernorm.weight
# model.layers.1.mlp.experts.0 --> model.layers.2.mlp.expert.0 (special case for experts)

name = name.replace('.mlps.', '.mlp.')
name = name.replace('.router.classifier.', '.gate.')
name = name.replace('.router.e_score_correction_bias', '.e_score_correction_bias')

# handle sub-block remapping
match = re.match(r'.*\.(\d+)\.([a-z_\.]+)\.(\d+)\..*', name)
if match and ".mlp.experts." not in name:
# convert block id from N.(name).M to (N+M).(name)
N = int(match.group(1))
middle = match.group(2)
M = int(match.group(3))
assert(N * 2 == bid)
new_bid = N * 2 + M
new_name = re.sub(r'\.(\d+)\.([a-z_\.]+)\.(\d+)\.', f'.{new_bid}.{middle}.', name)
yield from super().modify_tensors(data_torch, new_name, new_bid)
else:
# correct block inside name (fix for experts tensors)
if bid is not None:
name = name.replace(f'.{bid // 2}.', f'.{bid}.', 1)
yield from super().modify_tensors(data_torch, name, bid)


###### CONVERSION LOGIC ######


Expand Down
1 change: 1 addition & 0 deletions convert_hf_to_gguf_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ class TOKENIZER_TYPE(IntEnum):
{"name": "youtu", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Youtu-LLM-2B", },
{"name": "solar-open", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/upstage/Solar-Open-100B", },
{"name": "exaone-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B", },
{"name": "longcat-flash", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meituan-longcat/LongCat-Flash-Chat", },
]

# some models are known to be broken upstream, so we will skip them as exceptions
Expand Down
33 changes: 33 additions & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ class LLM:
EMBD_LENGTH_PER_LAYER_INP = "{arch}.embedding_length_per_layer_input"
DENSE_FEAT_IN_SIZE = "{arch}.{dense}_feat_in"
DENSE_FEAT_OUT_SIZE = "{arch}.{dense}_feat_out"
N_ZERO_EXPERTS = "{arch}.n_zero_experts" # longcat-flash

class Attention:
HEAD_COUNT = "{arch}.attention.head_count"
Expand Down Expand Up @@ -459,6 +460,7 @@ class MODEL_ARCH(IntEnum):
MIMO2 = auto()
LLAMA_EMBED = auto()
MAINCODER = auto()
LONGCAT_FLASH = auto()


class VISION_PROJECTOR_TYPE(IntEnum):
Expand Down Expand Up @@ -880,6 +882,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.MIMO2: "mimo2",
MODEL_ARCH.LLAMA_EMBED: "llama-embed",
MODEL_ARCH.MAINCODER: "maincoder",
MODEL_ARCH.LONGCAT_FLASH: "longcat-flash",
}

VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
Expand Down Expand Up @@ -3377,6 +3380,36 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.LONGCAT_FLASH: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_Q_A,
MODEL_TENSOR.ATTN_Q_B,
MODEL_TENSOR.ATTN_KV_A_MQA,
MODEL_TENSOR.ATTN_KV_B,
MODEL_TENSOR.ATTN_K_B,
MODEL_TENSOR.ATTN_V_B,
MODEL_TENSOR.ATTN_Q_A_NORM,
MODEL_TENSOR.ATTN_KV_A_NORM,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.ATTN_ROT_EMBD,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
MODEL_TENSOR.FFN_GATE_SHEXP,
MODEL_TENSOR.FFN_DOWN_SHEXP,
MODEL_TENSOR.FFN_UP_SHEXP,
MODEL_TENSOR.FFN_EXP_PROBS_B,
],
# TODO
}

Expand Down
3 changes: 3 additions & 0 deletions gguf-py/gguf/gguf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1075,6 +1075,9 @@ def add_eom_token_id(self, id: int) -> None:
def add_classifier_output_labels(self, labels: Sequence[str]) -> None:
self.add_array(Keys.Classifier.OUTPUT_LABELS.format(arch=self.arch), labels)

def add_n_zero_experts(self, n: int) -> None:
self.add_uint32(Keys.LLM.N_ZERO_EXPERTS.format(arch=self.arch), n)

# for vision models

def add_clip_has_vision_encoder(self, value: bool) -> None:
Expand Down
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ add_library(llama
models/llada.cpp
models/llama-iswa.cpp
models/llama.cpp
models/longcat-flash.cpp
models/maincoder.cpp
models/mamba.cpp
models/mimo2-iswa.cpp
Expand Down
3 changes: 3 additions & 0 deletions src/llama-arch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_MIMO2, "mimo2" },
{ LLM_ARCH_LLAMA_EMBED, "llama-embed" },
{ LLM_ARCH_MAINCODER, "maincoder" },
{ LLM_ARCH_LONGCAT_FLASH, "longcat-flash" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};

Expand Down Expand Up @@ -191,6 +192,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
{ LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" },
{ LLM_KV_INTERLEAVE_MOE_LAYER_STEP, "%s.interleave_moe_layer_step" },
{ LLM_KV_N_ZERO_EXPERTS, "%s.n_zero_experts" },

{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
Expand Down Expand Up @@ -1475,6 +1477,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_FFN_UP_SHEXP,
};
case LLM_ARCH_DEEPSEEK2:
case LLM_ARCH_LONGCAT_FLASH:
return {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_OUTPUT_NORM,
Expand Down
2 changes: 2 additions & 0 deletions src/llama-arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ enum llm_arch {
LLM_ARCH_MIMO2,
LLM_ARCH_LLAMA_EMBED,
LLM_ARCH_MAINCODER,
LLM_ARCH_LONGCAT_FLASH,
LLM_ARCH_UNKNOWN,
};

Expand Down Expand Up @@ -195,6 +196,7 @@ enum llm_kv {
LLM_KV_EMBEDDING_SCALE,
LLM_KV_TOKEN_SHIFT_COUNT,
LLM_KV_INTERLEAVE_MOE_LAYER_STEP,
LLM_KV_N_ZERO_EXPERTS,

LLM_KV_ATTENTION_HEAD_COUNT,
LLM_KV_ATTENTION_HEAD_COUNT_KV,
Expand Down
13 changes: 11 additions & 2 deletions src/llama-graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1114,6 +1114,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
const int64_t n_tokens = cur->ne[1];
const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN

// longcat-flash use n_zero_experts
const int64_t n_probs = n_expert + hparams.n_zero_experts;

ggml_tensor * logits = nullptr;

if (probs_in == nullptr) {
Expand Down Expand Up @@ -1169,7 +1172,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
// select top n_group_used expert groups
// https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/e815299b0bcbac849fa540c768ef21845365c9eb/modeling_deepseek.py#L440-L457
if (hparams.n_expert_groups > 1 && n_tokens > 0) {
const int64_t n_exp_per_group = n_expert / hparams.n_expert_groups;
const int64_t n_exp_per_group = n_probs / hparams.n_expert_groups;

// organize experts into n_expert_groups
ggml_tensor * selection_groups = ggml_reshape_3d(ctx0, selection_probs, n_exp_per_group, hparams.n_expert_groups, n_tokens); // [n_exp_per_group, n_expert_groups, n_tokens]
Expand All @@ -1187,7 +1190,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
// mask out the other groups
selection_probs = ggml_get_rows(ctx0, selection_groups, expert_groups); // [n_exp_per_group, n_group_used, n_tokens]
selection_probs = ggml_set_rows(ctx0, ggml_fill(ctx0, selection_groups, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]
selection_probs = ggml_reshape_2d(ctx0, selection_probs, n_expert, n_tokens); // [n_expert, n_tokens]
selection_probs = ggml_reshape_2d(ctx0, selection_probs, n_probs, n_tokens); // [n_probs, n_tokens]
cb(selection_probs, "ffn_moe_probs_masked", il);
}

Expand All @@ -1201,6 +1204,12 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
ggml_tensor * f_sel = ggml_cast(ctx0, selected_experts, GGML_TYPE_F32);
selected_experts = ggml_cast(ctx0, ggml_scale(ctx0, f_sel, 1.0f / float(hparams.n_group_experts)), GGML_TYPE_I32);
probs = ggml_reshape_3d(ctx0, probs, 1, hparams.n_expert, n_tokens);

} else if (arch == LLM_ARCH_LONGCAT_FLASH && hparams.n_zero_experts > 0) {
ggml_tensor * f_sel = ggml_cast(ctx0, selected_experts, GGML_TYPE_F32);
// TODO (hard): how to implement zero-computation experts here?
probs = ggml_reshape_3d(ctx0, probs, 1, n_probs, n_tokens);

} else {
probs = ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens);
}
Expand Down
1 change: 1 addition & 0 deletions src/llama-hparams.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ struct llama_hparams {
uint32_t n_expert_groups = 0;
uint32_t n_group_used = 0;
uint32_t n_group_experts = 0;
uint32_t n_zero_experts = 0;

float expert_group_scale = 0.05f;
float expert_weights_scale = 0.0f;
Expand Down
11 changes: 11 additions & 0 deletions src/llama-model-loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -857,6 +857,8 @@ struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx
n_created++;
}

loaded_tensor_names.insert(name);

return tensor;

}
Expand Down Expand Up @@ -886,11 +888,20 @@ struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_conte

n_created++;

loaded_tensor_names.insert(name);

return tensor;
}

void llama_model_loader::done_getting_tensors() const {
if (n_created != n_tensors) {
// for debugging
for (const auto & it : weights_map) {
const std::string & name = it.first;
if (loaded_tensor_names.find(name) == loaded_tensor_names.end()) {
LLAMA_LOG_DEBUG("%s: tensor '%s' was not created\n", __func__, name.c_str());
}
}
throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
}
}
Expand Down
3 changes: 3 additions & 0 deletions src/llama-model-loader.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#include <cstddef>
#include <map>
#include <set>
#include <stdexcept>
#include <unordered_map>

Expand Down Expand Up @@ -94,6 +95,8 @@ struct llama_model_loader {
size_t size_data = 0;
std::vector<std::pair<size_t, size_t>> mmaps_used;

std::set<std::string> loaded_tensor_names; // for debugging

llama_model_loader(
const std::string & fname,
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
Expand Down
Loading
Loading