Skip to content

Commit a397e03

Browse files
stduhpfleejet
andauthored
feat: add Longcat-Image / Longcat-Image-Edit support (#1053)
Co-authored-by: leejet <leejet714@gmail.com>
1 parent 72e512a commit a397e03

15 files changed

Lines changed: 361 additions & 43 deletions

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ API and command-line option may change frequently.***
4040
- [Chroma](./docs/chroma.md)
4141
- [Chroma1-Radiance](./docs/chroma_radiance.md)
4242
- [Qwen Image](./docs/qwen_image.md)
43+
- [LongCat Image](./docs/longcat_image.md)
4344
- [Z-Image](./docs/z_image.md)
4445
- [Ovis-Image](./docs/ovis_image.md)
4546
- [Anima](./docs/anima.md)
@@ -48,6 +49,7 @@ API and command-line option may change frequently.***
4849
- Image Edit Models
4950
- [FLUX.1-Kontext-dev](./docs/kontext.md)
5051
- [Qwen Image Edit series](./docs/qwen_image_edit.md)
52+
- [LongCat Image Edit](./docs/longcat_image.md)
5153
- Video Models
5254
- [Wan2.1/Wan2.2](./docs/wan.md)
5355
- [LTX-2.3](./docs/ltx2.md)
@@ -133,6 +135,7 @@ For runtime and parameter backend placement, see the [backend selection guide](.
133135
- [Chroma](./docs/chroma.md)
134136
- [🔥Qwen Image](./docs/qwen_image.md)
135137
- [🔥Qwen Image Edit series](./docs/qwen_image_edit.md)
138+
- [🔥LongCat Image / LongCat Image Edit](./docs/longcat_image.md)
136139
- [🔥Wan2.1/Wan2.2](./docs/wan.md)
137140
- [🔥LTX-2.3](./docs/ltx2.md)
138141
- [🔥Z-Image](./docs/z_image.md)

assets/longcat/example.png

423 KB
Loading

docs/longcat_image.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# How to Use
2+
3+
LongCat-Image uses a LongCat diffusion transformer, the FLUX VAE, and Qwen2.5-VL as the LLM text encoder.
4+
5+
## Download weights
6+
7+
- Download LongCat Image
8+
- safetensors: https://huggingface.co/Comfy-Org/LongCat-Image/tree/main/split_files/diffusion_models
9+
- gguf: https://huggingface.co/vantagewithai/LongCat-Image-GGUF/tree/main/comfy
10+
- Download LongCat Image Edit
11+
- LongCat Image Edit Turbo: https://huggingface.co/meituan-longcat/LongCat-Image-Edit-Turbo
12+
- gguf: https://huggingface.co/vantagewithai/LongCat-Image-Edit-GGUF/tree/main
13+
- Download vae
14+
- safetensors: https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
15+
- Download qwen_2.5_vl 7b
16+
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/text_encoders
17+
- gguf: https://huggingface.co/mradermacher/Qwen2.5-VL-7B-Instruct-GGUF/tree/main
18+
- For image editing with GGUF text encoders, also download the matching mmproj file and pass it with `--llm_vision`.
19+
20+
## Run
21+
22+
LongCat uses quoted text for character-level text rendering. Put target text inside single quotes, double quotes, or Chinese quotes.
23+
24+
### LongCat Image
25+
26+
```
27+
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\LongCat-Image-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\ae.sft --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf -p "a lovely cat holding a sign says 'longcat.cpp'" --cfg-scale 5.0 --sampling-method euler --flow-shift 3 -v --offload-to-cpu --diffusion-fa
28+
```
29+
30+
<img alt="longcat example" src="../assets/longcat/example.png" />

src/anima.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -598,7 +598,8 @@ namespace Anima {
598598
{},
599599
empty_ref_latents,
600600
false,
601-
1.0f);
601+
1.0f,
602+
false);
602603

603604
std::vector<float> axis_thetas = {
604605
static_cast<float>(theta) * calc_ntk_factor(t_extrapolation_ratio, axes_dim[0]),

src/auto_encoder_kl.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -680,7 +680,7 @@ struct AutoEncoderKL : public VAE {
680680
} else if (sd_version_is_sd3(version)) {
681681
scale_factor = 1.5305f;
682682
shift_factor = 0.0609f;
683-
} else if (sd_version_is_flux(version) || sd_version_is_z_image(version)) {
683+
} else if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_longcat(version)) {
684684
scale_factor = 0.3611f;
685685
shift_factor = 0.1159f;
686686
} else if (sd_version_uses_flux2_vae(version)) {

src/conditioner.hpp

Lines changed: 79 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1747,14 +1747,18 @@ struct LLMEmbedder : public Conditioner {
17471747
std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text,
17481748
const std::pair<int, int>& attn_range,
17491749
size_t min_length = 0,
1750-
size_t max_length = 100000000) {
1750+
size_t max_length = 100000000,
1751+
bool spell_quotes = false) {
17511752
std::vector<std::pair<std::string, float>> parsed_attention;
17521753
if (attn_range.first >= 0 && attn_range.second > 0) {
17531754
if (attn_range.first > 0) {
17541755
parsed_attention.emplace_back(text.substr(0, attn_range.first), 1.f);
17551756
}
17561757
if (attn_range.second - attn_range.first > 0) {
17571758
auto new_parsed_attention = parse_prompt_attention(text.substr(attn_range.first, attn_range.second - attn_range.first));
1759+
if (spell_quotes) {
1760+
new_parsed_attention = split_quotation_attention(new_parsed_attention);
1761+
}
17581762
parsed_attention.insert(parsed_attention.end(),
17591763
new_parsed_attention.begin(),
17601764
new_parsed_attention.end());
@@ -1804,8 +1808,10 @@ struct LLMEmbedder : public Conditioner {
18041808
int hidden_states_min_length,
18051809
const std::vector<std::pair<int, sd::Tensor<float>>>& image_embeds,
18061810
const std::set<int>& out_layers,
1807-
int prompt_template_encode_start_idx) {
1808-
auto tokens_weights_mask = tokenize(prompt, prompt_attn_range, min_length);
1811+
int prompt_template_encode_start_idx,
1812+
bool spell_quotes = false,
1813+
int max_length = 100000000) {
1814+
auto tokens_weights_mask = tokenize(prompt, prompt_attn_range, min_length, max_length, spell_quotes);
18091815
auto& tokens = std::get<0>(tokens_weights_mask);
18101816
auto& weights = std::get<1>(tokens_weights_mask);
18111817
auto& mask = std::get<2>(tokens_weights_mask);
@@ -1866,6 +1872,7 @@ struct LLMEmbedder : public Conditioner {
18661872
int prompt_template_encode_start_idx = 34;
18671873
int min_length = 0; // pad tokens
18681874
int hidden_states_min_length = 0; // zero pad hidden_states
1875+
bool spell_quotes = false;
18691876
std::set<int> out_layers;
18701877

18711878
int64_t t0 = ggml_time_ms();
@@ -1938,6 +1945,71 @@ struct LLMEmbedder : public Conditioner {
19381945

19391946
prompt += "<|im_end|>\n<|im_start|>assistant\n";
19401947
}
1948+
} else if (sd_version_is_longcat(version)) {
1949+
spell_quotes = true;
1950+
1951+
if (llm->enable_vision && conditioner_params.ref_images != nullptr && !conditioner_params.ref_images->empty()) {
1952+
LOG_INFO("LongCatEditPipeline");
1953+
prompt_template_encode_start_idx = 67;
1954+
min_length = 512 + prompt_template_encode_start_idx;
1955+
int image_embed_idx = 36 + 6;
1956+
1957+
int min_pixels = 384 * 384;
1958+
int max_pixels = 560 * 560;
1959+
std::string placeholder = "<|image_pad|>";
1960+
std::string img_prompt;
1961+
1962+
for (int i = 0; i < conditioner_params.ref_images->size(); i++) {
1963+
const auto& image = (*conditioner_params.ref_images)[i];
1964+
double factor = llm->params.vision.patch_size * llm->params.vision.spatial_merge_size;
1965+
int height = static_cast<int>(image.shape()[1]);
1966+
int width = static_cast<int>(image.shape()[0]);
1967+
int h_bar = static_cast<int>(std::round(height / factor) * factor);
1968+
int w_bar = static_cast<int>(std::round(width / factor) * factor);
1969+
1970+
if (static_cast<double>(h_bar) * w_bar > max_pixels) {
1971+
double beta = std::sqrt((height * width) / static_cast<double>(max_pixels));
1972+
h_bar = std::max(static_cast<int>(factor),
1973+
static_cast<int>(std::floor(height / beta / factor)) * static_cast<int>(factor));
1974+
w_bar = std::max(static_cast<int>(factor),
1975+
static_cast<int>(std::floor(width / beta / factor)) * static_cast<int>(factor));
1976+
} else if (static_cast<double>(h_bar) * w_bar < min_pixels) {
1977+
double beta = std::sqrt(static_cast<double>(min_pixels) / (height * width));
1978+
h_bar = static_cast<int>(std::ceil(height * beta / factor)) * static_cast<int>(factor);
1979+
w_bar = static_cast<int>(std::ceil(width * beta / factor)) * static_cast<int>(factor);
1980+
}
1981+
1982+
LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, height, width, h_bar, w_bar);
1983+
1984+
auto resized_image = clip_preprocess(image, w_bar, h_bar);
1985+
auto image_embed = llm->encode_image(n_threads, resized_image);
1986+
GGML_ASSERT(!image_embed.empty());
1987+
image_embeds.emplace_back(image_embed_idx, image_embed);
1988+
image_embed_idx += 1 + static_cast<int>(image_embed.shape()[1]) + 6;
1989+
1990+
img_prompt += "<|vision_start|>";
1991+
int64_t num_image_tokens = image_embed.shape()[1];
1992+
img_prompt.reserve(num_image_tokens * placeholder.size());
1993+
for (int j = 0; j < num_image_tokens; j++) {
1994+
img_prompt += placeholder;
1995+
}
1996+
img_prompt += "<|vision_end|>";
1997+
}
1998+
1999+
prompt = "<|im_start|>system\nAs an image editing expert, first analyze the content and attributes of the input image(s). Then, based on the user's editing instructions, clearly and precisely determine how to modify the given image(s), ensuring that only the specified parts are altered and all other aspects remain consistent with the original(s).<|im_end|>\n<|im_start|>user\n";
2000+
prompt += img_prompt;
2001+
} else {
2002+
prompt_template_encode_start_idx = 36;
2003+
min_length = 512 + prompt_template_encode_start_idx;
2004+
2005+
prompt = "<|im_start|>system\nAs an image captioning expert, generate a descriptive text prompt based on an image content, suitable for input to a text-to-image model.<|im_end|>\n<|im_start|>user\n";
2006+
}
2007+
2008+
prompt_attn_range.first = static_cast<int>(prompt.size());
2009+
prompt += conditioner_params.text;
2010+
prompt_attn_range.second = static_cast<int>(prompt.size());
2011+
2012+
prompt += "<|im_end|>\n<|im_start|>assistant\n";
19412013
} else if (version == VERSION_FLUX2) {
19422014
prompt_template_encode_start_idx = 0;
19432015
hidden_states_min_length = 512;
@@ -2012,7 +2084,8 @@ struct LLMEmbedder : public Conditioner {
20122084
hidden_states_min_length,
20132085
image_embeds,
20142086
out_layers,
2015-
prompt_template_encode_start_idx);
2087+
prompt_template_encode_start_idx,
2088+
spell_quotes);
20162089
std::vector<sd::Tensor<float>> extra_hidden_states_vec;
20172090
for (int i = 0; i < extra_prompts.size(); i++) {
20182091
auto extra_hidden_states = encode_prompt(n_threads,
@@ -2022,7 +2095,8 @@ struct LLMEmbedder : public Conditioner {
20222095
hidden_states_min_length,
20232096
image_embeds,
20242097
out_layers,
2025-
prompt_template_encode_start_idx);
2098+
prompt_template_encode_start_idx,
2099+
spell_quotes);
20262100
extra_hidden_states_vec.push_back(std::move(extra_hidden_states));
20272101
}
20282102

src/flux.hpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -446,7 +446,6 @@ namespace Flux {
446446
if (use_yak_mlp || use_mlp_silu_act) {
447447
mlp_mult_factor = 2;
448448
}
449-
450449
blocks["linear1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size * 3 + mlp_hidden_dim * mlp_mult_factor, mlp_proj_bias));
451450
blocks["linear2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size + mlp_hidden_dim, hidden_size, mlp_proj_bias));
452451
blocks["norm"] = std::shared_ptr<GGMLBlock>(new QKNorm(head_dim));
@@ -1225,6 +1224,9 @@ namespace Flux {
12251224
flux_params.share_modulation = true;
12261225
flux_params.ref_index_scale = 10.f;
12271226
flux_params.use_mlp_silu_act = true;
1227+
} else if (sd_version_is_longcat(version)) {
1228+
flux_params.context_in_dim = 3584;
1229+
flux_params.vec_in_dim = 0;
12281230
}
12291231
int64_t head_dim = 0;
12301232
int64_t actual_radiance_patch_size = -1;
@@ -1412,7 +1414,6 @@ namespace Flux {
14121414
} else if (version == VERSION_OVIS_IMAGE) {
14131415
txt_arange_dims = {1, 2};
14141416
}
1415-
14161417
pe_vec = Rope::gen_flux_pe(static_cast<int>(x->ne[1]),
14171418
static_cast<int>(x->ne[0]),
14181419
flux_params.patch_size,
@@ -1425,7 +1426,8 @@ namespace Flux {
14251426
flux_params.theta,
14261427
circular_y_enabled,
14271428
circular_x_enabled,
1428-
flux_params.axes_dim);
1429+
flux_params.axes_dim,
1430+
sd_version_is_longcat(version));
14291431
int pos_len = static_cast<int>(pe_vec.size() / flux_params.axes_dim_sum / 2);
14301432
// LOG_DEBUG("pos_len %d", pos_len);
14311433
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, flux_params.axes_dim_sum / 2, pos_len);

src/ggml_extend.hpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -953,11 +953,17 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_group_norm_32(ggml_context* ctx,
953953
return ggml_group_norm(ctx, a, 32, eps);
954954
}
955955

956+
__STATIC_INLINE__ bool ggml_ext_is_padded_1d(const ggml_tensor* x) {
957+
return x->nb[0] == ggml_type_size(x->type) &&
958+
x->nb[2] == x->nb[1] * x->ne[1] &&
959+
x->nb[3] == x->nb[2] * x->ne[2];
960+
}
961+
956962
__STATIC_INLINE__ ggml_tensor* ggml_ext_scale(ggml_context* ctx,
957963
ggml_tensor* x,
958964
float factor,
959965
bool inplace = false) {
960-
if (!ggml_is_contiguous(x)) {
966+
if (!ggml_ext_is_padded_1d(x)) {
961967
x = ggml_cont(ctx, x);
962968
}
963969
if (inplace) {
@@ -3664,7 +3670,7 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_lokr_forward(
36643670

36653671
ggml_tensor* hc = ggml_transpose(ctx, hc_t);
36663672
ggml_tensor* out = ggml_reshape_2d(ctx, ggml_cont(ctx, hc), up * vp, batch);
3667-
return ggml_scale(ctx, out, scale);
3673+
return ggml_ext_scale(ctx, out, scale);
36683674
} else {
36693675
int batch = (int)h->ne[3];
36703676
// 1. Reshape input: [W, H, vq*uq, batch] -> [W, H, vq, uq * batch]
@@ -3747,7 +3753,7 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_lokr_forward(
37473753
ggml_tensor* hc = ggml_transpose(ctx, hc_t);
37483754
// ungroup
37493755
ggml_tensor* out = ggml_reshape_4d(ctx, ggml_cont(ctx, hc), w_out, h_out, up * vp, batch);
3750-
return ggml_scale(ctx, out, scale);
3756+
return ggml_ext_scale(ctx, out, scale);
37513757
}
37523758
}
37533759

src/model.cpp

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -410,7 +410,7 @@ bool ModelLoader::init_from_diffusers_file(const std::string& file_path, const s
410410
}
411411

412412
SDVersion ModelLoader::get_sd_version() {
413-
TensorStorage token_embedding_weight, input_block_weight;
413+
TensorStorage token_embedding_weight, input_block_weight, context_ebedding_weight;
414414

415415
bool has_multiple_encoders = false;
416416
bool is_unet = false;
@@ -428,7 +428,8 @@ SDVersion ModelLoader::get_sd_version() {
428428
bool has_attn_1024 = false;
429429

430430
for (auto& [name, tensor_storage] : tensor_storage_map) {
431-
if (tensor_storage.name.find("model.diffusion_model.double_blocks.") != std::string::npos) {
431+
if (tensor_storage.name.find("model.diffusion_model.double_blocks.") != std::string::npos ||
432+
tensor_storage.name.find("model.diffusion_model.single_transformer_blocks.") != std::string::npos) {
432433
is_flux = true;
433434
}
434435
if (tensor_storage.name.find("model.diffusion_model.nerf_final_layer_conv.") != std::string::npos) {
@@ -522,6 +523,9 @@ SDVersion ModelLoader::get_sd_version() {
522523
tensor_storage.name == "unet.conv_in.weight") {
523524
input_block_weight = tensor_storage;
524525
}
526+
if (tensor_storage.name == "model.diffusion_model.txt_in.weight" || tensor_storage.name == "model.diffusion_model.context_embedder.weight") {
527+
context_ebedding_weight = tensor_storage;
528+
}
525529
}
526530
if (is_wan) {
527531
LOG_DEBUG("patch_embedding_channels %d", patch_embedding_channels);
@@ -552,16 +556,20 @@ SDVersion ModelLoader::get_sd_version() {
552556
}
553557

554558
if (is_flux && !is_flux2) {
555-
if (input_block_weight.ne[0] == 384) {
556-
return VERSION_FLUX_FILL;
557-
}
558-
if (input_block_weight.ne[0] == 128) {
559-
return VERSION_FLUX_CONTROLS;
560-
}
561-
if (input_block_weight.ne[0] == 196) {
562-
return VERSION_FLEX_2;
559+
if (context_ebedding_weight.ne[0] == 3584) {
560+
return VERSION_LONGCAT;
561+
} else {
562+
if (input_block_weight.ne[0] == 384) {
563+
return VERSION_FLUX_FILL;
564+
}
565+
if (input_block_weight.ne[0] == 128) {
566+
return VERSION_FLUX_CONTROLS;
567+
}
568+
if (input_block_weight.ne[0] == 196) {
569+
return VERSION_FLEX_2;
570+
}
571+
return VERSION_FLUX;
563572
}
564-
return VERSION_FLUX;
565573
}
566574

567575
if (is_flux2) {

src/model.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ enum SDVersion {
4747
VERSION_Z_IMAGE,
4848
VERSION_OVIS_IMAGE,
4949
VERSION_ERNIE_IMAGE,
50+
VERSION_LONGCAT,
5051
VERSION_COUNT,
5152
};
5253

@@ -141,6 +142,13 @@ static inline bool sd_version_is_z_image(SDVersion version) {
141142
return false;
142143
}
143144

145+
static inline bool sd_version_is_longcat(SDVersion version) {
146+
if (version == VERSION_LONGCAT) {
147+
return true;
148+
}
149+
return false;
150+
}
151+
144152
static inline bool sd_version_is_ernie_image(SDVersion version) {
145153
if (version == VERSION_ERNIE_IMAGE) {
146154
return true;
@@ -176,7 +184,8 @@ static inline bool sd_version_is_dit(SDVersion version) {
176184
version == VERSION_HIDREAM_O1 ||
177185
sd_version_is_anima(version) ||
178186
sd_version_is_z_image(version) ||
179-
sd_version_is_ernie_image(version)) {
187+
sd_version_is_ernie_image(version) ||
188+
sd_version_is_longcat(version)) {
180189
return true;
181190
}
182191
return false;

0 commit comments

Comments
 (0)