@@ -1747,14 +1747,18 @@ struct LLMEmbedder : public Conditioner {
17471747 std::tuple<std::vector<int >, std::vector<float >, std::vector<float >> tokenize (std::string text,
17481748 const std::pair<int , int >& attn_range,
17491749 size_t min_length = 0 ,
1750- size_t max_length = 100000000 ) {
1750+ size_t max_length = 100000000 ,
1751+ bool spell_quotes = false ) {
17511752 std::vector<std::pair<std::string, float >> parsed_attention;
17521753 if (attn_range.first >= 0 && attn_range.second > 0 ) {
17531754 if (attn_range.first > 0 ) {
17541755 parsed_attention.emplace_back (text.substr (0 , attn_range.first ), 1 .f );
17551756 }
17561757 if (attn_range.second - attn_range.first > 0 ) {
17571758 auto new_parsed_attention = parse_prompt_attention (text.substr (attn_range.first , attn_range.second - attn_range.first ));
1759+ if (spell_quotes) {
1760+ new_parsed_attention = split_quotation_attention (new_parsed_attention);
1761+ }
17581762 parsed_attention.insert (parsed_attention.end (),
17591763 new_parsed_attention.begin (),
17601764 new_parsed_attention.end ());
@@ -1804,8 +1808,10 @@ struct LLMEmbedder : public Conditioner {
18041808 int hidden_states_min_length,
18051809 const std::vector<std::pair<int , sd::Tensor<float >>>& image_embeds,
18061810 const std::set<int >& out_layers,
1807- int prompt_template_encode_start_idx) {
1808- auto tokens_weights_mask = tokenize (prompt, prompt_attn_range, min_length);
1811+ int prompt_template_encode_start_idx,
1812+ bool spell_quotes = false ,
1813+ int max_length = 100000000 ) {
1814+ auto tokens_weights_mask = tokenize (prompt, prompt_attn_range, min_length, max_length, spell_quotes);
18091815 auto & tokens = std::get<0 >(tokens_weights_mask);
18101816 auto & weights = std::get<1 >(tokens_weights_mask);
18111817 auto & mask = std::get<2 >(tokens_weights_mask);
@@ -1866,6 +1872,7 @@ struct LLMEmbedder : public Conditioner {
18661872 int prompt_template_encode_start_idx = 34 ;
18671873 int min_length = 0 ; // pad tokens
18681874 int hidden_states_min_length = 0 ; // zero pad hidden_states
1875+ bool spell_quotes = false ;
18691876 std::set<int > out_layers;
18701877
18711878 int64_t t0 = ggml_time_ms ();
@@ -1938,6 +1945,71 @@ struct LLMEmbedder : public Conditioner {
19381945
19391946 prompt += " <|im_end|>\n <|im_start|>assistant\n " ;
19401947 }
1948+ } else if (sd_version_is_longcat (version)) {
1949+ spell_quotes = true ;
1950+
1951+ if (llm->enable_vision && conditioner_params.ref_images != nullptr && !conditioner_params.ref_images ->empty ()) {
1952+ LOG_INFO (" LongCatEditPipeline" );
1953+ prompt_template_encode_start_idx = 67 ;
1954+ min_length = 512 + prompt_template_encode_start_idx;
1955+ int image_embed_idx = 36 + 6 ;
1956+
1957+ int min_pixels = 384 * 384 ;
1958+ int max_pixels = 560 * 560 ;
1959+ std::string placeholder = " <|image_pad|>" ;
1960+ std::string img_prompt;
1961+
1962+ for (int i = 0 ; i < conditioner_params.ref_images ->size (); i++) {
1963+ const auto & image = (*conditioner_params.ref_images )[i];
1964+ double factor = llm->params .vision .patch_size * llm->params .vision .spatial_merge_size ;
1965+ int height = static_cast <int >(image.shape ()[1 ]);
1966+ int width = static_cast <int >(image.shape ()[0 ]);
1967+ int h_bar = static_cast <int >(std::round (height / factor) * factor);
1968+ int w_bar = static_cast <int >(std::round (width / factor) * factor);
1969+
1970+ if (static_cast <double >(h_bar) * w_bar > max_pixels) {
1971+ double beta = std::sqrt ((height * width) / static_cast <double >(max_pixels));
1972+ h_bar = std::max (static_cast <int >(factor),
1973+ static_cast <int >(std::floor (height / beta / factor)) * static_cast <int >(factor));
1974+ w_bar = std::max (static_cast <int >(factor),
1975+ static_cast <int >(std::floor (width / beta / factor)) * static_cast <int >(factor));
1976+ } else if (static_cast <double >(h_bar) * w_bar < min_pixels) {
1977+ double beta = std::sqrt (static_cast <double >(min_pixels) / (height * width));
1978+ h_bar = static_cast <int >(std::ceil (height * beta / factor)) * static_cast <int >(factor);
1979+ w_bar = static_cast <int >(std::ceil (width * beta / factor)) * static_cast <int >(factor);
1980+ }
1981+
1982+ LOG_DEBUG (" resize conditioner ref image %d from %dx%d to %dx%d" , i, height, width, h_bar, w_bar);
1983+
1984+ auto resized_image = clip_preprocess (image, w_bar, h_bar);
1985+ auto image_embed = llm->encode_image (n_threads, resized_image);
1986+ GGML_ASSERT (!image_embed.empty ());
1987+ image_embeds.emplace_back (image_embed_idx, image_embed);
1988+ image_embed_idx += 1 + static_cast <int >(image_embed.shape ()[1 ]) + 6 ;
1989+
1990+ img_prompt += " <|vision_start|>" ;
1991+ int64_t num_image_tokens = image_embed.shape ()[1 ];
1992+ img_prompt.reserve (num_image_tokens * placeholder.size ());
1993+ for (int j = 0 ; j < num_image_tokens; j++) {
1994+ img_prompt += placeholder;
1995+ }
1996+ img_prompt += " <|vision_end|>" ;
1997+ }
1998+
1999+ prompt = " <|im_start|>system\n As an image editing expert, first analyze the content and attributes of the input image(s). Then, based on the user's editing instructions, clearly and precisely determine how to modify the given image(s), ensuring that only the specified parts are altered and all other aspects remain consistent with the original(s).<|im_end|>\n <|im_start|>user\n " ;
2000+ prompt += img_prompt;
2001+ } else {
2002+ prompt_template_encode_start_idx = 36 ;
2003+ min_length = 512 + prompt_template_encode_start_idx;
2004+
2005+ prompt = " <|im_start|>system\n As an image captioning expert, generate a descriptive text prompt based on an image content, suitable for input to a text-to-image model.<|im_end|>\n <|im_start|>user\n " ;
2006+ }
2007+
2008+ prompt_attn_range.first = static_cast <int >(prompt.size ());
2009+ prompt += conditioner_params.text ;
2010+ prompt_attn_range.second = static_cast <int >(prompt.size ());
2011+
2012+ prompt += " <|im_end|>\n <|im_start|>assistant\n " ;
19412013 } else if (version == VERSION_FLUX2 ) {
19422014 prompt_template_encode_start_idx = 0 ;
19432015 hidden_states_min_length = 512 ;
@@ -2012,7 +2084,8 @@ struct LLMEmbedder : public Conditioner {
20122084 hidden_states_min_length,
20132085 image_embeds,
20142086 out_layers,
2015- prompt_template_encode_start_idx);
2087+ prompt_template_encode_start_idx,
2088+ spell_quotes);
20162089 std::vector<sd::Tensor<float >> extra_hidden_states_vec;
20172090 for (int i = 0 ; i < extra_prompts.size (); i++) {
20182091 auto extra_hidden_states = encode_prompt (n_threads,
@@ -2022,7 +2095,8 @@ struct LLMEmbedder : public Conditioner {
20222095 hidden_states_min_length,
20232096 image_embeds,
20242097 out_layers,
2025- prompt_template_encode_start_idx);
2098+ prompt_template_encode_start_idx,
2099+ spell_quotes);
20262100 extra_hidden_states_vec.push_back (std::move (extra_hidden_states));
20272101 }
20282102
0 commit comments