1- #ifndef __CONDITIONER_HPP__
2- #define __CONDITIONER_HPP__
1+ #ifndef __SD_CONDITIONING_CONDITIONER_HPP__
2+ #define __SD_CONDITIONING_CONDITIONER_HPP__
33
44#include < cmath>
55#include < limits>
66#include < optional>
77
8- #include " clip.hpp"
9- #include " llm.hpp"
10- #include " t5.hpp"
11- #include " tensor_ggml.hpp"
8+ #include " core/tensor_ggml.hpp"
9+ #include " model/te/clip.hpp"
10+ #include " model/te/llm.hpp"
11+ #include " model/te/t5.hpp"
12+ #include " model_loader.h"
1213
1314struct SDCondition {
1415 sd::Tensor<float > c_crossattn;
@@ -103,7 +104,6 @@ struct ConditionerParams {
103104 int width = -1 ;
104105 int height = -1 ;
105106 bool zero_out_masked = false ;
106- int num_input_imgs = 0 ; // for photomaker
107107 const std::vector<sd::Tensor<float >>* ref_images = nullptr ; // for qwen image edit
108108};
109109
@@ -121,25 +121,16 @@ struct Conditioner {
121121 virtual void set_stream_layers_enabled (bool enabled) {}
122122 virtual void set_flash_attention_enabled (bool enabled) = 0;
123123 virtual void set_weight_adapter (const std::shared_ptr<WeightAdapter>& adapter) {}
124- virtual std::tuple<SDCondition, std::vector<bool >> get_learned_condition_with_trigger (int n_threads,
125- const ConditionerParams& conditioner_params) {
126- GGML_ABORT (" Not implemented yet!" );
127- }
128- virtual std::string remove_trigger_from_prompt (const std::string& prompt) {
129- GGML_ABORT (" Not implemented yet!" );
130- }
131124};
132125
133126// ldm.modules.encoders.modules.FrozenCLIPEmbedder
134127// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/sd_hijack_clip.py#L283
135128struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
136- SDVersion version = VERSION_SD1 ;
137- PMVersion pm_version = PM_VERSION_1 ;
129+ SDVersion version = VERSION_SD1 ;
138130 CLIPTokenizer tokenizer;
139131 std::shared_ptr<CLIPTextModelRunner> text_model;
140132 std::shared_ptr<CLIPTextModelRunner> text_model2;
141133
142- std::string trigger_word = " img" ; // should be user settable
143134 std::map<std::string, std::string> embedding_map;
144135 int32_t num_custom_embeddings = 0 ;
145136 int32_t num_custom_embeddings_2 = 0 ;
@@ -150,9 +141,8 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
150141 ggml_backend_t params_backend,
151142 const String2TensorStorage& tensor_storage_map,
152143 const std::map<std::string, std::string>& orig_embedding_map,
153- SDVersion version = VERSION_SD1 ,
154- PMVersion pv = PM_VERSION_1 )
155- : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407 ) {
144+ SDVersion version = VERSION_SD1 )
145+ : version(version), tokenizer(sd_version_is_sd2(version) ? 0 : 49407 ) {
156146 for (const auto & kv : orig_embedding_map) {
157147 std::string name = kv.first ;
158148 std::transform (name.begin (), name.end (), name.begin (), [](unsigned char c) { return std::tolower (c); });
@@ -329,121 +319,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
329319 return tokenizer.decode (tokens);
330320 }
331321
332- std::tuple<std::vector<int >, std::vector<float >, std::vector<bool >>
333- tokenize_with_trigger_token (std::string text,
334- int num_input_imgs,
335- int32_t image_token) {
336- auto parsed_attention = parse_prompt_attention (text);
337-
338- {
339- std::stringstream ss;
340- ss << " [" ;
341- for (const auto & item : parsed_attention) {
342- ss << " ['" << item.first << " ', " << item.second << " ], " ;
343- }
344- ss << " ]" ;
345- LOG_DEBUG (" parse '%s' to %s" , text.c_str (), ss.str ().c_str ());
346- }
347-
348- auto on_new_token_cb = [&](std::string& str, std::vector<int32_t >& bpe_tokens) -> bool {
349- auto iter = embedding_map.find (str);
350- if (iter == embedding_map.end ()) {
351- return false ;
352- }
353- std::string embedding_path = iter->second ;
354- if (load_embedding (str, embedding_path, bpe_tokens)) {
355- return true ;
356- }
357- return false ;
358- };
359-
360- std::vector<int > tokens;
361- std::vector<float > weights;
362- std::vector<bool > class_token_mask;
363- int32_t class_idx = -1 , tokens_acc = 0 ;
364- for (const auto & item : parsed_attention) {
365- std::vector<int > class_token_index;
366- std::vector<int > clean_input_ids;
367- const std::string& curr_text = item.first ;
368- float curr_weight = item.second ;
369- // printf(" %s: %f \n", curr_text.c_str(), curr_weight);
370- int32_t clean_index = 0 ;
371- if (curr_text == " BREAK" && curr_weight == -1 .0f ) {
372- // Pad token array up to chunk size at this point.
373- // TODO: This is a hardcoded chunk_len, like in stable-diffusion.cpp, make it a parameter for the future?
374- // Also, this is 75 instead of 77 to leave room for BOS and EOS tokens.
375- int padding_size = 75 - (tokens_acc % 75 );
376- for (int j = 0 ; j < padding_size; j++) {
377- clean_input_ids.push_back (tokenizer.EOS_TOKEN_ID );
378- clean_index++;
379- }
380-
381- // After padding, continue to the next iteration to process the following text as a new segment
382- tokens.insert (tokens.end (), clean_input_ids.begin (), clean_input_ids.end ());
383- weights.insert (weights.end (), padding_size, curr_weight);
384- continue ;
385- }
386-
387- // Regular token, process normally
388- std::vector<int > curr_tokens = tokenizer.encode (curr_text, on_new_token_cb);
389- for (uint32_t i = 0 ; i < curr_tokens.size (); i++) {
390- int token_id = curr_tokens[i];
391- if (token_id == image_token) {
392- class_token_index.push_back (clean_index - 1 );
393- } else {
394- clean_input_ids.push_back (token_id);
395- clean_index++;
396- }
397- }
398- // GGML_ASSERT(class_token_index.size() == 1); // PhotoMaker currently does not support multiple
399- // trigger words in a single prompt.
400- if (class_token_index.size () == 1 ) {
401- // Expand the class word token and corresponding mask
402- int class_token = clean_input_ids[class_token_index[0 ]];
403- class_idx = tokens_acc + class_token_index[0 ];
404- std::vector<int > clean_input_ids_tmp;
405- for (int i = 0 ; i < class_token_index[0 ]; i++)
406- clean_input_ids_tmp.push_back (clean_input_ids[i]);
407- for (int i = 0 ; i < (pm_version == PM_VERSION_2 ? 2 * num_input_imgs : num_input_imgs); i++)
408- clean_input_ids_tmp.push_back (class_token);
409- for (int i = class_token_index[0 ] + 1 ; i < clean_input_ids.size (); i++)
410- clean_input_ids_tmp.push_back (clean_input_ids[i]);
411- clean_input_ids.clear ();
412- clean_input_ids = clean_input_ids_tmp;
413- }
414- tokens_acc += clean_index;
415- tokens.insert (tokens.end (), clean_input_ids.begin (), clean_input_ids.end ());
416- weights.insert (weights.end (), clean_input_ids.size (), curr_weight);
417- }
418- // BUG!! double couting, pad_tokens will add BOS at the beginning
419- // tokens.insert(tokens.begin(), tokenizer.BOS_TOKEN_ID);
420- // weights.insert(weights.begin(), 1.0);
421-
422- tokenizer.pad_tokens (tokens, &weights, nullptr , text_model->model .n_token , text_model->model .n_token , true );
423- int offset = pm_version == PM_VERSION_2 ? 2 * num_input_imgs : num_input_imgs;
424- for (int i = 0 ; i < tokens.size (); i++) {
425- // if (class_idx + 1 <= i && i < class_idx + 1 + 2*num_input_imgs) // photomaker V2 has num_tokens(=2)*num_input_imgs
426- if (class_idx + 1 <= i && i < class_idx + 1 + offset) // photomaker V2 has num_tokens(=2)*num_input_imgs
427- // hardcode for now
428- class_token_mask.push_back (true );
429- else
430- class_token_mask.push_back (false );
431- }
432-
433- // printf("[");
434- // for (int i = 0; i < tokens.size(); i++) {
435- // printf("%d, ", class_token_mask[i] ? 1 : 0);
436- // }
437- // printf("]\n");
438-
439- // for (int i = 0; i < tokens.size(); i++) {
440- // std::cout << tokens[i] << ":" << weights[i] << ", ";
441- // }
442- // std::cout << std::endl;
443-
444- return std::make_tuple (tokens, weights, class_token_mask);
445- }
446-
447322 std::pair<std::vector<int >, std::vector<float >> tokenize (std::string text,
448323 size_t min_length = 0 ,
449324 size_t max_length = 0 ,
@@ -631,49 +506,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
631506 return result;
632507 }
633508
634- std::tuple<SDCondition, std::vector<bool >>
635- get_learned_condition_with_trigger (int n_threads,
636- const ConditionerParams& conditioner_params) override {
637- auto image_tokens = convert_token_to_id (trigger_word);
638- // if(image_tokens.size() == 1){
639- // printf(" image token id is: %d \n", image_tokens[0]);
640- // }
641- GGML_ASSERT (image_tokens.size () == 1 );
642- auto tokens_and_weights = tokenize_with_trigger_token (conditioner_params.text ,
643- conditioner_params.num_input_imgs ,
644- image_tokens[0 ]);
645- std::vector<int >& tokens = std::get<0 >(tokens_and_weights);
646- std::vector<float >& weights = std::get<1 >(tokens_and_weights);
647- std::vector<bool >& clsm = std::get<2 >(tokens_and_weights);
648- // printf("tokens: \n");
649- // for(int i = 0; i < tokens.size(); ++i)
650- // printf("%d ", tokens[i]);
651- // printf("\n");
652- // printf("clsm: \n");
653- // for(int i = 0; i < clsm.size(); ++i)
654- // printf("%d ", clsm[i]?1:0);
655- // printf("\n");
656- auto cond = get_learned_condition_common (n_threads,
657- tokens,
658- weights,
659- conditioner_params.clip_skip ,
660- conditioner_params.width ,
661- conditioner_params.height ,
662- conditioner_params.zero_out_masked );
663- return std::make_tuple (cond, clsm);
664- }
665-
666- std::string remove_trigger_from_prompt (const std::string& prompt) override {
667- auto image_tokens = convert_token_to_id (trigger_word);
668- GGML_ASSERT (image_tokens.size () == 1 );
669- auto tokens_and_weights = tokenize (prompt);
670- std::vector<int >& tokens = tokens_and_weights.first ;
671- auto it = std::find (tokens.begin (), tokens.end (), image_tokens[0 ]);
672- GGML_ASSERT (it != tokens.end ()); // prompt must have trigger word
673- tokens.erase (it);
674- return decode (tokens);
675- }
676-
677509 SDCondition get_learned_condition (int n_threads,
678510 const ConditionerParams& conditioner_params) override {
679511 auto tokens_and_weights = tokenize (conditioner_params.text , text_model->model .n_token , text_model->model .n_token , true );
@@ -2554,4 +2386,4 @@ struct LTXAVEmbedder : public Conditioner {
25542386 }
25552387};
25562388
2557- #endif
2389+ #endif // __SD_CONDITIONING_CONDITIONER_HPP__
0 commit comments