Skip to content

Commit 1f8c609

Browse files
committed
Merge remote-tracking branch 'upstream/master' into perf/smaller-merged-segments
2 parents 88a5ee4 + b3d56d0 commit 1f8c609

86 files changed

Lines changed: 1338 additions & 1173 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

CMakeLists.txt

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,21 @@ file(GLOB SD_LIB_SOURCES CONFIGURE_DEPENDS
210210
"src/*.h"
211211
"src/*.cpp"
212212
"src/*.hpp"
213+
"src/conditioning/*.h"
214+
"src/conditioning/*.cpp"
215+
"src/conditioning/*.hpp"
216+
"src/core/*.h"
217+
"src/core/*.cpp"
218+
"src/core/*.hpp"
219+
"src/extensions/*.h"
220+
"src/extensions/*.cpp"
221+
"src/extensions/*.hpp"
222+
"src/model/*/*.h"
223+
"src/model/*/*.cpp"
224+
"src/model/*/*.hpp"
225+
"src/runtime/*.h"
226+
"src/runtime/*.cpp"
227+
"src/runtime/*.hpp"
213228
"src/model_io/*.h"
214229
"src/model_io/*.cpp"
215230
"src/tokenizers/*.h"
@@ -312,6 +327,7 @@ add_subdirectory(thirdparty)
312327

313328
target_link_libraries(${SD_LIB} PUBLIC ggml zip)
314329
target_include_directories(${SD_LIB} PUBLIC . src include)
330+
target_include_directories(${SD_LIB} PRIVATE src/core)
315331
target_include_directories(${SD_LIB} PUBLIC . thirdparty)
316332
target_compile_features(${SD_LIB} PUBLIC c_std_11 cxx_std_17)
317333

format-code.sh

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,17 @@
1-
for f in src/*.cpp src/*.h src/*.hpp src/tokenizers/*.h src/tokenizers/*.cpp src/tokenizers/vocab/*.h src/tokenizers/vocab/*.cpp \
1+
for f in src/*.cpp src/*.h src/*.hpp \
2+
src/conditioning/*.cpp src/conditioning/*.h src/conditioning/*.hpp \
3+
src/core/*.cpp src/core/*.h src/core/*.hpp \
4+
src/extensions/*.cpp src/extensions/*.h src/extensions/*.hpp \
5+
src/runtime/*.cpp src/runtime/*.h src/runtime/*.hpp \
6+
src/model/*/*.cpp src/model/*/*.h src/model/*/*.hpp \
7+
src/tokenizers/*.h src/tokenizers/*.cpp src/tokenizers/vocab/*.h src/tokenizers/vocab/*.cpp \
28
src/model_io/*.h src/model_io/*.cpp examples/cli/*.cpp examples/cli/*.h examples/server/*.cpp \
39
examples/common/*.hpp examples/common/*.h examples/common/*.cpp; do
10+
[[ -e "$f" ]] || continue
411
[[ "$f" == vocab* ]] && continue
512
echo "formatting '$f'"
613
# if [ "$f" != "stable-diffusion.h" ]; then
714
# clang-tidy -fix -p build_linux/ "$f"
815
# fi
916
clang-format -style=file -i "$f"
10-
done
17+
done
Lines changed: 11 additions & 179 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
1-
#ifndef __CONDITIONER_HPP__
2-
#define __CONDITIONER_HPP__
1+
#ifndef __SD_CONDITIONING_CONDITIONER_HPP__
2+
#define __SD_CONDITIONING_CONDITIONER_HPP__
33

44
#include <cmath>
55
#include <limits>
66
#include <optional>
77

8-
#include "clip.hpp"
9-
#include "llm.hpp"
10-
#include "t5.hpp"
11-
#include "tensor_ggml.hpp"
8+
#include "core/tensor_ggml.hpp"
9+
#include "model/te/clip.hpp"
10+
#include "model/te/llm.hpp"
11+
#include "model/te/t5.hpp"
12+
#include "model_loader.h"
1213

1314
struct SDCondition {
1415
sd::Tensor<float> c_crossattn;
@@ -103,7 +104,6 @@ struct ConditionerParams {
103104
int width = -1;
104105
int height = -1;
105106
bool zero_out_masked = false;
106-
int num_input_imgs = 0; // for photomaker
107107
const std::vector<sd::Tensor<float>>* ref_images = nullptr; // for qwen image edit
108108
};
109109

@@ -121,25 +121,16 @@ struct Conditioner {
121121
virtual void set_stream_layers_enabled(bool enabled) {}
122122
virtual void set_flash_attention_enabled(bool enabled) = 0;
123123
virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {}
124-
virtual std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(int n_threads,
125-
const ConditionerParams& conditioner_params) {
126-
GGML_ABORT("Not implemented yet!");
127-
}
128-
virtual std::string remove_trigger_from_prompt(const std::string& prompt) {
129-
GGML_ABORT("Not implemented yet!");
130-
}
131124
};
132125

133126
// ldm.modules.encoders.modules.FrozenCLIPEmbedder
134127
// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/sd_hijack_clip.py#L283
135128
struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
136-
SDVersion version = VERSION_SD1;
137-
PMVersion pm_version = PM_VERSION_1;
129+
SDVersion version = VERSION_SD1;
138130
CLIPTokenizer tokenizer;
139131
std::shared_ptr<CLIPTextModelRunner> text_model;
140132
std::shared_ptr<CLIPTextModelRunner> text_model2;
141133

142-
std::string trigger_word = "img"; // should be user settable
143134
std::map<std::string, std::string> embedding_map;
144135
int32_t num_custom_embeddings = 0;
145136
int32_t num_custom_embeddings_2 = 0;
@@ -150,9 +141,8 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
150141
ggml_backend_t params_backend,
151142
const String2TensorStorage& tensor_storage_map,
152143
const std::map<std::string, std::string>& orig_embedding_map,
153-
SDVersion version = VERSION_SD1,
154-
PMVersion pv = PM_VERSION_1)
155-
: version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407) {
144+
SDVersion version = VERSION_SD1)
145+
: version(version), tokenizer(sd_version_is_sd2(version) ? 0 : 49407) {
156146
for (const auto& kv : orig_embedding_map) {
157147
std::string name = kv.first;
158148
std::transform(name.begin(), name.end(), name.begin(), [](unsigned char c) { return std::tolower(c); });
@@ -329,121 +319,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
329319
return tokenizer.decode(tokens);
330320
}
331321

332-
std::tuple<std::vector<int>, std::vector<float>, std::vector<bool>>
333-
tokenize_with_trigger_token(std::string text,
334-
int num_input_imgs,
335-
int32_t image_token) {
336-
auto parsed_attention = parse_prompt_attention(text);
337-
338-
{
339-
std::stringstream ss;
340-
ss << "[";
341-
for (const auto& item : parsed_attention) {
342-
ss << "['" << item.first << "', " << item.second << "], ";
343-
}
344-
ss << "]";
345-
LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
346-
}
347-
348-
auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
349-
auto iter = embedding_map.find(str);
350-
if (iter == embedding_map.end()) {
351-
return false;
352-
}
353-
std::string embedding_path = iter->second;
354-
if (load_embedding(str, embedding_path, bpe_tokens)) {
355-
return true;
356-
}
357-
return false;
358-
};
359-
360-
std::vector<int> tokens;
361-
std::vector<float> weights;
362-
std::vector<bool> class_token_mask;
363-
int32_t class_idx = -1, tokens_acc = 0;
364-
for (const auto& item : parsed_attention) {
365-
std::vector<int> class_token_index;
366-
std::vector<int> clean_input_ids;
367-
const std::string& curr_text = item.first;
368-
float curr_weight = item.second;
369-
// printf(" %s: %f \n", curr_text.c_str(), curr_weight);
370-
int32_t clean_index = 0;
371-
if (curr_text == "BREAK" && curr_weight == -1.0f) {
372-
// Pad token array up to chunk size at this point.
373-
// TODO: This is a hardcoded chunk_len, like in stable-diffusion.cpp, make it a parameter for the future?
374-
// Also, this is 75 instead of 77 to leave room for BOS and EOS tokens.
375-
int padding_size = 75 - (tokens_acc % 75);
376-
for (int j = 0; j < padding_size; j++) {
377-
clean_input_ids.push_back(tokenizer.EOS_TOKEN_ID);
378-
clean_index++;
379-
}
380-
381-
// After padding, continue to the next iteration to process the following text as a new segment
382-
tokens.insert(tokens.end(), clean_input_ids.begin(), clean_input_ids.end());
383-
weights.insert(weights.end(), padding_size, curr_weight);
384-
continue;
385-
}
386-
387-
// Regular token, process normally
388-
std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb);
389-
for (uint32_t i = 0; i < curr_tokens.size(); i++) {
390-
int token_id = curr_tokens[i];
391-
if (token_id == image_token) {
392-
class_token_index.push_back(clean_index - 1);
393-
} else {
394-
clean_input_ids.push_back(token_id);
395-
clean_index++;
396-
}
397-
}
398-
// GGML_ASSERT(class_token_index.size() == 1); // PhotoMaker currently does not support multiple
399-
// trigger words in a single prompt.
400-
if (class_token_index.size() == 1) {
401-
// Expand the class word token and corresponding mask
402-
int class_token = clean_input_ids[class_token_index[0]];
403-
class_idx = tokens_acc + class_token_index[0];
404-
std::vector<int> clean_input_ids_tmp;
405-
for (int i = 0; i < class_token_index[0]; i++)
406-
clean_input_ids_tmp.push_back(clean_input_ids[i]);
407-
for (int i = 0; i < (pm_version == PM_VERSION_2 ? 2 * num_input_imgs : num_input_imgs); i++)
408-
clean_input_ids_tmp.push_back(class_token);
409-
for (int i = class_token_index[0] + 1; i < clean_input_ids.size(); i++)
410-
clean_input_ids_tmp.push_back(clean_input_ids[i]);
411-
clean_input_ids.clear();
412-
clean_input_ids = clean_input_ids_tmp;
413-
}
414-
tokens_acc += clean_index;
415-
tokens.insert(tokens.end(), clean_input_ids.begin(), clean_input_ids.end());
416-
weights.insert(weights.end(), clean_input_ids.size(), curr_weight);
417-
}
418-
// BUG!! double couting, pad_tokens will add BOS at the beginning
419-
// tokens.insert(tokens.begin(), tokenizer.BOS_TOKEN_ID);
420-
// weights.insert(weights.begin(), 1.0);
421-
422-
tokenizer.pad_tokens(tokens, &weights, nullptr, text_model->model.n_token, text_model->model.n_token, true);
423-
int offset = pm_version == PM_VERSION_2 ? 2 * num_input_imgs : num_input_imgs;
424-
for (int i = 0; i < tokens.size(); i++) {
425-
// if (class_idx + 1 <= i && i < class_idx + 1 + 2*num_input_imgs) // photomaker V2 has num_tokens(=2)*num_input_imgs
426-
if (class_idx + 1 <= i && i < class_idx + 1 + offset) // photomaker V2 has num_tokens(=2)*num_input_imgs
427-
// hardcode for now
428-
class_token_mask.push_back(true);
429-
else
430-
class_token_mask.push_back(false);
431-
}
432-
433-
// printf("[");
434-
// for (int i = 0; i < tokens.size(); i++) {
435-
// printf("%d, ", class_token_mask[i] ? 1 : 0);
436-
// }
437-
// printf("]\n");
438-
439-
// for (int i = 0; i < tokens.size(); i++) {
440-
// std::cout << tokens[i] << ":" << weights[i] << ", ";
441-
// }
442-
// std::cout << std::endl;
443-
444-
return std::make_tuple(tokens, weights, class_token_mask);
445-
}
446-
447322
std::pair<std::vector<int>, std::vector<float>> tokenize(std::string text,
448323
size_t min_length = 0,
449324
size_t max_length = 0,
@@ -631,49 +506,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
631506
return result;
632507
}
633508

634-
std::tuple<SDCondition, std::vector<bool>>
635-
get_learned_condition_with_trigger(int n_threads,
636-
const ConditionerParams& conditioner_params) override {
637-
auto image_tokens = convert_token_to_id(trigger_word);
638-
// if(image_tokens.size() == 1){
639-
// printf(" image token id is: %d \n", image_tokens[0]);
640-
// }
641-
GGML_ASSERT(image_tokens.size() == 1);
642-
auto tokens_and_weights = tokenize_with_trigger_token(conditioner_params.text,
643-
conditioner_params.num_input_imgs,
644-
image_tokens[0]);
645-
std::vector<int>& tokens = std::get<0>(tokens_and_weights);
646-
std::vector<float>& weights = std::get<1>(tokens_and_weights);
647-
std::vector<bool>& clsm = std::get<2>(tokens_and_weights);
648-
// printf("tokens: \n");
649-
// for(int i = 0; i < tokens.size(); ++i)
650-
// printf("%d ", tokens[i]);
651-
// printf("\n");
652-
// printf("clsm: \n");
653-
// for(int i = 0; i < clsm.size(); ++i)
654-
// printf("%d ", clsm[i]?1:0);
655-
// printf("\n");
656-
auto cond = get_learned_condition_common(n_threads,
657-
tokens,
658-
weights,
659-
conditioner_params.clip_skip,
660-
conditioner_params.width,
661-
conditioner_params.height,
662-
conditioner_params.zero_out_masked);
663-
return std::make_tuple(cond, clsm);
664-
}
665-
666-
std::string remove_trigger_from_prompt(const std::string& prompt) override {
667-
auto image_tokens = convert_token_to_id(trigger_word);
668-
GGML_ASSERT(image_tokens.size() == 1);
669-
auto tokens_and_weights = tokenize(prompt);
670-
std::vector<int>& tokens = tokens_and_weights.first;
671-
auto it = std::find(tokens.begin(), tokens.end(), image_tokens[0]);
672-
GGML_ASSERT(it != tokens.end()); // prompt must have trigger word
673-
tokens.erase(it);
674-
return decode(tokens);
675-
}
676-
677509
SDCondition get_learned_condition(int n_threads,
678510
const ConditionerParams& conditioner_params) override {
679511
auto tokens_and_weights = tokenize(conditioner_params.text, text_model->model.n_token, text_model->model.n_token, true);
@@ -2554,4 +2386,4 @@ struct LTXAVEmbedder : public Conditioner {
25542386
}
25552387
};
25562388

2557-
#endif
2389+
#endif // __SD_CONDITIONING_CONDITIONER_HPP__

src/convert.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33
#include <regex>
44
#include <vector>
55

6-
#include "model.h"
76
#include "model_io/gguf_io.h"
87
#include "model_io/safetensors_io.h"
8+
#include "model_loader.h"
99
#include "util.h"
1010

1111
#include "ggml_extend_backend.h"
Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
#ifndef __GGML_EXTEND_HPP__
2-
#define __GGML_EXTEND_HPP__
1+
#ifndef __SD_CORE_GGML_EXTEND_HPP__
2+
#define __SD_CORE_GGML_EXTEND_HPP__
33

44
#include <assert.h>
55
#include <inttypes.h>
@@ -23,19 +23,19 @@
2323
#include <unordered_map>
2424
#include <vector>
2525

26+
#include "core/ggml_extend_backend.h"
27+
#include "core/ggml_graph_cut.h"
28+
#include "core/layer_registry.h"
2629
#include "ggml-alloc.h"
2730
#include "ggml-backend.h"
2831
#include "ggml.h"
29-
#include "ggml_extend_backend.h"
30-
#include "ggml_graph_cut.h"
31-
#include "layer_registry.h"
3232

33+
#include "core/tensor.hpp"
3334
#include "model.h"
34-
#include "tensor.hpp"
3535

36-
#include "rng.hpp"
37-
#include "tensor_ggml.hpp"
38-
#include "util.h"
36+
#include "core/rng.hpp"
37+
#include "core/tensor_ggml.hpp"
38+
#include "core/util.h"
3939

4040
#define EPS 1e-05f
4141

@@ -4161,4 +4161,4 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_lokr_forward(
41614161
}
41624162
}
41634163

4164-
#endif // __GGML_EXTEND__HPP__
4164+
#endif // __SD_CORE_GGML_EXTEND_HPP__
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#include "ggml_extend_backend.h"
1+
#include "core/ggml_extend_backend.h"
22

33
#include <algorithm>
44
#include <cctype>
@@ -8,8 +8,8 @@
88
#include <stdexcept>
99
#include <vector>
1010

11+
#include "core/util.h"
1112
#include "stable-diffusion.h"
12-
#include "util.h"
1313

1414
static std::string trim_copy(const std::string& value) {
1515
size_t begin = 0;
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
#ifndef __SD_GGML_EXTEND_BACKEND_H__
2-
#define __SD_GGML_EXTEND_BACKEND_H__
1+
#ifndef __SD_CORE_GGML_EXTEND_BACKEND_H__
2+
#define __SD_CORE_GGML_EXTEND_BACKEND_H__
33

44
#include <cstdint>
55
#include <cstring>
@@ -76,4 +76,4 @@ ggml_backend_t sd_backend_cpu_init();
7676
bool sd_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
7777
const char* sd_backend_module_name(SDBackendModule module);
7878
void ggml_ext_im_set_f32_1d(const struct ggml_tensor* tensor, int i, float value);
79-
#endif
79+
#endif // __SD_CORE_GGML_EXTEND_BACKEND_H__
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#include "ggml_graph_cut.h"
1+
#include "core/ggml_graph_cut.h"
22

33
#include <algorithm>
44
#include <cstring>
@@ -8,11 +8,11 @@
88
#include <stack>
99
#include <unordered_map>
1010

11+
#include "core/util.h"
1112
#include "ggml-alloc.h"
1213
#include "ggml-backend.h"
13-
#include "util.h"
1414

15-
#include "../ggml/src/ggml-impl.h"
15+
#include "ggml/src/ggml-impl.h"
1616

1717
namespace sd::ggml_graph_cut {
1818

0 commit comments

Comments
 (0)