NexaAI
diff --git a/‎.github/workflows/build.yml
+9-9 b/‎.github/workflows/build.yml
+9-9
diff --git a/‎CMakeLists.txt
+15-11 b/‎CMakeLists.txt
+15-11
diff --git a/‎Dockerfile.musa
+19 b/‎Dockerfile.musa
+19
diff --git a/‎README.md
+1-1 b/‎README.md
+1-1
diff --git a/‎assets/sd3.5_large.png
1.81 MB b/‎assets/sd3.5_large.png
1.81 MB
diff --git a/‎clip.hpp
+52-25 b/‎clip.hpp
+52-25
diff --git a/‎common.hpp
+23-14 b/‎common.hpp
+23-14
@@ -155,17 +155,17 @@ jobs:
       matrix:
         include:
           - build: "noavx"
-            defines: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DSD_BUILD_SHARED_LIBS=ON"
+            defines: "-DGGML_NATIVE=OFF -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DSD_BUILD_SHARED_LIBS=ON"
           - build: "avx2"
-            defines: "-DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
+            defines: "-DGGML_NATIVE=OFF -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
           - build: "avx"
-            defines: "-DGGML_AVX2=OFF -DSD_BUILD_SHARED_LIBS=ON"
+            defines: "-DGGML_NATIVE=OFF -DGGML_AVX=ON -DGGML_AVX2=OFF -DSD_BUILD_SHARED_LIBS=ON"
           - build: "avx512"
-            defines: "-DGGML_AVX512=ON -DSD_BUILD_SHARED_LIBS=ON"
+            defines: "-DGGML_NATIVE=OFF -DGGML_AVX512=ON -DGGML_AVX=ON -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
           - build: "cuda12"
-            defines: "-DSD_CUBLAS=ON -DSD_BUILD_SHARED_LIBS=ON"
-          - build: "rocm5.5"
-            defines: '-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030" -DSD_BUILD_SHARED_LIBS=ON'
+            defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES=90;89;80;75"
+          # - build: "rocm5.5"
+          #   defines: '-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030" -DSD_BUILD_SHARED_LIBS=ON'
           - build: 'vulkan'
             defines: "-DSD_VULKAN=ON -DSD_BUILD_SHARED_LIBS=ON"
     steps:
@@ -178,9 +178,9 @@ jobs:
       - name: Install cuda-toolkit
         id: cuda-toolkit
         if: ${{ matrix.build == 'cuda12' }}
-        uses: Jimver/[email protected].11
+        uses: Jimver/[email protected].19
         with:
-          cuda: "12.2.0"
+          cuda: "12.6.2"
           method: "network"
           sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
 
 
@@ -24,20 +24,20 @@ endif()
 # general
 #option(SD_BUILD_TESTS                "sd: build tests"    ${SD_STANDALONE})
 option(SD_BUILD_EXAMPLES             "sd: build examples" ${SD_STANDALONE})
-option(SD_CUBLAS                     "sd: cuda backend" OFF)
+option(SD_CUDA                       "sd: cuda backend" OFF)
 option(SD_HIPBLAS                    "sd: rocm backend" OFF)
 option(SD_METAL                      "sd: metal backend" OFF)
 option(SD_VULKAN                     "sd: vulkan backend" OFF)
 option(SD_SYCL                       "sd: sycl backend" OFF)
-option(SD_FLASH_ATTN                 "sd: use flash attention for x4 less memory usage" OFF)
+option(SD_MUSA                       "sd: musa backend" OFF)
 option(SD_FAST_SOFTMAX               "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
 option(SD_BUILD_SHARED_LIBS          "sd: build shared libs" OFF)
 #option(SD_BUILD_SERVER               "sd: build server example"                           ON)
 
-if(SD_CUBLAS)
-    message("-- Use CUBLAS as backend stable-diffusion")
+if(SD_CUDA)
+    message("-- Use CUDA as backend stable-diffusion")
     set(GGML_CUDA ON)
-    add_definitions(-DSD_USE_CUBLAS)
+    add_definitions(-DSD_USE_CUDA)
 endif()
 
 if(SD_METAL)
@@ -54,21 +54,25 @@ endif ()
 
 if (SD_HIPBLAS)
     message("-- Use HIPBLAS as backend stable-diffusion")
-    set(GGML_HIPBLAS ON)
-    add_definitions(-DSD_USE_CUBLAS)
+    set(GGML_HIP ON)
+    add_definitions(-DSD_USE_CUDA)
     if(SD_FAST_SOFTMAX)
         set(GGML_CUDA_FAST_SOFTMAX ON)
     endif()
 endif ()
 
-if(SD_FLASH_ATTN)
-    message("-- Use Flash Attention for memory optimization")
-    add_definitions(-DSD_USE_FLASH_ATTENTION)
+if(SD_MUSA)
+    message("-- Use MUSA as backend stable-diffusion")
+    set(GGML_MUSA ON)
+    add_definitions(-DSD_USE_CUDA)
+    if(SD_FAST_SOFTMAX)
+        set(GGML_CUDA_FAST_SOFTMAX ON)
+    endif()
 endif()
 
 set(SD_LIB stable-diffusion)
 
-file(GLOB SD_LIB_SOURCES 
+file(GLOB SD_LIB_SOURCES
     "*.h"
     "*.cpp"
     "*.hpp"
 
@@ -0,0 +1,19 @@
+ARG MUSA_VERSION=rc3.1.0
+
+FROM mthreads/musa:${MUSA_VERSION}-devel-ubuntu22.04 as build
+
+RUN apt-get update && apt-get install -y cmake
+
+WORKDIR /sd.cpp
+
+COPY . .
+
+RUN mkdir build && cd build && \
+    cmake .. -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release && \
+    cmake --build . --config Release
+
+FROM mthreads/musa:${MUSA_VERSION}-runtime-ubuntu22.04 as runtime
+
+COPY --from=build /sd.cpp/build/bin/sd /sd
+
+ENTRYPOINT [ "/sd" ]
@@ -63,4 +63,4 @@ To pull the latest changes from the upstream repository (`leejet/stable-diffusio
 
    ```bash
    git push origin master
-   ```
+   ```
@@ -343,6 +343,13 @@ class CLIPTokenizer {
         }
     }
 
+    std::string clean_up_tokenization(std::string& text) {
+        std::regex pattern(R"( ,)");
+        // Replace " ," with ","
+        std::string result = std::regex_replace(text, pattern, ",");
+        return result;
+    }
+
     std::string decode(const std::vector<int>& tokens) {
         std::string text = "";
         for (int t : tokens) {
@@ -351,8 +358,12 @@ class CLIPTokenizer {
             std::u32string ts = decoder[t];
             // printf("%d, %s \n", t,  utf32_to_utf8(ts).c_str());
             std::string s = utf32_to_utf8(ts);
-            if (s.length() >= 4 && ends_with(s, "</w>")) {
-                text += " " + s.replace(s.length() - 4, s.length() - 1, "");
+            if (s.length() >= 4) {
+                if (ends_with(s, "</w>")) {
+                    text += s.replace(s.length() - 4, s.length() - 1, "") + " ";
+                } else {
+                    text += s;
+                }
             } else {
                 text += " " + s;
             }
@@ -364,6 +375,7 @@ class CLIPTokenizer {
 
         // std::string s((char *)bytes.data());
         // std::string s = "";
+        text = clean_up_tokenization(text);
         return trim(text);
     }
 
@@ -533,9 +545,12 @@ class CLIPEmbeddings : public GGMLBlock {
     int64_t vocab_size;
     int64_t num_positions;
 
-    void init_params(struct ggml_context* ctx, ggml_type wtype) {
-        params["token_embedding.weight"]    = ggml_new_tensor_2d(ctx, wtype, embed_dim, vocab_size);
-        params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, embed_dim, num_positions);
+    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
+        enum ggml_type token_wtype    = (tensor_types.find(prefix + "token_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "token_embedding.weight"] : GGML_TYPE_F32;
+        enum ggml_type position_wtype = GGML_TYPE_F32;  //(tensor_types.find(prefix + "position_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "position_embedding.weight"] : GGML_TYPE_F32;
+
+        params["token_embedding.weight"]    = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
+        params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, position_wtype, embed_dim, num_positions);
     }
 
 public:
@@ -579,11 +594,14 @@ class CLIPVisionEmbeddings : public GGMLBlock {
     int64_t image_size;
     int64_t num_patches;
     int64_t num_positions;
+    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
+        enum ggml_type patch_wtype    = GGML_TYPE_F16;  // tensor_types.find(prefix + "patch_embedding.weight") != tensor_types.end() ? tensor_types[prefix + "patch_embedding.weight"] : GGML_TYPE_F16;
+        enum ggml_type class_wtype    = GGML_TYPE_F32;  // tensor_types.find(prefix + "class_embedding") != tensor_types.end() ? tensor_types[prefix + "class_embedding"] : GGML_TYPE_F32;
+        enum ggml_type position_wtype = GGML_TYPE_F32;  // tensor_types.find(prefix + "position_embedding.weight") != tensor_types.end() ? tensor_types[prefix + "position_embedding.weight"] : GGML_TYPE_F32;
 
-    void init_params(struct ggml_context* ctx, ggml_type wtype) {
-        params["patch_embedding.weight"]    = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, patch_size, patch_size, num_channels, embed_dim);
-        params["class_embedding"]           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, embed_dim);
-        params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, embed_dim, num_positions);
+        params["patch_embedding.weight"]    = ggml_new_tensor_4d(ctx, patch_wtype, patch_size, patch_size, num_channels, embed_dim);
+        params["class_embedding"]           = ggml_new_tensor_1d(ctx, class_wtype, embed_dim);
+        params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, position_wtype, embed_dim, num_positions);
     }
 
 public:
@@ -639,9 +657,10 @@ enum CLIPVersion {
 
 class CLIPTextModel : public GGMLBlock {
 protected:
-    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
         if (version == OPEN_CLIP_VIT_BIGG_14) {
-            params["text_projection"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, projection_dim, hidden_size);
+            enum ggml_type wtype      = GGML_TYPE_F32;  // tensor_types.find(prefix + "text_projection") != tensor_types.end() ? tensor_types[prefix + "text_projection"] : GGML_TYPE_F32;
+            params["text_projection"] = ggml_new_tensor_2d(ctx, wtype, projection_dim, hidden_size);
         }
     }
 
@@ -711,8 +730,12 @@ class CLIPTextModel : public GGMLBlock {
         if (return_pooled) {
             auto text_projection = params["text_projection"];
             ggml_tensor* pooled  = ggml_view_1d(ctx, x, hidden_size, x->nb[1] * max_token_idx);
-            pooled               = ggml_mul_mat(ctx, ggml_cont(ctx, ggml_transpose(ctx, text_projection)), pooled);
-            return pooled;
+            if (text_projection != NULL) {
+                pooled = ggml_nn_linear(ctx, pooled, text_projection, NULL);
+            } else {
+                LOG_DEBUG("Missing text_projection matrix, assuming identity...");
+            }
+            return pooled;  // [hidden_size, 1, 1]
         }
 
         return x;  // [N, n_token, hidden_size]
@@ -761,14 +784,17 @@ class CLIPVisionModel : public GGMLBlock {
         auto x = embeddings->forward(ctx, pixel_values);  // [N, num_positions, embed_dim]
         x      = pre_layernorm->forward(ctx, x);
         x      = encoder->forward(ctx, x, -1, false);
-        x      = post_layernorm->forward(ctx, x);  // [N, n_token, hidden_size]
+        // print_ggml_tensor(x, true, "ClipVisionModel x: ");
+        auto last_hidden_state = x;
+        x                      = post_layernorm->forward(ctx, x);  // [N, n_token, hidden_size]
 
         GGML_ASSERT(x->ne[3] == 1);
         if (return_pooled) {
             ggml_tensor* pooled = ggml_cont(ctx, ggml_view_2d(ctx, x, x->ne[0], x->ne[2], x->nb[2], 0));
             return pooled;  // [N, hidden_size]
         } else {
-            return x;  // [N, n_token, hidden_size]
+            // return x;  // [N, n_token, hidden_size]
+            return last_hidden_state;  // [N, n_token, hidden_size]
         }
     }
 };
@@ -779,9 +805,9 @@ class CLIPProjection : public UnaryBlock {
     int64_t out_features;
     bool transpose_weight;
 
-    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
+        enum ggml_type wtype = tensor_types.find(prefix + "weight") != tensor_types.end() ? tensor_types[prefix + "weight"] : GGML_TYPE_F32;
         if (transpose_weight) {
-            LOG_ERROR("transpose_weight");
             params["weight"] = ggml_new_tensor_2d(ctx, wtype, out_features, in_features);
         } else {
             params["weight"] = ggml_new_tensor_2d(ctx, wtype, in_features, out_features);
@@ -842,12 +868,13 @@ struct CLIPTextModelRunner : public GGMLRunner {
     CLIPTextModel model;
 
     CLIPTextModelRunner(ggml_backend_t backend,
-                        ggml_type wtype,
+                        std::map<std::string, enum ggml_type>& tensor_types,
+                        const std::string prefix,
                         CLIPVersion version = OPENAI_CLIP_VIT_L_14,
                         int clip_skip_value = 1,
                         bool with_final_ln  = true)
-        : GGMLRunner(backend, wtype), model(version, clip_skip_value, with_final_ln) {
-        model.init(params_ctx, wtype);
+        : GGMLRunner(backend), model(version, clip_skip_value, with_final_ln) {
+        model.init(params_ctx, tensor_types, prefix);
     }
 
     std::string get_desc() {
@@ -889,13 +916,13 @@ struct CLIPTextModelRunner : public GGMLRunner {
         struct ggml_tensor* embeddings = NULL;
 
         if (num_custom_embeddings > 0 && custom_embeddings_data != NULL) {
-            auto custom_embeddings = ggml_new_tensor_2d(compute_ctx,
-                                                        wtype,
-                                                        model.hidden_size,
-                                                        num_custom_embeddings);
+            auto token_embed_weight = model.get_token_embed_weight();
+            auto custom_embeddings  = ggml_new_tensor_2d(compute_ctx,
+                                                         token_embed_weight->type,
+                                                         model.hidden_size,
+                                                         num_custom_embeddings);
             set_backend_tensor_data(custom_embeddings, custom_embeddings_data);
 
-            auto token_embed_weight = model.get_token_embed_weight();
             // concatenate custom embeddings
             embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1);
         }
 
@@ -182,9 +182,11 @@ class GEGLU : public GGMLBlock {
     int64_t dim_in;
     int64_t dim_out;
 
-    void init_params(struct ggml_context* ctx, ggml_type wtype) {
-        params["proj.weight"] = ggml_new_tensor_2d(ctx, wtype, dim_in, dim_out * 2);
-        params["proj.bias"]   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, dim_out * 2);
+    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, std::string prefix = "") {
+        enum ggml_type wtype      = (tensor_types.find(prefix + "proj.weight") != tensor_types.end()) ? tensor_types[prefix + "proj.weight"] : GGML_TYPE_F32;
+        enum ggml_type bias_wtype = GGML_TYPE_F32;  //(tensor_types.find(prefix + "proj.bias") != tensor_types.end()) ? tensor_types[prefix + "proj.bias"] : GGML_TYPE_F32;
+        params["proj.weight"]     = ggml_new_tensor_2d(ctx, wtype, dim_in, dim_out * 2);
+        params["proj.bias"]       = ggml_new_tensor_1d(ctx, bias_wtype, dim_out * 2);
     }
 
 public:
@@ -245,16 +247,19 @@ class CrossAttention : public GGMLBlock {
     int64_t context_dim;
     int64_t n_head;
     int64_t d_head;
+    bool flash_attn;
 
 public:
     CrossAttention(int64_t query_dim,
                    int64_t context_dim,
                    int64_t n_head,
-                   int64_t d_head)
+                   int64_t d_head,
+                   bool flash_attn = false)
         : n_head(n_head),
           d_head(d_head),
           query_dim(query_dim),
-          context_dim(context_dim) {
+          context_dim(context_dim),
+          flash_attn(flash_attn) {
         int64_t inner_dim = d_head * n_head;
 
         blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false));
@@ -283,7 +288,7 @@ class CrossAttention : public GGMLBlock {
         auto k = to_k->forward(ctx, context);  // [N, n_context, inner_dim]
         auto v = to_v->forward(ctx, context);  // [N, n_context, inner_dim]
 
-        x = ggml_nn_attention_ext(ctx, q, k, v, n_head, NULL, false);  // [N, n_token, inner_dim]
+        x = ggml_nn_attention_ext(ctx, q, k, v, n_head, NULL, false, false, flash_attn);  // [N, n_token, inner_dim]
 
         x = to_out_0->forward(ctx, x);  // [N, n_token, query_dim]
         return x;
@@ -301,15 +306,16 @@ class BasicTransformerBlock : public GGMLBlock {
                           int64_t n_head,
                           int64_t d_head,
                           int64_t context_dim,
-                          bool ff_in = false)
+                          bool ff_in      = false,
+                          bool flash_attn = false)
         : n_head(n_head), d_head(d_head), ff_in(ff_in) {
         // disable_self_attn is always False
         // disable_temporal_crossattention is always False
         // switch_temporal_ca_to_sa is always False
         // inner_dim is always None or equal to dim
         // gated_ff is always True
-        blocks["attn1"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, dim, n_head, d_head));
-        blocks["attn2"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, context_dim, n_head, d_head));
+        blocks["attn1"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, dim, n_head, d_head, flash_attn));
+        blocks["attn2"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, context_dim, n_head, d_head, flash_attn));
         blocks["ff"]    = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim));
         blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
         blocks["norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
@@ -374,7 +380,8 @@ class SpatialTransformer : public GGMLBlock {
                        int64_t n_head,
                        int64_t d_head,
                        int64_t depth,
-                       int64_t context_dim)
+                       int64_t context_dim,
+                       bool flash_attn = false)
         : in_channels(in_channels),
           n_head(n_head),
           d_head(d_head),
@@ -388,7 +395,7 @@ class SpatialTransformer : public GGMLBlock {
 
         for (int i = 0; i < depth; i++) {
             std::string name = "transformer_blocks." + std::to_string(i);
-            blocks[name]     = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim));
+            blocks[name]     = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false, flash_attn));
         }
 
         blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}));
@@ -433,8 +440,10 @@ class SpatialTransformer : public GGMLBlock {
 
 class AlphaBlender : public GGMLBlock {
 protected:
-    void init_params(struct ggml_context* ctx, ggml_type wtype) {
-        params["mix_factor"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, std::string prefix = "") {
+        // Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix
+        enum ggml_type wtype = GGML_TYPE_F32;  //(tensor_types.ypes.find(prefix + "mix_factor") != tensor_types.end()) ? tensor_types[prefix + "mix_factor"] : GGML_TYPE_F32;
+        params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
     }
 
     float get_alpha() {
@@ -511,4 +520,4 @@ class VideoResBlock : public ResBlock {
     }
 };
 
-#endif  // __COMMON_HPP__
+#endif  // __COMMON_HPP__
-Original file line number
+Diff line change
    ```bash
    git push origin master
 -   ```
 +   ```