Skip to content

Commit d42fd59

Browse files
authored
feat: add OpenCL backend support (#680)
1 parent 0d8b39f commit d42fd59

10 files changed

Lines changed: 102 additions & 7 deletions

File tree

CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ option(SD_CUDA "sd: cuda backend" OFF)
2828
option(SD_HIPBLAS "sd: rocm backend" OFF)
2929
option(SD_METAL "sd: metal backend" OFF)
3030
option(SD_VULKAN "sd: vulkan backend" OFF)
31+
option(SD_OPENCL "sd: opencl backend" OFF)
3132
option(SD_SYCL "sd: sycl backend" OFF)
3233
option(SD_MUSA "sd: musa backend" OFF)
3334
option(SD_FAST_SOFTMAX "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
@@ -52,6 +53,12 @@ if (SD_VULKAN)
5253
add_definitions(-DSD_USE_VULKAN)
5354
endif ()
5455

56+
if (SD_OPENCL)
57+
message("-- Use OpenCL as backend stable-diffusion")
58+
set(GGML_OPENCL ON)
59+
add_definitions(-DSD_USE_OPENCL)
60+
endif ()
61+
5562
if (SD_HIPBLAS)
5663
message("-- Use HIPBLAS as backend stable-diffusion")
5764
set(GGML_HIP ON)

README.md

Lines changed: 68 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ Inference of Stable Diffusion and Flux in pure C/C++
2222
- Accelerated memory-efficient CPU inference
2323
- Only requires ~2.3GB when using txt2img with fp16 precision to generate a 512x512 image, enabling Flash Attention just requires ~1.8GB.
2424
- AVX, AVX2 and AVX512 support for x86 architectures
25-
- Full CUDA, Metal, Vulkan and SYCL backend for GPU acceleration.
25+
- Full CUDA, Metal, Vulkan, OpenCL and SYCL backend for GPU acceleration.
2626
- Can load ckpt, safetensors and diffusers models/checkpoints. Standalone VAEs models
2727
- No need to convert to `.ggml` or `.gguf` anymore!
2828
- Flash Attention for memory usage optimization
@@ -160,6 +160,73 @@ cmake .. -DSD_VULKAN=ON
160160
cmake --build . --config Release
161161
```
162162

163+
##### Using OpenCL (for Adreno GPU)
164+
165+
Currently, it supports only Adreno GPUs and is primarily optimized for Q4_0 type
166+
167+
To build for Windows ARM please refers to [Windows 11 Arm64
168+
](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/OPENCL.md#windows-11-arm64)
169+
170+
Building for Android:
171+
172+
Android NDK:
173+
Download and install the Android NDK from the [official Android developer site](https://developer.android.com/ndk/downloads).
174+
175+
Setup OpenCL Dependencies for NDK:
176+
177+
You need to provide OpenCL headers and the ICD loader library to your NDK sysroot.
178+
179+
* OpenCL Headers:
180+
```bash
181+
# In a temporary working directory
182+
git clone https://github.com/KhronosGroup/OpenCL-Headers
183+
cd OpenCL-Headers
184+
# Replace <YOUR_NDK_PATH> with your actual NDK installation path
185+
# e.g., cp -r CL /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
186+
sudo cp -r CL <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
187+
cd ..
188+
```
189+
190+
* OpenCL ICD Loader:
191+
```bash
192+
# In the same temporary working directory
193+
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
194+
cd OpenCL-ICD-Loader
195+
mkdir build_ndk && cd build_ndk
196+
197+
# Replace <YOUR_NDK_PATH> in the CMAKE_TOOLCHAIN_FILE and OPENCL_ICD_LOADER_HEADERS_DIR
198+
cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
199+
-DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
200+
-DOPENCL_ICD_LOADER_HEADERS_DIR=<YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
201+
-DANDROID_ABI=arm64-v8a \
202+
-DANDROID_PLATFORM=24 \
203+
-DANDROID_STL=c++_shared
204+
205+
ninja
206+
# Replace <YOUR_NDK_PATH>
207+
# e.g., cp libOpenCL.so /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
208+
sudo cp libOpenCL.so <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
209+
cd ../..
210+
```
211+
212+
Build `stable-diffusion.cpp` for Android with OpenCL:
213+
214+
```bash
215+
mkdir build-android && cd build-android
216+
217+
# Replace <YOUR_NDK_PATH> with your actual NDK installation path
218+
# e.g., -DCMAKE_TOOLCHAIN_FILE=/path/to/android-ndk-r26c/build/cmake/android.toolchain.cmake
219+
cmake .. -G Ninja \
220+
-DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
221+
-DANDROID_ABI=arm64-v8a \
222+
-DANDROID_PLATFORM=android-28 \
223+
-DGGML_OPENMP=OFF \
224+
-DSD_OPENCL=ON
225+
226+
ninja
227+
```
228+
*(Note: Don't forget to include `LD_LIBRARY_PATH=/vendor/lib64` in your command line before running the binary)*
229+
163230
##### Using SYCL
164231
165232
Using SYCL makes the computation run on the Intel GPU. Please make sure you have installed the related driver and [Intel® oneAPI Base toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) before start. More details and steps can refer to [llama.cpp SYCL backend](https://github.com/ggerganov/llama.cpp/blob/master/docs/backend/SYCL.md#linux).

common.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ class UpSampleBlock : public GGMLBlock {
5656
// x: [N, channels, h, w]
5757
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
5858

59-
x = ggml_upscale(ctx, x, 2); // [N, channels, h*2, w*2]
59+
x = ggml_upscale(ctx, x, 2, GGML_SCALE_MODE_NEAREST); // [N, channels, h*2, w*2]
6060
x = conv->forward(ctx, x); // [N, out_channels, h*2, w*2]
6161
return x;
6262
}

esrgan.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,8 +130,8 @@ class RRDBNet : public GGMLBlock {
130130
body_feat = conv_body->forward(ctx, body_feat);
131131
feat = ggml_add(ctx, feat, body_feat);
132132
// upsample
133-
feat = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx, feat, 2)));
134-
feat = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx, feat, 2)));
133+
feat = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
134+
feat = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
135135
auto out = conv_last->forward(ctx, lrelu(ctx, conv_hr->forward(ctx, feat)));
136136
return out;
137137
}

ggml

Submodule ggml updated from ff90529 to 9e4bee1

ggml_extend.hpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@
3939
#include "ggml-vulkan.h"
4040
#endif
4141

42+
#ifdef SD_USE_OPENCL
43+
#include "ggml-opencl.h"
44+
#endif
45+
4246
#ifdef SD_USE_SYCL
4347
#include "ggml-sycl.h"
4448
#endif
@@ -113,7 +117,8 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_kronecker(ggml_context* ctx, struct g
113117
a->ne[0] * b->ne[0],
114118
a->ne[1] * b->ne[1],
115119
a->ne[2] * b->ne[2],
116-
a->ne[3] * b->ne[3]),
120+
a->ne[3] * b->ne[3],
121+
GGML_SCALE_MODE_NEAREST),
117122
b);
118123
}
119124

model.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@
2626
#include "ggml-vulkan.h"
2727
#endif
2828

29+
#ifdef SD_USE_OPENCL
30+
#include "ggml-opencl.h"
31+
#endif
32+
2933
#define ST_HEADER_SIZE_LEN 8
3034

3135
uint64_t read_u64(uint8_t* buffer) {

stable-diffusion.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,14 @@ class StableDiffusionGGML {
181181
LOG_WARN("Failed to initialize Vulkan backend");
182182
}
183183
#endif
184+
#ifdef SD_USE_OPENCL
185+
LOG_DEBUG("Using OpenCL backend");
186+
// ggml_log_set(ggml_log_callback_default, nullptr); // Optional ggml logs
187+
backend = ggml_backend_opencl_init();
188+
if (!backend) {
189+
LOG_WARN("Failed to initialize OpenCL backend");
190+
}
191+
#endif
184192
#ifdef SD_USE_SYCL
185193
LOG_DEBUG("Using SYCL backend");
186194
backend = ggml_backend_sycl_init(0);

tae.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ class TinyDecoder : public UnaryBlock {
149149
if (i == 1) {
150150
h = ggml_relu_inplace(ctx, h);
151151
} else {
152-
h = ggml_upscale(ctx, h, 2);
152+
h = ggml_upscale(ctx, h, 2, GGML_SCALE_MODE_NEAREST);
153153
}
154154
continue;
155155
}

upscaler.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@ struct UpscalerGGML {
2828
LOG_DEBUG("Using Vulkan backend");
2929
backend = ggml_backend_vk_init(0);
3030
#endif
31+
#ifdef SD_USE_OPENCL
32+
LOG_DEBUG("Using OpenCL backend");
33+
backend = ggml_backend_opencl_init();
34+
#endif
3135
#ifdef SD_USE_SYCL
3236
LOG_DEBUG("Using SYCL backend");
3337
backend = ggml_backend_sycl_init(0);

0 commit comments

Comments
 (0)