Skip to content

Commit 778ccb7

Browse files
author
Mark Caldwell
committed
Merge upstream/master: re-seat PuLID-Flux onto the refactored DiffusionParams
Upstream leejet#1569 ("simplify diffusion model runner params") split the monolithic DiffusionParams into per-model Extra structs. Re-seated the PuLID-Flux feature onto the new architecture: - diffusion_model.hpp: pulid_id + pulid_id_weight added to FluxDiffusionExtra. - flux.hpp: compute(DiffusionParams) now reads extra->pulid_id / extra->pulid_id_weight and threads them through to build_graph (the PuLID cross-attention code itself merged cleanly). - stable-diffusion.cpp: the FluxDiffusionExtra construction carries the PuLID id embedding + weight; obsolete monolithic param assignments dropped. Verified end-to-end on three GPUs/backends (compiles + the 3-way off / zero-weight / on PuLID falsification all pass; zero-weight is byte-identical to baseline, weight 1.0 alters output and preserves identity): - AMD R9700 (RDNA4, ROCm) - AMD RX 6700 XT (RDNA2, Vulkan) - NVIDIA RTX 3060 (Vulkan)
2 parents aef4d29 + f8935d6 commit 778ccb7

66 files changed

Lines changed: 4083 additions & 1200 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/build.yml

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -545,6 +545,30 @@ jobs:
545545
- name: Pack artifacts
546546
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
547547
run: |
548+
$ErrorActionPreference = "Stop"
549+
$dst = "build\bin"
550+
$rocmBin = Join-Path "${env:HIP_PATH}" "bin"
551+
$requiredRocmPaths = @(
552+
(Join-Path $rocmBin "rocblas.dll"),
553+
(Join-Path $rocmBin "rocblas\library")
554+
)
555+
foreach ($path in $requiredRocmPaths) {
556+
if (!(Test-Path $path)) {
557+
throw "Missing ROCm runtime dependency: $path"
558+
}
559+
}
560+
561+
foreach ($pattern in @("rocblas*.dll", "hipblas*.dll", "libhipblas*.dll")) {
562+
Copy-Item -Path (Join-Path $rocmBin $pattern) -Destination $dst -Force -ErrorAction SilentlyContinue
563+
}
564+
565+
foreach ($dir in @("rocblas", "hipblaslt")) {
566+
$src = Join-Path $rocmBin $dir
567+
if (Test-Path $src) {
568+
Copy-Item -Path $src -Destination $dst -Recurse -Force
569+
}
570+
}
571+
548572
7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-${{ env.ROCM_VERSION }}-x64.zip .\build\bin\*
549573
550574
- name: Upload artifacts
@@ -687,16 +711,6 @@ jobs:
687711
with:
688712
submodules: recursive
689713

690-
- name: Setup Node
691-
uses: actions/setup-node@v4
692-
with:
693-
node-version: 20
694-
695-
- name: Setup pnpm
696-
uses: pnpm/action-setup@v4
697-
with:
698-
version: 10.15.1
699-
700714
- name: ccache
701715
uses: ggml-org/ccache-action@v1.2.16
702716
with:
@@ -754,6 +768,18 @@ jobs:
754768
echo PATH=$PATH:$ROCM_PATH/bin >> $GITHUB_ENV
755769
echo LD_LIBRARY_PATH=$ROCM_PATH/lib:$ROCM_PATH/llvm/lib:$ROCM_PATH/lib/rocprofiler-systems >> $GITHUB_ENV
756770
771+
# setup-node installs into /opt/hostedtoolcache, which is removed above.
772+
# Keep Node/pnpm setup after disk cleanup so the server frontend can be embedded.
773+
- name: Setup Node
774+
uses: actions/setup-node@v4
775+
with:
776+
node-version: 20
777+
778+
- name: Setup pnpm
779+
uses: pnpm/action-setup@v4
780+
with:
781+
version: 10.15.1
782+
757783
- name: Build
758784
id: cmake_build
759785
run: |

CMakeLists.txt

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,31 @@ endif()
2222
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
2323
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
2424

25+
if(APPLE)
26+
function(sd_set_macos_rpaths target)
27+
get_target_property(target_type ${target} TYPE)
28+
if(target_type STREQUAL "EXECUTABLE")
29+
set(runtime_paths "@executable_path" "@executable_path/../lib")
30+
elseif(target_type STREQUAL "SHARED_LIBRARY" OR target_type STREQUAL "MODULE_LIBRARY")
31+
set(runtime_paths "@loader_path" "@loader_path/../lib")
32+
set_target_properties(${target} PROPERTIES
33+
MACOSX_RPATH ON
34+
INSTALL_NAME_DIR "@rpath"
35+
BUILD_WITH_INSTALL_NAME_DIR ON
36+
)
37+
else()
38+
return()
39+
endif()
40+
41+
# Release artifacts zip the build output directly, so keep macOS rpaths relocatable.
42+
set_target_properties(${target} PROPERTIES
43+
BUILD_RPATH "${runtime_paths}"
44+
INSTALL_RPATH "${runtime_paths}"
45+
BUILD_WITH_INSTALL_RPATH ON
46+
)
47+
endfunction()
48+
endif()
49+
2550
if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
2651
set(SD_STANDALONE ON)
2752
else()
@@ -237,6 +262,10 @@ else()
237262
add_library(${SD_LIB} STATIC ${SD_LIB_SOURCES})
238263
endif()
239264

265+
if(APPLE)
266+
sd_set_macos_rpaths(${SD_LIB})
267+
endif()
268+
240269
if(SD_SYCL)
241270
message("-- Use SYCL as backend stable-diffusion")
242271
set(GGML_SYCL ON)

README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ API and command-line option may change frequently.***
1515

1616
## 🔥Important News
1717

18+
* **2026/05/31** 🚀 stable-diffusion.cpp now supports **PiD**
19+
* **2026/05/27** 🚀 stable-diffusion.cpp now supports **Lens**
1820
* **2026/05/17** 🚀 stable-diffusion.cpp now supports **LTX-2.3**
1921
* **2026/04/11** 🚀 stable-diffusion.cpp now uses a brand-new embedded web UI.
2022
* **2026/01/18** 🚀 stable-diffusion.cpp now supports **FLUX.2-klein**
@@ -37,9 +39,12 @@ API and command-line option may change frequently.***
3739
- [SD3/SD3.5](./docs/sd3.md)
3840
- [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md)
3941
- [FLUX.2-dev/FLUX.2-klein](./docs/flux2.md)
42+
- [Lens](./docs/lens.md)
4043
- [Chroma](./docs/chroma.md)
4144
- [Chroma1-Radiance](./docs/chroma_radiance.md)
4245
- [Qwen Image](./docs/qwen_image.md)
46+
- [PiD](./docs/pid.md)
47+
- [LongCat Image](./docs/longcat_image.md)
4348
- [Z-Image](./docs/z_image.md)
4449
- [Ovis-Image](./docs/ovis_image.md)
4550
- [Anima](./docs/anima.md)
@@ -48,6 +53,7 @@ API and command-line option may change frequently.***
4853
- Image Edit Models
4954
- [FLUX.1-Kontext-dev](./docs/kontext.md)
5055
- [Qwen Image Edit series](./docs/qwen_image_edit.md)
56+
- [LongCat Image Edit](./docs/longcat_image.md)
5157
- Video Models
5258
- [Wan2.1/Wan2.2](./docs/wan.md)
5359
- [LTX-2.3](./docs/ltx2.md)
@@ -140,6 +146,8 @@ For runtime and parameter backend placement, see the [backend selection guide](.
140146
- [Anima](./docs/anima.md)
141147
- [ERNIE-Image](./docs/ernie_image.md)
142148
- [HiDream-O1-Image](./docs/hidream_o1_image.md)
149+
- [Lens](./docs/lens.md)
150+
- [LongCat Image / LongCat Image Edit](./docs/longcat_image.md)
143151
- [LoRA](./docs/lora.md)
144152
- [LCM/LCM-LoRA](./docs/lcm.md)
145153
- [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)

assets/lens/example.png

630 KB
Loading

assets/lens/turbo_example.png

555 KB
Loading

assets/longcat/example.png

423 KB
Loading

assets/pid/example.png

9.03 MB
Loading

docs/lens.md

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# How to Use
2+
3+
Lens uses a Lens diffusion transformer, the FLUX.2 VAE, and GPT-OSS-20B as the LLM text encoder.
4+
5+
## Download weights
6+
7+
- Download Lens
8+
- safetensors: https://huggingface.co/Comfy-Org/Lens/tree/main/diffusion_models
9+
- Download Lens Turbo
10+
- safetensors: https://huggingface.co/Comfy-Org/Lens/tree/main/diffusion_models
11+
- Download vae
12+
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
13+
- Download GPT-OSS-20B
14+
- gguf: https://huggingface.co/unsloth/gpt-oss-20b-GGUF/tree/main
15+
16+
## Examples
17+
18+
### Lens
19+
20+
```
21+
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\lens_bf16.safetensors --llm "..\..\llm\gpt-oss-20b-UD-Q8_K_XL.gguf" --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --cfg-scale 5.0 -p "A crystal dragon soaring through an aurora borealis sky, its entire body made of transparent faceted crystal refracting the green and purple aurora light into rainbow spectra, ice particles trailing from its wings, high fantasy digital art" --diffusion-fa -v
22+
```
23+
24+
<img width="256" alt="Lens example" src="../assets/lens/example.png" />
25+
26+
### Lens Turbo
27+
28+
```
29+
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\lens_turbo_bf16.safetensors --llm "..\..\llm\gpt-oss-20b-UD-Q8_K_XL.gguf" --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --cfg-scale 1.0 -p "A crystal dragon soaring through an aurora borealis sky, its entire body made of transparent faceted crystal refracting the green and purple aurora light into rainbow spectra, ice particles trailing from its wings, high fantasy digital art" --diffusion-fa -v --steps 4
30+
```
31+
32+
<img width="256" alt="Lens Turbo example" src="../assets/lens/turbo_example.png" />

docs/longcat_image.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# How to Use
2+
3+
LongCat-Image uses a LongCat diffusion transformer, the FLUX VAE, and Qwen2.5-VL as the LLM text encoder.
4+
5+
## Download weights
6+
7+
- Download LongCat Image
8+
- safetensors: https://huggingface.co/Comfy-Org/LongCat-Image/tree/main/split_files/diffusion_models
9+
- gguf: https://huggingface.co/vantagewithai/LongCat-Image-GGUF/tree/main/comfy
10+
- Download LongCat Image Edit
11+
- LongCat Image Edit Turbo: https://huggingface.co/meituan-longcat/LongCat-Image-Edit-Turbo
12+
- gguf: https://huggingface.co/vantagewithai/LongCat-Image-Edit-GGUF/tree/main
13+
- Download vae
14+
- safetensors: https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
15+
- Download qwen_2.5_vl 7b
16+
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/text_encoders
17+
- gguf: https://huggingface.co/mradermacher/Qwen2.5-VL-7B-Instruct-GGUF/tree/main
18+
- For image editing with GGUF text encoders, also download the matching mmproj file and pass it with `--llm_vision`.
19+
20+
## Run
21+
22+
LongCat uses quoted text for character-level text rendering. Put target text inside single quotes, double quotes, or Chinese quotes.
23+
24+
### LongCat Image
25+
26+
```
27+
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\LongCat-Image-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\ae.sft --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf -p "a lovely cat holding a sign says 'longcat.cpp'" --cfg-scale 5.0 --sampling-method euler --flow-shift 3 -v --offload-to-cpu --diffusion-fa
28+
```
29+
30+
<img alt="longcat example" src="../assets/longcat/example.png" />

docs/pid.md

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# How to Use
2+
3+
PiD is NVIDIA's Pixel Diffusion Decoder. It replaces the usual VAE decode or decode-then-upscale path with a pixel-space diffusion decoder conditioned on a
4+
source latent and text prompt.
5+
6+
In stable-diffusion.cpp, PiD currently runs as an image edit pipeline: provide a reference image with `-r`/`--ref-image`, encode that image with a matching VAE, then let the PiD diffusion model decode/upscale directly to RGB.
7+
8+
## Download weights
9+
10+
- Download PiD
11+
- safetensors: https://huggingface.co/Comfy-Org/PixelDiT/tree/main/diffusion_models
12+
- Download Gemma 2 2B
13+
- safetensors: https://huggingface.co/Comfy-Org/PixelDiT/tree/main/text_encoders
14+
- Download the VAE that matches the PiD checkpoint backbone
15+
- safetensors: https://huggingface.co/nvidia/PiD/tree/main/checkpoints
16+
- Flux / Z-Image PiD: use the Flux VAE and pass `--vae-format flux`
17+
- SD3 PiD: use the SD3 VAE and pass `--vae-format sd3`
18+
- Flux.2 PiD: use the Flux.2 VAE and pass `--vae-format flux2`
19+
20+
The official PiD model card should be checked before use. At the time of the initial PiD release, the official weights are under the NSCLv1 non-commercial license.
21+
22+
## Examples
23+
24+
```
25+
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\pid_flux1_512_to_2048_4step_bf16.safetensors --llm "..\..\ComfyUI\models\text_encoders\gemma_2_2b_it_elm_bf16.safetensors" --vae ..\..\ComfyUI\models\vae\ae.sft --vae-format flux --cfg-scale 1.0 -p "a lovely cat" -r ..\assets\ernie_image\turbo_example.png --diffusion-fa -v --steps 4 -H 2048 -W 2048 --rng cpu
26+
```
27+
28+
Before:
29+
30+
<img width="256" alt="ERNIE-Image Turbo example" src="../assets/ernie_image/turbo_example.png" />
31+
32+
After:
33+
<img width="1024" alt="PiD example" src="../assets/pid/example.png" />
34+
35+
## Notes
36+
37+
- `-r`/`--ref-image` is required. PiD uses the first reference image as the source latent condition.
38+
- `--vae-format` should match the VAE latent layout used by the PiD checkpoint. This is important when using standalone VAE files because the PiD diffusion
39+
checkpoint alone does not identify the VAE format.

0 commit comments

Comments
 (0)