Skip to content

Commit 92dc726

Browse files
authored
feat: add microsoft lens support (#1560)
1 parent 07b2b18 commit 92dc726

19 files changed

Lines changed: 1042 additions & 20 deletions

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ API and command-line option may change frequently.***
1515

1616
## 🔥Important News
1717

18+
* **2026/05/27** 🚀 stable-diffusion.cpp now supports **Lens**
1819
* **2026/05/17** 🚀 stable-diffusion.cpp now supports **LTX-2.3**
1920
* **2026/04/11** 🚀 stable-diffusion.cpp now uses a brand-new embedded web UI.
2021
* **2026/01/18** 🚀 stable-diffusion.cpp now supports **FLUX.2-klein**
@@ -37,6 +38,7 @@ API and command-line option may change frequently.***
3738
- [SD3/SD3.5](./docs/sd3.md)
3839
- [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md)
3940
- [FLUX.2-dev/FLUX.2-klein](./docs/flux2.md)
41+
- [Lens](./docs/lens.md)
4042
- [Chroma](./docs/chroma.md)
4143
- [Chroma1-Radiance](./docs/chroma_radiance.md)
4244
- [Qwen Image](./docs/qwen_image.md)
@@ -135,14 +137,15 @@ For runtime and parameter backend placement, see the [backend selection guide](.
135137
- [Chroma](./docs/chroma.md)
136138
- [🔥Qwen Image](./docs/qwen_image.md)
137139
- [🔥Qwen Image Edit series](./docs/qwen_image_edit.md)
138-
- [🔥LongCat Image / LongCat Image Edit](./docs/longcat_image.md)
139140
- [🔥Wan2.1/Wan2.2](./docs/wan.md)
140141
- [🔥LTX-2.3](./docs/ltx2.md)
141142
- [🔥Z-Image](./docs/z_image.md)
142143
- [Ovis-Image](./docs/ovis_image.md)
143144
- [Anima](./docs/anima.md)
144145
- [ERNIE-Image](./docs/ernie_image.md)
145146
- [HiDream-O1-Image](./docs/hidream_o1_image.md)
147+
- [Lens](./docs/lens.md)
148+
- [LongCat Image / LongCat Image Edit](./docs/longcat_image.md)
146149
- [LoRA](./docs/lora.md)
147150
- [LCM/LCM-LoRA](./docs/lcm.md)
148151
- [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)

assets/lens/example.png

630 KB
Loading

assets/lens/turbo_example.png

555 KB
Loading

docs/lens.md

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# How to Use
2+
3+
Lens uses a Lens diffusion transformer, the FLUX.2 VAE, and GPT-OSS-20B as the LLM text encoder.
4+
5+
## Download weights
6+
7+
- Download Lens
8+
- safetensors: https://huggingface.co/Comfy-Org/Lens/tree/main/diffusion_models
9+
- Download Lens Turbo
10+
- safetensors: https://huggingface.co/Comfy-Org/Lens/tree/main/diffusion_models
11+
- Download vae
12+
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
13+
- Download GPT-OSS-20B
14+
- gguf: https://huggingface.co/unsloth/gpt-oss-20b-GGUF/tree/main
15+
16+
## Examples
17+
18+
### Lens
19+
20+
```
21+
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\lens_bf16.safetensors --llm "..\..\llm\gpt-oss-20b-UD-Q8_K_XL.gguf" --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --cfg-scale 5.0 -p "A crystal dragon soaring through an aurora borealis sky, its entire body made of transparent faceted crystal refracting the green and purple aurora light into rainbow spectra, ice particles trailing from its wings, high fantasy digital art" --diffusion-fa -v
22+
```
23+
24+
<img width="256" alt="Lens example" src="../assets/lens/example.png" />
25+
26+
### Lens Turbo
27+
28+
```
29+
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\lens_turbo_bf16.safetensors --llm "..\..\llm\gpt-oss-20b-UD-Q8_K_XL.gguf" --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --cfg-scale 1.0 -p "A crystal dragon soaring through an aurora borealis sky, its entire body made of transparent faceted crystal refracting the green and purple aurora light into rainbow spectra, ice particles trailing from its wings, high fantasy digital art" --diffusion-fa -v --steps 4
30+
```
31+
32+
<img width="256" alt="Lens Turbo example" src="../assets/lens/turbo_example.png" />

src/conditioner.hpp

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1696,11 +1696,15 @@ struct LLMEmbedder : public Conditioner {
16961696
arch = LLM::LLMArch::MISTRAL_SMALL_3_2;
16971697
} else if (sd_version_is_ernie_image(version)) {
16981698
arch = LLM::LLMArch::MINISTRAL_3_3B;
1699+
} else if (sd_version_is_lens(version)) {
1700+
arch = LLM::LLMArch::GPT_OSS_20B;
16991701
} else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) {
17001702
arch = LLM::LLMArch::QWEN3;
17011703
}
17021704
if (arch == LLM::LLMArch::MISTRAL_SMALL_3_2 || arch == LLM::LLMArch::MINISTRAL_3_3B) {
17031705
tokenizer = std::make_shared<MistralTokenizer>();
1706+
} else if (arch == LLM::LLMArch::GPT_OSS_20B) {
1707+
tokenizer = std::make_shared<GPTOSSTokenizer>();
17041708
} else {
17051709
tokenizer = std::make_shared<Qwen2Tokenizer>();
17061710
}
@@ -1871,6 +1875,7 @@ struct LLMEmbedder : public Conditioner {
18711875
std::vector<std::pair<int, sd::Tensor<float>>> image_embeds;
18721876
int prompt_template_encode_start_idx = 34;
18731877
int min_length = 0; // pad tokens
1878+
int max_length = 100000000;
18741879
int hidden_states_min_length = 0; // zero pad hidden_states
18751880
bool spell_quotes = false;
18761881
std::set<int> out_layers;
@@ -2029,6 +2034,30 @@ struct LLMEmbedder : public Conditioner {
20292034
prompt_attn_range.first = 0;
20302035
prompt += conditioner_params.text;
20312036
prompt_attn_range.second = static_cast<int>(prompt.size());
2037+
} else if (sd_version_is_lens(version)) {
2038+
prompt_template_encode_start_idx = 97;
2039+
min_length = 0;
2040+
max_length = 512;
2041+
out_layers = {6, 12, 18, 24};
2042+
2043+
prompt =
2044+
"<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.\n"
2045+
"Knowledge cutoff: 2024-06\n"
2046+
"Current date: 2026-05-26\n" // fix for current date
2047+
"\n"
2048+
"Reasoning: medium\n"
2049+
"\n"
2050+
"# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>developer<|message|># Instructions\n"
2051+
"\n"
2052+
"Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background.\n"
2053+
"\n"
2054+
"<|end|><|start|>user<|message|>";
2055+
2056+
prompt_attn_range.first = static_cast<int>(prompt.size());
2057+
prompt += conditioner_params.text;
2058+
prompt_attn_range.second = static_cast<int>(prompt.size());
2059+
2060+
prompt += "<|end|><|start|>assistant<|channel|>analysis<|message|>Need to generate one image according to the description.<|end|><|start|>assistant<|channel|>final<|message|>";
20322061
} else if (sd_version_is_z_image(version)) {
20332062
prompt_template_encode_start_idx = 0;
20342063
out_layers = {35}; // -2
@@ -2085,7 +2114,8 @@ struct LLMEmbedder : public Conditioner {
20852114
image_embeds,
20862115
out_layers,
20872116
prompt_template_encode_start_idx,
2088-
spell_quotes);
2117+
spell_quotes,
2118+
max_length);
20892119
std::vector<sd::Tensor<float>> extra_hidden_states_vec;
20902120
for (int i = 0; i < extra_prompts.size(); i++) {
20912121
auto extra_hidden_states = encode_prompt(n_threads,
@@ -2096,7 +2126,8 @@ struct LLMEmbedder : public Conditioner {
20962126
image_embeds,
20972127
out_layers,
20982128
prompt_template_encode_start_idx,
2099-
spell_quotes);
2129+
spell_quotes,
2130+
max_length);
21002131
extra_hidden_states_vec.push_back(std::move(extra_hidden_states));
21012132
}
21022133

src/diffusion_model.hpp

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include "ernie_image.hpp"
77
#include "flux.hpp"
88
#include "hidream_o1.hpp"
9+
#include "lens.hpp"
910
#include "ltxv.hpp"
1011
#include "mmdit.hpp"
1112
#include "qwen_image.hpp"
@@ -701,6 +702,72 @@ struct ErnieImageModel : public DiffusionModel {
701702
}
702703
};
703704

705+
struct LensModel : public DiffusionModel {
706+
std::string prefix;
707+
Lens::LensRunner lens;
708+
709+
LensModel(ggml_backend_t backend,
710+
ggml_backend_t params_backend,
711+
const String2TensorStorage& tensor_storage_map = {},
712+
const std::string prefix = "model.diffusion_model")
713+
: prefix(prefix), lens(backend, params_backend, tensor_storage_map, prefix) {
714+
}
715+
716+
std::string get_desc() override {
717+
return lens.get_desc();
718+
}
719+
720+
void alloc_params_buffer() override {
721+
lens.alloc_params_buffer();
722+
}
723+
724+
void free_params_buffer() override {
725+
lens.free_params_buffer();
726+
}
727+
728+
void free_compute_buffer() override {
729+
lens.free_compute_buffer();
730+
}
731+
732+
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
733+
lens.get_param_tensors(tensors, prefix);
734+
}
735+
736+
size_t get_params_buffer_size() override {
737+
return lens.get_params_buffer_size();
738+
}
739+
740+
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
741+
lens.set_weight_adapter(adapter);
742+
}
743+
744+
int64_t get_adm_in_channels() override {
745+
return 768;
746+
}
747+
748+
void set_flash_attention_enabled(bool enabled) {
749+
lens.set_flash_attention_enabled(enabled);
750+
}
751+
752+
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
753+
lens.set_max_graph_vram_bytes(max_vram_bytes);
754+
}
755+
756+
void set_circular_axes(bool circular_x, bool circular_y) override {
757+
lens.set_circular_axes(circular_x, circular_y);
758+
}
759+
760+
sd::Tensor<float> compute(int n_threads,
761+
const DiffusionParams& diffusion_params) override {
762+
GGML_ASSERT(diffusion_params.x != nullptr);
763+
GGML_ASSERT(diffusion_params.timesteps != nullptr);
764+
return lens.compute(n_threads,
765+
*diffusion_params.x,
766+
*diffusion_params.timesteps,
767+
tensor_or_empty(diffusion_params.context));
768+
}
769+
};
770+
704771
struct LTXAVModel : public DiffusionModel {
705772
std::string prefix;
706773
LTXV::LTXAVRunner ltxav;

0 commit comments

Comments
 (0)