Skip to content

[POC] Image generation multi-concurrency idea #2113

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 37 additions & 10 deletions samples/cpp/image_generation/text2image.cpp
Original file line number Diff line number Diff line change
@@ -1,27 +1,54 @@
// Copyright (C) 2023-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
#include <iostream>

#include "openvino/genai/image_generation/text2image_pipeline.hpp"


#include "imwrite.hpp"
#include "progress_bar.hpp"

int32_t main(int32_t argc, char* argv[]) try {
OPENVINO_ASSERT(argc == 3, "Usage: ", argv[0], " <MODEL_DIR> '<PROMPT>'");
OPENVINO_ASSERT(argc == 3, "Usage: ", argv[0], " <MODEL_DIR> '<PROMPT>'"); // TODO: <PROMPT> unused

const std::string models_path = argv[1], prompt = argv[2];
const std::string device = "CPU"; // GPU can be used as well

std::vector<std::string> prompts = {
"happy dog",
"black cat",
"yellow raspberry",
"retro personal computer",
"walking astronaut",
"fish with a hat",
"flying car",
};

ov::genai::Text2ImagePipeline pipe(models_path, device);
ov::Tensor image = pipe.generate(prompt,
ov::genai::width(512),
ov::genai::height(512),
ov::genai::num_inference_steps(20),
ov::genai::num_images_per_prompt(1),
ov::genai::callback(progress_bar));

// writes `num_images_per_prompt` images by pattern name
imwrite("image_%d.bmp", image, true);

std::vector<std::thread> threads;

for (size_t i = 0; i < prompts.size(); ++i) {
const std::string p = prompts[i];
threads.emplace_back([i, &pipe, p] () {
std::cout << "Generating... " << i << std::endl;
ov::genai::Text2ImagePipeline::GenerationRequest request = pipe.create_generation_request();
ov::Tensor image = request.generate(p,
ov::AnyMap{
ov::genai::width(512),
ov::genai::height(512),
ov::genai::num_inference_steps(20),
ov::genai::num_images_per_prompt(1)});
std::cout << "Generated " << i << std::endl;
imwrite("mt_image_512" + std::to_string(i) + "_%d.bmp", image, true);
std::cout << "Generation saved" << std::endl;
});
}

// join all threads
for (auto& thread : threads) {
thread.join();
}

return EXIT_SUCCESS;
} catch (const std::exception& error) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,9 @@ class OPENVINO_GENAI_EXPORTS AutoencoderKL {
return compile(device, ov::AnyMap{std::forward<Properties>(properties)...});
}

ov::Tensor decode(ov::Tensor latent);
ov::Tensor decode(ov::Tensor latent, size_t request_idx = 0);

ov::Tensor encode(ov::Tensor image, std::shared_ptr<Generator> generator);
ov::Tensor encode(ov::Tensor image, std::shared_ptr<Generator> generator, size_t request_idx = 0);

const Config& get_config() const;

Expand All @@ -139,7 +139,7 @@ class OPENVINO_GENAI_EXPORTS AutoencoderKL {
void merge_vae_image_post_processing() const;

Config m_config;
ov::InferRequest m_encoder_request, m_decoder_request;
std::vector<ov::InferRequest> m_encoder_requests, m_decoder_requests;
std::shared_ptr<ov::Model> m_encoder_model = nullptr, m_decoder_model = nullptr;
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,16 +82,16 @@ class OPENVINO_GENAI_EXPORTS CLIPTextModel {
return compile(device, ov::AnyMap{std::forward<Properties>(properties)...});
}

void set_adapters(const std::optional<AdapterConfig>& adapters);
void set_adapters(const std::optional<AdapterConfig>& adapters, size_t request_idx = 0);

ov::Tensor infer(const std::string& pos_prompt, const std::string& neg_prompt, bool do_classifier_free_guidance);
ov::Tensor infer(const std::string& pos_prompt, const std::string& neg_prompt, bool do_classifier_free_guidance, size_t request_idx = 0);

ov::Tensor get_output_tensor(const size_t idx);
ov::Tensor get_output_tensor(const size_t , size_t request_idx = 0);

private:
Config m_config;
AdapterController m_adapter_controller;
ov::InferRequest m_request;
std::vector<ov::InferRequest> m_requests;
std::shared_ptr<ov::Model> m_model;

Tokenizer m_clip_tokenizer;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@
namespace ov {
namespace genai {

namespace utils {
class RequestIdxQueue;
}


/**
* Text to image pipelines which provides unified API to all supported models types.
* Models specific aspects are hidden in image generation config, which includes multiple prompts support or
Expand Down Expand Up @@ -237,6 +242,22 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline {
return generate(positive_prompt, ov::AnyMap{std::forward<Properties>(properties)...});
}

class OPENVINO_GENAI_EXPORTS GenerationRequest {
Text2ImagePipeline& m_pipeline;
size_t m_request_idx;
public:
GenerationRequest(size_t request_idx, Text2ImagePipeline& pipeline);
~GenerationRequest();
GenerationRequest(const GenerationRequest&) = delete;
GenerationRequest& operator=(const GenerationRequest&) = delete;

ov::Tensor generate(const std::string& positive_prompt, const ov::AnyMap& properties = {});

friend class Text2ImagePipeline;
};

GenerationRequest create_generation_request();

/**
* Performs latent image decoding. It can be useful to use within 'callback' which accepts current latent image
* @param latent A latent image
Expand All @@ -248,6 +269,7 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline {

private:
std::shared_ptr<DiffusionPipeline> m_impl;
std::shared_ptr<ov::genai::utils::RequestIdxQueue> m_request_idx_queue;

explicit Text2ImagePipeline(const std::shared_ptr<DiffusionPipeline>& impl);
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,11 @@ class OPENVINO_GENAI_EXPORTS UNet2DConditionModel {
return compile(device, ov::AnyMap{std::forward<Properties>(properties)...});
}

void set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states);
void set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states, size_t request_idx = 0);

void set_adapters(const std::optional<AdapterConfig>& adapters);
void set_adapters(const std::optional<AdapterConfig>& adapters, size_t request_idx = 0);

ov::Tensor infer(ov::Tensor sample, ov::Tensor timestep);
ov::Tensor infer(ov::Tensor sample, ov::Tensor timestep, size_t request_idx = 0);

bool do_classifier_free_guidance(float guidance_scale) const {
return guidance_scale > 1.0f && m_config.time_cond_proj_dim < 0;
Expand Down
2 changes: 1 addition & 1 deletion src/cpp/include/openvino/genai/tokenizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
* @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path
* @param properties Properties passed to ov::Core::compile_model
*/
explicit Tokenizer(const std::filesystem::path& tokenizer_path, const ov::AnyMap& properties = {});
explicit Tokenizer(const std::filesystem::path& tokenizer_path, const ov::AnyMap& properties = {}, size_t infer_request_queue_size = 0);

/**
* @brief ov::genai::Tokenizer constructor to initialize directly from model and weights
Expand Down
29 changes: 16 additions & 13 deletions src/cpp/src/image_generation/diffusion_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,13 @@ class DiffusionPipeline {
m_generation_config.validate();
}

void set_scheduler(std::shared_ptr<Scheduler> scheduler) {
void set_scheduler(std::shared_ptr<Scheduler> scheduler, size_t request_idx = 0) {
auto casted = std::dynamic_pointer_cast<IScheduler>(scheduler);
OPENVINO_ASSERT(casted != nullptr, "Passed incorrect scheduler type");
m_scheduler = casted;
if (m_schedulers.size() <= request_idx) {
m_schedulers.resize(request_idx + 1);
}
m_schedulers[request_idx] = casted;
}

virtual void reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale) = 0;
Expand All @@ -110,15 +113,15 @@ class DiffusionPipeline {
const std::string& vae_device,
const ov::AnyMap& properties) = 0;

virtual std::tuple<ov::Tensor, ov::Tensor, ov::Tensor, ov::Tensor> prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config) = 0;
virtual std::tuple<ov::Tensor, ov::Tensor, ov::Tensor, ov::Tensor> prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config, size_t request_idx = 0) = 0;

virtual void compute_hidden_states(const std::string& positive_prompt, const ImageGenerationConfig& generation_config) = 0;
virtual void compute_hidden_states(const std::string& positive_prompt, const ImageGenerationConfig& generation_config, size_t request_idx = 0) = 0;

virtual void set_lora_adapters(std::optional<AdapterConfig> adapters) = 0;
virtual void set_lora_adapters(std::optional<AdapterConfig> adapters, size_t request_idx = 0) = 0;

virtual ov::Tensor generate(const std::string& positive_prompt, ov::Tensor initial_image, ov::Tensor mask_image, const ov::AnyMap& properties) = 0;
virtual ov::Tensor generate(const std::string& positive_prompt, ov::Tensor initial_image, ov::Tensor mask_image, const ov::AnyMap& properties, size_t request_idx = 0) = 0;

virtual ov::Tensor decode(const ov::Tensor latent) = 0;
virtual ov::Tensor decode(const ov::Tensor latent, size_t request_idx = 0) = 0;

virtual ImageGenerationPerfMetrics get_performance_metrics() = 0;

Expand All @@ -143,18 +146,18 @@ class DiffusionPipeline {

virtual size_t get_config_in_channels() const = 0;

virtual void blend_latents(ov::Tensor image_latent, ov::Tensor noise, ov::Tensor mask, ov::Tensor latent, size_t inference_step) {
virtual void blend_latents(ov::Tensor image_latent, ov::Tensor noise, ov::Tensor mask, ov::Tensor latent, size_t inference_step, size_t request_idx = 0) {
OPENVINO_ASSERT(m_pipeline_type == PipelineType::INPAINTING, "'blend_latents' can be called for inpainting pipeline only");
OPENVINO_ASSERT(image_latent.get_shape() == latent.get_shape(), "Shapes for current", latent.get_shape(), "and initial image latents ", image_latent.get_shape(), " must match");

ov::Tensor noised_image_latent(image_latent.get_element_type(), {});
std::vector<std::int64_t> timesteps = m_scheduler->get_timesteps();
std::vector<std::int64_t> timesteps = m_schedulers[request_idx]->get_timesteps();

if (inference_step < timesteps.size() - 1) {
image_latent.copy_to(noised_image_latent);

int64_t noise_timestep = timesteps[inference_step + 1];
m_scheduler->add_noise(noised_image_latent, noise, noise_timestep);
m_schedulers[request_idx]->add_noise(noised_image_latent, noise, noise_timestep);
} else {
noised_image_latent = image_latent;
}
Expand Down Expand Up @@ -215,16 +218,16 @@ class DiffusionPipeline {
// encode masked image to latent scape
auto encode_start = std::chrono::steady_clock::now();
masked_image_latent = m_vae->encode(masked_image, generation_config.generator);
m_perf_metrics.vae_encoder_inference_duration += std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::steady_clock::now() - encode_start).count();
//m_perf_metrics.vae_encoder_inference_duration += std::chrono::duration_cast<std::chrono::milliseconds>(
// std::chrono::steady_clock::now() - encode_start).count();
masked_image_latent = numpy_utils::repeat(masked_image_latent, generation_config.num_images_per_prompt * batch_size_multiplier);
}

return std::make_tuple(mask, masked_image_latent);
}

PipelineType m_pipeline_type;
std::shared_ptr<IScheduler> m_scheduler;
std::vector<std::shared_ptr<IScheduler>> m_schedulers;
ImageGenerationConfig m_generation_config;
float m_load_time_ms = 0.0f;
ImageGenerationPerfMetrics m_perf_metrics;
Expand Down
11 changes: 6 additions & 5 deletions src/cpp/src/image_generation/flux_fill_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class FluxFillPipeline : public FluxPipeline {
initialize_generation_config("FluxFillPipeline");
}

std::tuple<ov::Tensor, ov::Tensor, ov::Tensor, ov::Tensor> prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config) override {
std::tuple<ov::Tensor, ov::Tensor, ov::Tensor, ov::Tensor> prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config, size_t request_idx = 0) override {

const size_t vae_scale_factor = m_vae->get_vae_scale_factor();

Expand Down Expand Up @@ -119,7 +119,8 @@ class FluxFillPipeline : public FluxPipeline {
ov::Tensor generate(const std::string& positive_prompt,
ov::Tensor initial_image,
ov::Tensor mask_image,
const ov::AnyMap& properties) override {
const ov::AnyMap& properties,
size_t request_idx = 0) override {
const auto gen_start = std::chrono::steady_clock::now();
m_perf_metrics.clean_up();
m_custom_generation_config = m_generation_config;
Expand Down Expand Up @@ -149,10 +150,10 @@ class FluxFillPipeline : public FluxPipeline {
std::tie(latents, processed_image, image_latent, noise) = prepare_latents(initial_image, m_custom_generation_config);

size_t image_seq_len = latents.get_shape()[1];
m_scheduler->set_timesteps(image_seq_len, m_custom_generation_config.num_inference_steps, m_custom_generation_config.strength);
m_schedulers[request_idx]->set_timesteps(image_seq_len, m_custom_generation_config.num_inference_steps, m_custom_generation_config.strength);

// Prepare timesteps
std::vector<float> timesteps = m_scheduler->get_float_timesteps();
std::vector<float> timesteps = m_schedulers[request_idx]->get_float_timesteps();
m_latent_timestep = timesteps[0];

// Prepare mask latents
Expand All @@ -174,7 +175,7 @@ class FluxFillPipeline : public FluxPipeline {
auto infer_duration = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start);
m_perf_metrics.raw_metrics.transformer_inference_durations.emplace_back(MicroSeconds(infer_duration));

auto scheduler_step_result = m_scheduler->step(noise_pred_tensor, latents, inference_step, m_custom_generation_config.generator);
auto scheduler_step_result = m_schedulers[request_idx]->step(noise_pred_tensor, latents, inference_step, m_custom_generation_config.generator);
latents = scheduler_step_result["latent"];

if (callback && callback(inference_step, timesteps.size(), latents)) {
Expand Down
Loading