diff --git a/samples/cpp/image_generation/text2image.cpp b/samples/cpp/image_generation/text2image.cpp index 0ec0f924e2..7b22e6891b 100644 --- a/samples/cpp/image_generation/text2image.cpp +++ b/samples/cpp/image_generation/text2image.cpp @@ -1,27 +1,54 @@ // Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 +#include #include "openvino/genai/image_generation/text2image_pipeline.hpp" + #include "imwrite.hpp" #include "progress_bar.hpp" int32_t main(int32_t argc, char* argv[]) try { - OPENVINO_ASSERT(argc == 3, "Usage: ", argv[0], " ''"); + OPENVINO_ASSERT(argc == 3, "Usage: ", argv[0], " ''"); // TODO: unused const std::string models_path = argv[1], prompt = argv[2]; const std::string device = "CPU"; // GPU can be used as well + std::vector prompts = { + "happy dog", + "black cat", + "yellow raspberry", + "retro personal computer", + "walking astronaut", + "fish with a hat", + "flying car", + }; + ov::genai::Text2ImagePipeline pipe(models_path, device); - ov::Tensor image = pipe.generate(prompt, - ov::genai::width(512), - ov::genai::height(512), - ov::genai::num_inference_steps(20), - ov::genai::num_images_per_prompt(1), - ov::genai::callback(progress_bar)); - - // writes `num_images_per_prompt` images by pattern name - imwrite("image_%d.bmp", image, true); + + std::vector threads; + + for (size_t i = 0; i < prompts.size(); ++i) { + const std::string p = prompts[i]; + threads.emplace_back([i, &pipe, p] () { + std::cout << "Generating... " << i << std::endl; + ov::genai::Text2ImagePipeline::GenerationRequest request = pipe.create_generation_request(); + ov::Tensor image = request.generate(p, + ov::AnyMap{ + ov::genai::width(512), + ov::genai::height(512), + ov::genai::num_inference_steps(20), + ov::genai::num_images_per_prompt(1)}); + std::cout << "Generated " << i << std::endl; + imwrite("mt_image_512" + std::to_string(i) + "_%d.bmp", image, true); + std::cout << "Generation saved" << std::endl; + }); + } + + // join all threads + for (auto& thread : threads) { + thread.join(); + } return EXIT_SUCCESS; } catch (const std::exception& error) { diff --git a/src/cpp/include/openvino/genai/image_generation/autoencoder_kl.hpp b/src/cpp/include/openvino/genai/image_generation/autoencoder_kl.hpp index 13da4a5317..7b0baee3fd 100644 --- a/src/cpp/include/openvino/genai/image_generation/autoencoder_kl.hpp +++ b/src/cpp/include/openvino/genai/image_generation/autoencoder_kl.hpp @@ -127,9 +127,9 @@ class OPENVINO_GENAI_EXPORTS AutoencoderKL { return compile(device, ov::AnyMap{std::forward(properties)...}); } - ov::Tensor decode(ov::Tensor latent); + ov::Tensor decode(ov::Tensor latent, size_t request_idx = 0); - ov::Tensor encode(ov::Tensor image, std::shared_ptr generator); + ov::Tensor encode(ov::Tensor image, std::shared_ptr generator, size_t request_idx = 0); const Config& get_config() const; @@ -139,7 +139,7 @@ class OPENVINO_GENAI_EXPORTS AutoencoderKL { void merge_vae_image_post_processing() const; Config m_config; - ov::InferRequest m_encoder_request, m_decoder_request; + std::vector m_encoder_requests, m_decoder_requests; std::shared_ptr m_encoder_model = nullptr, m_decoder_model = nullptr; }; diff --git a/src/cpp/include/openvino/genai/image_generation/clip_text_model.hpp b/src/cpp/include/openvino/genai/image_generation/clip_text_model.hpp index aa9f16c12b..7589652ac0 100644 --- a/src/cpp/include/openvino/genai/image_generation/clip_text_model.hpp +++ b/src/cpp/include/openvino/genai/image_generation/clip_text_model.hpp @@ -82,16 +82,16 @@ class OPENVINO_GENAI_EXPORTS CLIPTextModel { return compile(device, ov::AnyMap{std::forward(properties)...}); } - void set_adapters(const std::optional& adapters); + void set_adapters(const std::optional& adapters, size_t request_idx = 0); - ov::Tensor infer(const std::string& pos_prompt, const std::string& neg_prompt, bool do_classifier_free_guidance); + ov::Tensor infer(const std::string& pos_prompt, const std::string& neg_prompt, bool do_classifier_free_guidance, size_t request_idx = 0); - ov::Tensor get_output_tensor(const size_t idx); + ov::Tensor get_output_tensor(const size_t , size_t request_idx = 0); private: Config m_config; AdapterController m_adapter_controller; - ov::InferRequest m_request; + std::vector m_requests; std::shared_ptr m_model; Tokenizer m_clip_tokenizer; diff --git a/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp b/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp index 21a00188e6..939481ecb6 100644 --- a/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp +++ b/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp @@ -8,6 +8,11 @@ namespace ov { namespace genai { +namespace utils { +class RequestIdxQueue; +} + + /** * Text to image pipelines which provides unified API to all supported models types. * Models specific aspects are hidden in image generation config, which includes multiple prompts support or @@ -237,6 +242,22 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { return generate(positive_prompt, ov::AnyMap{std::forward(properties)...}); } + class OPENVINO_GENAI_EXPORTS GenerationRequest { + Text2ImagePipeline& m_pipeline; + size_t m_request_idx; + public: + GenerationRequest(size_t request_idx, Text2ImagePipeline& pipeline); + ~GenerationRequest(); + GenerationRequest(const GenerationRequest&) = delete; + GenerationRequest& operator=(const GenerationRequest&) = delete; + + ov::Tensor generate(const std::string& positive_prompt, const ov::AnyMap& properties = {}); + + friend class Text2ImagePipeline; + }; + + GenerationRequest create_generation_request(); + /** * Performs latent image decoding. It can be useful to use within 'callback' which accepts current latent image * @param latent A latent image @@ -248,6 +269,7 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { private: std::shared_ptr m_impl; + std::shared_ptr m_request_idx_queue; explicit Text2ImagePipeline(const std::shared_ptr& impl); }; diff --git a/src/cpp/include/openvino/genai/image_generation/unet2d_condition_model.hpp b/src/cpp/include/openvino/genai/image_generation/unet2d_condition_model.hpp index f0084d44b4..99ccc004c9 100644 --- a/src/cpp/include/openvino/genai/image_generation/unet2d_condition_model.hpp +++ b/src/cpp/include/openvino/genai/image_generation/unet2d_condition_model.hpp @@ -85,11 +85,11 @@ class OPENVINO_GENAI_EXPORTS UNet2DConditionModel { return compile(device, ov::AnyMap{std::forward(properties)...}); } - void set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states); + void set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states, size_t request_idx = 0); - void set_adapters(const std::optional& adapters); + void set_adapters(const std::optional& adapters, size_t request_idx = 0); - ov::Tensor infer(ov::Tensor sample, ov::Tensor timestep); + ov::Tensor infer(ov::Tensor sample, ov::Tensor timestep, size_t request_idx = 0); bool do_classifier_free_guidance(float guidance_scale) const { return guidance_scale > 1.0f && m_config.time_cond_proj_dim < 0; diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp index 633d078608..087ec872f9 100644 --- a/src/cpp/include/openvino/genai/tokenizer.hpp +++ b/src/cpp/include/openvino/genai/tokenizer.hpp @@ -47,7 +47,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path * @param properties Properties passed to ov::Core::compile_model */ - explicit Tokenizer(const std::filesystem::path& tokenizer_path, const ov::AnyMap& properties = {}); + explicit Tokenizer(const std::filesystem::path& tokenizer_path, const ov::AnyMap& properties = {}, size_t infer_request_queue_size = 0); /** * @brief ov::genai::Tokenizer constructor to initialize directly from model and weights diff --git a/src/cpp/src/image_generation/diffusion_pipeline.hpp b/src/cpp/src/image_generation/diffusion_pipeline.hpp index 06dd7ac5b6..59bbdcd8c3 100644 --- a/src/cpp/src/image_generation/diffusion_pipeline.hpp +++ b/src/cpp/src/image_generation/diffusion_pipeline.hpp @@ -92,10 +92,13 @@ class DiffusionPipeline { m_generation_config.validate(); } - void set_scheduler(std::shared_ptr scheduler) { + void set_scheduler(std::shared_ptr scheduler, size_t request_idx = 0) { auto casted = std::dynamic_pointer_cast(scheduler); OPENVINO_ASSERT(casted != nullptr, "Passed incorrect scheduler type"); - m_scheduler = casted; + if (m_schedulers.size() <= request_idx) { + m_schedulers.resize(request_idx + 1); + } + m_schedulers[request_idx] = casted; } virtual void reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale) = 0; @@ -110,15 +113,15 @@ class DiffusionPipeline { const std::string& vae_device, const ov::AnyMap& properties) = 0; - virtual std::tuple prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config) = 0; + virtual std::tuple prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config, size_t request_idx = 0) = 0; - virtual void compute_hidden_states(const std::string& positive_prompt, const ImageGenerationConfig& generation_config) = 0; + virtual void compute_hidden_states(const std::string& positive_prompt, const ImageGenerationConfig& generation_config, size_t request_idx = 0) = 0; - virtual void set_lora_adapters(std::optional adapters) = 0; + virtual void set_lora_adapters(std::optional adapters, size_t request_idx = 0) = 0; - virtual ov::Tensor generate(const std::string& positive_prompt, ov::Tensor initial_image, ov::Tensor mask_image, const ov::AnyMap& properties) = 0; + virtual ov::Tensor generate(const std::string& positive_prompt, ov::Tensor initial_image, ov::Tensor mask_image, const ov::AnyMap& properties, size_t request_idx = 0) = 0; - virtual ov::Tensor decode(const ov::Tensor latent) = 0; + virtual ov::Tensor decode(const ov::Tensor latent, size_t request_idx = 0) = 0; virtual ImageGenerationPerfMetrics get_performance_metrics() = 0; @@ -143,18 +146,18 @@ class DiffusionPipeline { virtual size_t get_config_in_channels() const = 0; - virtual void blend_latents(ov::Tensor image_latent, ov::Tensor noise, ov::Tensor mask, ov::Tensor latent, size_t inference_step) { + virtual void blend_latents(ov::Tensor image_latent, ov::Tensor noise, ov::Tensor mask, ov::Tensor latent, size_t inference_step, size_t request_idx = 0) { OPENVINO_ASSERT(m_pipeline_type == PipelineType::INPAINTING, "'blend_latents' can be called for inpainting pipeline only"); OPENVINO_ASSERT(image_latent.get_shape() == latent.get_shape(), "Shapes for current", latent.get_shape(), "and initial image latents ", image_latent.get_shape(), " must match"); ov::Tensor noised_image_latent(image_latent.get_element_type(), {}); - std::vector timesteps = m_scheduler->get_timesteps(); + std::vector timesteps = m_schedulers[request_idx]->get_timesteps(); if (inference_step < timesteps.size() - 1) { image_latent.copy_to(noised_image_latent); int64_t noise_timestep = timesteps[inference_step + 1]; - m_scheduler->add_noise(noised_image_latent, noise, noise_timestep); + m_schedulers[request_idx]->add_noise(noised_image_latent, noise, noise_timestep); } else { noised_image_latent = image_latent; } @@ -215,8 +218,8 @@ class DiffusionPipeline { // encode masked image to latent scape auto encode_start = std::chrono::steady_clock::now(); masked_image_latent = m_vae->encode(masked_image, generation_config.generator); - m_perf_metrics.vae_encoder_inference_duration += std::chrono::duration_cast( - std::chrono::steady_clock::now() - encode_start).count(); + //m_perf_metrics.vae_encoder_inference_duration += std::chrono::duration_cast( + // std::chrono::steady_clock::now() - encode_start).count(); masked_image_latent = numpy_utils::repeat(masked_image_latent, generation_config.num_images_per_prompt * batch_size_multiplier); } @@ -224,7 +227,7 @@ class DiffusionPipeline { } PipelineType m_pipeline_type; - std::shared_ptr m_scheduler; + std::vector> m_schedulers; ImageGenerationConfig m_generation_config; float m_load_time_ms = 0.0f; ImageGenerationPerfMetrics m_perf_metrics; diff --git a/src/cpp/src/image_generation/flux_fill_pipeline.hpp b/src/cpp/src/image_generation/flux_fill_pipeline.hpp index e41d31fd07..8363e3d9b8 100644 --- a/src/cpp/src/image_generation/flux_fill_pipeline.hpp +++ b/src/cpp/src/image_generation/flux_fill_pipeline.hpp @@ -35,7 +35,7 @@ class FluxFillPipeline : public FluxPipeline { initialize_generation_config("FluxFillPipeline"); } - std::tuple prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config) override { + std::tuple prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config, size_t request_idx = 0) override { const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); @@ -119,7 +119,8 @@ class FluxFillPipeline : public FluxPipeline { ov::Tensor generate(const std::string& positive_prompt, ov::Tensor initial_image, ov::Tensor mask_image, - const ov::AnyMap& properties) override { + const ov::AnyMap& properties, + size_t request_idx = 0) override { const auto gen_start = std::chrono::steady_clock::now(); m_perf_metrics.clean_up(); m_custom_generation_config = m_generation_config; @@ -149,10 +150,10 @@ class FluxFillPipeline : public FluxPipeline { std::tie(latents, processed_image, image_latent, noise) = prepare_latents(initial_image, m_custom_generation_config); size_t image_seq_len = latents.get_shape()[1]; - m_scheduler->set_timesteps(image_seq_len, m_custom_generation_config.num_inference_steps, m_custom_generation_config.strength); + m_schedulers[request_idx]->set_timesteps(image_seq_len, m_custom_generation_config.num_inference_steps, m_custom_generation_config.strength); // Prepare timesteps - std::vector timesteps = m_scheduler->get_float_timesteps(); + std::vector timesteps = m_schedulers[request_idx]->get_float_timesteps(); m_latent_timestep = timesteps[0]; // Prepare mask latents @@ -174,7 +175,7 @@ class FluxFillPipeline : public FluxPipeline { auto infer_duration = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start); m_perf_metrics.raw_metrics.transformer_inference_durations.emplace_back(MicroSeconds(infer_duration)); - auto scheduler_step_result = m_scheduler->step(noise_pred_tensor, latents, inference_step, m_custom_generation_config.generator); + auto scheduler_step_result = m_schedulers[request_idx]->step(noise_pred_tensor, latents, inference_step, m_custom_generation_config.generator); latents = scheduler_step_result["latent"]; if (callback && callback(inference_step, timesteps.size(), latents)) { diff --git a/src/cpp/src/image_generation/flux_pipeline.hpp b/src/cpp/src/image_generation/flux_pipeline.hpp index aac412a9eb..eada6a8214 100644 --- a/src/cpp/src/image_generation/flux_pipeline.hpp +++ b/src/cpp/src/image_generation/flux_pipeline.hpp @@ -269,7 +269,7 @@ class FluxPipeline : public DiffusionPipeline { m_transformer->compile(denoise_device, *updated_properties); } - void compute_hidden_states(const std::string& positive_prompt, const ImageGenerationConfig& generation_config) override { + void compute_hidden_states(const std::string& positive_prompt, const ImageGenerationConfig& generation_config, size_t request_idx = 0) override { // encode_prompt std::string prompt_2_str = generation_config.prompt_2 != std::nullopt ? *generation_config.prompt_2 : positive_prompt; @@ -310,7 +310,7 @@ class FluxPipeline : public DiffusionPipeline { m_transformer->set_hidden_states("img_ids", latent_image_ids); } - std::tuple prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config) override { + std::tuple prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config, size_t request_idx = 0) override { const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); size_t num_channels_latents = m_transformer->get_config().in_channels / 4; @@ -336,7 +336,7 @@ class FluxPipeline : public DiffusionPipeline { latent = ov::Tensor(image_latents.get_element_type(), image_latents.get_shape()); image_latents.copy_to(latent); - m_scheduler->scale_noise(latent, m_latent_timestep, noise); + m_schedulers[request_idx]->scale_noise(latent, m_latent_timestep, noise); latent = pack_latents(latent, generation_config.num_images_per_prompt, num_channels_latents, height, width); if (m_pipeline_type == PipelineType::INPAINTING) { @@ -351,13 +351,13 @@ class FluxPipeline : public DiffusionPipeline { return std::make_tuple(latent, proccesed_image, image_latents, noise); } - void set_lora_adapters(std::optional adapters) override { + void set_lora_adapters(std::optional adapters, size_t request_idx = 0) override { if(adapters) { if(auto updated_adapters = derived_adapters(*adapters)) { adapters = updated_adapters; } - m_clip_text_encoder->set_adapters(adapters); - m_transformer->set_adapters(adapters); + m_clip_text_encoder->set_adapters(adapters); // TODO support Flux pipeline after the idea is approved + m_transformer->set_adapters(adapters); // TODO support Flux pipeline after the idea is approved } } @@ -430,7 +430,8 @@ class FluxPipeline : public DiffusionPipeline { ov::Tensor generate(const std::string& positive_prompt, ov::Tensor initial_image, ov::Tensor mask_image, - const ov::AnyMap& properties) override { + const ov::AnyMap& properties, + size_t request_idx = 0) override { const auto gen_start = std::chrono::steady_clock::now(); m_perf_metrics.clean_up(); m_custom_generation_config = m_generation_config; @@ -459,10 +460,10 @@ class FluxPipeline : public DiffusionPipeline { size_t image_seq_len = (m_custom_generation_config.height / vae_scale_factor / 2) * (m_custom_generation_config.width / vae_scale_factor / 2); - m_scheduler->set_timesteps(image_seq_len, m_custom_generation_config.num_inference_steps, m_custom_generation_config.strength); + m_schedulers[request_idx]->set_timesteps(image_seq_len, m_custom_generation_config.num_inference_steps, m_custom_generation_config.strength); // Prepare timesteps - std::vector timesteps = m_scheduler->get_float_timesteps(); + std::vector timesteps = m_schedulers[request_idx]->get_float_timesteps(); m_latent_timestep = timesteps[0]; // Prepare latent variables @@ -488,7 +489,7 @@ class FluxPipeline : public DiffusionPipeline { auto infer_duration = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start); m_perf_metrics.raw_metrics.transformer_inference_durations.emplace_back(MicroSeconds(infer_duration)); - auto scheduler_step_result = m_scheduler->step(noise_pred_tensor, latents, inference_step, m_custom_generation_config.generator); + auto scheduler_step_result = m_schedulers[request_idx]->step(noise_pred_tensor, latents, inference_step, m_custom_generation_config.generator); latents = scheduler_step_result["latent"]; if (m_pipeline_type == PipelineType::INPAINTING) { @@ -521,12 +522,12 @@ class FluxPipeline : public DiffusionPipeline { return image; } - ov::Tensor decode(const ov::Tensor latent) override { + ov::Tensor decode(const ov::Tensor latent, size_t request_idx) override { ov::Tensor unpacked_latent = unpack_latents(latent, m_custom_generation_config.height, m_custom_generation_config.width, m_vae->get_vae_scale_factor()); - return m_vae->decode(unpacked_latent); + return m_vae->decode(unpacked_latent, request_idx); } ImageGenerationPerfMetrics get_performance_metrics() override { @@ -614,7 +615,8 @@ class FluxPipeline : public DiffusionPipeline { const ov::Tensor image_latent, const ov::Tensor mask, const ov::Tensor noise, - size_t inference_step) override { + size_t inference_step, + size_t request_idx = 0) override { OPENVINO_ASSERT(m_pipeline_type == PipelineType::INPAINTING, "'blend_latents' can be called for inpainting pipeline only"); OPENVINO_ASSERT(image_latent.get_shape() == latents.get_shape(), "Shapes for current ", latents.get_shape(), " and initial image latents ", image_latent.get_shape(), " must match"); @@ -622,10 +624,10 @@ class FluxPipeline : public DiffusionPipeline { ov::Tensor init_latents_proper(image_latent.get_element_type(), image_latent.get_shape()); image_latent.copy_to(init_latents_proper); - std::vector timesteps = m_scheduler->get_float_timesteps(); + std::vector timesteps = m_schedulers[request_idx]->get_float_timesteps(); if (inference_step < timesteps.size() - 1) { float noise_timestep = timesteps[inference_step + 1]; - m_scheduler->scale_noise(init_latents_proper, noise_timestep, noise); + m_schedulers[request_idx]->scale_noise(init_latents_proper, noise_timestep, noise); } float * latents_data = latents.data(); diff --git a/src/cpp/src/image_generation/models/autoencoder_kl.cpp b/src/cpp/src/image_generation/models/autoencoder_kl.cpp index 00a3223852..73fb396a28 100644 --- a/src/cpp/src/image_generation/models/autoencoder_kl.cpp +++ b/src/cpp/src/image_generation/models/autoencoder_kl.cpp @@ -216,38 +216,40 @@ AutoencoderKL& AutoencoderKL::compile(const std::string& device, const ov::AnyMa if (m_encoder_model) { ov::CompiledModel encoder_compiled_model = core.compile_model(m_encoder_model, device, handle_scale_factor(m_encoder_model, device, properties)); ov::genai::utils::print_compiled_model_properties(encoder_compiled_model, "Auto encoder KL encoder model"); - m_encoder_request = encoder_compiled_model.create_infer_request(); + for (size_t i = 0; i < 4 /* To be passed in constructor after idea is approved */; i++) + m_encoder_requests.emplace_back(encoder_compiled_model.create_infer_request()); // release the original model m_encoder_model.reset(); } ov::CompiledModel decoder_compiled_model = core.compile_model(m_decoder_model, device, handle_scale_factor(m_decoder_model, device, properties)); ov::genai::utils::print_compiled_model_properties(decoder_compiled_model, "Auto encoder KL decoder model"); - m_decoder_request = decoder_compiled_model.create_infer_request(); + for (size_t i = 0; i < 4 /* To be passed in constructor after idea is approved */; i++) + m_decoder_requests.emplace_back(decoder_compiled_model.create_infer_request()); // release the original model m_decoder_model.reset(); return *this; } -ov::Tensor AutoencoderKL::decode(ov::Tensor latent) { - OPENVINO_ASSERT(m_decoder_request, "VAE decoder model must be compiled first. Cannot infer non-compiled model"); +ov::Tensor AutoencoderKL::decode(ov::Tensor latent, size_t request_idx) { + OPENVINO_ASSERT(m_decoder_requests.size(), "VAE decoder model must be compiled first. Cannot infer non-compiled model"); - m_decoder_request.set_input_tensor(latent); - m_decoder_request.infer(); - return m_decoder_request.get_output_tensor(); + m_decoder_requests[request_idx].set_input_tensor(latent); + m_decoder_requests[request_idx].infer(); + return m_decoder_requests[request_idx].get_output_tensor(); } -ov::Tensor AutoencoderKL::encode(ov::Tensor image, std::shared_ptr generator) { - OPENVINO_ASSERT(m_encoder_request || m_encoder_model, "AutoencoderKL is created without 'VAE encoder' capability. Please, pass extra argument to constructor to create 'VAE encoder'"); - OPENVINO_ASSERT(m_encoder_request, "VAE encoder model must be compiled first. Cannot infer non-compiled model"); +ov::Tensor AutoencoderKL::encode(ov::Tensor image, std::shared_ptr generator, size_t request_idx) { + OPENVINO_ASSERT(m_encoder_requests.size() || m_encoder_model, "AutoencoderKL is created without 'VAE encoder' capability. Please, pass extra argument to constructor to create 'VAE encoder'"); + OPENVINO_ASSERT(m_encoder_requests.size(), "VAE encoder model must be compiled first. Cannot infer non-compiled model"); - m_encoder_request.set_input_tensor(image); - m_encoder_request.infer(); + m_encoder_requests[request_idx].set_input_tensor(image); + m_encoder_requests[request_idx].infer(); - ov::Tensor output = m_encoder_request.get_output_tensor(), latent; + ov::Tensor output = m_encoder_requests[request_idx].get_output_tensor(), latent; - ov::CompiledModel compiled_model = m_encoder_request.get_compiled_model(); + ov::CompiledModel compiled_model = m_encoder_requests[request_idx].get_compiled_model(); auto outputs = compiled_model.outputs(); OPENVINO_ASSERT(outputs.size() == 1, "AutoencoderKL encoder model is expected to have a single output"); diff --git a/src/cpp/src/image_generation/models/clip_text_model.cpp b/src/cpp/src/image_generation/models/clip_text_model.cpp index ab977408e2..4a5a0c0c28 100644 --- a/src/cpp/src/image_generation/models/clip_text_model.cpp +++ b/src/cpp/src/image_generation/models/clip_text_model.cpp @@ -35,7 +35,7 @@ CLIPTextModel::Config::Config(const std::filesystem::path& config_path) { } CLIPTextModel::CLIPTextModel(const std::filesystem::path& root_dir) : - m_clip_tokenizer(get_tokenizer_path_by_text_encoder(root_dir)), + m_clip_tokenizer(get_tokenizer_path_by_text_encoder(root_dir), {}, 4/* To be passed in constructor after idea is approved */), m_config(root_dir / "config.json") { m_model = utils::singleton_core().read_model(root_dir / "openvino_model.xml"); } @@ -93,21 +93,23 @@ CLIPTextModel& CLIPTextModel::compile(const std::string& device, const ov::AnyMa } ov::CompiledModel compiled_model = utils::singleton_core().compile_model(m_model, device, *filtered_properties); ov::genai::utils::print_compiled_model_properties(compiled_model, "Clip Text model"); - m_request = compiled_model.create_infer_request(); + for (size_t i = 0; i < 4 /* To be passed in constructor after idea is approved */; i++) + m_requests.emplace_back(compiled_model.create_infer_request()); // release the original model m_model.reset(); return *this; } -void CLIPTextModel::set_adapters(const std::optional& adapters) { +void CLIPTextModel::set_adapters(const std::optional& adapters, size_t request_idx) { + // TODO: Missing OPENVINO_ASSERT? if (adapters) { - m_adapter_controller.apply(m_request, *adapters); + m_adapter_controller.apply(m_requests[request_idx], *adapters); } } -ov::Tensor CLIPTextModel::infer(const std::string& pos_prompt, const std::string& neg_prompt, bool do_classifier_free_guidance) { - OPENVINO_ASSERT(m_request, "CLIP text encoder model must be compiled first. Cannot infer non-compiled model"); +ov::Tensor CLIPTextModel::infer(const std::string& pos_prompt, const std::string& neg_prompt, bool do_classifier_free_guidance, size_t request_idx) { + OPENVINO_ASSERT(m_requests.size(), "CLIP text encoder model must be compiled first. Cannot infer non-compiled model"); const int32_t pad_token_id = m_clip_tokenizer.get_pad_token_id(); const size_t text_embedding_batch_size = do_classifier_free_guidance ? 2 : 1; @@ -124,9 +126,9 @@ ov::Tensor CLIPTextModel::infer(const std::string& pos_prompt, const std::string } }; - ov::PartialShape compiled_input_partial_shape = m_request.get_compiled_model().inputs()[0].get_partial_shape(); + ov::PartialShape compiled_input_partial_shape = m_requests[request_idx].get_compiled_model().inputs()[0].get_partial_shape(); - ov::Tensor input_ids = m_request.get_input_tensor(); + ov::Tensor input_ids = m_requests[request_idx].get_input_tensor(); if (compiled_input_partial_shape.is_dynamic()) { input_ids.set_shape({text_embedding_batch_size, m_config.max_position_embeddings}); @@ -143,6 +145,7 @@ ov::Tensor CLIPTextModel::infer(const std::string& pos_prompt, const std::string size_t current_batch_idx = 0; + if (input_ids.get_shape()[0] == 2) { perform_tokenization(neg_prompt, ov::Tensor(input_ids, {current_batch_idx , 0}, @@ -157,16 +160,17 @@ ov::Tensor CLIPTextModel::infer(const std::string& pos_prompt, const std::string {current_batch_idx + 1, m_config.max_position_embeddings})); // text embeddings - m_request.infer(); + m_requests[request_idx].infer(); // This is true when text_embedding_batch_size is 1, but model was reshaped / compiled as batch size 2. m_slice_batch1_output = (text_embedding_batch_size != input_ids.get_shape()[0]); - return get_output_tensor(0); + return get_output_tensor(0, request_idx); } -ov::Tensor CLIPTextModel::get_output_tensor(const size_t idx) { - auto infer_out_tensor = m_request.get_output_tensor(idx); +ov::Tensor CLIPTextModel::get_output_tensor(const size_t idx, size_t request_idx) { + // TODO: Missing OPENVINO_ASSERT? + auto infer_out_tensor = m_requests[request_idx].get_output_tensor(idx); if (m_slice_batch1_output) { //Slice and return batch index 1 output. auto out_shape = infer_out_tensor.get_shape(); diff --git a/src/cpp/src/image_generation/models/unet2d_condition_model.cpp b/src/cpp/src/image_generation/models/unet2d_condition_model.cpp index 2e8f62eb92..77dfd26a73 100644 --- a/src/cpp/src/image_generation/models/unet2d_condition_model.cpp +++ b/src/cpp/src/image_generation/models/unet2d_condition_model.cpp @@ -99,21 +99,21 @@ UNet2DConditionModel& UNet2DConditionModel::compile(const std::string& device, c return *this; } -void UNet2DConditionModel::set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states) { +void UNet2DConditionModel::set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states, size_t request_idx) { OPENVINO_ASSERT(m_impl, "UNet model must be compiled first"); - m_impl->set_hidden_states(tensor_name, encoder_hidden_states); + m_impl->set_hidden_states(tensor_name, encoder_hidden_states, request_idx); } -void UNet2DConditionModel::set_adapters(const std::optional& adapters) { +void UNet2DConditionModel::set_adapters(const std::optional& adapters, size_t request_idx) { OPENVINO_ASSERT(m_impl, "UNet model must be compiled first"); if(adapters) { - m_impl->set_adapters(m_adapter_controller, *adapters); + m_impl->set_adapters(m_adapter_controller, *adapters, request_idx); } } -ov::Tensor UNet2DConditionModel::infer(ov::Tensor sample, ov::Tensor timestep) { +ov::Tensor UNet2DConditionModel::infer(ov::Tensor sample, ov::Tensor timestep, size_t request_idx) { OPENVINO_ASSERT(m_impl, "UNet model must be compiled first. Cannot infer non-compiled model"); - return m_impl->infer(sample, timestep); + return m_impl->infer(sample, timestep, request_idx); } } // namespace genai diff --git a/src/cpp/src/image_generation/models/unet_inference.hpp b/src/cpp/src/image_generation/models/unet_inference.hpp index 639338901b..b8cc6aab54 100644 --- a/src/cpp/src/image_generation/models/unet_inference.hpp +++ b/src/cpp/src/image_generation/models/unet_inference.hpp @@ -12,9 +12,9 @@ class UNet2DConditionModel::UNetInference { public: virtual void compile(std::shared_ptr model, const std::string& device, const ov::AnyMap& properties) = 0; - virtual void set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states) = 0; - virtual void set_adapters(AdapterController& adapter_controller, const AdapterConfig& adapters) = 0; - virtual ov::Tensor infer(ov::Tensor sample, ov::Tensor timestep) = 0; + virtual void set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states, size_t request_idx = 0) = 0; + virtual void set_adapters(AdapterController& adapter_controller, const AdapterConfig& adapters, size_t request_idx = 0) = 0; + virtual ov::Tensor infer(ov::Tensor sample, ov::Tensor timestep, size_t request_idx = 0) = 0; // utility function to resize model given optional dimensions. static void reshape(std::shared_ptr model, diff --git a/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp b/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp index 6bc86a5f06..14e119b6d8 100644 --- a/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp +++ b/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp @@ -15,32 +15,35 @@ class UNet2DConditionModel::UNetInferenceDynamic : public UNet2DConditionModel:: virtual void compile(std::shared_ptr model, const std::string& device, const ov::AnyMap& properties) override { ov::CompiledModel compiled_model = utils::singleton_core().compile_model(model, device, properties); ov::genai::utils::print_compiled_model_properties(compiled_model, "UNet 2D Condition dynamic model"); - m_request = compiled_model.create_infer_request(); + + for (size_t i = 0; i < 4 /* To be passed in constructor after idea is approved */; i++) + m_requests.emplace_back(compiled_model.create_infer_request()); } - virtual void set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states) override { - OPENVINO_ASSERT(m_request, "UNet model must be compiled first"); - m_request.set_tensor(tensor_name, encoder_hidden_states); + virtual void set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states, size_t request_idx = 0) override { + OPENVINO_ASSERT(m_requests.size(), "UNet model must be compiled first"); + std::cout << "Setting hidden states for request " << request_idx << std::endl; + m_requests[request_idx].set_tensor(tensor_name, encoder_hidden_states); } - virtual void set_adapters(AdapterController &adapter_controller, const AdapterConfig& adapters) override { - OPENVINO_ASSERT(m_request, "UNet model must be compiled first"); - adapter_controller.apply(m_request, adapters); + virtual void set_adapters(AdapterController &adapter_controller, const AdapterConfig& adapters, size_t request_idx = 0) override { + OPENVINO_ASSERT(m_requests.size(), "UNet model must be compiled first"); + adapter_controller.apply(m_requests[request_idx], adapters); } - virtual ov::Tensor infer(ov::Tensor sample, ov::Tensor timestep) override { - OPENVINO_ASSERT(m_request, "UNet model must be compiled first. Cannot infer non-compiled model"); + virtual ov::Tensor infer(ov::Tensor sample, ov::Tensor timestep, size_t request_idx = 0) override { + OPENVINO_ASSERT(m_requests.size(), "UNet model must be compiled first. Cannot infer non-compiled model"); - m_request.set_tensor("sample", sample); - m_request.set_tensor("timestep", timestep); + m_requests[request_idx].set_tensor("sample", sample); + m_requests[request_idx].set_tensor("timestep", timestep); - m_request.infer(); + m_requests[request_idx].infer(); - return m_request.get_output_tensor(); + return m_requests[request_idx].get_output_tensor(); } private: - ov::InferRequest m_request; + std::vector m_requests; }; } // namespace genai diff --git a/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp b/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp index fd5d53e1d1..a6f2763fbb 100644 --- a/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp +++ b/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp @@ -47,9 +47,10 @@ class UNet2DConditionModel::UNetInferenceStaticBS1 : public UNet2DConditionModel } } - virtual void set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states) override { + virtual void set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states, size_t request_idx = 0) override { OPENVINO_ASSERT(m_native_batch_size && m_native_batch_size == m_requests.size(), "UNet model must be compiled first"); + OPENVINO_ASSERT(request_idx == 0, "multi concurrency not supported for NPU"); size_t encoder_hidden_states_bs = encoder_hidden_states.get_shape()[0]; @@ -80,17 +81,19 @@ class UNet2DConditionModel::UNetInferenceStaticBS1 : public UNet2DConditionModel } } - virtual void set_adapters(AdapterController& adapter_controller, const AdapterConfig& adapters) override { + virtual void set_adapters(AdapterController& adapter_controller, const AdapterConfig& adapters, size_t request_idx = 0) override { OPENVINO_ASSERT(m_native_batch_size && m_native_batch_size == m_requests.size(), "UNet model must be compiled first"); + OPENVINO_ASSERT(request_idx == 0, "multi concurrency not supported for NPU"); for (int i = 0; i < m_native_batch_size; i++) { adapter_controller.apply(m_requests[i], adapters); } } - virtual ov::Tensor infer(ov::Tensor sample, ov::Tensor timestep) override { + virtual ov::Tensor infer(ov::Tensor sample, ov::Tensor timestep, size_t request_idx = 0) override { OPENVINO_ASSERT(m_native_batch_size && m_native_batch_size == m_requests.size(), "UNet model must be compiled first"); + OPENVINO_ASSERT(request_idx == 0, "multi concurrency not supported for NPU"); OPENVINO_ASSERT(sample.get_shape()[0] == m_native_batch_size, "sample batch size must match native batch size"); diff --git a/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp index f3cf5a90dc..008e621f32 100644 --- a/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp +++ b/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp @@ -267,7 +267,7 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { m_vae->compile(vae_device, properties); } - void compute_hidden_states(const std::string& positive_prompt, const ImageGenerationConfig& generation_config) override { + void compute_hidden_states(const std::string& positive_prompt, const ImageGenerationConfig& generation_config, size_t request_idx = 0) override { const auto& transformer_config = m_transformer->get_config(); const size_t batch_size_multiplier = do_classifier_free_guidance(generation_config.guidance_scale) ? 2 : 1; // Transformer accepts 2x batch in case of CFG @@ -419,7 +419,7 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { m_transformer->set_hidden_states("pooled_projections", pooled_prompt_embeds_inp); } - std::tuple prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config) override { + std::tuple prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config, size_t request_idx = 0) override { const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); ov::Shape latent_shape{generation_config.num_images_per_prompt, m_transformer->get_config().in_channels, @@ -440,7 +440,7 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { noise = generation_config.generator->randn_tensor(latent_shape); latent = ov::Tensor(image_latents.get_element_type(), image_latents.get_shape()); image_latents.copy_to(latent); - m_scheduler->scale_noise(latent, m_latent_timestep, noise); + m_schedulers[request_idx]->scale_noise(latent, m_latent_timestep, noise); } else { noise = generation_config.generator->randn_tensor(latent_shape); latent.set_shape(latent_shape); @@ -449,20 +449,21 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { const float * noise_data = noise.data(); float * latent_data = latent.data(); for (size_t i = 0; i < latent.get_size(); ++i) - latent_data[i] = noise_data[i] * m_scheduler->get_init_noise_sigma(); + latent_data[i] = noise_data[i] * m_schedulers[request_idx]->get_init_noise_sigma(); } return std::make_tuple(latent, proccesed_image, image_latents, noise); } - void set_lora_adapters(std::optional adapters) override { + void set_lora_adapters(std::optional adapters, size_t request_idx = 0) override { OPENVINO_THROW("LORA adapters are not implemented for Stable Diffusion 3 yet"); } ov::Tensor generate(const std::string& positive_prompt, ov::Tensor initial_image, ov::Tensor mask_image, - const ov::AnyMap& properties) override { + const ov::AnyMap& properties, + size_t request_idx = 0) override { const auto gen_start = std::chrono::steady_clock::now(); m_perf_metrics.clean_up(); ImageGenerationConfig generation_config = m_generation_config; @@ -487,9 +488,9 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { check_inputs(generation_config, initial_image); // 3. Prepare timesteps - m_scheduler->set_timesteps(generation_config.num_inference_steps, generation_config.strength); + m_schedulers[request_idx]->set_timesteps(generation_config.num_inference_steps, generation_config.strength); - std::vector timesteps = m_scheduler->get_float_timesteps(); + std::vector timesteps = m_schedulers[request_idx]->get_float_timesteps(); m_latent_timestep = timesteps[0]; // 4. Compute text encoders and set hidden states @@ -547,7 +548,7 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { noisy_residual_tensor = noise_pred_tensor; } - auto scheduler_step_result = m_scheduler->step(noisy_residual_tensor, latent, inference_step, generation_config.generator); + auto scheduler_step_result = m_schedulers[request_idx]->step(noisy_residual_tensor, latent, inference_step, generation_config.generator); latent = scheduler_step_result["latent"]; if (m_pipeline_type == PipelineType::INPAINTING && !is_inpainting_model()) { @@ -577,8 +578,8 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { return image; } - ov::Tensor decode(const ov::Tensor latent) override { - return m_vae->decode(latent); + ov::Tensor decode(const ov::Tensor latent, size_t request_idx = 0) override { + return m_vae->decode(latent, request_idx); } ImageGenerationPerfMetrics get_performance_metrics() override { @@ -592,18 +593,18 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { return m_transformer->get_config().in_channels; } - void blend_latents(ov::Tensor image_latent, ov::Tensor noise, ov::Tensor mask, ov::Tensor latent, size_t inference_step) override { + void blend_latents(ov::Tensor image_latent, ov::Tensor noise, ov::Tensor mask, ov::Tensor latent, size_t inference_step, size_t request_idx = 0) override { OPENVINO_ASSERT(m_pipeline_type == PipelineType::INPAINTING, "'blend_latents' can be called for inpainting pipeline only"); OPENVINO_ASSERT(image_latent.get_shape() == latent.get_shape(), "Shapes for current", latent.get_shape(), "and initial image latents ", image_latent.get_shape(), " must match"); ov::Tensor noised_image_latent(image_latent.get_element_type(), {}); - std::vector timesteps = m_scheduler->get_float_timesteps(); + std::vector timesteps = m_schedulers[request_idx]->get_float_timesteps(); if (inference_step < timesteps.size() - 1) { image_latent.copy_to(noised_image_latent); float noise_timestep = timesteps[inference_step + 1]; - m_scheduler->scale_noise(noised_image_latent, noise_timestep, noise); + m_schedulers[request_idx]->scale_noise(noised_image_latent, noise_timestep, noise); } else { noised_image_latent = image_latent; } diff --git a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp index ff2116e5f8..f71983be9a 100644 --- a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp +++ b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp @@ -36,7 +36,8 @@ class StableDiffusionPipeline : public DiffusionPipeline { nlohmann::json data = nlohmann::json::parse(file); using utils::read_json_param; - set_scheduler(Scheduler::from_config(root_dir / "scheduler/scheduler_config.json")); + for (size_t i = 0; i < 4 /* To be passed in constructor after idea is approved */; i++) + set_scheduler(Scheduler::from_config(root_dir / "scheduler/scheduler_config.json"), i); const std::string text_encoder = data["text_encoder"][1].get(); if (text_encoder == "CLIPTextModel") { @@ -78,7 +79,8 @@ class StableDiffusionPipeline : public DiffusionPipeline { nlohmann::json data = nlohmann::json::parse(file); using utils::read_json_param; - set_scheduler(Scheduler::from_config(root_dir / "scheduler/scheduler_config.json")); + for (size_t i = 0; i < 4 /* To be passed in constructor after idea is approved */; i++) + set_scheduler(Scheduler::from_config(root_dir / "scheduler/scheduler_config.json"), i); auto updated_properties = update_adapters_in_properties(properties, &DiffusionPipeline::derived_adapters); @@ -163,21 +165,23 @@ class StableDiffusionPipeline : public DiffusionPipeline { m_vae->compile(vae_device, *updated_properties); } - void compute_hidden_states(const std::string& positive_prompt, const ImageGenerationConfig& generation_config) override { + void compute_hidden_states(const std::string& positive_prompt, const ImageGenerationConfig& generation_config, size_t request_idx = 0) override { const auto& unet_config = m_unet->get_config(); const size_t batch_size_multiplier = m_unet->do_classifier_free_guidance(generation_config.guidance_scale) ? 2 : 1; // Unet accepts 2x batch in case of CFG std::string negative_prompt = generation_config.negative_prompt != std::nullopt ? *generation_config.negative_prompt : std::string{}; auto infer_start = std::chrono::steady_clock::now(); ov::Tensor encoder_hidden_states = m_clip_text_encoder->infer(positive_prompt, negative_prompt, - batch_size_multiplier > 1); + batch_size_multiplier > 1, request_idx); auto infer_duration = std::chrono::duration_cast(std::chrono::steady_clock::now() - infer_start).count(); - m_perf_metrics.encoder_inference_duration["text_encoder"] = infer_duration; + + // TODO: Metrics for POC purposes, metrics need to be handled per-request + //m_perf_metrics.encoder_inference_duration["text_encoder"] = infer_duration; // replicate encoder hidden state to UNet model if (generation_config.num_images_per_prompt == 1) { // reuse output of text encoder directly w/o extra memory copy - m_unet->set_hidden_states("encoder_hidden_states", encoder_hidden_states); + m_unet->set_hidden_states("encoder_hidden_states", encoder_hidden_states, request_idx); } else { ov::Shape enc_shape = encoder_hidden_states.get_shape(); enc_shape[0] *= generation_config.num_images_per_prompt; @@ -191,17 +195,17 @@ class StableDiffusionPipeline : public DiffusionPipeline { } } - m_unet->set_hidden_states("encoder_hidden_states", encoder_hidden_states_repeated); + m_unet->set_hidden_states("encoder_hidden_states", encoder_hidden_states_repeated, request_idx); } if (unet_config.time_cond_proj_dim >= 0) { // LCM ov::Tensor timestep_cond = get_guidance_scale_embedding(generation_config.guidance_scale - 1.0f, unet_config.time_cond_proj_dim); - m_unet->set_hidden_states("timestep_cond", timestep_cond); + m_unet->set_hidden_states("timestep_cond", timestep_cond, request_idx); } } - std::tuple prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config) override { - std::vector timesteps = m_scheduler->get_timesteps(); + std::tuple prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config, size_t request_idx = 0) override { + std::vector timesteps = m_schedulers[request_idx]->get_timesteps(); OPENVINO_ASSERT(!timesteps.empty(), "Timesteps are not computed yet"); int64_t latent_timestep = timesteps.front(); @@ -224,10 +228,11 @@ class StableDiffusionPipeline : public DiffusionPipeline { // - inpainting with non-specialized model if (!is_strength_max || return_image_latent) { auto encode_start = std::chrono::steady_clock::now(); - image_latent = m_vae->encode(proccesed_image, generation_config.generator); - m_perf_metrics.vae_encoder_inference_duration = std::chrono::duration_cast( - std::chrono::steady_clock::now() - encode_start) - .count(); + image_latent = m_vae->encode(proccesed_image, generation_config.generator, request_idx); + // TODO: Metrics for POC purposes, metrics need to be handled per-request + //m_perf_metrics.vae_encoder_inference_duration = std::chrono::duration_cast( + // std::chrono::steady_clock::now() - encode_start) + // .count(); // in case of image to image or inpaining with strength < 1.0, we need to initialize initial latent with // image_latent if (!is_strength_max) { @@ -240,7 +245,7 @@ class StableDiffusionPipeline : public DiffusionPipeline { noise = generation_config.generator->randn_tensor(latent_shape); if (!latent.get_shape().empty()) { - m_scheduler->add_noise(latent, noise, latent_timestep); + m_schedulers[request_idx]->add_noise(latent, noise, latent_timestep); } else { latent.set_shape(latent_shape); @@ -248,29 +253,31 @@ class StableDiffusionPipeline : public DiffusionPipeline { const float * noise_data = noise.data(); float * latent_data = latent.data(); for (size_t i = 0; i < latent.get_size(); ++i) - latent_data[i] = noise_data[i] * m_scheduler->get_init_noise_sigma(); + latent_data[i] = noise_data[i] * m_schedulers[request_idx]->get_init_noise_sigma(); } return std::make_tuple(latent, proccesed_image, image_latent, noise); } - void set_lora_adapters(std::optional adapters) override { + void set_lora_adapters(std::optional adapters, size_t request_idx = 0) override { if(adapters) { if(auto updated_adapters = derived_adapters(*adapters)) { adapters = updated_adapters; } - m_clip_text_encoder->set_adapters(adapters); - m_unet->set_adapters(adapters); + m_clip_text_encoder->set_adapters(adapters, request_idx); + m_unet->set_adapters(adapters, request_idx); } } ov::Tensor generate(const std::string& positive_prompt, ov::Tensor initial_image, ov::Tensor mask_image, - const ov::AnyMap& properties) override { + const ov::AnyMap& properties, + size_t request_idx = 0) override { const auto gen_start = std::chrono::steady_clock::now(); using namespace numpy_utils; - m_perf_metrics.clean_up(); + // TODO: Metrics for POC purposes, metrics need to be handled per-request + //m_perf_metrics.clean_up(); ImageGenerationConfig generation_config = m_generation_config; generation_config.update_generation_config(properties); @@ -295,17 +302,17 @@ class StableDiffusionPipeline : public DiffusionPipeline { check_inputs(generation_config, initial_image); - set_lora_adapters(generation_config.adapters); + set_lora_adapters(generation_config.adapters, request_idx); - m_scheduler->set_timesteps(generation_config.num_inference_steps, generation_config.strength); - std::vector timesteps = m_scheduler->get_timesteps(); + m_schedulers[request_idx]->set_timesteps(generation_config.num_inference_steps, generation_config.strength); + std::vector timesteps = m_schedulers[request_idx]->get_timesteps(); // compute text encoders and set hidden states - compute_hidden_states(positive_prompt, generation_config); + compute_hidden_states(positive_prompt, generation_config, request_idx); // preparate initial / image latents ov::Tensor latent, processed_image, image_latent, noise; - std::tie(latent, processed_image, image_latent, noise) = prepare_latents(initial_image, generation_config); + std::tie(latent, processed_image, image_latent, noise) = prepare_latents(initial_image, generation_config, request_idx); // prepare mask latents ov::Tensor mask, masked_image_latent; @@ -320,6 +327,7 @@ class StableDiffusionPipeline : public DiffusionPipeline { ov::Tensor latent_cfg(ov::element::f32, latent_shape_cfg), denoised, noisy_residual_tensor(ov::element::f32, {}), latent_model_input; for (size_t inference_step = 0; inference_step < timesteps.size(); inference_step++) { + std::cout << "Iter " << inference_step << std::endl; auto step_start = std::chrono::steady_clock::now(); numpy_utils::batch_copy(latent, latent_cfg, 0, 0, generation_config.num_images_per_prompt); // concat the same latent twice along a batch dimension in case of CFG @@ -327,14 +335,14 @@ class StableDiffusionPipeline : public DiffusionPipeline { numpy_utils::batch_copy(latent, latent_cfg, 0, generation_config.num_images_per_prompt, generation_config.num_images_per_prompt); } - m_scheduler->scale_model_input(latent_cfg, inference_step); + m_schedulers[request_idx]->scale_model_input(latent_cfg, inference_step); ov::Tensor latent_model_input = is_inpainting_model() ? numpy_utils::concat(numpy_utils::concat(latent_cfg, mask, 1), masked_image_latent, 1) : latent_cfg; ov::Tensor timestep(ov::element::i64, {1}, ×teps[inference_step]); auto infer_start = std::chrono::steady_clock::now(); - ov::Tensor noise_pred_tensor = m_unet->infer(latent_model_input, timestep); + ov::Tensor noise_pred_tensor = m_unet->infer(latent_model_input, timestep, request_idx); auto infer_duration = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start); - m_perf_metrics.raw_metrics.unet_inference_durations.emplace_back(MicroSeconds(infer_duration)); + //m_perf_metrics.raw_metrics.unet_inference_durations.emplace_back(MicroSeconds(infer_duration)); ov::Shape noise_pred_shape = noise_pred_tensor.get_shape(); noise_pred_shape[0] /= batch_size_multiplier; @@ -355,7 +363,7 @@ class StableDiffusionPipeline : public DiffusionPipeline { noisy_residual_tensor = noise_pred_tensor; } - auto scheduler_step_result = m_scheduler->step(noisy_residual_tensor, latent, inference_step, generation_config.generator); + auto scheduler_step_result = m_schedulers[request_idx]->step(noisy_residual_tensor, latent, inference_step, generation_config.generator); latent = scheduler_step_result["latent"]; // in case of non-specialized inpainting model, we need manually mask current denoised latent and initial image latent @@ -369,30 +377,32 @@ class StableDiffusionPipeline : public DiffusionPipeline { if (callback && callback(inference_step, timesteps.size(), denoised)) { auto step_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - step_start); - m_perf_metrics.raw_metrics.iteration_durations.emplace_back(MicroSeconds(step_ms)); + //m_perf_metrics.raw_metrics.iteration_durations.emplace_back(MicroSeconds(step_ms)); auto image = ov::Tensor(ov::element::u8, {}); - m_perf_metrics.generate_duration = - std::chrono::duration_cast(std::chrono::steady_clock::now() - gen_start) - .count(); + // TODO: Metrics for POC purposes, metrics need to be handled per-request + //m_perf_metrics.generate_duration = + // std::chrono::duration_cast(std::chrono::steady_clock::now() - gen_start) + // .count(); return image; } auto step_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - step_start); - m_perf_metrics.raw_metrics.iteration_durations.emplace_back(MicroSeconds(step_ms)); + //m_perf_metrics.raw_metrics.iteration_durations.emplace_back(MicroSeconds(step_ms)); } + std::cout << "Iterations finished" << std::endl; auto decode_start = std::chrono::steady_clock::now(); - auto image = decode(denoised); - m_perf_metrics.vae_decoder_inference_duration = - std::chrono::duration_cast(std::chrono::steady_clock::now() - decode_start) - .count(); - m_perf_metrics.generate_duration = - std::chrono::duration_cast(std::chrono::steady_clock::now() - gen_start).count(); + auto image = decode(denoised, request_idx); + // m_perf_metrics.vae_decoder_inference_duration = + // std::chrono::duration_cast(std::chrono::steady_clock::now() - decode_start) + // .count(); + // m_perf_metrics.generate_duration = + // std::chrono::duration_cast(std::chrono::steady_clock::now() - gen_start).count(); return image; } - ov::Tensor decode(const ov::Tensor latent) override { - return m_vae->decode(latent); + ov::Tensor decode(const ov::Tensor latent, size_t request_idx = 0) override { + return m_vae->decode(latent, request_idx); } ImageGenerationPerfMetrics get_performance_metrics() override { diff --git a/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp index 7b49f956f8..3ed1a2f965 100644 --- a/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp +++ b/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp @@ -185,7 +185,7 @@ class StableDiffusionXLPipeline : public StableDiffusionPipeline { m_vae->compile(vae_device, *updated_properties); } - void compute_hidden_states(const std::string& positive_prompt, const ImageGenerationConfig& generation_config) override { + void compute_hidden_states(const std::string& positive_prompt, const ImageGenerationConfig& generation_config, size_t request_idx = 0) override { const auto& unet_config = m_unet->get_config(); const size_t batch_size_multiplier = m_unet->do_classifier_free_guidance(generation_config.guidance_scale) ? 2 : 1; // Unet accepts 2x batch in case of CFG @@ -340,14 +340,14 @@ class StableDiffusionXLPipeline : public StableDiffusionPipeline { } } - void set_lora_adapters(std::optional adapters) override { + void set_lora_adapters(std::optional adapters, size_t request_idx = 0) override { if (adapters) { if (auto updated_adapters = derived_adapters(*adapters)) { adapters = updated_adapters; } - m_clip_text_encoder->set_adapters(adapters); - m_clip_text_encoder_with_projection->set_adapters(adapters); - m_unet->set_adapters(adapters); + m_clip_text_encoder->set_adapters(adapters); // TODO + m_clip_text_encoder_with_projection->set_adapters(adapters); // TODO + m_unet->set_adapters(adapters); // TODO } } diff --git a/src/cpp/src/image_generation/text2image_pipeline.cpp b/src/cpp/src/image_generation/text2image_pipeline.cpp index 75e2ca21f4..1db266f5b6 100644 --- a/src/cpp/src/image_generation/text2image_pipeline.cpp +++ b/src/cpp/src/image_generation/text2image_pipeline.cpp @@ -17,7 +17,7 @@ namespace ov { namespace genai { -Text2ImagePipeline::Text2ImagePipeline(const std::filesystem::path& root_dir) { +Text2ImagePipeline::Text2ImagePipeline(const std::filesystem::path& root_dir) : m_request_idx_queue(std::make_shared(4)/* To be inherited from root_dir config or constructor param once idea is approved */) { const std::string class_name = get_class_name(root_dir); auto start_time = std::chrono::steady_clock::now(); @@ -36,7 +36,7 @@ Text2ImagePipeline::Text2ImagePipeline(const std::filesystem::path& root_dir) { m_impl->save_load_time(start_time); } -Text2ImagePipeline::Text2ImagePipeline(const std::filesystem::path& root_dir, const std::string& device, const ov::AnyMap& properties) { +Text2ImagePipeline::Text2ImagePipeline(const std::filesystem::path& root_dir, const std::string& device, const ov::AnyMap& properties) : m_request_idx_queue(std::make_shared(4)/* To be inherited from root_dir config or constructor param once idea is approved */) { const std::string class_name = get_class_name(root_dir); auto start_time = std::chrono::steady_clock::now(); @@ -55,7 +55,7 @@ Text2ImagePipeline::Text2ImagePipeline(const std::filesystem::path& root_dir, co m_impl->save_load_time(start_time); } -Text2ImagePipeline::Text2ImagePipeline(const Image2ImagePipeline& pipe) { +Text2ImagePipeline::Text2ImagePipeline(const Image2ImagePipeline& pipe) : m_request_idx_queue(std::make_shared(4)/* To be inherited from root_dir config or constructor param once idea is approved */) { auto start_time = std::chrono::steady_clock::now(); if (auto stable_diffusion_xl = std::dynamic_pointer_cast(pipe.m_impl); stable_diffusion_xl != nullptr) { m_impl = std::make_shared(PipelineType::TEXT_2_IMAGE, *stable_diffusion_xl); @@ -71,7 +71,7 @@ Text2ImagePipeline::Text2ImagePipeline(const Image2ImagePipeline& pipe) { m_impl->save_load_time(start_time); } -Text2ImagePipeline::Text2ImagePipeline(const InpaintingPipeline& pipe) { +Text2ImagePipeline::Text2ImagePipeline(const InpaintingPipeline& pipe) : m_request_idx_queue(std::make_shared(4)/* To be inherited from root_dir config or constructor param once idea is approved */) { auto start_time = std::chrono::steady_clock::now(); if (auto stable_diffusion_xl = std::dynamic_pointer_cast(pipe.m_impl); stable_diffusion_xl != nullptr) { m_impl = std::make_shared(PipelineType::TEXT_2_IMAGE, *stable_diffusion_xl); @@ -88,7 +88,7 @@ Text2ImagePipeline::Text2ImagePipeline(const InpaintingPipeline& pipe) { } Text2ImagePipeline::Text2ImagePipeline(const std::shared_ptr& impl) - : m_impl(impl) { + : m_impl(impl), m_request_idx_queue(std::make_shared(4)/* To be inherited from root_dir config or constructor param once idea is approved */) { assert(m_impl != nullptr); } @@ -215,8 +215,9 @@ void Text2ImagePipeline::compile(const std::string& text_encode_device, m_impl->save_load_time(start_time); } +// Move to separate infer request class ov::Tensor Text2ImagePipeline::generate(const std::string& positive_prompt, const ov::AnyMap& properties) { - return m_impl->generate(positive_prompt, {}, {}, properties); + return m_impl->generate(positive_prompt, {}, {}, properties); } ov::Tensor Text2ImagePipeline::decode(const ov::Tensor latent) { @@ -227,5 +228,21 @@ ImageGenerationPerfMetrics Text2ImagePipeline::get_performance_metrics() { return m_impl->get_performance_metrics(); } +Text2ImagePipeline::GenerationRequest::GenerationRequest(size_t request_idx, Text2ImagePipeline& pipeline) : m_pipeline(pipeline), m_request_idx(request_idx) { +} + +Text2ImagePipeline::GenerationRequest::~GenerationRequest() { + m_pipeline.m_request_idx_queue->return_to(m_request_idx); +} + +ov::Tensor Text2ImagePipeline::GenerationRequest::generate(const std::string& positive_prompt, const ov::AnyMap& properties) { + return m_pipeline.m_impl->generate(positive_prompt, {}, {}, properties, m_request_idx); +} + +Text2ImagePipeline::GenerationRequest Text2ImagePipeline::create_generation_request() { + size_t free_request_idx = m_request_idx_queue->get(); + return GenerationRequest(free_request_idx, *this); +} + } // namespace genai } // namespace ov diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index 1fa521d0d0..ed8ed42cdf 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -210,15 +210,15 @@ class Tokenizer::TokenizerImpl { } } - TokenizerImpl(const std::filesystem::path& models_path, const ov::AnyMap& properties) { - setup_tokenizer(models_path, properties); + TokenizerImpl(const std::filesystem::path& models_path, const ov::AnyMap& properties, size_t infer_request_queue_size = 0) { + setup_tokenizer(models_path, properties, infer_request_queue_size); } - TokenizerImpl(const std::pair, std::shared_ptr>& models, const ov::AnyMap& properties) { - setup_tokenizer(models, properties); + TokenizerImpl(const std::pair, std::shared_ptr>& models, const ov::AnyMap& properties, size_t infer_request_queue_size = 0) { + setup_tokenizer(models, properties, infer_request_queue_size); } - void setup_tokenizer(const std::filesystem::path& models_path, const ov::AnyMap& properties) { + void setup_tokenizer(const std::filesystem::path& models_path, const ov::AnyMap& properties, size_t infer_request_queue_size = 0) { ScopedVar env_manager(tokenizers_relative_to_genai()); auto core = get_core_singleton(); @@ -242,10 +242,10 @@ class Tokenizer::TokenizerImpl { parse_if_exists(models_path / "tokenizer_config.json", m_chat_template); parse_if_exists(models_path / "processor_config.json", m_chat_template); parse_if_exists(models_path / "chat_template.json", m_chat_template); - setup_tokenizer(std::make_pair(ov_tokenizer, ov_detokenizer), properties); + setup_tokenizer(std::make_pair(ov_tokenizer, ov_detokenizer), properties, infer_request_queue_size); } - void setup_tokenizer(const std::pair, std::shared_ptr>& models, const ov::AnyMap& properties) { + void setup_tokenizer(const std::pair, std::shared_ptr>& models, const ov::AnyMap& properties, size_t infer_request_queue_size = 0) { auto [ov_tokenizer, ov_detokenizer] = models; OPENVINO_ASSERT(ov_tokenizer || ov_detokenizer, "Neither tokenizer nor detokenzier models were provided"); @@ -264,7 +264,7 @@ class Tokenizer::TokenizerImpl { ov::genai::utils::print_compiled_model_properties(tokenizer, "OV Tokenizer"); m_ireq_queue_tokenizer = std::make_unique>( - tokenizer.get_property(ov::optimal_number_of_infer_requests), + infer_request_queue_size > 0 ? infer_request_queue_size : tokenizer.get_property(ov::optimal_number_of_infer_requests), [&tokenizer]() -> ov::InferRequest { return tokenizer.create_infer_request(); }); @@ -601,8 +601,8 @@ class Tokenizer::TokenizerImpl { } }; -Tokenizer::Tokenizer(const std::filesystem::path& tokenizer_path, const ov::AnyMap& properties) { - m_pimpl = std::make_shared(tokenizer_path, properties); +Tokenizer::Tokenizer(const std::filesystem::path& tokenizer_path, const ov::AnyMap& properties, size_t infer_request_queue_size) { + m_pimpl = std::make_shared(tokenizer_path, properties, infer_request_queue_size); } Tokenizer::Tokenizer( diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp index f18e848c79..5b78d81ba4 100644 --- a/src/cpp/src/utils.cpp +++ b/src/cpp/src/utils.cpp @@ -3,6 +3,7 @@ #include "utils.hpp" +#include #include #include #include @@ -17,6 +18,8 @@ #include "openvino/op/transpose.hpp" #include "openvino/genai/text_streamer.hpp" +#include "circular_buffer_queue.hpp" + #include "sampler.hpp" @@ -496,6 +499,29 @@ std::pair extract_scheduler_config(const ov::AnyMap return {plugin_config, scheduler_config}; }; +struct RequestIdxQueueImpl { + std::unique_ptr> m_queue; + size_t m_concurrency_limit; + size_t m_val = 0; // only used to initialize the queue + + RequestIdxQueueImpl(size_t concurrency_limit) : m_concurrency_limit(concurrency_limit), m_queue(std::make_unique>(concurrency_limit, [this] { + return this->m_val++; + })) {} +}; + +RequestIdxQueue::RequestIdxQueue(size_t concurrency_limit) { + m_impl = std::make_shared(concurrency_limit); +} + +size_t RequestIdxQueue::get() { + size_t res = m_impl->m_queue->get_idle().get(); + return res; +} + +void RequestIdxQueue::return_to(size_t value) { + m_impl->m_queue->return_to(value); +} + } // namespace utils } // namespace genai } // namespace ov diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index 8891c1d42b..7bd26df94b 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -3,6 +3,7 @@ #pragma once #include +#include #include #include #include @@ -223,6 +224,22 @@ const ModelsMap::mapped_type& get_model_weights_pair(const ModelsMap& models_map std::pair extract_scheduler_config(const ov::AnyMap& properties, std::optional default_config = std::nullopt); + +class RequestIdxQueueImpl; + +class RequestIdxQueue { + std::shared_ptr m_impl; +public: + RequestIdxQueue(size_t concurrency_limit); + ~RequestIdxQueue() = default; + + // Blocking op to get free request index + size_t get(); + + // Returning free request index to the queue + void return_to(size_t value); +}; + } // namespace utils } // namespace genai } // namespace ov diff --git a/src/python/py_image_generation_models.cpp b/src/python/py_image_generation_models.cpp index 0b5f4fee48..6cd070ca53 100644 --- a/src/python/py_image_generation_models.cpp +++ b/src/python/py_image_generation_models.cpp @@ -69,14 +69,15 @@ void init_clip_text_model(py::module_& m) { clip_text_model.def("get_config", &ov::genai::CLIPTextModel::get_config) .def("reshape", &ov::genai::CLIPTextModel::reshape, py::arg("batch_size")) - .def("set_adapters", &ov::genai::CLIPTextModel::set_adapters, py::arg("adapters")) + .def("set_adapters", &ov::genai::CLIPTextModel::set_adapters, py::arg("adapters"), py::arg("request_idx") = 0) .def("infer", &ov::genai::CLIPTextModel::infer, py::call_guard(), py::arg("pos_prompt"), py::arg("neg_prompt"), - py::arg("do_classifier_free_guidance")) - .def("get_output_tensor", &ov::genai::CLIPTextModel::get_output_tensor, py::arg("idx")) + py::arg("do_classifier_free_guidance"), + py::arg("request_idx") = 0) + .def("get_output_tensor", &ov::genai::CLIPTextModel::get_output_tensor, py::arg("idx"), py::arg("request_idx") = 0) .def( "compile", [](ov::genai::CLIPTextModel& self, @@ -279,13 +280,14 @@ void init_unet2d_condition_model(py::module_& m) { unet2d_condition_model.def("get_config", &ov::genai::UNet2DConditionModel::get_config) .def("reshape", &ov::genai::UNet2DConditionModel::reshape, py::arg("batch_size"), py::arg("height"), py::arg("width"), py::arg("tokenizer_model_max_length")) - .def("set_adapters", &ov::genai::UNet2DConditionModel::set_adapters, py::arg("adapters")) + .def("set_adapters", &ov::genai::UNet2DConditionModel::set_adapters, py::arg("adapters"), py::arg("request_idx") = 0) .def("infer", &ov::genai::UNet2DConditionModel::infer, py::call_guard(), py::arg("sample"), - py::arg("timestep")) - .def("set_hidden_states", &ov::genai::UNet2DConditionModel::set_hidden_states, py::arg("tensor_name"), py::arg("encoder_hidden_states")) + py::arg("timestep"), + py::arg("request_idx") = 0) + .def("set_hidden_states", &ov::genai::UNet2DConditionModel::set_hidden_states, py::arg("tensor_name"), py::arg("encoder_hidden_states"), py::arg("request_idx") = 0) .def("do_classifier_free_guidance", &ov::genai::UNet2DConditionModel::do_classifier_free_guidance, py::arg("guidance_scale")) .def( "compile", @@ -544,8 +546,8 @@ void init_autoencoder_kl(py::module_& m) { device (str): Device to run the model on (e.g., CPU, GPU). kwargs: Device properties. )") - .def("decode", &ov::genai::AutoencoderKL::decode, py::call_guard(), py::arg("latent")) - .def("encode", &ov::genai::AutoencoderKL::encode, py::call_guard(), py::arg("image"), py::arg("generator")) + .def("decode", &ov::genai::AutoencoderKL::decode, py::call_guard(), py::arg("latent"), py::arg("request_idx") = 0) + .def("encode", &ov::genai::AutoencoderKL::encode, py::call_guard(), py::arg("image"), py::arg("generator"), py::arg("request_idx") = 0) .def("get_config", &ov::genai::AutoencoderKL::get_config) .def("get_vae_scale_factor", &ov::genai::AutoencoderKL::get_vae_scale_factor); } diff --git a/tests/cpp/scheduler.cpp b/tests/cpp/scheduler.cpp index 40ea18381e..cd4cbc3a67 100644 --- a/tests/cpp/scheduler.cpp +++ b/tests/cpp/scheduler.cpp @@ -974,7 +974,7 @@ TEST(TestScheduler, FullyPreemptsCacheEvictedSequences) { // mock-generate 4 more tokens in the 1-st sequence group so that the remaining 2 blocks are filled up std::vector first_seq_group_only = { requests[0] }; - for (size_t i = 0; i < 4; i++) { + for (size_t i = 0; i < 4 /* To be passed in constructor after idea is approved */; i++) { // Since eviction arena size is less than the cache_size - BLOCK_SIZE, no preemption is expected to occur yet // - tokens are added 1 by 1 and once a new block fills, an older one is evicted automatically _schedule_one_mock_generation_token_for_each_sequence_group(scheduler, first_seq_group_only);